Amazing-Python-Scripts

Форк
0
103 строки · 4.5 Кб
1
import requests as req
2
import re
3
import csv
4

5
# Social Media links to look for
6
media_links = [
7
    "instagram",
8
    "facebook",
9
    "twitter",
10
    "linkedin",
11
    "youtube",
12
]
13

14
output_link_format = []
15

16
val = int(
17
    input(
18
        "\n----------------------------\n Social Media Links Scraper\n----------------------------\n1. Single Website\n2. CSV file of websites\nEnter your choice from 1 or 2 : "
19
    ))
20

21
# Single Website Data
22
if val == 1:
23

24
    # Takes website link as input
25
    link = input("\nEnter the website url : ")
26

27
    # Makes a get request to the url
28
    r = req.get(link)
29
    # print(r)
30

31
    if r.status_code == 200:
32
        # Finds all links on the website's page
33
        all_links = re.findall(
34
            r"\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b",
35
            r.text)
36
        # print(all_links)
37

38
        found_links = {
39
            "url": link,
40
            "instagram": "",
41
            "facebook": "",
42
            "twitter": "",
43
            "linkedin": "",
44
            "youtube": "",
45
        }
46

47
        # Searching for links
48
        for i in all_links:
49
            for j in media_links:
50
                if i.find(j) > 0:
51
                    found_links[j] = i
52

53
        # Displaying output
54
        print()
55
        for k, v in found_links.items():
56
            print(k, ":", v)
57

58
# Reading sites from csv file and writing output to a new csv file
59
elif val == 2:
60

61
    # Taking file path as input and opening it
62
    csv_file = open(input("\nEnter file path : "))
63

64
    # Iterating through links in csv file
65
    for link in csv_file:
66
        link = str(link).strip()
67
        r = req.get(link)
68
        # print(r)
69

70
        if r.status_code == 200:
71
            found_links = {
72
                "url": link,
73
                "instagram": "",
74
                "facebook": "",
75
                "twitter": "",
76
                "linkedin": "",
77
                "youtube": "",
78
            }
79
            all_links = re.findall(
80
                r"\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b",
81
                r.text)
82
            # print(all_links)
83
            for i in all_links:
84
                for j in media_links:
85
                    if i.find(j) > 0:
86
                        found_links[j] = i
87
            output_link_format.append(found_links)
88
        else:
89
            print(link, "did not return status code 200")
90
    # print(output_link_format)
91

92
    # Generating output csv file
93
    print("Generating output csv file...")
94
    keys = output_link_format[0].keys()
95

96
    with open('output.csv', 'w', newline='') as output_file:
97
        dict_writer = csv.DictWriter(output_file, keys)
98
        dict_writer.writeheader()
99
        dict_writer.writerows(output_link_format)
100

101
    print("Output csv file generated successfully")
102
else:
103
    print("Invalid choice! Enter choice between 1-2")
104

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.