Amazing-Python-Scripts

Форк
0
143 строки · 3.4 Кб
1
import requests
2
import json
3
import os
4
import time
5
from bs4 import BeautifulSoup
6

7
# to scrape title
8

9

10
def getTitle(soup):
11
    ogTitle = soup.find("meta", property="og:title")
12

13
    twitterTitle = soup.find("meta", attrs={"name": "twitter:title"})
14

15
    documentTitle = soup.find("title")
16
    h1Title = soup.find("h1")
17
    h2Title = soup.find("h2")
18
    pTitle = soup.find("p")
19

20
    res = ogTitle or twitterTitle or documentTitle or h1Title or h2Title or pTitle
21
    res = res.get_text() or res.get("content", None)
22

23
    if (len(res) > 60):
24
        res = res[0:60]
25
    if (res == None or len(res.split()) == 0):
26
        res = "Not available"
27
    return res.strip()
28

29
# to scrape page description
30

31

32
def getDesc(soup):
33
    ogDesc = soup.find("meta", property="og:description")
34

35
    twitterDesc = soup.find("meta", attrs={"name": "twitter:description"})
36

37
    metaDesc = soup.find("meta", attrs={"name": "description"})
38

39
    pDesc = soup.find("p")
40

41
    res = ogDesc or twitterDesc or metaDesc or pDesc
42
    res = res.get_text() or res.get("content", None)
43
    if (len(res) > 60):
44
        res = res[0:60]
45
    if (res == None or len(res.split()) == 0):
46
        res = "Not available"
47
    return res.strip()
48

49
# to scrape image link
50

51

52
def getImage(soup, url):
53
    ogImg = soup.find("meta", property="og:image")
54

55
    twitterImg = soup.find("meta", attrs={"name": "twitter:image"})
56

57
    metaImg = soup.find("link", attrs={"rel": "img_src"})
58

59
    img = soup.find("img")
60

61
    res = ogImg or twitterImg or metaImg or img
62
    res = res.get("content", None) or res.get_text() or res.get("src", None)
63

64
    count = 0
65
    for i in range(0, len(res)):
66
        if (res[i] == "." or res[i] == "/"):
67
            count += 1
68
        else:
69
            break
70
    res = res[count::]
71
    if ((not res == None) and ((not "https://" in res) or (not "https://" in res))):
72
        res = url + "/" + res
73
    if (res == None or len(res.split()) == 0):
74
        res = "Not available"
75

76
    return res
77

78
# print dictionary
79

80

81
def printData(data):
82
    for item in data.items():
83
        print(f'{item[0].capitalize()}: {item[1]}')
84

85

86
# start
87
print("\n======================")
88
print("- Link Preview -")
89
print("======================\n")
90

91
# get url from user
92
url = input("Enter URL to preview : ")
93

94
# parsing and checking the url
95
if (url == ""):
96
    url = 'www.girlscript.tech'
97
if ((not "http://" in url) or (not "https://" in url)):
98
    url = "https://" + url
99

100
# printing values
101

102
# first check in the DB
103
db = {}
104
# create file if it doesn't exist
105
if not os.path.exists('Link-Preview/db.json'):
106
    f = open('Link-Preview/db.json', "w")
107
    f.write("{}")
108
    f.close()
109

110
# read db
111
with open('Link-Preview/db.json', 'r+') as file:
112
    data = file.read()
113
    if (len(data) == 0):
114
        data = "{}"
115
        file.write(data)
116
    db = json.loads(data)
117

118
# check if it exists
119
if (url in db and db[url]["time"] < round(time.time())):
120
    printData(db[url])
121
else:
122
    # if not in db get via request
123

124
    # getting the html
125
    r = requests.get(url)
126
    soup = BeautifulSoup(r.text, "html.parser")
127

128
    sevenDaysInSec = 7*24*60*60
129
    # printing data
130
    newData = {
131
        "title": getTitle(soup),
132
        "description": getDesc(soup),
133
        "url": url,
134
        "image": getImage(soup, url),
135
        "time": round(time.time() * 1000) + sevenDaysInSec
136
    }
137
    printData(newData)
138
    # parse file
139
    db[url] = newData
140
    with open('Link-Preview/db.json', 'w') as file:
141
        json.dump(db, file)
142

143
print("\n--END--\n")
144

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.