Amazing-Python-Scripts
143 строки · 3.4 Кб
1import requests2import json3import os4import time5from bs4 import BeautifulSoup6
7# to scrape title
8
9
10def getTitle(soup):11ogTitle = soup.find("meta", property="og:title")12
13twitterTitle = soup.find("meta", attrs={"name": "twitter:title"})14
15documentTitle = soup.find("title")16h1Title = soup.find("h1")17h2Title = soup.find("h2")18pTitle = soup.find("p")19
20res = ogTitle or twitterTitle or documentTitle or h1Title or h2Title or pTitle21res = res.get_text() or res.get("content", None)22
23if (len(res) > 60):24res = res[0:60]25if (res == None or len(res.split()) == 0):26res = "Not available"27return res.strip()28
29# to scrape page description
30
31
32def getDesc(soup):33ogDesc = soup.find("meta", property="og:description")34
35twitterDesc = soup.find("meta", attrs={"name": "twitter:description"})36
37metaDesc = soup.find("meta", attrs={"name": "description"})38
39pDesc = soup.find("p")40
41res = ogDesc or twitterDesc or metaDesc or pDesc42res = res.get_text() or res.get("content", None)43if (len(res) > 60):44res = res[0:60]45if (res == None or len(res.split()) == 0):46res = "Not available"47return res.strip()48
49# to scrape image link
50
51
52def getImage(soup, url):53ogImg = soup.find("meta", property="og:image")54
55twitterImg = soup.find("meta", attrs={"name": "twitter:image"})56
57metaImg = soup.find("link", attrs={"rel": "img_src"})58
59img = soup.find("img")60
61res = ogImg or twitterImg or metaImg or img62res = res.get("content", None) or res.get_text() or res.get("src", None)63
64count = 065for i in range(0, len(res)):66if (res[i] == "." or res[i] == "/"):67count += 168else:69break70res = res[count::]71if ((not res == None) and ((not "https://" in res) or (not "https://" in res))):72res = url + "/" + res73if (res == None or len(res.split()) == 0):74res = "Not available"75
76return res77
78# print dictionary
79
80
81def printData(data):82for item in data.items():83print(f'{item[0].capitalize()}: {item[1]}')84
85
86# start
87print("\n======================")88print("- Link Preview -")89print("======================\n")90
91# get url from user
92url = input("Enter URL to preview : ")93
94# parsing and checking the url
95if (url == ""):96url = 'www.girlscript.tech'97if ((not "http://" in url) or (not "https://" in url)):98url = "https://" + url99
100# printing values
101
102# first check in the DB
103db = {}104# create file if it doesn't exist
105if not os.path.exists('Link-Preview/db.json'):106f = open('Link-Preview/db.json', "w")107f.write("{}")108f.close()109
110# read db
111with open('Link-Preview/db.json', 'r+') as file:112data = file.read()113if (len(data) == 0):114data = "{}"115file.write(data)116db = json.loads(data)117
118# check if it exists
119if (url in db and db[url]["time"] < round(time.time())):120printData(db[url])121else:122# if not in db get via request123
124# getting the html125r = requests.get(url)126soup = BeautifulSoup(r.text, "html.parser")127
128sevenDaysInSec = 7*24*60*60129# printing data130newData = {131"title": getTitle(soup),132"description": getDesc(soup),133"url": url,134"image": getImage(soup, url),135"time": round(time.time() * 1000) + sevenDaysInSec136}137printData(newData)138# parse file139db[url] = newData140with open('Link-Preview/db.json', 'w') as file:141json.dump(db, file)142
143print("\n--END--\n")144