Amazing-Python-Scripts
49 строк · 1.7 Кб
1import requests2from bs4 import BeautifulSoup3import re4
5# Taking the URL input and validating using Regex
6URL = input("Enter a valid Wikipedia URL:\n")7urlRegex = re.compile(r"^https\:\/\/([\w\.]+)wikipedia.org\/wiki\/([\w]+\_?)+")8mo = urlRegex.search(URL)9if mo == None:10print("Wrong URL entered. Make sure to enter a valid Wikipedia URL. Make sure to add https:// before the URL if you forgot.")11exit()12
13# Requesting the HTML and making the BeautifulSoup object
14req = requests.get(mo.group())15soup = BeautifulSoup(req.text, "lxml")16
17# Validating if the site has content
18if soup.find("p").text.strip() == "Other reasons this message may be displayed:":19print("This Wikipedia site does not exists.\n")20exit()21
22# Retriving and printing the page title
23page_title = soup.find("h1", class_="firstHeading").text24print(f"\n---{page_title}---\n")25
26# Making the text file to save the text data
27f = open(f"{page_title}.txt", "w", encoding="utf-8")28f.write(f"//{mo.group()}\n---{page_title}---\n\n")29
30# Topics to avoid
31exclude = ["See also", "References", "Sources",32"Further reading", "External links"]33
34# Scraping the site for headings and paragraphs
35for info in soup.descendants:36if info.name == "span":37try:38if info["class"][0] == "mw-headline":39headline = info.get_text()40if headline not in exclude:41print(f"{headline}:\n") # Printing the heading42f.write(f"\n{headline}:\n\n")43except KeyError: # try except block to handle BS KeyError44pass45elif info.name == "p":46para = info.get_text()47print(f"{para}") # Printing the paragraph48f.write(f"{para}")49f.close() # Closing the file50