Amazing-Python-Scripts

wiki_Scraper.py
49 строк · 1.7 Кб
Перенос по словам
1
import requests
2
from bs4 import BeautifulSoup
3
import re
4

5
# Taking the URL input and validating using Regex
6
URL = input("Enter a valid Wikipedia URL:\n")
7
urlRegex = re.compile(r"^https\:\/\/([\w\.]+)wikipedia.org\/wiki\/([\w]+\_?)+")
8
mo = urlRegex.search(URL)
9
if mo == None:
10
    print("Wrong URL entered. Make sure to enter a valid Wikipedia URL. Make sure to add https:// before the URL if you forgot.")
11
    exit()
12

13
# Requesting the HTML and making the BeautifulSoup object
14
req = requests.get(mo.group())
15
soup = BeautifulSoup(req.text, "lxml")
16

17
# Validating if the site has content
18
if soup.find("p").text.strip() == "Other reasons this message may be displayed:":
19
    print("This Wikipedia site does not exists.\n")
20
    exit()
21

22
# Retriving and printing the page title
23
page_title = soup.find("h1", class_="firstHeading").text
24
print(f"\n---{page_title}---\n")
25

26
# Making the text file to save the text data
27
f = open(f"{page_title}.txt", "w", encoding="utf-8")
28
f.write(f"//{mo.group()}\n---{page_title}---\n\n")
29

30
# Topics to avoid
31
exclude = ["See also", "References", "Sources",
32
           "Further reading", "External links"]
33

34
# Scraping the site for headings and paragraphs
35
for info in soup.descendants:
36
    if info.name == "span":
37
        try:
38
            if info["class"][0] == "mw-headline":
39
                headline = info.get_text()
40
                if headline not in exclude:
41
                    print(f"{headline}:\n")  # Printing the heading
42
                    f.write(f"\n{headline}:\n\n")
43
        except KeyError:  # try except block to handle BS KeyError
44
            pass
45
    elif info.name == "p":
46
        para = info.get_text()
47
        print(f"{para}")  # Printing the paragraph
48
        f.write(f"{para}")
49
f.close()  # Closing the file
50
Amazing-Python-Scripts

Использование cookies