Amazing-Python-Scripts

scraper.py
86 строк · 3.5 Кб
Перенос по словам
1
import requests
2
from bs4 import BeautifulSoup as bs
3
import argparse
4

5

6
parser = argparse.ArgumentParser(description='IMDB Scraper')
7
parser.add_argument('--t', action='store', type=str, required=True,
8
                    help='Enter the title of the movie')
9

10

11
# Base id url is used when the title id is known
12
base_id = "https://www.imdb.com/title"
13
# base url is used when the user gives a title to search for
14
base = "https://www.imdb.com/find?s=tt&q="
15

16

17
def get_info(soup):
18
    info = {}
19
    labels = ["title", "year", "rating", "genre", "plot", "date", "country",
20
              "language", "budget", "gross", "gross_usa", "opening_week_usa"]
21
    try:
22
        info["title"] = soup.find(
23
            'div', attrs={"class": "title_wrapper"}).h1.get_text(strip=True)
24
        info["year"] = soup.find(
25
            'span', attrs={"id": "titleYear"}).a.get_text(strip=True)
26
        info["rating"] = soup.find(
27
            'span', attrs={"itemprop": "ratingValue"}).get_text(strip=True)
28
        subtext = soup.find("div", attrs={"class": "subtext"})
29
        info["genre"] = subtext.a.get_text(strip=True)
30
        article = soup.find('div', attrs={"id": "titleStoryLine"})
31
        info["plot"] = article.find(
32
            'div', attrs={"class": "canwrap"}).p.span.get_text(strip=True)
33
        details = soup.find('div', attrs={"id": "titleDetails"})
34
        blocks = details.findAll('div', attrs={"class": "txt-block"})
35
        for block in blocks:
36
            heading = block.h4.get_text(strip=True)
37
            if heading == "Release Date:":
38
                info["date"] = block.get_text(strip=True).replace(
39
                    "See more»", '').replace(heading, '')
40
            if heading == "Country:":
41
                info["country"] = block.a.get_text(strip=True)
42
            if heading == "Language":
43
                info["language"] = block.a.get_text(strip=True)
44
            if heading == "Budget:":
45
                info["budget"] = block.get_text(
46
                    strip=True).replace(heading, '')
47
            if heading == "Cumulative Worldwide Gross:":
48
                info["gross"] = block.get_text(
49
                    strip=True).replace(heading, '')
50
            if heading == "Gross USA:":
51
                info["gross_usa"] = block.get_text(
52
                    strip=True).replace(heading, '')
53
            if heading == "Opening Weekend USA:":
54
                info["opening_week_usa"] = block.get_text(
55
                    strip=True).replace(heading, '')
56
    except:
57
        assert any(obj in labels for obj in info), "No info found"
58

59
    if len(info) > 4:
60
        print(info, end="\n\n\n")
61

62

63
def find_movie(query):
64
    url = base+query
65
    resp = requests.get(url)
66
# for parsing we have used the lxml parser for optimization purposes, if lxml does not work for you replace 'lxml' with 'html.parser'
67
    soup1 = bs(resp.text, 'lxml')
68
# Since for every query imdb gives about 150-200 responses , we choose the top 5 and return the details for them
69
    movie_list = soup1.findAll("tr", attrs={"class": "findResult"})[0:5]
70
    if movie_list:
71
        for movie in movie_list:
72
            # Through the table given , we extract the title id from the 'href' attribute of the <a> tag
73
            title_id = movie.find(
74
                'td', attrs={"class": "result_text"}).a.attrs["href"][6:]
75

76
            url = base_id+title_id
77
            respo = requests.get(base_id+title_id)
78
            soup = bs(respo.text, 'lxml')
79
            get_info(soup)
80
    else:
81
        print("No results found")
82

83

84
if __name__ == "__main__":
85
    args = parser.parse_args()
86
    find_movie(args.t)
87
Amazing-Python-Scripts

Использование cookies