Amazing-Python-Scripts
86 строк · 3.5 Кб
1import requests2from bs4 import BeautifulSoup as bs3import argparse4
5
6parser = argparse.ArgumentParser(description='IMDB Scraper')7parser.add_argument('--t', action='store', type=str, required=True,8help='Enter the title of the movie')9
10
11# Base id url is used when the title id is known
12base_id = "https://www.imdb.com/title"13# base url is used when the user gives a title to search for
14base = "https://www.imdb.com/find?s=tt&q="15
16
17def get_info(soup):18info = {}19labels = ["title", "year", "rating", "genre", "plot", "date", "country",20"language", "budget", "gross", "gross_usa", "opening_week_usa"]21try:22info["title"] = soup.find(23'div', attrs={"class": "title_wrapper"}).h1.get_text(strip=True)24info["year"] = soup.find(25'span', attrs={"id": "titleYear"}).a.get_text(strip=True)26info["rating"] = soup.find(27'span', attrs={"itemprop": "ratingValue"}).get_text(strip=True)28subtext = soup.find("div", attrs={"class": "subtext"})29info["genre"] = subtext.a.get_text(strip=True)30article = soup.find('div', attrs={"id": "titleStoryLine"})31info["plot"] = article.find(32'div', attrs={"class": "canwrap"}).p.span.get_text(strip=True)33details = soup.find('div', attrs={"id": "titleDetails"})34blocks = details.findAll('div', attrs={"class": "txt-block"})35for block in blocks:36heading = block.h4.get_text(strip=True)37if heading == "Release Date:":38info["date"] = block.get_text(strip=True).replace(39"See more»", '').replace(heading, '')40if heading == "Country:":41info["country"] = block.a.get_text(strip=True)42if heading == "Language":43info["language"] = block.a.get_text(strip=True)44if heading == "Budget:":45info["budget"] = block.get_text(46strip=True).replace(heading, '')47if heading == "Cumulative Worldwide Gross:":48info["gross"] = block.get_text(49strip=True).replace(heading, '')50if heading == "Gross USA:":51info["gross_usa"] = block.get_text(52strip=True).replace(heading, '')53if heading == "Opening Weekend USA:":54info["opening_week_usa"] = block.get_text(55strip=True).replace(heading, '')56except:57assert any(obj in labels for obj in info), "No info found"58
59if len(info) > 4:60print(info, end="\n\n\n")61
62
63def find_movie(query):64url = base+query65resp = requests.get(url)66# for parsing we have used the lxml parser for optimization purposes, if lxml does not work for you replace 'lxml' with 'html.parser'
67soup1 = bs(resp.text, 'lxml')68# Since for every query imdb gives about 150-200 responses , we choose the top 5 and return the details for them
69movie_list = soup1.findAll("tr", attrs={"class": "findResult"})[0:5]70if movie_list:71for movie in movie_list:72# Through the table given , we extract the title id from the 'href' attribute of the <a> tag73title_id = movie.find(74'td', attrs={"class": "result_text"}).a.attrs["href"][6:]75
76url = base_id+title_id77respo = requests.get(base_id+title_id)78soup = bs(respo.text, 'lxml')79get_info(soup)80else:81print("No results found")82
83
84if __name__ == "__main__":85args = parser.parse_args()86find_movie(args.t)87