dream
235 строк · 8.0 Кб
1#!/usr/bin/env python
2
3import json4import numpy as np5import pandas as pd6import wget7from time import time8
9
10collect_movies_based_on_rating = False11collect_movies_based_on_numvotes = True12
13for url in [14"https://datasets.imdbws.com/name.basics.tsv.gz",15"https://datasets.imdbws.com/title.ratings.tsv.gz",16"https://datasets.imdbws.com/title.akas.tsv.gz",17"https://datasets.imdbws.com/title.basics.tsv.gz",18"https://datasets.imdbws.com/title.crew.tsv.gz",19"https://datasets.imdbws.com/title.episode.tsv.gz",20"https://datasets.imdbws.com/title.principals.tsv.gz",21"https://datasets.imdbws.com/title.episode.tsv.gz",22]:23filename = wget.download(url)24
25# # Choose imdb-ids of most popular movies
26
27fpath = "./title.ratings.tsv.gz"28df_ratings = pd.read_table(fpath, low_memory=False)29
30df = pd.read_table(31"./title.basics.tsv.gz", low_memory=False, na_values={"startYear": ["\\N"], "endYear": ["\\N"], "isAdult": ["\\N"]}32)
33
34df = df.merge(df_ratings, on="tconst")35
36# fill start year values
37df["startYear"] = df["startYear"].fillna(value=df["startYear"])38df["startYear"] = df["startYear"].fillna(value=0)39df["endYear"] = df["endYear"].fillna(value=df["startYear"])40df["isAdult"] = df["isAdult"].fillna(value=0)41
42df = df.astype(dtype={"startYear": np.int32, "endYear": np.int32, "isAdult": np.int32})43
44target = ["movie", "tvMovie", "tvSeries", "tvMiniSeries"]45ind_drop = df[~df["titleType"].isin(target)].index46df = df.drop(ind_drop)47
48if collect_movies_based_on_rating:49movies_ids = []50movies_ids.extend(df.loc[(df["startYear"] <= 1990) & (df["averageRating"] > 8), "tconst"].values)51movies_ids.extend(52df.loc[(df["startYear"] > 1990) & (df["startYear"] <= 2005) & (df["averageRating"] > 7), "tconst"].values53)54movies_ids.extend(55df.loc[(df["startYear"] > 2005) & (df["startYear"] <= 2015) & (df["averageRating"] > 6), "tconst"].values56)57movies_ids.extend(58df.loc[(df["startYear"] > 2015) & (df["startYear"] <= 2021) & (df["averageRating"] > 5), "tconst"].values59)60
61with open("imdb_ids.txt", "w") as f:62for movie_id in movies_ids:63f.write(str(movie_id) + "\n")64
65if collect_movies_based_on_numvotes:66movies_ids = []67movies_ids.extend(df.loc[df.loc[:, "numVotes"] > 1000, "tconst"].values)68movies_ids.extend(df.loc[df.loc[:, "averageRating"] > 6.0, "tconst"].values)69movies_ids = list(set(movies_ids))70
71with open("imdb_ids.txt", "w") as f:72for movie_id in movies_ids:73f.write(str(movie_id) + "\n")74
75with open("imdb_ids.txt", "r") as f:76all_movies_ids = f.read().splitlines()77
78all_movies_ids = list(set(all_movies_ids))79print(f"Total number of considered movies: {len(all_movies_ids)}")80
81# # Collect titles and ratings
82
83t0 = time()84fpath = "./title.ratings.tsv.gz"85df_ratings = pd.read_table(fpath, low_memory=False)86
87ind_drop = df_ratings[~df_ratings["tconst"].isin(all_movies_ids)].index88df_ratings = df_ratings.drop(ind_drop)89assert df_ratings.shape[0] == len(all_movies_ids), print("Number of samples less than number of movies")90
91fpath = "./title.basics.tsv.gz"92
93df = pd.read_table(fpath, low_memory=False, na_values={"startYear": ["\\N"], "endYear": ["\\N"], "isAdult": ["\\N"]})94
95ind_drop = df[~df["tconst"].isin(all_movies_ids)].index96df = df.drop(ind_drop)97
98df = df.merge(df_ratings, on="tconst")99
100df.rename(101columns={102"originalTitle": "original title",103"primaryTitle": "title",104"genres": "genre",105"averageRating": "imdb_rating",106"tconst": "imdb_id",107},108inplace=True,109)
110
111df.drop_duplicates(inplace=True)112
113df["titleType"] = df["titleType"].apply(lambda x: "Series" if "Series" in x else "")114df["genre"] = [",".join([x, y]) if y != "" else x for x, y in zip(df["genre"], df["titleType"])]115df["genre"] = df["genre"].apply(lambda x: x if x != "\\N" else "")116df["genre"] = df["genre"].apply(lambda x: x.split(","))117
118df.fillna({"startYear": 0, "endYear": 0}, inplace=True)119df["startYear"] = df["startYear"].astype("int")120df["endYear"] = df["endYear"].astype("int")121df.drop(["titleType", "isAdult", "runtimeMinutes"], axis=1, inplace=True)122assert df.shape[0] == len(all_movies_ids), print("Number of samples less than number of movies")123
124print(f"Total time: {time() - t0}")125
126# # Collect names of actors etc
127
128t0 = time()129fpath = "./title.principals.tsv.gz"130
131df_principals = pd.read_table(fpath)132df_principals = df_principals.loc[:, ["tconst", "nconst", "ordering", "category", "characters"]]133df_principals.rename(columns={"tconst": "imdb_id"}, inplace=True)134print(df_principals.head())135
136ind_drop = df_principals[~df_principals["imdb_id"].isin(all_movies_ids)].index137df_principals = df_principals.drop(ind_drop)138print(df_principals.head())139
140ind_drop = df_principals[~df_principals["ordering"].isin([1, 2, 3, 4, 5, 6])].index141df_principals = df_principals.drop(ind_drop)142print(df_principals.head())143
144df_principals["category"] = df_principals["category"].apply(lambda x: x if x != "actress" else "actor")145target_profs = ["director", "producer", "actor", "writer"]146ind_drop = df_principals[~df_principals["category"].isin(target_profs)].index147df_principals = df_principals.drop(ind_drop)148print(df_principals.head())149
150fpath = "./name.basics.tsv.gz"151
152df_names = pd.read_table(fpath)153df_names = df_names.loc[:, ["primaryName", "nconst"]]154print(df_names.head())155
156df_principals = df_principals.merge(df_names, on="nconst")157print(df_principals["characters"])158
159special_char = df_principals.loc[4, "characters"]160df_principals["characters"] = df_principals["characters"].apply(161lambda x: [] if x == special_char or len(x) == 0 else json.loads(x)162)
163print(df_principals.head())164
165print(f"Total time: {time() - t0}")166
167# # Collect persons
168t0 = time()169
170
171def collect_movie_persons(x):172return pd.Series(173{174f"{role}s": x.loc[x.sort_values(by=["ordering"])["category"] == prof, name].values.tolist()175for prof, role, name in zip(176["director", "producer", "actor", "writer", "actor"],177["director", "producer", "actor", "writer", "character"],178["primaryName", "primaryName", "primaryName", "primaryName", "characters"],179)180}181)182
183
184df_principals = pd.DataFrame(df_principals.groupby("imdb_id").apply(collect_movie_persons))185print(df_principals.head())186df_principals["characters"] = df_principals["characters"].apply(lambda x: sum(x, []) if isinstance(x, list) else [])187print(df_principals.head())188
189df.set_index("imdb_id", inplace=True)190df = df.join(df_principals, on="imdb_id")191df.reset_index(inplace=True)192
193df.fillna(value={f"{prof}s": "" for prof in target_profs}, inplace=True)194
195assert df.shape[0] == len(all_movies_ids), print("Number of samples less than number of movies")196
197print(f"Total time: {time() - t0}")198
199# # Collect alternative titles
200
201t0 = time()202
203fpath = "./title.akas.tsv.gz"204
205df_akas = pd.read_table(fpath, low_memory=False)206df_akas = df_akas.loc[df_akas["region"] == "US", :]207df_akas.rename(columns={"titleId": "imdb_id"}, inplace=True)208
209ind_drop = df_akas[~df_akas["imdb_id"].isin(all_movies_ids)].index210df_akas = df_akas.drop(ind_drop)211
212grouped_data = df_akas.groupby("imdb_id")["title"].apply(lambda x: "::".join(x))213df_titles = pd.DataFrame(grouped_data)214df_titles.rename(columns={"title": "all_titles"}, inplace=True)215
216df.set_index("imdb_id", inplace=True)217df = df.join(df_titles, on="imdb_id")218df.reset_index(inplace=True)219df.fillna(value={"all_titles": ""}, inplace=True)220
221assert df.shape[0] == len(all_movies_ids), print("Number of samples less than number of movies")222
223print(f"Total time: {time() - t0}")224
225database = df.to_dict("records")226for el in database:227el["genre"] = el["genre"] if el["genre"] != "" else None228el["startYear"] = el["startYear"] if el["startYear"] != 0 else None229el["endYear"] = el["endYear"] if el["endYear"] != 0 else None230el["all_titles"] = el["all_titles"].split("::") if el["all_titles"] != "" else []231for prof in ["director", "producer", "actor", "writer"]:232el[f"{prof}s"] = list(el[f"{prof}s"]) if list(el[f"{prof}s"]) != "" else []233
234with open("database_most_popular_main_info.json", "w") as f:235json.dump(database, f, indent=2)236