Amazing-Python-Scripts
149 строк · 4.7 Кб
1# Youtube Trending Feed Scrapper
2# Written by XZANATOL
3from optparse import OptionParser4from selenium import webdriver5import pandas as pd6import mongoengine7import pymongo8import time9import sys10
11# Help menu
12usage = """13<Script> [Options]
14
15[Options]
16-h, --help Shows this help message and exit.
17-c, --csv Saves extracted contents to a CSV file.
18-m, --mongo Saves extracted contents to a MongoDB.
19"""
20
21# Load args
22parser = OptionParser()23parser.add_option("-c", "--csv", action="store_true", dest="csv",24help="Saves extracted contents to a CSV file.")25parser.add_option("-m", "--mongo", action="store_true",26dest="mongo", help="Saves extracted contents to a MongoDB.")27
28# Defined DataFrame to avoid check errors
29df = pd.DataFrame()30
31# MongoDB Collection (Table) Template
32
33
34class Trending(mongoengine.Document):35section = mongoengine.StringField(required=True)36title = mongoengine.StringField(required=True)37channel = mongoengine.StringField(required=True)38link = mongoengine.StringField(required=True)39views = mongoengine.StringField(required=True)40date = mongoengine.StringField(required=True)41
42meta = {"indexes": ["section"]}43
44
45def load_driver():46"""Load Chrome webdriver."""47driver = webdriver.Chrome("chromedriver.exe")48return driver49
50
51def page_scrap(driver):52"""Scrap YouTube trending feed."""53# pages to be scrapped: Now, Music, Gaming, Movies54pages = ["https://www.youtube.com/feed/trending",55"https://www.youtube.com/feed/trending?bp=4gINGgt5dG1hX2NoYXJ0cw%3D%3D",56"https://www.youtube.com/feed/trending?bp=4gIcGhpnYW1pbmdfY29ycHVzX21vc3RfcG9wdWxhcg%3D%3D",57"https://www.youtube.com/feed/trending?bp=4gIKGgh0cmFpbGVycw%3D%3D"]58sections = ["Now", "Music", "Gaming", "Movies"]59
60for num in range(4):61driver.get(pages[num])62time.sleep(3) # Make sure that all the page is loaded63# Extract first 10 contents64cards = driver.find_elements_by_tag_name("ytd-video-renderer")[:10]65links = driver.find_elements_by_id("video-title")[:10]66meta_data = driver.find_elements_by_tag_name(67"ytd-video-meta-block")[:10]68for i in range(10):69# Splitted meta data that will be saved70meta_splitted = meta_data[i].text.split("\n")71# Sometimes this character is extracted for unknown reasons72try:73meta_splitted.remove("•")74except:75pass76section = sections[num] # Scrapped from which section?77link = links[i].get_attribute("href") # Video Link78title = links[i].text # Video title79channel = meta_splitted[0] # Channel name80views = meta_splitted[1] # Video Views81date = meta_splitted[2] # Release date82
83"""Arguments validation is better than making a scrapping algorithm for each"""84if mongo:85save_to_db(section, title, channel, link, views, date)86if csv:87append_to_df(section, title, channel, link, views, date)88
89print(f"[+]Finished scraping '{sections[num]}' section!")90
91# last validation for csv92if csv:93save_to_csv()94
95
96def save_to_db(section, title, channel, link, views, date):97"""Saves a record to database."""98# Create object99record = Trending(100section=section,101title=title,102channel=channel,103link=link,104views=views,105date=date)106# Save record107record.save()108
109
110def append_to_df(section, title, channel, link, views, date):111"""Appends a record to dataframe."""112global df113df = df.append({"section": section,114"title": title,115"channel": channel,116"link": link,117"views": views,118"date": date, }, ignore_index=True)119
120
121def save_to_csv():122"""exports dataframe to a CSV file."""123global df124df.to_csv("Youtube.csv", index=False, columns=["section", "title",125"channel", "link",126"views", "date"])127# Function end (eye friendly comment to seperate the function end line)128
129
130if __name__ == "__main__":131(options, args) = parser.parse_args()132
133# Flags134csv = options.csv135mongo = options.mongo136# Validate flags137if not (bool(csv) or bool(mongo)):138print(usage)139sys.exit()140
141if mongo:142mongoengine.connect("Youtube")143
144driver = load_driver() # load driver145page_scrap(driver) # start scrapping146print("[+]Done !")147# End session148driver.quit()149sys.exit()150