Amazing-Python-Scripts

Форк
0
149 строк · 4.7 Кб
1
# Youtube Trending Feed Scrapper
2
# Written by XZANATOL
3
from optparse import OptionParser
4
from selenium import webdriver
5
import pandas as pd
6
import mongoengine
7
import pymongo
8
import time
9
import sys
10

11
# Help menu
12
usage = """
13
<Script> [Options]
14

15
[Options]
16
    -h, --help    Shows this help message and exit.
17
    -c, --csv     Saves extracted contents to a CSV file.
18
    -m, --mongo   Saves extracted contents to a MongoDB.
19
"""
20

21
# Load args
22
parser = OptionParser()
23
parser.add_option("-c", "--csv", action="store_true", dest="csv",
24
                  help="Saves extracted contents to a CSV file.")
25
parser.add_option("-m", "--mongo", action="store_true",
26
                  dest="mongo", help="Saves extracted contents to a MongoDB.")
27

28
# Defined DataFrame to avoid check errors
29
df = pd.DataFrame()
30

31
# MongoDB Collection (Table) Template
32

33

34
class Trending(mongoengine.Document):
35
    section = mongoengine.StringField(required=True)
36
    title = mongoengine.StringField(required=True)
37
    channel = mongoengine.StringField(required=True)
38
    link = mongoengine.StringField(required=True)
39
    views = mongoengine.StringField(required=True)
40
    date = mongoengine.StringField(required=True)
41

42
    meta = {"indexes": ["section"]}
43

44

45
def load_driver():
46
    """Load Chrome webdriver."""
47
    driver = webdriver.Chrome("chromedriver.exe")
48
    return driver
49

50

51
def page_scrap(driver):
52
    """Scrap YouTube trending feed."""
53
    # pages to be scrapped: Now, Music, Gaming, Movies
54
    pages = ["https://www.youtube.com/feed/trending",
55
             "https://www.youtube.com/feed/trending?bp=4gINGgt5dG1hX2NoYXJ0cw%3D%3D",
56
             "https://www.youtube.com/feed/trending?bp=4gIcGhpnYW1pbmdfY29ycHVzX21vc3RfcG9wdWxhcg%3D%3D",
57
             "https://www.youtube.com/feed/trending?bp=4gIKGgh0cmFpbGVycw%3D%3D"]
58
    sections = ["Now", "Music", "Gaming", "Movies"]
59

60
    for num in range(4):
61
        driver.get(pages[num])
62
        time.sleep(3)  # Make sure that all the page is loaded
63
        # Extract first 10 contents
64
        cards = driver.find_elements_by_tag_name("ytd-video-renderer")[:10]
65
        links = driver.find_elements_by_id("video-title")[:10]
66
        meta_data = driver.find_elements_by_tag_name(
67
            "ytd-video-meta-block")[:10]
68
        for i in range(10):
69
            # Splitted meta data that will be saved
70
            meta_splitted = meta_data[i].text.split("\n")
71
            # Sometimes this character is extracted for unknown reasons
72
            try:
73
                meta_splitted.remove("•")
74
            except:
75
                pass
76
            section = sections[num]     # Scrapped from which section?
77
            link = links[i].get_attribute("href")  # Video Link
78
            title = links[i].text     # Video title
79
            channel = meta_splitted[0]  # Channel name
80
            views = meta_splitted[1]  # Video Views
81
            date = meta_splitted[2]  # Release date
82

83
            """Arguments validation is better than making a scrapping algorithm for each"""
84
            if mongo:
85
                save_to_db(section, title, channel, link, views, date)
86
            if csv:
87
                append_to_df(section, title, channel, link, views, date)
88

89
        print(f"[+]Finished scraping '{sections[num]}' section!")
90

91
    # last validation for csv
92
    if csv:
93
        save_to_csv()
94

95

96
def save_to_db(section, title, channel, link, views, date):
97
    """Saves a record to database."""
98
    # Create object
99
    record = Trending(
100
        section=section,
101
        title=title,
102
        channel=channel,
103
        link=link,
104
        views=views,
105
        date=date)
106
    # Save record
107
    record.save()
108

109

110
def append_to_df(section, title, channel, link, views, date):
111
    """Appends a record to dataframe."""
112
    global df
113
    df = df.append({"section": section,
114
                    "title": title,
115
                    "channel": channel,
116
                    "link": link,
117
                    "views": views,
118
                    "date": date, }, ignore_index=True)
119

120

121
def save_to_csv():
122
    """exports dataframe to a CSV file."""
123
    global df
124
    df.to_csv("Youtube.csv", index=False, columns=["section", "title",
125
                                                   "channel", "link",
126
                                                   "views", "date"])
127
    # Function end (eye friendly comment to seperate the function end line)
128

129

130
if __name__ == "__main__":
131
    (options, args) = parser.parse_args()
132

133
    # Flags
134
    csv = options.csv
135
    mongo = options.mongo
136
    # Validate flags
137
    if not (bool(csv) or bool(mongo)):
138
        print(usage)
139
        sys.exit()
140

141
    if mongo:
142
        mongoengine.connect("Youtube")
143

144
    driver = load_driver()  # load driver
145
    page_scrap(driver)  # start scrapping
146
    print("[+]Done !")
147
    # End session
148
    driver.quit()
149
    sys.exit()
150

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.