Amazing-Python-Scripts

Форк
0
161 строка · 5.0 Кб
1
import requests
2
import csv
3
import time
4
import sqlite3
5
from bs4 import BeautifulSoup
6

7

8
def sql_connection():
9
    """
10
    Establishes a connection to the SQL file database
11
    :return connection object:
12
    """
13
    con = sqlite3.connect('SubredditDatabase.db')
14
    return con
15

16

17
def sql_table(con):
18
    """
19
    Creates a table in the database (if it does not exist already)
20
    to store the tweet info
21
    :param con:
22
    :return:
23
    """
24
    cur = con.cursor()
25
    cur.execute("CREATE TABLE IF NOT EXISTS posts(SUBREDDIT text, TAG text, "
26
                " TITLE text, AUTHOR text, TIMESTAMP text, UPVOTES int, "
27
                " COMMENTS text, URL text)")
28
    con.commit()
29

30

31
def sql_insert_table(con, entities):
32
    """
33
    Inserts the desired data into the table to store tweet info
34
    :param con:
35
    :param entities:
36
    :return:
37
    """
38
    cur = con.cursor()
39
    cur.execute('INSERT INTO posts(SUBREDDIT, TAG, TITLE, AUTHOR, '
40
                'TIMESTAMP, UPVOTES, COMMENTS, URL) '
41
                'VALUES(?, ?, ?, ?, ?, ?, ?, ?)', entities)
42
    con.commit()
43

44

45
def scraper():
46
    """
47
    The function scrapes the post info from the desired subreddit and stores it
48
    into the desired file.
49
    :return:
50
    """
51
    con = sql_connection()
52
    sql_table(con)
53

54
    while 1:
55
        subreddit = input('\n\nEnter the name of the subreddit: r/').lower()
56
        max_count = int(
57
            input('Enter the maximum number of entries to collect: '))
58
        select = int(input('Select tags to add for the search: \n1. hot\n2. new'
59
                           '\n3. rising\n4. controversial\n5. top\nMake your choice: '))
60

61
        if select == 1:
62
            tag = 'hot'
63
            tag_url = '/'
64
        elif select == 2:
65
            tag = 'new'
66
            tag_url = '/new/'
67
        elif select == 3:
68
            tag = 'rising'
69
            tag_url = '/rising/'
70
        elif select == 4:
71
            tag = 'controversial'
72
            tag_url = '/controversial/'
73
        elif select == 5:
74
            tag = 'top'
75
            tag_url = '/top/'
76

77
        # URL for the desired subreddit
78
        url = 'https://old.reddit.com/r/' + subreddit
79

80
        # Using a user-agent to mimic browser activity
81
        headers = {'User-Agent': 'Mozilla/5.0'}
82

83
        req = requests.get(url, headers=headers)
84

85
        if req.status_code == 200:
86
            soup = BeautifulSoup(req.text, 'html.parser')
87
            print(f'\nCOLLECTING INFORMATION FOR r/{subreddit}....')
88

89
            attrs = {'class': 'thing'}
90
            counter = 1
91
            full = 0
92
            reddit_info = []
93
            while 1:
94
                for post in soup.find_all('div', attrs=attrs):
95
                    try:
96
                        # To obtain the post title
97
                        title = post.find('a', class_='title').text
98

99
                        # To get the username of the post author
100
                        author = post.find('a', class_='author').text
101

102
                        # To obtain the time of the post
103
                        time_stamp = post.time.attrs['title']
104

105
                        # To obtain the number of comments on the post
106
                        comments = post.find(
107
                            'a', class_='comments').text.split()[0]
108
                        if comments == 'comment':
109
                            comments = 0
110

111
                        # To get the number of comments on the post
112
                        upvotes = post.find('div', class_='score likes').text
113
                        if upvotes == '•':
114
                            upvotes = "None"
115

116
                        # To get the URL of the post
117
                        link = post.find('a', class_='title')['href']
118
                        link = 'www.reddit.com' + link
119

120
                        # Entering all the collected information into our database
121
                        entities = (subreddit, tag, title, author, time_stamp, upvotes,
122
                                    comments, link)
123
                        sql_insert_table(con, entities)
124

125
                        if counter == max_count:
126
                            full = 1
127
                            break
128

129
                        counter += 1
130
                    except AttributeError:
131
                        continue
132

133
                if full:
134
                    break
135

136
                try:
137
                    # To go to the next page
138
                    next_button = soup.find('span', class_='next-button')
139
                    next_page_link = next_button.find('a').attrs['href']
140

141
                    time.sleep(2)
142

143
                    req = requests.get(next_page_link, headers=headers)
144
                    soup = BeautifulSoup(req.text, 'html.parser')
145
                except:
146
                    break
147

148
            print('DONE!\n')
149
            ans = input(
150
                'Press (y) to continue or any other key to exit: ').lower()
151
            if ans == 'y':
152
                continue
153
            else:
154
                print('Exiting..')
155
                break
156
        else:
157
            print('Error fetching results.. Try again!')
158

159

160
if __name__ == '__main__':
161
    scraper()
162

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.