Amazing-Python-Scripts
161 строка · 5.0 Кб
1import requests
2import csv
3import time
4import sqlite3
5from bs4 import BeautifulSoup
6
7
8def sql_connection():
9"""
10Establishes a connection to the SQL file database
11:return connection object:
12"""
13con = sqlite3.connect('SubredditDatabase.db')
14return con
15
16
17def sql_table(con):
18"""
19Creates a table in the database (if it does not exist already)
20to store the tweet info
21:param con:
22:return:
23"""
24cur = con.cursor()
25cur.execute("CREATE TABLE IF NOT EXISTS posts(SUBREDDIT text, TAG text, "
26" TITLE text, AUTHOR text, TIMESTAMP text, UPVOTES int, "
27" COMMENTS text, URL text)")
28con.commit()
29
30
31def sql_insert_table(con, entities):
32"""
33Inserts the desired data into the table to store tweet info
34:param con:
35:param entities:
36:return:
37"""
38cur = con.cursor()
39cur.execute('INSERT INTO posts(SUBREDDIT, TAG, TITLE, AUTHOR, '
40'TIMESTAMP, UPVOTES, COMMENTS, URL) '
41'VALUES(?, ?, ?, ?, ?, ?, ?, ?)', entities)
42con.commit()
43
44
45def scraper():
46"""
47The function scrapes the post info from the desired subreddit and stores it
48into the desired file.
49:return:
50"""
51con = sql_connection()
52sql_table(con)
53
54while 1:
55subreddit = input('\n\nEnter the name of the subreddit: r/').lower()
56max_count = int(
57input('Enter the maximum number of entries to collect: '))
58select = int(input('Select tags to add for the search: \n1. hot\n2. new'
59'\n3. rising\n4. controversial\n5. top\nMake your choice: '))
60
61if select == 1:
62tag = 'hot'
63tag_url = '/'
64elif select == 2:
65tag = 'new'
66tag_url = '/new/'
67elif select == 3:
68tag = 'rising'
69tag_url = '/rising/'
70elif select == 4:
71tag = 'controversial'
72tag_url = '/controversial/'
73elif select == 5:
74tag = 'top'
75tag_url = '/top/'
76
77# URL for the desired subreddit
78url = 'https://old.reddit.com/r/' + subreddit
79
80# Using a user-agent to mimic browser activity
81headers = {'User-Agent': 'Mozilla/5.0'}
82
83req = requests.get(url, headers=headers)
84
85if req.status_code == 200:
86soup = BeautifulSoup(req.text, 'html.parser')
87print(f'\nCOLLECTING INFORMATION FOR r/{subreddit}....')
88
89attrs = {'class': 'thing'}
90counter = 1
91full = 0
92reddit_info = []
93while 1:
94for post in soup.find_all('div', attrs=attrs):
95try:
96# To obtain the post title
97title = post.find('a', class_='title').text
98
99# To get the username of the post author
100author = post.find('a', class_='author').text
101
102# To obtain the time of the post
103time_stamp = post.time.attrs['title']
104
105# To obtain the number of comments on the post
106comments = post.find(
107'a', class_='comments').text.split()[0]
108if comments == 'comment':
109comments = 0
110
111# To get the number of comments on the post
112upvotes = post.find('div', class_='score likes').text
113if upvotes == '•':
114upvotes = "None"
115
116# To get the URL of the post
117link = post.find('a', class_='title')['href']
118link = 'www.reddit.com' + link
119
120# Entering all the collected information into our database
121entities = (subreddit, tag, title, author, time_stamp, upvotes,
122comments, link)
123sql_insert_table(con, entities)
124
125if counter == max_count:
126full = 1
127break
128
129counter += 1
130except AttributeError:
131continue
132
133if full:
134break
135
136try:
137# To go to the next page
138next_button = soup.find('span', class_='next-button')
139next_page_link = next_button.find('a').attrs['href']
140
141time.sleep(2)
142
143req = requests.get(next_page_link, headers=headers)
144soup = BeautifulSoup(req.text, 'html.parser')
145except:
146break
147
148print('DONE!\n')
149ans = input(
150'Press (y) to continue or any other key to exit: ').lower()
151if ans == 'y':
152continue
153else:
154print('Exiting..')
155break
156else:
157print('Error fetching results.. Try again!')
158
159
160if __name__ == '__main__':
161scraper()
162