Amazing-Python-Scripts
223 строки · 6.7 Кб
1import requests
2from bs4 import BeautifulSoup
3import tkinter as tk
4from tkinter import messagebox, simpledialog
5from tkinter import ttk
6from tkinter import font as tkFont
7import time
8import sqlite3
9from sqlite3 import Error
10
11# Function to connect to the SQL Database
12
13
14def sql_connection():
15try:
16con = sqlite3.connect('./Stack-overflow-scraper/stackoverflow.db')
17return con
18except Error:
19print(Error)
20
21# Function to create table
22
23
24def sql_table(con):
25cursorObj = con.cursor()
26cursorObj.execute(
27"CREATE TABLE IF NOT EXISTS questions(question_text text, question_summary text, question_link text,votes integer, views integer )")
28con.commit()
29
30
31# Call functions to connect to database and create table
32con = sql_connection()
33sql_table(con)
34
35# Function to insert into table
36
37
38def sql_insert(con, entities):
39cursorObj = con.cursor()
40cursorObj.execute(
41'INSERT INTO questions(question_text, question_summary, question_link, votes, views) VALUES(?, ?, ?, ?, ?)', entities)
42con.commit()
43
44# Function to generate URL based on choice
45
46
47def get_URL():
48tag = search_box.get()
49if not tag:
50messagebox.showinfo("Alert", "Please Enter tag!")
51return
52url = 'https://stackoverflow.com/questions/tagged/{}?sort=MostVotes&edited=true'.format(
53tag)
54return url
55
56
57def number_questions():
58questions = int(questions_box.get())
59if type(questions) != int or questions > 15:
60return 15
61return questions
62
63
64def scrape_questions():
65for count in range(5):
66progress['value'] += 15
67window.update_idletasks()
68time.sleep(0.10)
69
70question_count = number_questions()
71count = 0
72
73url = get_URL()
74if url:
75page = requests.get(url)
76else:
77clear_progress()
78return
79
80# Start scraping resultant html data
81soup = BeautifulSoup(page.content, 'html.parser')
82questions = soup.find_all('div', {'class': 'question-summary'})
83if not questions:
84messagebox.showinfo("Invalid", "Invalid search tag")
85clear_progress()
86return ""
87for question in questions:
88if count >= question_count:
89break
90question_text = question.find(
91'a', {'class': 'question-hyperlink'}).text.strip()
92question_summary = question.find(
93'div', {'class': 'excerpt'}).text.strip()
94question_summary = question_summary.replace('\n', ' ')
95question_link = 'https://stackoverflow.com{}'.format(
96question.find('a', {'class': 'question-hyperlink'})['href'])
97votes = question.find(
98'span', {'class': 'vote-count-post'}).text.strip()
99views = question.find(
100'div', {'class': 'views'}).text.strip().split()[0]
101entities = (question_text, question_summary,
102question_link, votes, views)
103sql_insert(con, entities)
104count += 1
105
106messagebox.showinfo("Success!", "Questions scrapped successfully!")
107clear_progress()
108
109# Function to fetch stackoverflow questions from DB
110
111
112def sql_fetch(con):
113cursorObj = con.cursor()
114try:
115# SQL search query
116cursorObj.execute(
117'SELECT DISTINCT * FROM questions ORDER BY rowid DESC')
118except Error:
119print("Database empty... Fetch users using GUI")
120return
121
122rows = cursorObj.fetchall()
123display_text = ""
124
125# Show messagebox incase of empty DB
126if len(rows) == 0:
127messagebox.showinfo("Alert", "No users scraped yet!")
128return " "
129
130first_row = "{:^65}".format("Question") + "{:^65}".format("Summary") + "{:^40}".format(
131"Link") + "{:^15}".format("Votes") + "{:^15}".format("Views") + '\n'
132display_text += first_row
133
134# Format rows
135for row in rows:
136question_text = "{:<65}".format(
137row[0] if len(row[0]) < 60 else row[0][:56]+"...")
138question_summary = "{:<65}".format(
139row[1] if len(row[1]) < 60 else row[1][:56]+"...")
140question_link = "{:<40}".format(
141row[2] if len(row[2]) < 30 else row[2][:36]+"...")
142votes = "{:^15}".format(row[3])
143views = "{:^15}".format(row[4])
144display_text += (question_text + question_summary +
145question_link + votes + views + '\n')
146
147return display_text
148
149
150def show_results():
151display_text = sql_fetch(con)
152query_label.config(state=tk.NORMAL)
153query_label.delete(1.0, "end")
154query_label.insert(1.0, display_text)
155query_label.config(state=tk.DISABLED)
156
157
158def clear_progress():
159# set progress bar back to 0
160progress['value'] = 100
161window.update_idletasks()
162progress['value'] = 0
163window.update_idletasks()
164
165
166# Creating tkinter window
167window = tk.Tk()
168window.title('Stack overflow question scraper')
169window.geometry('1200x1000')
170window.configure(bg='white')
171
172style = ttk.Style()
173style.theme_use('alt')
174style.map('my.TButton', background=[('active', 'white')])
175style.configure('my.TButton', font=('Helvetica', 16, 'bold'))
176style.configure('my.TButton', background='white')
177style.configure('my.TButton', foreground='orange')
178style.configure('my.TFrame', background='white')
179
180# label text for title
181ttk.Label(window, text="Stack overflow question scraper",
182background='white', foreground="Orange",
183font=("Helvetica", 30, 'bold')).grid(row=0, column=1)
184
185# label texts
186ttk.Label(window, text="Enter tag (ex - python):", background='white',
187font=("Helvetica", 15)).grid(column=0,
188row=5, padx=10, pady=25)
189
190ttk.Label(window, text="No of questions to scrape:", background='white',
191font=("Helvetica", 15)).grid(column=0,
192row=6, padx=10, pady=5)
193
194
195# Button creation
196scrape_btn = ttk.Button(window, text="Scrape questions!",
197style='my.TButton', command=scrape_questions)
198scrape_btn.grid(row=5, column=2, pady=5, padx=15, ipadx=5)
199
200display_btn = ttk.Button(window, text="Display from DB",
201style='my.TButton', command=show_results)
202display_btn.grid(row=6, column=2, pady=5, padx=15, ipadx=5)
203
204# Search Box
205search_box = tk.Entry(window, font=("Helvetica 15"), bd=2, width=60)
206search_box.grid(row=5, column=1, pady=5, padx=15, ipadx=5)
207
208questions_box = tk.Entry(window, font=("Helvetica 15"), bd=2, width=60)
209questions_box.grid(row=6, column=1, pady=5, padx=15, ipadx=5)
210
211frame = ttk.Frame(window, style='my.TFrame')
212frame.place(relx=0.50, rely=0.18, relwidth=0.98, relheight=0.90, anchor="n")
213
214# Progress bar
215progress = ttk.Progressbar(window, orient="horizontal",
216length=200, mode="determinate")
217progress.grid(row=5, column=5, pady=5, padx=15, ipadx=5)
218
219# To display questions data
220query_label = tk.Text(frame, height="52", width="500", bg="alice blue")
221query_label.grid(row=10, columnspan=2)
222
223window.mainloop()
224