Amazing-Python-Scripts

Форк
0
223 строки · 6.7 Кб
1
import requests
2
from bs4 import BeautifulSoup
3
import tkinter as tk
4
from tkinter import messagebox, simpledialog
5
from tkinter import ttk
6
from tkinter import font as tkFont
7
import time
8
import sqlite3
9
from sqlite3 import Error
10

11
# Function to connect to the SQL Database
12

13

14
def sql_connection():
15
    try:
16
        con = sqlite3.connect('./Stack-overflow-scraper/stackoverflow.db')
17
        return con
18
    except Error:
19
        print(Error)
20

21
# Function to create table
22

23

24
def sql_table(con):
25
    cursorObj = con.cursor()
26
    cursorObj.execute(
27
        "CREATE TABLE IF NOT EXISTS questions(question_text text, question_summary text, question_link text,votes integer, views integer )")
28
    con.commit()
29

30

31
# Call functions to connect to database and create table
32
con = sql_connection()
33
sql_table(con)
34

35
# Function to insert into table
36

37

38
def sql_insert(con, entities):
39
    cursorObj = con.cursor()
40
    cursorObj.execute(
41
        'INSERT INTO questions(question_text, question_summary, question_link, votes, views) VALUES(?, ?, ?, ?, ?)', entities)
42
    con.commit()
43

44
# Function to generate URL based on choice
45

46

47
def get_URL():
48
    tag = search_box.get()
49
    if not tag:
50
        messagebox.showinfo("Alert", "Please Enter tag!")
51
        return
52
    url = 'https://stackoverflow.com/questions/tagged/{}?sort=MostVotes&edited=true'.format(
53
        tag)
54
    return url
55

56

57
def number_questions():
58
    questions = int(questions_box.get())
59
    if type(questions) != int or questions > 15:
60
        return 15
61
    return questions
62

63

64
def scrape_questions():
65
    for count in range(5):
66
        progress['value'] += 15
67
        window.update_idletasks()
68
        time.sleep(0.10)
69

70
    question_count = number_questions()
71
    count = 0
72

73
    url = get_URL()
74
    if url:
75
        page = requests.get(url)
76
    else:
77
        clear_progress()
78
        return
79

80
    # Start scraping resultant html data
81
    soup = BeautifulSoup(page.content, 'html.parser')
82
    questions = soup.find_all('div', {'class': 'question-summary'})
83
    if not questions:
84
        messagebox.showinfo("Invalid", "Invalid search tag")
85
        clear_progress()
86
        return ""
87
    for question in questions:
88
        if count >= question_count:
89
            break
90
        question_text = question.find(
91
            'a', {'class': 'question-hyperlink'}).text.strip()
92
        question_summary = question.find(
93
            'div', {'class': 'excerpt'}).text.strip()
94
        question_summary = question_summary.replace('\n', ' ')
95
        question_link = 'https://stackoverflow.com{}'.format(
96
            question.find('a', {'class': 'question-hyperlink'})['href'])
97
        votes = question.find(
98
            'span', {'class': 'vote-count-post'}).text.strip()
99
        views = question.find(
100
            'div', {'class': 'views'}).text.strip().split()[0]
101
        entities = (question_text, question_summary,
102
                    question_link, votes, views)
103
        sql_insert(con, entities)
104
        count += 1
105

106
    messagebox.showinfo("Success!", "Questions scrapped successfully!")
107
    clear_progress()
108

109
# Function to fetch stackoverflow questions from DB
110

111

112
def sql_fetch(con):
113
    cursorObj = con.cursor()
114
    try:
115
        # SQL search query
116
        cursorObj.execute(
117
            'SELECT DISTINCT * FROM questions ORDER BY rowid DESC')
118
    except Error:
119
        print("Database empty... Fetch users using GUI")
120
        return
121

122
    rows = cursorObj.fetchall()
123
    display_text = ""
124

125
    # Show messagebox incase of empty DB
126
    if len(rows) == 0:
127
        messagebox.showinfo("Alert", "No users scraped yet!")
128
        return " "
129

130
    first_row = "{:^65}".format("Question") + "{:^65}".format("Summary") + "{:^40}".format(
131
        "Link") + "{:^15}".format("Votes") + "{:^15}".format("Views") + '\n'
132
    display_text += first_row
133

134
    # Format rows
135
    for row in rows:
136
        question_text = "{:<65}".format(
137
            row[0] if len(row[0]) < 60 else row[0][:56]+"...")
138
        question_summary = "{:<65}".format(
139
            row[1] if len(row[1]) < 60 else row[1][:56]+"...")
140
        question_link = "{:<40}".format(
141
            row[2] if len(row[2]) < 30 else row[2][:36]+"...")
142
        votes = "{:^15}".format(row[3])
143
        views = "{:^15}".format(row[4])
144
        display_text += (question_text + question_summary +
145
                         question_link + votes + views + '\n')
146

147
    return display_text
148

149

150
def show_results():
151
    display_text = sql_fetch(con)
152
    query_label.config(state=tk.NORMAL)
153
    query_label.delete(1.0, "end")
154
    query_label.insert(1.0, display_text)
155
    query_label.config(state=tk.DISABLED)
156

157

158
def clear_progress():
159
    # set progress bar back to 0
160
    progress['value'] = 100
161
    window.update_idletasks()
162
    progress['value'] = 0
163
    window.update_idletasks()
164

165

166
# Creating tkinter window
167
window = tk.Tk()
168
window.title('Stack overflow question scraper')
169
window.geometry('1200x1000')
170
window.configure(bg='white')
171

172
style = ttk.Style()
173
style.theme_use('alt')
174
style.map('my.TButton', background=[('active', 'white')])
175
style.configure('my.TButton', font=('Helvetica', 16, 'bold'))
176
style.configure('my.TButton', background='white')
177
style.configure('my.TButton', foreground='orange')
178
style.configure('my.TFrame', background='white')
179

180
# label text for title
181
ttk.Label(window, text="Stack overflow question scraper",
182
          background='white', foreground="Orange",
183
          font=("Helvetica", 30, 'bold')).grid(row=0, column=1)
184

185
# label texts
186
ttk.Label(window, text="Enter tag (ex - python):", background='white',
187
          font=("Helvetica", 15)).grid(column=0,
188
                                       row=5, padx=10, pady=25)
189

190
ttk.Label(window, text="No of questions to scrape:", background='white',
191
          font=("Helvetica", 15)).grid(column=0,
192
                                       row=6, padx=10, pady=5)
193

194

195
# Button creation
196
scrape_btn = ttk.Button(window, text="Scrape questions!",
197
                        style='my.TButton', command=scrape_questions)
198
scrape_btn.grid(row=5, column=2, pady=5, padx=15, ipadx=5)
199

200
display_btn = ttk.Button(window, text="Display from DB",
201
                         style='my.TButton', command=show_results)
202
display_btn.grid(row=6, column=2, pady=5, padx=15, ipadx=5)
203

204
# Search Box
205
search_box = tk.Entry(window, font=("Helvetica 15"), bd=2, width=60)
206
search_box.grid(row=5, column=1, pady=5, padx=15, ipadx=5)
207

208
questions_box = tk.Entry(window, font=("Helvetica 15"), bd=2, width=60)
209
questions_box.grid(row=6, column=1, pady=5, padx=15, ipadx=5)
210

211
frame = ttk.Frame(window, style='my.TFrame')
212
frame.place(relx=0.50, rely=0.18, relwidth=0.98, relheight=0.90, anchor="n")
213

214
# Progress bar
215
progress = ttk.Progressbar(window, orient="horizontal",
216
                           length=200, mode="determinate")
217
progress.grid(row=5, column=5, pady=5, padx=15, ipadx=5)
218

219
# To display questions data
220
query_label = tk.Text(frame, height="52", width="500", bg="alice blue")
221
query_label.grid(row=10,  columnspan=2)
222

223
window.mainloop()
224

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.