Amazing-Python-Scripts
91 строка · 2.9 Кб
1import requests
2from bs4 import BeautifulSoup
3from selenium import webdriver
4from selenium.webdriver.common.keys import Keys
5import time
6from fpdf import FPDF
7
8# Get input for category and number of articles
9category = input("Enter category: ")
10number_articles = int(input("Enter number of articles: "))
11driver_path = input("Enter chrome driver path: ")
12
13url = 'https://dev.to/search?q={}'.format(category)
14
15# initiating the webdriver. Parameter includes the path of the webdriver.
16driver = webdriver.Chrome(driver_path)
17driver.get(url)
18
19# this is just to ensure that the page is loaded
20time.sleep(5)
21html = driver.page_source
22
23# Now apply bs4 to html variable
24soup = BeautifulSoup(html, "html.parser")
25results_div = soup.find('div', {'id': 'substories'})
26articles = results_div.find_all('article')
27
28# Getting articles from dev.to
29count = 0
30for article in articles:
31article_data = article.find(
32'a', class_='crayons-story__hidden-navigation-link')['href']
33
34post_url = "https://dev.to{}".format(article_data)
35driver.get(post_url)
36time.sleep(5)
37
38post_html = driver.page_source
39soup = BeautifulSoup(post_html, "html.parser")
40article_div = soup.find('div', {'class': 'article-wrapper'})
41article_content = article_div.find(
42'article', {'id': 'article-show-container'})
43
44# Title of post found
45header_tag = article_content.find(
46'header', class_='crayons-article__header')
47title_div = header_tag.find('div', class_='crayons-article__header__meta')
48title_content = title_div.find('h1')
49
50# Author of post found
51author_tag = title_div.find('div', class_='crayons-article__subheader')
52author_name = author_tag.find('a', class_='crayons-link')
53
54# Post content found
55article_content_div = article_content.find(
56'div', class_='crayons-article__main')
57article_content_body = article_content_div.find(
58'div', class_='crayons-article__body')
59p_tags = article_content_body.find_all('p')
60
61title_string = (title_content.text.strip()).encode(
62'latin-1', 'replace').decode('latin-1')
63author_string = ("By - {}".format(author_name.text.strip())
64).encode('latin-1', 'replace').decode('latin-1')
65
66# Add a page
67pdf = FPDF()
68pdf.add_page()
69# set style and size of font
70pdf.set_font("Arial", size=12)
71
72# Title cell
73pdf.cell(200, 5, txt=title_string, ln=1, align='C')
74# Author cell
75pdf.cell(200, 10, txt=author_string, ln=2, align='C')
76
77for p_tag in p_tags:
78article_part = (p_tag.text.strip()).encode(
79'latin-1', 'replace').decode('latin-1')
80# Add part of article to pdf
81pdf.multi_cell(0, 5, txt=article_part, align='L')
82
83# save the pdf with name .pdf
84pdf_title = ''.join(e for e in title_string if e.isalnum())
85pdf.output("{}.pdf".format(pdf_title))
86
87count = count + 1
88if (count == number_articles):
89break
90
91driver.close() # closing the webdriver
92