Amazing-Python-Scripts

Форк
0
174 строки · 6.8 Кб
1
from selenium import webdriver
2
from selenium.webdriver.common.by import By
3
from selenium.webdriver.chrome.service import Service
4
from selenium.webdriver.support.ui import WebDriverWait
5
from selenium.webdriver.support import expected_conditions as EC
6
from selenium.webdriver.chrome.options import Options
7

8
"""
9
Example code :
10
    python_scraper = Courses("python",5)
11
    print(python_scraper.scrape_all())
12
"""
13

14

15
class Courses:
16
    def __init__(self, keyword, page_count):
17
        self.keyword = keyword
18
        self.page_count = page_count
19

20
    def __scrape_page(self):
21
        chromedriver_path = ''
22
        options = Options()
23
        options.add_argument("--headless")
24
        driver = webdriver.Chrome(service=Service(
25
            chromedriver_path), options=options)
26
        wait = WebDriverWait(driver, 100)
27
        driver.get('https://www.coursera.org/search?query=' + self.keyword)
28
        return wait, driver
29

30
    def scrape_all(self):
31
        wait, driver = self.__scrape_page()
32
        courses_data = []
33
        try:
34
            j = 0
35
            for i in range(self.page_count):
36
                courses = wait.until(EC.visibility_of_all_elements_located(
37
                    (By.CSS_SELECTOR, 'main ul>li')))
38
                for course in courses:
39
                    title = driver.execute_script(
40
                        'return arguments[0].querySelector("h3")?.innerText', course)
41
                    description = driver.execute_script(
42
                        'return arguments[0].querySelector("p>span")?.innerText', course)
43
                    review = driver.execute_script(
44
                        'return arguments[0].querySelector("div:has(>svg)")?.innerText.replace("\\n\\n","⭐")', course)
45
                    url = driver.execute_script(
46
                        'return String(arguments[0].querySelector("a")?.href)', course)
47
                    data = {"id": j, "title": title,
48
                            "description": description, "review": review, "url": url}
49
                    courses_data += [data]
50
                    j += 1
51
                next_btn = driver.find_element(
52
                    By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
53
                if 'disabled' in next_btn.get_attribute('class'):
54
                    print('There are no more pages')
55
                    break
56
                else:
57
                    next_btn.click()
58
            return {
59
                "data": courses_data,
60
                "message": f"Course Titles for {self.keyword}"
61
            }
62
        except:
63
            return {
64
                "data": None,
65
                "message": f"No courses found for {self.keyword}"
66
            }
67

68
    def course_titles(self):
69
        wait, driver = self.__scrape_page()
70
        titles = []
71
        try:
72
            for i in range(self.page_count):
73
                courses = wait.until(EC.visibility_of_all_elements_located(
74
                    (By.CSS_SELECTOR, 'main ul>li')))
75
                titles.extend([driver.execute_script(
76
                    'return arguments[0].querySelector("h3")?.innerText', course) for course in courses])
77
                next_btn = driver.find_element(
78
                    By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
79
                if 'disabled' in next_btn.get_attribute('class'):
80
                    print('There are no more pages')
81
                    break
82
                else:
83
                    next_btn.click()
84
            return {
85
                "data": titles,
86
                "message": f"Course Titles for {self.keyword}"
87
            }
88
        except:
89
            return {
90
                "data": None,
91
                "message": f"No courses found for {self.keyword}"
92
            }
93

94
    def course_description(self):
95
        wait, driver = self.__scrape_page()
96
        descriptions = []
97
        try:
98
            for i in range(self.page_count):
99
                courses = wait.until(EC.visibility_of_all_elements_located(
100
                    (By.CSS_SELECTOR, 'main ul>li')))
101
                descriptions.extend([driver.execute_script(
102
                    'return arguments[0].querySelector("p>span")?.innerText', course) for course in courses])
103
                next_btn = driver.find_element(
104
                    By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
105
                if 'disabled' in next_btn.get_attribute('class'):
106
                    print('There are no more pages')
107
                    break
108
                else:
109
                    next_btn.click()
110
            return {
111
                "data": descriptions,
112
                "message": f"Course Titles for {self.keyword}"
113
            }
114
        except:
115
            return {
116
                "data": None,
117
                "message": f"No courses found for {self.keyword}"
118
            }
119

120
    def course_reviews(self):
121
        wait, driver = self.__scrape_page()
122
        reviews = []
123
        try:
124
            for i in range(self.page_count):
125
                courses = wait.until(EC.visibility_of_all_elements_located(
126
                    (By.CSS_SELECTOR, 'main ul>li')))
127
                reviews.extend([driver.execute_script(
128
                    'return arguments[0].querySelector("div:has(>svg)")?.innerText.replace("\\n\\n","⭐")', course) for course in courses])
129
                next_btn = driver.find_element(
130
                    By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
131
                if 'disabled' in next_btn.get_attribute('class'):
132
                    print('There are no more pages')
133
                    break
134
                else:
135
                    next_btn.click()
136
            return {
137
                "data": reviews,
138
                "message": f"Course Titles for {self.keyword}"
139
            }
140
        except:
141
            return {
142
                "data": None,
143
                "message": f"No courses found for {self.keyword}"
144
            }
145

146
    def course_urls(self):
147
        wait, driver = self.__scrape_page()
148
        urls = []
149
        try:
150
            for i in range(self.page_count):
151
                courses = wait.until(EC.visibility_of_all_elements_located(
152
                    (By.CSS_SELECTOR, 'main ul>li')))
153
                urls.extend([driver.execute_script(
154
                    'return String(arguments[0].querySelector("a")?.href)', course) for course in courses])
155
                next_btn = driver.find_element(
156
                    By.CSS_SELECTOR, 'button[aria-label="Next Page"]')
157
                if 'disabled' in next_btn.get_attribute('class'):
158
                    print('There are no more pages')
159
                    break
160
                else:
161
                    next_btn.click()
162
            return {
163
                "data": urls,
164
                "message": f"Course Titles for {self.keyword}"
165
            }
166
        except:
167
            return {
168
                "data": None,
169
                "message": f"No courses found for {self.keyword}"
170
            }
171

172

173
python_scraper = Courses("python", 5)
174
print(python_scraper.scrape_all())
175

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.