Amazing-Python-Scripts

Форк
0
160 строк · 5.6 Кб
1
import logging
2
from datetime import datetime
3
from dbConnector import FlipkartDatabaseConnector
4
from productList import product_categories
5
from genricHtmlib import SeleniumScraper
6
import os
7
import lxml.html as html
8
import concurrent.futures
9

10
SeleniumScraper = SeleniumScraper()
11

12

13
class Scraper:
14
    def __init__(self):
15
        self.brand: str = "flipkart"
16
        self.website = "https://www.flipkart.com/search?q="
17
        self.websiteName = "https://www.flipkart.com"
18
        self.stamp: str = datetime.now().strftime("%Y-%ma-%d_%H-%M-%S")
19
        self.storagePath: str = os.getcwd()
20

21
        self.productLinksXpath = '//*[@rel="noopener noreferrer"]//@href'
22
        self.skuXpath = '//tr[contains(@class, "row")]//td[contains(text(), "Model Number")]/following-sibling::td[1]/ul/li/text()'
23
        self.nameXpath = '//*[@class="B_NuCI"]//text()'
24
        self.description = '//div[contains(text(), "Description")]/following-sibling::div[1]/div/text()'
25
        self.image = '//*[@class="_396cs4 _2amPTt _3qGmMb"]//@src'
26
        self.category = '//*[@class="_3GIHBu"]//text()'
27
        self.price = '//*[@class="_30jeq3 _16Jk6d"]//text()'
28

29
    def getProductList(self, keyword):
30
        try:
31
            productLinks = []
32
            url = self.website + keyword
33
            response = SeleniumScraper.fetch_request_normal(url)
34
            if response is None:
35
                doc = SeleniumScraper.fetch_request_selenium(url)
36
            else:
37
                doc = html.fromstring(response)
38

39
            Links = SeleniumScraper.get_xpath_link(
40
                doc, self.productLinksXpath, self.websiteName)
41
            productLinks.extend(Links)
42

43
            for page in range(2, 20):
44
                print(f'Geting Page {page} for {keyword}')
45
                url = self.website + keyword + "&page=" + str(page)
46
                response = SeleniumScraper.fetch_request_normal(url)
47
                if response is None:
48
                    doc = SeleniumScraper.fetch_request_selenium(url)
49
                else:
50
                    doc = html.fromstring(response)
51

52
                Links = SeleniumScraper.get_xpath_link(
53
                    doc, self.productLinksXpath, self.websiteName)
54
                productLinks.extend(Links)
55

56
            print(f'Total products for {keyword} is {len(productLinks)}')
57
            return productLinks
58

59
        except Exception as e:
60
            print(e)
61

62
    def getProductDetails(self, productLink):
63
        print(f'Getting product details for {productLink}')
64
        response = SeleniumScraper.fetch_request_normal(productLink)
65
        if response is None:
66
            doc = SeleniumScraper.fetch_request_selenium(productLink)
67
        else:
68
            doc = html.fromstring(response)
69

70
        productDetails = {}
71

72
        try:
73
            sku = SeleniumScraper.get_xpath_data(doc, self.skuXpath)
74
            sku = sku[0]
75
        except:
76
            sku = "None"
77

78
        try:
79
            name = SeleniumScraper.get_xpath_data(doc, self.nameXpath)
80
            name = name[0]
81
        except:
82
            name = "None"
83

84
        try:
85
            description = SeleniumScraper.get_xpath_data(doc, self.description)
86
            description = ''.join(description)
87
        except:
88
            description = "None"
89

90
        try:
91
            image_path = SeleniumScraper.get_xpath_link(
92
                doc, self.image, self.websiteName)
93
            image_path = image_path[0]
94
        except:
95
            image_path = "None"
96

97
        try:
98
            category = SeleniumScraper.get_xpath_data(doc, self.category)
99
            category = category[1]
100
        except:
101
            category = "None"
102

103
        try:
104
            price = SeleniumScraper.get_xpath_data(doc, self.price)
105
            price = SeleniumScraper.cleanData(price)
106
            price = price[0]
107
        except:
108
            price = "None"
109

110
        productDetails["sku"] = str(sku)
111
        productDetails["name"] = str(name)
112
        productDetails["description"] = str(description)
113
        productDetails["image_path"] = str(image_path)
114
        productDetails["category"] = str(category)
115
        productDetails["timestamp"] = str(self.stamp)
116
        productDetails["URL"] = str(productLink)
117
        productDetails['price'] = price
118

119
        print(productDetails)
120
        return productDetails
121

122
    def start(self):
123
        productList = []
124
        number_of_threads: int = 1
125

126
        # Log start of scraper
127
        print(f"Starting {self.brand} scraper")
128

129
        # make db amazon.db if it doesn't exist
130
        if not os.path.exists(self.storagePath + "/" + self.brand + ".db"):
131
            print(
132
                f'Creating {self.brand}.db at {self.storagePath+self.brand+".db"}')
133
            db = FlipkartDatabaseConnector(self.stamp)
134
            db.schemaMaker()
135
            print(db.welcomeMessage)
136

137
        self.db = FlipkartDatabaseConnector(self.stamp)
138
        print(self.db.welcomeMessage)
139

140
        with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor:
141
            productUrls = executor.map(self.getProductList, product_categories)
142
            productList.extend(productUrls)
143

144
        # flatten the list productList
145
        productList = [item for sublist in productList for item in sublist]
146
        print(f'Total products for {self.brand} is {len(productList)}')
147

148
        with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor:
149
            results = executor.map(self.getProductDetails, productList)
150

151
            for result in results:
152
                print(f"Saving {result['sku']} to db")
153
                self.db.insertProduct(result)
154

155
        self.db.removeDuplicates()
156

157

158
if __name__ == '__main__':
159
    scraper = Scraper()
160
    scraper.start()
161

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.