Amazing-Python-Scripts
160 строк · 5.6 Кб
1import logging
2from datetime import datetime
3from dbConnector import FlipkartDatabaseConnector
4from productList import product_categories
5from genricHtmlib import SeleniumScraper
6import os
7import lxml.html as html
8import concurrent.futures
9
10SeleniumScraper = SeleniumScraper()
11
12
13class Scraper:
14def __init__(self):
15self.brand: str = "flipkart"
16self.website = "https://www.flipkart.com/search?q="
17self.websiteName = "https://www.flipkart.com"
18self.stamp: str = datetime.now().strftime("%Y-%ma-%d_%H-%M-%S")
19self.storagePath: str = os.getcwd()
20
21self.productLinksXpath = '//*[@rel="noopener noreferrer"]//@href'
22self.skuXpath = '//tr[contains(@class, "row")]//td[contains(text(), "Model Number")]/following-sibling::td[1]/ul/li/text()'
23self.nameXpath = '//*[@class="B_NuCI"]//text()'
24self.description = '//div[contains(text(), "Description")]/following-sibling::div[1]/div/text()'
25self.image = '//*[@class="_396cs4 _2amPTt _3qGmMb"]//@src'
26self.category = '//*[@class="_3GIHBu"]//text()'
27self.price = '//*[@class="_30jeq3 _16Jk6d"]//text()'
28
29def getProductList(self, keyword):
30try:
31productLinks = []
32url = self.website + keyword
33response = SeleniumScraper.fetch_request_normal(url)
34if response is None:
35doc = SeleniumScraper.fetch_request_selenium(url)
36else:
37doc = html.fromstring(response)
38
39Links = SeleniumScraper.get_xpath_link(
40doc, self.productLinksXpath, self.websiteName)
41productLinks.extend(Links)
42
43for page in range(2, 20):
44print(f'Geting Page {page} for {keyword}')
45url = self.website + keyword + "&page=" + str(page)
46response = SeleniumScraper.fetch_request_normal(url)
47if response is None:
48doc = SeleniumScraper.fetch_request_selenium(url)
49else:
50doc = html.fromstring(response)
51
52Links = SeleniumScraper.get_xpath_link(
53doc, self.productLinksXpath, self.websiteName)
54productLinks.extend(Links)
55
56print(f'Total products for {keyword} is {len(productLinks)}')
57return productLinks
58
59except Exception as e:
60print(e)
61
62def getProductDetails(self, productLink):
63print(f'Getting product details for {productLink}')
64response = SeleniumScraper.fetch_request_normal(productLink)
65if response is None:
66doc = SeleniumScraper.fetch_request_selenium(productLink)
67else:
68doc = html.fromstring(response)
69
70productDetails = {}
71
72try:
73sku = SeleniumScraper.get_xpath_data(doc, self.skuXpath)
74sku = sku[0]
75except:
76sku = "None"
77
78try:
79name = SeleniumScraper.get_xpath_data(doc, self.nameXpath)
80name = name[0]
81except:
82name = "None"
83
84try:
85description = SeleniumScraper.get_xpath_data(doc, self.description)
86description = ''.join(description)
87except:
88description = "None"
89
90try:
91image_path = SeleniumScraper.get_xpath_link(
92doc, self.image, self.websiteName)
93image_path = image_path[0]
94except:
95image_path = "None"
96
97try:
98category = SeleniumScraper.get_xpath_data(doc, self.category)
99category = category[1]
100except:
101category = "None"
102
103try:
104price = SeleniumScraper.get_xpath_data(doc, self.price)
105price = SeleniumScraper.cleanData(price)
106price = price[0]
107except:
108price = "None"
109
110productDetails["sku"] = str(sku)
111productDetails["name"] = str(name)
112productDetails["description"] = str(description)
113productDetails["image_path"] = str(image_path)
114productDetails["category"] = str(category)
115productDetails["timestamp"] = str(self.stamp)
116productDetails["URL"] = str(productLink)
117productDetails['price'] = price
118
119print(productDetails)
120return productDetails
121
122def start(self):
123productList = []
124number_of_threads: int = 1
125
126# Log start of scraper
127print(f"Starting {self.brand} scraper")
128
129# make db amazon.db if it doesn't exist
130if not os.path.exists(self.storagePath + "/" + self.brand + ".db"):
131print(
132f'Creating {self.brand}.db at {self.storagePath+self.brand+".db"}')
133db = FlipkartDatabaseConnector(self.stamp)
134db.schemaMaker()
135print(db.welcomeMessage)
136
137self.db = FlipkartDatabaseConnector(self.stamp)
138print(self.db.welcomeMessage)
139
140with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor:
141productUrls = executor.map(self.getProductList, product_categories)
142productList.extend(productUrls)
143
144# flatten the list productList
145productList = [item for sublist in productList for item in sublist]
146print(f'Total products for {self.brand} is {len(productList)}')
147
148with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor:
149results = executor.map(self.getProductDetails, productList)
150
151for result in results:
152print(f"Saving {result['sku']} to db")
153self.db.insertProduct(result)
154
155self.db.removeDuplicates()
156
157
158if __name__ == '__main__':
159scraper = Scraper()
160scraper.start()
161