Amazing-Python-Scripts
112 строк · 3.7 Кб
1import urllib
2import time
3import requests
4import re
5import csv
6from bs4 import BeautifulSoup
7
8
9def write_csv(loc, info):
10"""
11The function writes the job openings collected in a .csv file
12"""
13headers = ['Title', 'Company Name', 'Location', 'Date', 'Summary', 'Url']
14
15# Adding info into the rows of the file
16with open('./Job Scraper/' + loc+'_openings.csv', 'a', encoding='utf-8') as csv_f:
17csv_p = csv.writer(csv_f, delimiter=',')
18csv_p.writerow(headers)
19csv_p.writerows(info)
20
21print(f'\n{loc}_openings.csv has been saved to your directory!\n')
22
23
24def job_scraper():
25"""
26The function scrapes the required number of job openings posted for a given job title and location
27and stores all the associated information in a .csv file
28"""
29title = input("\nEnter job title: ").replace(" ", "+")
30loc = input("Enter job location: ").replace(" ", "+")
31num = int(input("Enter the number of job openings to obtain: "))
32
33url = f'https://in.indeed.com/jobs?q={title}&l={loc}'
34req_page = requests.get(url)
35
36job_array = []
37
38if req_page.status_code == 200:
39soup = BeautifulSoup(req_page.text, "html.parser")
40job_table = soup.find("td", id="resultsCol")
41count = 0
42
43flag = 1
44while flag:
45for job_card in job_table.find_all("div", class_="jobsearch-SerpJobCard"):
46# Getting the job title
47title_elem = job_card.find(
48'a', class_='jobtitle turnstileLink')
49title = title_elem.text.strip()
50
51# Getting the company name
52company_details = job_card.find('div', class_='sjcl')
53company_name = company_details.find('span', class_='company')
54company_name = company_name.text.strip()
55
56# Getting the company location
57company_loc = company_details.find('span', class_='location')
58if company_loc != None:
59company_loc = company_loc.text.strip()
60else:
61company_loc = loc
62
63# Getting the URL of the post
64link = job_card.find('a')['href']
65link = 'https://in.indeed.com' + link
66
67# Getting the date of the post
68date_elem = job_card.find('span', class_='date')
69date = date_elem.text.strip()
70
71# Getting the job summary
72summary_ele = job_card.findAll(
73'div', attrs={'class': 'summary'})
74for span in summary_ele:
75span = span.text.strip()
76
77count += 1
78
79job_array.append(
80[title, company_name, company_loc, date, span, link])
81if count == num:
82flag = 0
83break
84
85# To go to the next page
86page = soup.find("ul", class_="pagination-list")
87found = 0
88for page in page.find_all('a'):
89if page.attrs['aria-label'] == 'Next':
90found = 1
91break
92
93if found:
94next_page_link = 'https://in.indeed.com' + page.attrs['href']
95
96time.sleep(2)
97
98req_page = requests.get(next_page_link)
99soup = BeautifulSoup(req_page.text, "html.parser")
100job_table = soup.find("td", id="resultsCol")
101
102else:
103flag = 0
104
105write_csv(loc, job_array)
106
107else:
108print('There seems to be a problem fetching the results. Check your inputs, connections and try again')
109
110
111if __name__ == '__main__':
112job_scraper()
113