Amazing-Python-Scripts

JobScraper.py
112 строк · 3.7 Кб
Перенос по словам
1
import urllib
2
import time
3
import requests
4
import re
5
import csv
6
from bs4 import BeautifulSoup
7

8

9
def write_csv(loc, info):
10
    """
11
    The function writes the job openings collected in a .csv file
12
    """
13
    headers = ['Title', 'Company Name', 'Location', 'Date', 'Summary', 'Url']
14

15
    # Adding info into the rows of the file
16
    with open('./Job Scraper/' + loc+'_openings.csv', 'a', encoding='utf-8') as csv_f:
17
        csv_p = csv.writer(csv_f, delimiter=',')
18
        csv_p.writerow(headers)
19
        csv_p.writerows(info)
20

21
    print(f'\n{loc}_openings.csv has been saved to your directory!\n')
22

23

24
def job_scraper():
25
    """
26
    The function scrapes the required number of job openings posted for a given job title and location
27
    and stores all the associated information in a .csv file
28
    """
29
    title = input("\nEnter job title: ").replace(" ", "+")
30
    loc = input("Enter job location: ").replace(" ", "+")
31
    num = int(input("Enter the number of job openings to obtain: "))
32

33
    url = f'https://in.indeed.com/jobs?q={title}&l={loc}'
34
    req_page = requests.get(url)
35

36
    job_array = []
37

38
    if req_page.status_code == 200:
39
        soup = BeautifulSoup(req_page.text, "html.parser")
40
        job_table = soup.find("td", id="resultsCol")
41
        count = 0
42

43
        flag = 1
44
        while flag:
45
            for job_card in job_table.find_all("div", class_="jobsearch-SerpJobCard"):
46
                # Getting the job title
47
                title_elem = job_card.find(
48
                    'a', class_='jobtitle turnstileLink')
49
                title = title_elem.text.strip()
50

51
                # Getting the company name
52
                company_details = job_card.find('div', class_='sjcl')
53
                company_name = company_details.find('span', class_='company')
54
                company_name = company_name.text.strip()
55

56
                # Getting the company location
57
                company_loc = company_details.find('span', class_='location')
58
                if company_loc != None:
59
                    company_loc = company_loc.text.strip()
60
                else:
61
                    company_loc = loc
62

63
                # Getting the URL of the post
64
                link = job_card.find('a')['href']
65
                link = 'https://in.indeed.com' + link
66

67
                # Getting the date of the post
68
                date_elem = job_card.find('span', class_='date')
69
                date = date_elem.text.strip()
70

71
                # Getting the job summary
72
                summary_ele = job_card.findAll(
73
                    'div', attrs={'class': 'summary'})
74
                for span in summary_ele:
75
                    span = span.text.strip()
76

77
                count += 1
78

79
                job_array.append(
80
                    [title, company_name, company_loc, date, span, link])
81
                if count == num:
82
                    flag = 0
83
                    break
84

85
            # To go to the next page
86
            page = soup.find("ul", class_="pagination-list")
87
            found = 0
88
            for page in page.find_all('a'):
89
                if page.attrs['aria-label'] == 'Next':
90
                    found = 1
91
                    break
92

93
            if found:
94
                next_page_link = 'https://in.indeed.com' + page.attrs['href']
95

96
                time.sleep(2)
97

98
                req_page = requests.get(next_page_link)
99
                soup = BeautifulSoup(req_page.text, "html.parser")
100
                job_table = soup.find("td", id="resultsCol")
101

102
            else:
103
                flag = 0
104

105
        write_csv(loc, job_array)
106

107
    else:
108
        print('There seems to be a problem fetching the results. Check your inputs, connections and try again')
109

110

111
if __name__ == '__main__':
112
    job_scraper()
113
Amazing-Python-Scripts

Использование cookies