Amazing-Python-Scripts

Форк
0
112 строк · 3.8 Кб
1
import re
2
import requests
3
from urllib.parse import urlparse, unquote
4
import logging
5

6
logger = logging.getLogger(__name__)
7

8

9
class GoogleSearchAPI:
10
    def __init__(self, key: str, cx: str):
11
        self._cx = cx
12
        self._key = key
13
        self._api_url = "https://www.googleapis.com/customsearch/v1"
14
        self._params = {
15
            "num": 10,
16
            "cx": self._cx,
17
            "key": self._key
18
        }
19

20
    def _hit_api(self, linkedin_id: str) -> list:
21
        results = []
22
        try:
23
            params = self._params.copy()
24
            params["exactTerms"] = f"/in/{linkedin_id}"
25
            while True:
26
                resp = requests.get(self._api_url, params=params)
27
                if resp.status_code != 200:
28
                    logger.warning(
29
                        f"Google Custom Search API error: {resp.status_code} - {resp.text}")
30
                    break
31

32
                data = resp.json()
33
                items = data.get("items", [])
34
                results.extend(items)
35

36
                next_page = data.get("queries", {}).get("nextPage", [])
37
                if not next_page:
38
                    break
39
                params["start"] = next_page[0]["startIndex"]
40
        except Exception as e:
41
            logger.exception("Error in _hit_api:")
42
        return results
43

44

45
class ProfilePicture:
46
    def __init__(self, key: str, cx: str):
47
        self._api_obj = GoogleSearchAPI(key, cx)
48

49
    def extract_id(self, link: str) -> str:
50
        """ To get a clean LinkedIn ID  """
51
        linkedin_id = link
52
        match = re.findall(r'\/in\/([^\/]+)\/?', urlparse(link).path)
53
        if match:
54
            linkedin_id = match[0].strip()
55
        linkedin_id = linkedin_id.strip("/")
56
        linkedin_id = unquote(linkedin_id)
57
        return linkedin_id
58

59
    def _check_picture_url(self, link: str) -> bool:
60
        match = re.search(
61
            r"(media-exp\d\.licdn\.com).+?(profile-displayphoto-shrink_)", link)
62
        return bool(match)
63

64
    def _check_url_exists(self, link: str) -> bool:
65
        try:
66
            resp = requests.head(link, timeout=5)
67
            return resp.status_code == 200
68
        except requests.RequestException:
69
            return False
70

71
    def _extract_profile_picture(self, linkedin_id: str, res: list) -> str:
72
        link = ""
73
        for item in res:
74
            linkedin_url = item.get("link", "")
75
            search_id = self.extract_id(linkedin_url)
76
            if search_id == linkedin_id:
77
                metatags = item.get("pagemap", {}).get("metatags", [])
78
                metatags = [tag.get("og:image")
79
                            for tag in metatags if "og:image" in tag]
80

81
                for url in metatags:
82
                    if self._check_picture_url(url) and self._check_url_exists(url):
83
                        link = url
84
                        break
85
            if link:
86
                break
87
        return link
88

89
    def _extract_profile_info(self, linkedin_id: str, res: list) -> dict:
90
        info = {}
91
        for item in res:
92
            linkedin_url = item.get("link", "")
93
            search_id = self.extract_id(linkedin_url)
94
            if search_id == linkedin_id:
95
                info["name"] = item.get("title")
96
                info["headline"] = item.get("snippet")
97
                info["public_url"] = linkedin_url
98
                break
99
        return info
100

101
    def get_profile_picture(self, link: str) -> str:
102
        linkedin_id = self.extract_id(link)
103
        api_resp = self._api_obj._hit_api(linkedin_id)
104
        profile_picture_url = self._extract_profile_picture(
105
            linkedin_id, api_resp)
106
        return profile_picture_url
107

108
    def get_profile_info(self, link: str) -> dict:
109
        linkedin_id = self.extract_id(link)
110
        api_resp = self._api_obj._hit_api(linkedin_id)
111
        profile_info = self._extract_profile_info(linkedin_id, api_resp)
112
        return profile_info
113

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.