Amazing-Python-Scripts
112 строк · 3.8 Кб
1import re2import requests3from urllib.parse import urlparse, unquote4import logging5
6logger = logging.getLogger(__name__)7
8
9class GoogleSearchAPI:10def __init__(self, key: str, cx: str):11self._cx = cx12self._key = key13self._api_url = "https://www.googleapis.com/customsearch/v1"14self._params = {15"num": 10,16"cx": self._cx,17"key": self._key18}19
20def _hit_api(self, linkedin_id: str) -> list:21results = []22try:23params = self._params.copy()24params["exactTerms"] = f"/in/{linkedin_id}"25while True:26resp = requests.get(self._api_url, params=params)27if resp.status_code != 200:28logger.warning(29f"Google Custom Search API error: {resp.status_code} - {resp.text}")30break31
32data = resp.json()33items = data.get("items", [])34results.extend(items)35
36next_page = data.get("queries", {}).get("nextPage", [])37if not next_page:38break39params["start"] = next_page[0]["startIndex"]40except Exception as e:41logger.exception("Error in _hit_api:")42return results43
44
45class ProfilePicture:46def __init__(self, key: str, cx: str):47self._api_obj = GoogleSearchAPI(key, cx)48
49def extract_id(self, link: str) -> str:50""" To get a clean LinkedIn ID """51linkedin_id = link52match = re.findall(r'\/in\/([^\/]+)\/?', urlparse(link).path)53if match:54linkedin_id = match[0].strip()55linkedin_id = linkedin_id.strip("/")56linkedin_id = unquote(linkedin_id)57return linkedin_id58
59def _check_picture_url(self, link: str) -> bool:60match = re.search(61r"(media-exp\d\.licdn\.com).+?(profile-displayphoto-shrink_)", link)62return bool(match)63
64def _check_url_exists(self, link: str) -> bool:65try:66resp = requests.head(link, timeout=5)67return resp.status_code == 20068except requests.RequestException:69return False70
71def _extract_profile_picture(self, linkedin_id: str, res: list) -> str:72link = ""73for item in res:74linkedin_url = item.get("link", "")75search_id = self.extract_id(linkedin_url)76if search_id == linkedin_id:77metatags = item.get("pagemap", {}).get("metatags", [])78metatags = [tag.get("og:image")79for tag in metatags if "og:image" in tag]80
81for url in metatags:82if self._check_picture_url(url) and self._check_url_exists(url):83link = url84break85if link:86break87return link88
89def _extract_profile_info(self, linkedin_id: str, res: list) -> dict:90info = {}91for item in res:92linkedin_url = item.get("link", "")93search_id = self.extract_id(linkedin_url)94if search_id == linkedin_id:95info["name"] = item.get("title")96info["headline"] = item.get("snippet")97info["public_url"] = linkedin_url98break99return info100
101def get_profile_picture(self, link: str) -> str:102linkedin_id = self.extract_id(link)103api_resp = self._api_obj._hit_api(linkedin_id)104profile_picture_url = self._extract_profile_picture(105linkedin_id, api_resp)106return profile_picture_url107
108def get_profile_info(self, link: str) -> dict:109linkedin_id = self.extract_id(link)110api_resp = self._api_obj._hit_api(linkedin_id)111profile_info = self._extract_profile_info(linkedin_id, api_resp)112return profile_info113