dream
208 строк · 8.1 Кб
1import logging
2import os
3import requests
4import re
5from collections import deque
6from copy import deepcopy
7from datetime import datetime
8from os import getenv
9
10import sentry_sdk
11from nltk.sentiment.vader import SentimentIntensityAnalyzer
12from nltk.tokenize import sent_tokenize
13
14sentry_sdk.init(getenv("SENTRY_DSN"))
15logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO)
16logger = logging.getLogger(__name__)
17
18
19BADLIST_ANNOTATOR_URL = getenv("BADLIST_ANNOTATOR_URL")
20
21BADLISTED_WORDS = re.compile(
22r"\b(gun|shoot|die.?\b|murder|kill|victim|stolen" r"|decease|sick\b|sicken\b|sickness\b|hurt\b|hurting\b|ailing\b)",
23re.IGNORECASE,
24)
25nltk_sentiment_classifier = SentimentIntensityAnalyzer()
26
27
28def get_nltk_sentiment(text):
29result = nltk_sentiment_classifier.polarity_scores(text)
30if result.get("neg", 0.0) >= 0.05:
31return "negative"
32elif result.get("pos", 0.0) >= 0.5:
33return "positive"
34else:
35return "neutral"
36
37
38class CachedRequestsAPI:
39NEWS_SERVICE_URL = "https://gnews.io/api/v4/search?q=TOPIC&country=us&lang=en&max=20&sortby=publishedAt&token="
40ALL_NEWS_SERVICE_URL = "https://gnews.io/api/v4/top-headlines?country=us&lang=en&max=20&sortby=publishedAt&token="
41EXT_NEWS_SERVICE_URL = (
42"https://gnews.io/api/v4/search?q=TOPIC&country=us&lang=en&expand=content&max=20&sortby=publishedAt&token="
43)
44EXT_ALL_NEWS_SERVICE_URL = (
45"https://gnews.io/api/v4/top-headlines?country=us&lang=en&expand=content&max=20&sortby=publishedAt&token="
46)
47
48def __init__(self, renew_freq_time):
49self.renew_freq_time = renew_freq_time
50self.first_renew_time = datetime.now()
51self.prev_renew_times = {}
52self.cached = {}
53self._api_keys = self._collect_api_keys()
54logger.info(
55f"CachedRequestAPI initialized with renew_freq_time: {renew_freq_time} s;" f"api keys: {self._api_keys}"
56)
57
58def _collect_api_keys(self):
59api_keys = [os.environ["GNEWS_API_KEY"]]
60assert len(api_keys) > 0, print(f"news skill api keys is empty! api_keys {api_keys}")
61return deque(api_keys)
62
63def _construct_address(self, topic, api_key, return_list_of_news):
64if topic == "all":
65if return_list_of_news:
66request_address = self.EXT_ALL_NEWS_SERVICE_URL + api_key
67else:
68request_address = self.ALL_NEWS_SERVICE_URL + api_key
69else:
70if return_list_of_news:
71request_address = self.EXT_NEWS_SERVICE_URL + api_key
72else:
73request_address = self.NEWS_SERVICE_URL + api_key
74request_address = request_address.replace("TOPIC", f'"{topic}"')
75return request_address
76
77def _make_request(self, topic, return_list_of_news):
78for ind, api_key in enumerate(self._api_keys):
79try:
80request_address = self._construct_address(topic, api_key, return_list_of_news)
81resp = requests.get(url=request_address, timeout=1.5)
82except Exception as e:
83sentry_sdk.capture_exception(e)
84logger.exception(e)
85resp = requests.Response()
86resp.status_code = 504
87if resp.status_code == 429:
88msg = f"News API Response status code 429 with api key {api_key}"
89logger.warning(msg)
90else:
91# Change order of api_keys to use first success next time
92self._api_keys.rotate(-ind)
93break
94return resp
95
96def get_new_topic_news(self, topic, return_list_of_news):
97result = []
98resp = self._make_request(topic, return_list_of_news)
99
100if resp.status_code != 200:
101logger.warning(
102f"result status code is not 200: {resp}. result text: {resp.text}; "
103f"result status: {resp.status_code}"
104)
105sentry_sdk.capture_message(
106f"News API! result status code is not 200: {resp}. result text: {resp.text}; "
107f"result status: {resp.status_code}"
108)
109else:
110response = resp.json()
111response = response.get("articles", [])
112result = response
113result = self.get_not_badlisted_english_news(result)
114return result
115
116def send(self, topic="all", status="", prev_news_urls=None, return_list_of_news=False):
117"""Get news using cache and NewsAPI requests
118
119Args:
120topic: string topic (i.g. sport news, putin, politics
121status: string news skill status
122prev_news_urls: list of all discussed previous news' URLs sent to user (list of strings)
123Returns:
124dictionary with one top rated over latest news
125"""
126prev_news_urls = [] if prev_news_urls is None else prev_news_urls
127topic = topic.lower() if len(topic) > 0 else "all"
128curr_time = datetime.now()
129
130if return_list_of_news:
131top_news = self.get_new_topic_news(topic, return_list_of_news)
132else:
133if (
134len(self.cached.get(topic, [])) == 0
135or (curr_time - self.prev_renew_times.get(topic, self.first_renew_time)).seconds > self.renew_freq_time
136):
137self.cached[topic] = self.get_new_topic_news(topic, return_list_of_news) + self.cached.get(topic, [])
138self.prev_renew_times[topic] = curr_time
139
140top_news = deepcopy(self.cached.get(topic, []))
141
142if len(prev_news_urls) > 0 and status == "headline":
143# some prev discussed news detected
144top_news = [news for news in top_news if "url" in news and news["url"] not in prev_news_urls]
145
146if len(top_news) > 0:
147return top_news
148else:
149return []
150
151@staticmethod
152def get_not_badlisted_english_news(articles):
153articles_to_check = []
154for article in articles:
155title = article.get("title", "") or ""
156if len(title) == 0:
157continue
158description = article.get("content", "") or ""
159sentences_content = sent_tokenize(description)
160if description and len(sentences_content) > 2:
161description = " ".join(sentences_content[:2])
162article["description"] = description
163elif description and len(sentences_content) > 1:
164description = sentences_content[0]
165article["description"] = description
166else:
167description = article.get("description", "") or ""
168
169if len(description) == 0:
170continue
171if get_nltk_sentiment(f"{title} {description}") == "negative":
172continue
173
174articles_to_check += [f"{title} {description}"]
175
176try:
177resp = requests.request(
178url=BADLIST_ANNOTATOR_URL, json={"sentences": articles_to_check}, method="POST", timeout=0.5
179)
180except (requests.ConnectTimeout, requests.ReadTimeout) as e:
181sentry_sdk.capture_exception(e)
182logger.exception("Badlisted Annotator requests from News API Annotator Timeout")
183resp = requests.Response()
184resp.status_code = 504
185
186if resp.status_code != 200:
187logger.warning(
188f"result status code is not 200: {resp}. result text: {resp.text}; "
189f"result status: {resp.status_code}"
190)
191result = [False] * len(articles_to_check)
192sentry_sdk.capture_message(
193"Badlisted Annotator requests from News API Annotator "
194f" result status code is not 200: {resp}. result text: {resp.text}; "
195f"result status: {resp.status_code}"
196)
197else:
198# each element is like `{'bad_words': False}`
199result = [sum(d.values()) for d in resp.json()[0]["batch"]]
200
201articles = [
202article
203for article, is_black in zip(articles, result)
204if not is_black
205and not BADLISTED_WORDS.search(f'{article.get("title", "")} {article.get("description", "")}')
206]
207
208return articles
209