dream

Форк
0
/
newsapi_service.py 
208 строк · 8.1 Кб
1
import logging
2
import os
3
import requests
4
import re
5
from collections import deque
6
from copy import deepcopy
7
from datetime import datetime
8
from os import getenv
9

10
import sentry_sdk
11
from nltk.sentiment.vader import SentimentIntensityAnalyzer
12
from nltk.tokenize import sent_tokenize
13

14
sentry_sdk.init(getenv("SENTRY_DSN"))
15
logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO)
16
logger = logging.getLogger(__name__)
17

18

19
BADLIST_ANNOTATOR_URL = getenv("BADLIST_ANNOTATOR_URL")
20

21
BADLISTED_WORDS = re.compile(
22
    r"\b(gun|shoot|die.?\b|murder|kill|victim|stolen" r"|decease|sick\b|sicken\b|sickness\b|hurt\b|hurting\b|ailing\b)",
23
    re.IGNORECASE,
24
)
25
nltk_sentiment_classifier = SentimentIntensityAnalyzer()
26

27

28
def get_nltk_sentiment(text):
29
    result = nltk_sentiment_classifier.polarity_scores(text)
30
    if result.get("neg", 0.0) >= 0.05:
31
        return "negative"
32
    elif result.get("pos", 0.0) >= 0.5:
33
        return "positive"
34
    else:
35
        return "neutral"
36

37

38
class CachedRequestsAPI:
39
    NEWS_SERVICE_URL = "https://gnews.io/api/v4/search?q=TOPIC&country=us&lang=en&max=20&sortby=publishedAt&token="
40
    ALL_NEWS_SERVICE_URL = "https://gnews.io/api/v4/top-headlines?country=us&lang=en&max=20&sortby=publishedAt&token="
41
    EXT_NEWS_SERVICE_URL = (
42
        "https://gnews.io/api/v4/search?q=TOPIC&country=us&lang=en&expand=content&max=20&sortby=publishedAt&token="
43
    )
44
    EXT_ALL_NEWS_SERVICE_URL = (
45
        "https://gnews.io/api/v4/top-headlines?country=us&lang=en&expand=content&max=20&sortby=publishedAt&token="
46
    )
47

48
    def __init__(self, renew_freq_time):
49
        self.renew_freq_time = renew_freq_time
50
        self.first_renew_time = datetime.now()
51
        self.prev_renew_times = {}
52
        self.cached = {}
53
        self._api_keys = self._collect_api_keys()
54
        logger.info(
55
            f"CachedRequestAPI initialized with renew_freq_time: {renew_freq_time} s;" f"api keys: {self._api_keys}"
56
        )
57

58
    def _collect_api_keys(self):
59
        api_keys = [os.environ["GNEWS_API_KEY"]]
60
        assert len(api_keys) > 0, print(f"news skill api keys is empty! api_keys {api_keys}")
61
        return deque(api_keys)
62

63
    def _construct_address(self, topic, api_key, return_list_of_news):
64
        if topic == "all":
65
            if return_list_of_news:
66
                request_address = self.EXT_ALL_NEWS_SERVICE_URL + api_key
67
            else:
68
                request_address = self.ALL_NEWS_SERVICE_URL + api_key
69
        else:
70
            if return_list_of_news:
71
                request_address = self.EXT_NEWS_SERVICE_URL + api_key
72
            else:
73
                request_address = self.NEWS_SERVICE_URL + api_key
74
            request_address = request_address.replace("TOPIC", f'"{topic}"')
75
        return request_address
76

77
    def _make_request(self, topic, return_list_of_news):
78
        for ind, api_key in enumerate(self._api_keys):
79
            try:
80
                request_address = self._construct_address(topic, api_key, return_list_of_news)
81
                resp = requests.get(url=request_address, timeout=1.5)
82
            except Exception as e:
83
                sentry_sdk.capture_exception(e)
84
                logger.exception(e)
85
                resp = requests.Response()
86
                resp.status_code = 504
87
            if resp.status_code == 429:
88
                msg = f"News API Response status code 429 with api key {api_key}"
89
                logger.warning(msg)
90
            else:
91
                # Change order of api_keys to use first success next time
92
                self._api_keys.rotate(-ind)
93
                break
94
        return resp
95

96
    def get_new_topic_news(self, topic, return_list_of_news):
97
        result = []
98
        resp = self._make_request(topic, return_list_of_news)
99

100
        if resp.status_code != 200:
101
            logger.warning(
102
                f"result status code is not 200: {resp}. result text: {resp.text}; "
103
                f"result status: {resp.status_code}"
104
            )
105
            sentry_sdk.capture_message(
106
                f"News API! result status code is not 200: {resp}. result text: {resp.text}; "
107
                f"result status: {resp.status_code}"
108
            )
109
        else:
110
            response = resp.json()
111
            response = response.get("articles", [])
112
            result = response
113
        result = self.get_not_badlisted_english_news(result)
114
        return result
115

116
    def send(self, topic="all", status="", prev_news_urls=None, return_list_of_news=False):
117
        """Get news using cache and NewsAPI requests
118

119
        Args:
120
            topic: string topic (i.g. sport news, putin, politics
121
            status: string news skill status
122
            prev_news_urls: list of all discussed previous news' URLs sent to user (list of strings)
123
        Returns:
124
            dictionary with one top rated over latest news
125
        """
126
        prev_news_urls = [] if prev_news_urls is None else prev_news_urls
127
        topic = topic.lower() if len(topic) > 0 else "all"
128
        curr_time = datetime.now()
129

130
        if return_list_of_news:
131
            top_news = self.get_new_topic_news(topic, return_list_of_news)
132
        else:
133
            if (
134
                len(self.cached.get(topic, [])) == 0
135
                or (curr_time - self.prev_renew_times.get(topic, self.first_renew_time)).seconds > self.renew_freq_time
136
            ):
137
                self.cached[topic] = self.get_new_topic_news(topic, return_list_of_news) + self.cached.get(topic, [])
138
                self.prev_renew_times[topic] = curr_time
139

140
            top_news = deepcopy(self.cached.get(topic, []))
141

142
        if len(prev_news_urls) > 0 and status == "headline":
143
            # some prev discussed news detected
144
            top_news = [news for news in top_news if "url" in news and news["url"] not in prev_news_urls]
145

146
        if len(top_news) > 0:
147
            return top_news
148
        else:
149
            return []
150

151
    @staticmethod
152
    def get_not_badlisted_english_news(articles):
153
        articles_to_check = []
154
        for article in articles:
155
            title = article.get("title", "") or ""
156
            if len(title) == 0:
157
                continue
158
            description = article.get("content", "") or ""
159
            sentences_content = sent_tokenize(description)
160
            if description and len(sentences_content) > 2:
161
                description = " ".join(sentences_content[:2])
162
                article["description"] = description
163
            elif description and len(sentences_content) > 1:
164
                description = sentences_content[0]
165
                article["description"] = description
166
            else:
167
                description = article.get("description", "") or ""
168

169
            if len(description) == 0:
170
                continue
171
            if get_nltk_sentiment(f"{title} {description}") == "negative":
172
                continue
173

174
            articles_to_check += [f"{title} {description}"]
175

176
        try:
177
            resp = requests.request(
178
                url=BADLIST_ANNOTATOR_URL, json={"sentences": articles_to_check}, method="POST", timeout=0.5
179
            )
180
        except (requests.ConnectTimeout, requests.ReadTimeout) as e:
181
            sentry_sdk.capture_exception(e)
182
            logger.exception("Badlisted Annotator requests from News API Annotator Timeout")
183
            resp = requests.Response()
184
            resp.status_code = 504
185

186
        if resp.status_code != 200:
187
            logger.warning(
188
                f"result status code is not 200: {resp}. result text: {resp.text}; "
189
                f"result status: {resp.status_code}"
190
            )
191
            result = [False] * len(articles_to_check)
192
            sentry_sdk.capture_message(
193
                "Badlisted Annotator requests from News API Annotator "
194
                f" result status code is not 200: {resp}. result text: {resp.text}; "
195
                f"result status: {resp.status_code}"
196
            )
197
        else:
198
            # each element is like `{'bad_words': False}`
199
            result = [sum(d.values()) for d in resp.json()[0]["batch"]]
200

201
        articles = [
202
            article
203
            for article, is_black in zip(articles, result)
204
            if not is_black
205
            and not BADLISTED_WORDS.search(f'{article.get("title", "")} {article.get("description", "")}')
206
        ]
207

208
        return articles
209

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.