dream

gossip.py
343 строки · 10.4 Кб
Перенос по словам
1
import re
2
import logging
3
from common.combined_classes import TOPIC_GROUPS
4
from common import utils, universal_templates
5

6
logger = logging.getLogger(__name__)
7

8
TOP_5k_FREQUENT_WORDS = set(
9
    [k.strip() for k in open("common/google-10000-english-no-swears.txt", "r").readlines()[:5000]]
10
)
11

12
GOSSIP_COMPILED_PATTERN = re.compile(
13
    r"\b(celebrit|actor|actress|writer|author|entrepreneur|sportsperson|musician|gossip)", re.IGNORECASE
14
)
15
HAVE_YOU_GOSSIP_TEMPLATE = re.compile(r"(would|have|did|was|had|were|are|do) you .*gossip", re.IGNORECASE)
16

17
GOSSIP_SKILL_TRIGGER_PHRASES = [
18
    "Would you want to hear the latest gossip?",
19
    "Are you interested in the latest gossip?",
20
    "Would you be interested in the latest gossip?",
21
]
22

23
CELEBRITY_TRIGGER_PHRASES = ["What is your favourite celebrity?"]
24

25
TOPICS_TO_PEOPLE_MAPPINGS = [
26
    {
27
        "Topic": "Entertainment_Movies",
28
        "People": [
29
            "Christian Bale",
30
            "Jake Gyllenhaal",
31
            "Leonardo DiCaprio",
32
            "Tom Hardy",
33
            "Joaquin Phoenix",
34
            "Hugh Jackman",
35
            "Brad Pitt",
36
            "Ryan Gosling",
37
            "Tom Cruise",
38
            "Bradley Cooper",
39
            "Amy Adams",
40
            "Scarlett Johansson",
41
            "Emma Stone",
42
            "Anne Hathaway",
43
            "Emily Blunt",
44
            "Margot Robbie",
45
            "Jennifer Lawrence",
46
            "Rachel McAdams",
47
            "Saoirse Ronan",
48
            "Amanda Seyfried",
49
        ],
50
    },
51
    {
52
        "Topic": "Entertainment_Music",
53
        "People": [
54
            "Ed Sheeran",
55
            "Justin Bieber",
56
            "Katy Perry",
57
            "Maroon 5",
58
            "Post Malone",
59
            "Lady Gaga",
60
            "Ariana Grande",
61
            "Imagine Dragons",
62
            "The Weeknd",
63
            "Nicki Minaj",
64
            "Eminem",
65
            "Luke Bryan",
66
            "P!nk",
67
            "One Direction",
68
            "Justin Timberlake",
69
            "Kendrick Lamar",
70
            "Lady A",
71
            "Beyonce",
72
            "Jason Aldean",
73
            "Sam Smith",
74
        ],
75
    },
76
    {
77
        "Topic": "Entertainment_Books",
78
        "People": [
79
            "Colson Whitehead",
80
            "Madeline Miller",
81
            "Yaa Gyasi",
82
            "Lauren Groff",
83
            "George Saunders",
84
            "Karen Russell",
85
            "Jemisin",
86
            "Lisa Ko",
87
            "Emily St. John Mandel",
88
            "Jesmyn Ward",
89
            "Brandon Sanderson",
90
            "John Darnielle",
91
            "Celeste Ng",
92
            "Ta-Nehisi Coates",
93
            "Donna Tartt",
94
            "Erin Morgenstern",
95
            "Akhil Sharma",
96
            "Zadie Smith",
97
            "Patrick Rothfuss",
98
            "Kate Atkinson",
99
        ],
100
    },
101
    {
102
        "Topic": "Entertainment_General",
103
        "People": [
104
            "Jennifer Lawrence",
105
            "Chris Pratt",
106
            "Brie Larson",
107
            "Benedict Cumberbatch",
108
            "Phoebe Waller-Bridge",
109
            "Oscar Isaac",
110
            "Emma Stone",
111
            "Adam Driver",
112
            "Sophie Turner",
113
            "Donald Glover",
114
            "Melissa McCarthy",
115
            "Eddie Redmayne",
116
            "Amy Schumer",
117
            "Rami Malek",
118
            "Margot Robbie",
119
            "Andrew Garfield",
120
            "Karen Gillan",
121
            "Chris Hemsworth",
122
            "Millie Bobbie Brown",
123
            "Finn Wolfhard",
124
        ],
125
    },
126
    {
127
        "Topic": "Sports",
128
        "People": [
129
            "LeBron James",
130
            "Serena Williams",
131
            "Tom Brady",
132
            "Simone Biles",
133
            "Usain Bolt",
134
            "Mike Trout",
135
            "Steph Curry",
136
            "Lionel Messi",
137
            "Michael Phelps",
138
            "Novak Djokovic",
139
            "Katie Ledecky",
140
            "Kevin Durant",
141
            "Rafel Nadal",
142
            "Cristiano Ronaldo",
143
            "Aaron Rodgers",
144
            "Roger Federer",
145
            "Sidney Crosby",
146
            "Clayton Kershaw",
147
            "Alex Ovechkin",
148
            "Carli Lloyd",
149
        ],
150
    },
151
    {
152
        "Topic": "Politics",
153
        "People": [
154
            # "Donald Trump",
155
            # "Barack Obama",
156
            # "Hillary Clinton",
157
            # "Brett Kavanaugh",
158
            # "Nancy Pelosi",
159
            # "Ted Cruz",
160
            # "Marco Rubio",
161
            # "Beto O'Rourke",
162
            # "Alexandria Ocasio-Cortez",
163
            # "Arnold Schwarzenegger",
164
            # "Joe Biden"
165
        ],
166
    },
167
    {
168
        "Topic": "Science_and_Technology",
169
        "People": [
170
            "Elon Musk",
171
            "Jeff Bezos",
172
            "Bill Gates",
173
            "Tim Timberlake",
174
            # "Philip Scheinfeld",
175
            # "Jayson Waller",
176
            # "Alfredo Delgado",
177
            # "Katie Hamilton",
178
            # "Billionaire Barbie",
179
            # "Alan Belcher",
180
            # "Los Silva",
181
            # "Van Taylor",
182
            # "David Granados",
183
            # "Randall Emmett",
184
            # "Rob Deutsch",
185
            # "Adam Weitsman",
186
            # "David Meltzer",
187
            # "Brady Bell",
188
            # "Andrew Andrawes",
189
            # "Jordan Montgomery",
190
            # "Eric Marcus",
191
            # "Ben Newman",
192
            # "Tai Lopez",
193
            # "Grant Cardone",
194
            # "Rudy Mawer",
195
            # "Paul Vigario",
196
            # "Amber Voight",
197
            # "Cesar Gomez",
198
        ],
199
    },
200
    {"Topic": "Phatic", "People": []},
201
    {"Topic": "Interactive", "People": []},
202
    {"Topic": "Inappropriate_Content", "People": []},
203
    {"Topic": "Other", "People": []},
204
]
205

206

207
COBOT_TOPICS_TO_WIKI_OCCUPATIONS = {
208
    "Politics": [["Q82955", "politician"], ["Q193391", "diplomat"]],
209
    "Science_and_Technology": [["Q131524", "entrepreneur"]],
210
    "Entertainment_Movies": [
211
        ["Q33999", "actor"],
212
        ["Q10800557", "film actor"],
213
        ["Q2526255", "film director"],
214
        ["Q28389", "screenwriter"],
215
        ["Q10798782", "television actor"],
216
        ["Q3282637", "film producer"],
217
        ["Q2259451", "stage actor"],
218
        ["Q3455803", "director"],
219
        ["Q947873", "television presenter"],
220
        ["Q222344", "cinematographer"],
221
        ["Q2405480", "voice actor"],
222
    ],
223
    "Entertainment_Books": [
224
        ["Q36180", "writer"],
225
        ["Q49757", "poet"],
226
        ["Q6625963", "novelist"],
227
        ["Q214917", "playwright"],
228
        ["Q1607826", "editor"],
229
    ],
230
    "Entertainment_General": [
231
        ["Q1028181", "painter"],
232
        ["Q483501", "artist"],
233
        ["Q33231", "photographer"],
234
        ["Q1281618", "sculptor"],
235
        ["Q644687", "illustrator"],
236
        ["Q15296811", "drawer"],
237
        ["Q1930187", "journalist"],
238
    ],
239
    "Sports": [
240
        ["Q2066131", "athlete"],
241
        ["Q937857", "association football player"],
242
        ["Q3665646", "basketball player"],
243
        ["Q10871364", "baseball player"],
244
        ["Q12299841", "cricketer"],
245
        ["Q11513337", "athletics competitor"],
246
        ["Q19204627", "American football player"],
247
        ["Q11774891", "ice hockey player"],
248
        ["Q2309784", "sport cyclist"],
249
        ["Q628099", "association football manager"],
250
        ["Q13141064", "badminton player"],
251
        ["Q10873124", "chess player"],
252
        ["Q14089670", "rugby union player"],
253
        ["Q11338576", "boxer"],
254
        ["Q15117302", "volleyball player"],
255
        ["Q10843402", "swimmer"],
256
        ["Q12840545", "handball player"],
257
        ["Q10833314", "tennis player"],
258
    ],
259
    "Entertainment_Music": [
260
        ["Q177220", "singer"],
261
        ["Q36834", "composer"],
262
        ["Q639669", "musician"],
263
        ["Q753110", "songwriter"],
264
        ["Q486748", "pianist"],
265
        ["Q488205", "singer-songwriter"],
266
        ["Q855091", "guitarist"],
267
        ["Q2865819", "opera singer"],
268
    ],
269
}
270

271

272
def skill_trigger_phrases():
273
    return GOSSIP_SKILL_TRIGGER_PHRASES + CELEBRITY_TRIGGER_PHRASES
274

275

276
def talk_about_gossip(human_utterance, bot_utterance):
277
    user_lets_chat_about = universal_templates.if_chat_about_particular_topic(
278
        human_utterance, bot_utterance, compiled_pattern=GOSSIP_COMPILED_PATTERN
279
    )
280
    flag = bool(user_lets_chat_about)
281
    return flag
282

283

284
def get_all_supported_occupations_lists():
285
    all_occupations = []
286

287
    all_topics_mappings = TOPICS_TO_PEOPLE_MAPPINGS
288
    for topic_mapping in all_topics_mappings:
289
        topic = topic_mapping["Topic"]
290
        people = []
291
        people = topic_mapping["People"]
292
        if len(people) > 0:
293
            raw_occupations_list = COBOT_TOPICS_TO_WIKI_OCCUPATIONS[topic]
294
            for occupation_pair in raw_occupations_list:
295
                occupation_id = occupation_pair[0]
296
                all_occupations.append(occupation_id)
297

298
    return all_occupations
299

300

301
def celebrity_from_uttr(human_utterance):
302
    logger.debug(f'Calling get_celebrity_from_uttr on {human_utterance["text"]}')
303

304
    # we need to get all supported occupations
305
    raw_profession_list = get_all_supported_occupations_lists()
306

307
    celebrity_name, matching_types, mismatching_types = None, None, None
308
    entity_dict = human_utterance["annotations"].get("wiki_parser", {}).get("topic_skill_entities_info", {})
309
    logger.info(f"found entities: {entity_dict}")
310
    for celebrity_name in entity_dict:
311
        if (
312
            "occupation" in entity_dict[celebrity_name]
313
            and entity_dict[celebrity_name]["pos"] == 0
314
            and entity_dict[celebrity_name]["conf"] > 0.5
315
            and celebrity_name.lower() not in TOP_5k_FREQUENT_WORDS
316
        ):
317
            occupation_list = entity_dict[celebrity_name]["occupation"]
318
            matching_types = [job[1] for job in occupation_list if job[0] in raw_profession_list]
319
            mismatching_types = [job[1] for job in occupation_list if job[0] not in raw_profession_list]
320
            if matching_types:
321
                break
322
    if not matching_types:
323
        return None, None, None
324
    celebrity_name = celebrity_name.title()
325
    logger.warning(f"Relations {celebrity_name} {matching_types} {mismatching_types}")
326
    return celebrity_name, matching_types, mismatching_types
327

328

329
def check_is_celebrity_mentioned(human_utterance):
330
    celebrity_name, _, _ = celebrity_from_uttr(human_utterance)
331
    if celebrity_name is not None:
332
        return True
333
    return False
334

335

336
def about_celebrities(annotated_utterance):
337
    found_topics = utils.get_topics(annotated_utterance, probs=False, which="all")
338
    if any([topic in found_topics for topic in TOPIC_GROUPS["celebrities"]]):
339
        return True
340
    elif re.findall(GOSSIP_COMPILED_PATTERN, annotated_utterance["text"]):
341
        return True
342
    else:
343
        return False
344
dream

Использование cookies