dream

utils.py
317 строк · 12.8 Кб
Перенос по словам
1
from typing import Dict, List
2
import logging
3
from copy import deepcopy
4
import re
5

6
from common.universal_templates import if_chat_about_particular_topic
7
from common.utils import get_intents, service_intents
8
from common.grounding import BUT_PHRASE, REPEAT_PHRASE
9

10
logger = logging.getLogger(__name__)
11
LAST_N_TURNS = 5  # number of turns to consider in annotator/skill.
12

13

14
spaces_pat = re.compile(r"\s+")
15
special_symb_pat = re.compile(r"[^a-zа-я0-9' ]", flags=re.IGNORECASE)
16

17

18
def clean_text(text):
19
    return special_symb_pat.sub(" ", spaces_pat.sub(" ", text.lower().replace("\n", " "))).strip()
20

21

22
def get_last_n_turns(
23
    dialog: Dict,
24
    bot_last_turns=None,
25
    human_last_turns=None,
26
    total_last_turns=None,
27
    excluded_attributes=["entities"],
28
):
29
    bot_last_turns = bot_last_turns or LAST_N_TURNS
30
    human_last_turns = human_last_turns or bot_last_turns + 1
31
    total_last_turns = total_last_turns or bot_last_turns * 2 + 1
32
    utterance_texts = [utterance["text"] for utterance in dialog["utterances"][-total_last_turns:]]
33
    for utterance_text in utterance_texts:
34
        if "#repeat" in utterance_text:  # Not to lose history on each repeat
35
            human_last_turns += 1
36
            bot_last_turns += 1
37
            total_last_turns += 2
38
    new_dialog = {}
39
    for key, value in dialog.items():
40
        if key not in ["utterances", "human_utterances", "bot_utterances"]:
41
            if isinstance(value, dict) and "attributes" in value:
42
                new_dialog[key] = {k: deepcopy(v) for k, v in value.items() if k != "attributes"}
43
                new_dialog[key]["attributes"] = {
44
                    k: deepcopy(v) for k, v in value["attributes"].items() if k not in excluded_attributes
45
                }
46
            else:
47
                new_dialog[key] = deepcopy(value)
48
    new_dialog["utterances"] = deepcopy(dialog["utterances"][-total_last_turns:])
49

50
    new_dialog["human_utterances"] = []
51
    new_dialog["bot_utterances"] = []
52

53
    for utt in new_dialog["utterances"]:
54
        if utt["user"]["user_type"] == "human":
55
            new_dialog["human_utterances"].append(deepcopy(utt))
56
        elif utt["user"]["user_type"] == "bot":
57
            new_dialog["bot_utterances"].append(deepcopy(utt))
58

59
    return new_dialog
60

61

62
def is_human_uttr_repeat_request_or_misheard(utt):
63
    is_repeat_request = utt.get("annotations", {}).get("intent_catcher", {}).get("repeat", {}).get("detected", 0) == 1
64
    is_low_asr_conf = utt.get("annotations", {}).get("asr", {}).get("asr_confidence", "") == "very_low"
65
    if is_low_asr_conf or is_repeat_request:
66
        return True
67
    else:
68
        return False
69

70

71
def is_bot_uttr_repeated_or_misheard(utt):
72
    is_asr = utt.get("active_skill", "") == "misheard_asr" and utt.get("confidence", 0.0) == 1.0
73
    is_repeated = "#+#repeat" in utt.get("text", "")
74
    detected_interrupt = any(
75
        [interrupt_phrase in utt.get("text", "") for interrupt_phrase in [BUT_PHRASE, REPEAT_PHRASE]]
76
    )
77
    if is_asr or is_repeated or detected_interrupt:
78
        return True
79
    else:
80
        return False
81

82

83
def remove_clarification_turns_from_dialog(dialog):
84
    new_dialog = deepcopy(dialog)
85
    new_dialog["utterances"] = []
86
    dialog_length = len(dialog["utterances"])
87

88
    for i, utt in enumerate(dialog["utterances"]):
89
        if utt["user"]["user_type"] == "human":
90
            new_dialog["utterances"].append(utt)
91
        elif utt["user"]["user_type"] == "bot":
92
            if (
93
                0 < i < dialog_length - 1
94
                and is_bot_uttr_repeated_or_misheard(utt)
95
                and is_human_uttr_repeat_request_or_misheard(dialog["utterances"][i - 1])
96
            ):
97
                new_dialog["utterances"] = new_dialog["utterances"][:-1]
98
            else:
99
                new_dialog["utterances"].append(utt)
100

101
    new_dialog["human_utterances"] = []
102
    new_dialog["bot_utterances"] = []
103

104
    for utt in new_dialog["utterances"]:
105
        if utt["user"]["user_type"] == "human":
106
            new_dialog["human_utterances"].append(deepcopy(utt))
107
        elif utt["user"]["user_type"] == "bot":
108
            new_dialog["bot_utterances"].append(deepcopy(utt))
109

110
    return new_dialog
111

112

113
def replace_with_annotated_utterances(dialog, mode="punct_sent"):
114
    if mode == "punct_sent":
115
        for utt in dialog["utterances"] + dialog["human_utterances"]:
116
            utt["orig_text"] = utt["text"]
117
            if "sentseg" in utt["annotations"]:
118
                utt["text"] = utt["annotations"]["sentseg"]["punct_sent"]
119
    elif mode == "segments":
120
        for utt in dialog["utterances"] + dialog["human_utterances"] + dialog["bot_utterances"]:
121
            utt["orig_text"] = utt["text"]
122
            if "sentseg" in utt["annotations"]:
123
                utt["text"] = deepcopy(utt["annotations"]["sentseg"]["segments"])
124
            elif isinstance(utt["text"], str):
125
                utt["text"] = [utt["text"]]
126
    elif mode == "modified_sents":
127
        for utt in dialog["utterances"] + dialog["human_utterances"]:
128
            utt["orig_text"] = utt["text"]
129
            if "sentrewrite" in utt["annotations"]:
130
                utt["text"] = utt["annotations"]["sentrewrite"]["modified_sents"][-1]
131
            elif "sentseg" in utt["annotations"]:
132
                utt["text"] = utt["annotations"]["sentseg"]["punct_sent"]
133
    elif mode == "clean_sent":
134
        for utt in dialog["utterances"] + dialog["human_utterances"] + dialog["bot_utterances"]:
135
            utt["orig_text"] = utt["text"]
136
            utt["text"] = clean_text(utt["text"])
137
    return dialog
138

139

140
def clean_up_utterances_to_avoid_unwanted_keys(
141
    dialog,
142
    wanted_keys=["text", "annotations", "active_skill", "user"],
143
    types_utterances=["human_utterances", "bot_utterances", "utterances"],
144
    used_annotations=None,
145
):
146
    # Attention! It removes all other keys from the dialog
147
    new_dialog = {}
148
    for key in types_utterances:
149
        new_dialog[key] = []
150
        for utter in dialog.get(key, []):
151
            new_utter = {}
152
            for wanted_key in wanted_keys:
153
                if wanted_key in utter:
154
                    if used_annotations and isinstance(used_annotations, list) and wanted_key == "annotations":
155
                        new_annotations = {}
156
                        for annotation_key in used_annotations:
157
                            if annotation_key in utter[wanted_key]:
158
                                new_annotations[annotation_key] = utter[wanted_key][annotation_key]
159
                        new_utter[wanted_key] = new_annotations
160
                    else:
161
                        new_utter[wanted_key] = utter[wanted_key]
162
            new_dialog[key] += [new_utter]
163
    return new_dialog
164

165

166
def last_n_human_utt_dialog_formatter(dialog: Dict, last_n_utts: int, only_last_sentence: bool = False) -> List:
167
    """
168
    Args:
169
        dialog (Dict): full dialog state
170
        last_n_utts (int): how many last user utterances to take
171
        only_last_sentence (bool, optional): take only last sentence in each utterance. Defaults to False.
172
    """
173
    dialog = deepcopy(dialog)
174
    if len(dialog["human_utterances"]) <= last_n_utts and not if_chat_about_particular_topic(
175
        dialog["human_utterances"][0]
176
    ):
177
        # in all cases when not particular topic, convert first phrase in the dialog to `hello!`
178
        if "sentseg" in dialog["human_utterances"][0].get("annotations", {}):
179
            dialog["human_utterances"][0]["annotations"]["sentseg"]["punct_sent"] = "hello!"
180
            dialog["human_utterances"][0]["annotations"]["sentseg"]["segments"] = ["hello"]
181
        else:
182
            dialog["human_utterances"][0]["text"] = "hello"
183

184
    human_utts = []
185
    detected_intents = []
186
    for utt in dialog["human_utterances"][-last_n_utts:]:
187
        if "sentseg" in utt.get("annotations", {}):
188
            sentseg_ann = utt["annotations"]["sentseg"]
189
            if only_last_sentence:
190
                text = sentseg_ann["segments"][-1] if len(sentseg_ann["segments"]) > 0 else ""
191
            else:
192
                text = sentseg_ann["punct_sent"]
193
        else:
194
            text = utt["text"]
195
        human_utts += [text]
196
        detected_intents += [get_intents(utt, which="all")]
197
    return [{"sentences_batch": [human_utts], "intents": [detected_intents]}]
198

199

200
def stop_formatter_dialog(dialog: Dict) -> List[Dict]:
201
    # Used by: stop annotator, conv eval annotator
202
    hypotheses = dialog["utterances"][-1]["hypotheses"]
203
    utts = []
204
    for h in hypotheses:
205
        tmp_utts = [m["text"] for m in dialog["utterances"]]
206
        tmp_utts.append(h["text"])
207
        tmp_utts = " [SEP] ".join([j for j in tmp_utts])
208
        utts.append(tmp_utts)
209
    return [{"dialogs": utts}]
210

211

212
def count_ongoing_skill_utterances(bot_utterances: List[Dict], skill: str) -> int:
213
    i = 0
214
    for utt in bot_utterances[::-1]:
215
        if utt["active_skill"] == skill:
216
            i += 1
217
        else:
218
            break
219
    return i
220

221

222
def dff_formatter(
223
    dialog: Dict,
224
    service_name: str,
225
    bot_last_turns=1,
226
    human_last_turns=1,
227
    used_annotations=None,
228
    types_utterances=None,
229
    wanted_keys=None,
230
) -> List[Dict]:
231
    types_utterances = ["human_utterances", "bot_utterances"] if types_utterances is None else types_utterances
232
    wanted_keys = ["text", "annotations", "active_skill", "user"] if wanted_keys is None else wanted_keys
233
    # DialoFlow Framework formatter
234
    state_name = f"{service_name}_state"
235
    human_utter_index = len(dialog["human_utterances"]) - 1
236

237
    human_attributes = dialog.get("human", {}).get("attributes", {})
238
    state = human_attributes.get(state_name, {})
239
    dff_shared_state = human_attributes.get("dff_shared_state", {"cross_states": {}, "cross_links": {}})
240
    used_links = human_attributes.get("used_links", {})
241
    age_group = human_attributes.get("age_group", "")
242
    disliked_skills = human_attributes.get("disliked_skills", {})
243
    entities = human_attributes.get("entities", {})
244
    prompts_goals = human_attributes.get("prompts_goals", {})
245

246
    previous_human_utter_index = state.get("previous_human_utter_index", -1)
247
    checking_unclarified_n_turns = human_utter_index - previous_human_utter_index
248
    if 1 < checking_unclarified_n_turns <= LAST_N_TURNS and previous_human_utter_index != -1:
249
        turns = list(
250
            zip(
251
                dialog["human_utterances"][-checking_unclarified_n_turns:],
252
                dialog["bot_utterances"][-checking_unclarified_n_turns:],
253
            )
254
        )
255
        unclarified_turns = [
256
            None
257
            for hu, bu in turns
258
            if is_human_uttr_repeat_request_or_misheard(hu) and is_bot_uttr_repeated_or_misheard(bu)
259
        ]
260
        clarification_request_flag = len(unclarified_turns) == 1
261
    else:
262
        clarification_request_flag = False
263

264
    dialog = get_last_n_turns(dialog)
265
    dialog = remove_clarification_turns_from_dialog(dialog)
266
    dialog = get_last_n_turns(dialog, bot_last_turns=bot_last_turns, human_last_turns=human_last_turns)
267
    dialog = replace_with_annotated_utterances(dialog, mode="punct_sent")
268

269
    # rm all execpt human_utterances, bot_utterances
270
    # we need only: text, annotations, active_skill
271
    new_dialog = clean_up_utterances_to_avoid_unwanted_keys(
272
        dialog, wanted_keys=wanted_keys, types_utterances=types_utterances, used_annotations=used_annotations
273
    )
274

275
    return [
276
        {
277
            "human_utter_index_batch": [human_utter_index],
278
            "dialog_batch": [new_dialog],
279
            f"{state_name}_batch": [state],
280
            "dff_shared_state_batch": [dff_shared_state],
281
            "entities_batch": [entities],
282
            "used_links_batch": [used_links],
283
            "age_group_batch": [age_group],
284
            "disliked_skills_batch": [disliked_skills],
285
            "prompts_goals_batch": [prompts_goals],
286
            "clarification_request_flag_batch": [clarification_request_flag],
287
            "dialog_id_batch": [dialog["dialog_id"]],
288
        }
289
    ]
290

291

292
def programy_post_formatter_dialog(dialog: Dict) -> Dict:
293
    # Used by: program_y, program_y_dangerous, program_y_wide
294
    # Look at skills/program_y*
295
    dialog = get_last_n_turns(dialog, bot_last_turns=6)
296
    first_uttr_hi = False
297
    if len(dialog["human_utterances"]) == 1 and not if_chat_about_particular_topic(dialog["human_utterances"][-1]):
298
        first_uttr_hi = True
299

300
    dialog = remove_clarification_turns_from_dialog(dialog)
301
    dialog = last_n_human_utt_dialog_formatter(dialog, last_n_utts=5)[0]
302
    sentences = dialog["sentences_batch"][0]
303
    intents = dialog["intents"][0]
304

305
    # modify sentences with yes/no intents to yes/no phrase
306
    # todo: sent may contain multiple sentence, logic here could be improved
307
    prioritized_intents = service_intents - {"yes", "no"}
308
    for i, (sent, ints) in enumerate(zip(sentences, intents)):
309
        ints = set(ints)
310
        if "?" not in sent and len(ints & prioritized_intents) == 0:
311
            if "yes" in ints:
312
                sentences[i] = "yes."
313
            elif "no" in ints:
314
                sentences[i] = "no."
315
    if first_uttr_hi:
316
        sentences = ["hi."]
317
    return {"sentences_batch": [sentences]}
318
dream

Использование cookies