dream

collect_midas_data.py
79 строк · 2.4 Кб
Перенос по словам
1
import pandas as pd
2

3
for data_type in ["train", "dev"]:
4
    with open(f"da_data/{data_type}.txt", "r") as f:
5
        data = f.read().splitlines()
6

7
    print(len(data))
8

9
    df = {
10
        "text": [],
11
        # "labels": [],
12
        "joint_labels": [],
13
    }
14
    for row in data:
15
        splitted_row = row.split(" ## ")
16
        try:
17
            splitted_row[1] = splitted_row[1].split(";")
18
            splitted_row[1] = [el for el in splitted_row[1] if el]
19
            df["text"].append(splitted_row[0])
20
            # df["labels"].append(splitted_row[1])
21
            df["joint_labels"].append(";".join(splitted_row[1]))
22
        except Exception:
23
            pass
24

25
    data = pd.DataFrame(df)
26
    print(data.shape)
27
    print(data.head())
28
    print(data["joint_labels"].value_counts())
29

30
    data.to_csv(f"midas_{data_type}.csv", index=False, sep=",")
31

32
    considered = [
33
        "open_question_factual",
34
        "open_question_opinion",
35
        "open_question_personal",
36
        "yes_no_question",
37
        "clarifying_question",
38
        "command",
39
        "dev_command",
40
        "appreciation",
41
        "opinion",
42
        "complaint",
43
        "comment",
44
        "statement",
45
        "other_answers",
46
        "pos_answer",
47
        "neg_answer",
48
    ]
49

50
    questions = ["open_question_factual", "open_question_opinion", "yes_no_question"]
51

52
    with open(f"da_data/{data_type}.txt", "r") as f:
53
        data = f.read().splitlines()
54

55
    print(len(data))
56

57
    df = {"text": [], "binary_labels": [], "joint_labels": []}
58
    for row in data:
59
        splitted_row = row.split(" ## ")
60
        try:
61
            splitted_row[1] = splitted_row[1].split(";")
62
            splitted_row[1] = [el for el in splitted_row[1] if el and el in considered]
63
            if len(splitted_row[1]) == 1:
64
                df["text"].append(splitted_row[0])
65
                df["joint_labels"].append(splitted_row[1][0])
66
                if splitted_row[1][0] in questions:
67
                    df["binary_labels"].append("some_question")
68
                else:
69
                    df["binary_labels"].append("some_statement")
70

71
        except Exception:
72
            pass
73

74
    data = pd.DataFrame(df)
75
    print(data.shape)
76
    print(data.head())
77
    print(data["joint_labels"].value_counts())
78
    print(data["binary_labels"].value_counts())
79
    data.to_csv(f"~/.deeppavlov/downloads/midas/midas_semantic_classes_{data_type}.csv", index=False, sep=",")
80
dream

Использование cookies