dream
79 строк · 2.4 Кб
1import pandas as pd
2
3for data_type in ["train", "dev"]:
4with open(f"da_data/{data_type}.txt", "r") as f:
5data = f.read().splitlines()
6
7print(len(data))
8
9df = {
10"text": [],
11# "labels": [],
12"joint_labels": [],
13}
14for row in data:
15splitted_row = row.split(" ## ")
16try:
17splitted_row[1] = splitted_row[1].split(";")
18splitted_row[1] = [el for el in splitted_row[1] if el]
19df["text"].append(splitted_row[0])
20# df["labels"].append(splitted_row[1])
21df["joint_labels"].append(";".join(splitted_row[1]))
22except Exception:
23pass
24
25data = pd.DataFrame(df)
26print(data.shape)
27print(data.head())
28print(data["joint_labels"].value_counts())
29
30data.to_csv(f"midas_{data_type}.csv", index=False, sep=",")
31
32considered = [
33"open_question_factual",
34"open_question_opinion",
35"open_question_personal",
36"yes_no_question",
37"clarifying_question",
38"command",
39"dev_command",
40"appreciation",
41"opinion",
42"complaint",
43"comment",
44"statement",
45"other_answers",
46"pos_answer",
47"neg_answer",
48]
49
50questions = ["open_question_factual", "open_question_opinion", "yes_no_question"]
51
52with open(f"da_data/{data_type}.txt", "r") as f:
53data = f.read().splitlines()
54
55print(len(data))
56
57df = {"text": [], "binary_labels": [], "joint_labels": []}
58for row in data:
59splitted_row = row.split(" ## ")
60try:
61splitted_row[1] = splitted_row[1].split(";")
62splitted_row[1] = [el for el in splitted_row[1] if el and el in considered]
63if len(splitted_row[1]) == 1:
64df["text"].append(splitted_row[0])
65df["joint_labels"].append(splitted_row[1][0])
66if splitted_row[1][0] in questions:
67df["binary_labels"].append("some_question")
68else:
69df["binary_labels"].append("some_statement")
70
71except Exception:
72pass
73
74data = pd.DataFrame(df)
75print(data.shape)
76print(data.head())
77print(data["joint_labels"].value_counts())
78print(data["binary_labels"].value_counts())
79data.to_csv(f"~/.deeppavlov/downloads/midas/midas_semantic_classes_{data_type}.csv", index=False, sep=",")
80