dream
309 строк · 9.5 Кб
1import string
2
3from nltk.tokenize import sent_tokenize, word_tokenize
4
5
6# Segmentation task
7# dataset: one sample = (list of token without punctuations, list of tags):
8# [['hi', 'alexa', 'what', 'time', 'is', 'it']]
9# [['B-S', ,'O', 'B-Q', 'O', 'O', 'O']]
10
11# Convert cornellmoviequotes dataset to be suitable with the segmentation task
12
13
14def preprocess(raw_text):
15# input: raw text consisting of sentences without punctuation
16# output: x - list of tokens, y - list of label
17tmp = sent_tokenize(raw_text)
18
19# remove the long line which consists more than three sentences
20if len(tmp) > 3:
21# print(tmp)
22return [], []
23
24tmp = [word_tokenize(sent) for sent in tmp]
25
26x, y = [], []
27
28for sent in tmp:
29if sent[-1] == "?":
30y.append("B-Q")
31# elif sent[-1].endswith('!'):
32# y.append('B-E')
33else:
34y.append("B-S")
35
36x.extend(sent[:-1])
37y.extend(["O"] * (len(sent) - 2))
38return x, y
39
40
41def convert_cornellmoviequotes():
42with open(file="../datasets/cornellmoviequotes/moviequotes.scripts.txt", mode="r", encoding="latin-1") as f:
43lines = f.readlines()
44X, Y = [], []
45
46for line in lines:
47tmp = line.split("+++$+++")[-1].strip().lower()
48# print(tmp)
49
50x, y = preprocess(tmp)
51
52# print(x)
53# print(y)
54# print('\n')
55if x != []:
56X.append(x)
57Y.append(y)
58
59with open(file="../datasets/cornqellmoviequotes.txt", mode="w", encoding="utf-8") as fo:
60for x, y in zip(X, Y):
61for word, label in zip(x, y):
62fo.write("{}\t{}\n".format(word, label))
63fo.write("\n")
64
65
66def convert_dailydialog():
67X, Y = [], []
68with open(file="../datasets/dailydialog.txt", mode="r", encoding="utf-8") as f:
69lines = f.readlines()
70# print(lines[:10])
71# print(len(lines))
72for line in lines:
73tmp = line.strip().lower()
74if len(tmp) == 0:
75continue
76# print(tmp)
77
78x, y = preprocess(tmp)
79
80# print(x)
81# print(y)
82# print('\n')
83if x != []:
84X.append(x)
85Y.append(y)
86
87with open(file="../datasets/dailydialog_sentseg.txt", mode="w", encoding="utf-8") as fo:
88for x, y in zip(X, Y):
89for word, label in zip(x, y):
90fo.write("{}\t{}\n".format(word, label))
91fo.write("\n")
92
93
94def data_split(x, y, dev_size, test_size):
95from sklearn.model_selection import train_test_split
96
97X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42)
98X_train, X_dev, y_train, y_dev = train_test_split(
99X_train, y_train, test_size=dev_size / (1 - test_size), random_state=42
100)
101return X_train, y_train, X_dev, y_dev, X_test, y_test
102
103
104def split_dataset(dataset_name="cornellmoviequotes"):
105X, Y = [], []
106x, y = [], []
107
108with open(file=f"data/{dataset_name}.txt", mode="r", encoding="utf-8") as f:
109for line in f:
110if line.strip() == "":
111X.append(x)
112Y.append(y)
113x, y = [], []
114else:
115items = line.split()
116x.append(items[0])
117y.append(items[1])
118
119xtrain, ytrain, xdev, ydev, xtest, ytest = data_split(X, Y, 0.1, 0.1)
120# print(xtrain[:10])
121# print(ytrain[:10])
122# print(len(xtrain), len(ytrain), len(xdev), len(ydev), len(xtest), len(ytest))
123
124def write2file(sents, labels, filename):
125with open(file=filename, mode="w", encoding="utf-8") as fo:
126for s, l in zip(sents, labels):
127for word, tag in zip(s, l):
128fo.write("{}\t{}\n".format(word, tag))
129fo.write("\n")
130
131write2file(xtrain, ytrain, f"data/{dataset_name}_train.txt")
132write2file(xdev, ydev, f"data/{dataset_name}_dev.txt")
133write2file(xtest, ytest, f"data/{dataset_name}_test.txt")
134
135
136def create_dicts(inp_file, out_file):
137word_counts = {}
138
139with open(file=inp_file, mode="r", encoding="utf-8") as f:
140for line in f:
141words = line.strip().split()
142if len(words) > 0:
143if words[0] not in word_counts:
144word_counts[words[0]] = 1
145else:
146word_counts[words[0]] += 1
147
148listofTuples = sorted(word_counts.items(), key=lambda x: x[1])
149
150words = ["<PAD>", "<UNK>"]
151for elem in listofTuples:
152if elem[1] > 3:
153words.append(elem[0])
154
155word2id = {k: v for (v, k) in enumerate(words)}
156id2word = {k: v for (k, v) in enumerate(words)}
157
158chars = ["<PAD>", "<UNK>"]
159for word in word2id.keys():
160for c in word:
161if c not in chars:
162chars.append(c)
163
164char2id = {k: v for (v, k) in enumerate(chars)}
165id2char = {k: v for (k, v) in enumerate(chars)}
166
167tag2id = {"<PAD>": 0, "B-S": 1, "B-Q": 2, "O": 3}
168id2tag = {0: "<PAD>", 1: "B-S", 2: "B-Q", 3: "O"}
169
170print(word2id)
171print(char2id)
172print(len(word2id), len(id2word), len(char2id), len(id2char))
173
174import pickle
175
176with open(out_file, "wb") as f:
177pickle.dump(
178{
179"word2id": word2id,
180"id2word": id2word,
181"char2id": char2id,
182"id2char": id2char,
183"tag2id": tag2id,
184"id2tag": id2tag,
185},
186f,
187)
188
189
190def data_statistic(file):
191stat = {"samples": 0, "total_words": 0, "B-S": 0, "B-Q": 0, "O": 0}
192with open(file=file, mode="r") as f:
193for line in f:
194if len(line.strip()) > 0:
195word, tag = line.strip().split("\t")
196stat[tag] += 1
197stat["total_words"] += 1
198else:
199stat["samples"] += 1
200
201print(stat)
202
203
204def create_dailydialog_for_deeppavlov():
205with open(
206file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/dailydialog_deeppavlov2.txt",
207mode="w",
208encoding="utf-8",
209) as fo:
210for dialog in open(
211file="../datasets/ijcnlp_dailydialog/dialogues_text.txt", mode="r", encoding="utf-8"
212).readlines():
213utterances = dialog.lower().replace("! ?", "!").replace("? !", "?").replace("!", ".").split("__eou__")[:-1]
214for utt in utterances:
215if len(utt) > 200:
216continue
217x, y = "", ""
218s = word_tokenize(utt)
219for word in s:
220if word in [".", "?", "!"]:
221y += word + " "
222elif word not in string.punctuation:
223x += word + " "
224y += word + " "
225if y[-2] in [".", "?", "!"]:
226fo.write("{} [SEP] {}\n".format(x[:-1], y[:-1]))
227
228# if len(y) == 0:
229# continue
230# y = y.replace("!", ".").replace(",", "").replace(" ’ ", "'").replace(" ", " ").strip()
231# if y[-1] not in [".", "?"]:
232# print(y)
233# x = y.replace("?", "").replace(".", "").replace("!", "").replace(" ", " ").strip()
234# if len(x.strip()) > 0:
235# fo.write("{} [SEP] {}\n".format(x, y))
236
237
238def split_dailydialog_for_deeppavlov():
239with open(
240file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/dailydialog_deeppavlov2.txt",
241mode="r",
242encoding="utf-8",
243) as f:
244samples = f.readlines()
245n = len(samples)
246train = samples[: (int)(n * 0.8)]
247val = samples[len(train) : (int)(n * 0.9)]
248test = samples[len(train) + len(val) :]
249print(len(samples), len(train), len(val), len(test))
250
251with open(
252file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/train2.txt", mode="w", encoding="utf-8"
253) as fo:
254fo.writelines(train)
255with open(
256file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/valid2.txt", mode="w", encoding="utf-8"
257) as fo:
258fo.writelines(val)
259with open(
260file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/test2.txt", mode="w", encoding="utf-8"
261) as fo:
262fo.writelines(test)
263
264
265# convert = {"Q": "?", "S": ".", "": ""}
266# def SentSegRestoreSent(x, y):
267# assert len(x) == len(y)
268# if len(y) == 0:
269# return ""
270# sent = x[0]
271# punct = "" if y[0] == "O" else convert[y[0][-1]]
272# for word, tag in zip(x[1:], y[1:]):
273# if tag != "O":
274# sent += punct
275# punct = convert[tag[-1]]
276# sent += " " + word
277# sent += punct
278
279# return sent
280
281# with open(file="/home/theanh/.deeppavlov/downloads/sentseg_dailydialog/train.txt", mode="w", encoding="utf-8") as fo:
282# x, y = [], []
283# for line in open(file="models/dailydialog_811/train.txt", mode="r", encoding="utf-8").readlines():
284# items = line.strip().split()
285# if len(items) == 0:
286# if len(x) > 0:
287# xs = " ".join(x)
288# ys = SentSegRestoreSent(x, y)
289# fo.write(f"{xs} [SEP] {ys}\n")
290# x, y = [], []
291# else:
292# x.append(items[0].strip())
293# y.append(items[1].strip())
294
295
296# import pickle
297# print(pickle.load(open("models/dailydialog_811/params.pkl", "rb")))
298#
299#
300# with open(file="/home/theanh/.deeppavlov/downloads/sentseg_dailydialog/test.txt", mode="w", encoding="utf-8") as fo:
301# for line in open(file="models/dailydialog_811/test.txt", mode="r", encoding="utf-8").readlines():
302# if len(line.strip()) > 0:
303# line = line.replace("B-Q", "B-?").replace("B-S", "B-.")
304# fo.write(line)
305
306
307create_dailydialog_for_deeppavlov()
308
309split_dailydialog_for_deeppavlov()
310