dream
341 строка · 10.4 Кб
1import string2
3from nltk.tokenize import sent_tokenize, word_tokenize4
5
6# Segmentation task
7# dataset: one sample = (list of token without punctuations, list of tags):
8# [['hi', 'alexa', 'what', 'time', 'is', 'it']]
9# [['B-S', ,'O', 'B-Q', 'O', 'O', 'O']]
10
11# Convert cornellmoviequotes dataset to be suitable with the segmentation task
12
13
14def preprocess(raw_text):15# input: raw text consisting of sentences without punctuation16# output: x - list of tokens, y - list of label17tmp = sent_tokenize(raw_text)18
19# remove the long line which consists more than three sentences20if len(tmp) > 3:21# print(tmp)22return [], []23
24tmp = [word_tokenize(sent) for sent in tmp]25
26x, y = [], []27
28for sent in tmp:29if sent[-1] == "?":30y.append("B-Q")31# elif sent[-1].endswith('!'):32# y.append('B-E')33else:34y.append("B-S")35
36x.extend(sent[:-1])37y.extend(["O"] * (len(sent) - 2))38return x, y39
40
41def convert_russian_subtitles():42with open(file="data/russian_subtitles_unique_utterances.txt", mode="r") as f:43lines = f.readlines()44X, Y = [], []45
46for line in lines:47tmp = line.strip().lower()48x, y = preprocess(tmp)49if x != []:50X.append(x)51Y.append(y)52
53with open(file="./data/sentseg.txt", mode="w", encoding="utf-8") as fo:54for x, y in zip(X, Y):55for word, label in zip(x, y):56fo.write("{}\t{}\n".format(word, label))57fo.write("\n")58
59
60def convert_cornellmoviequotes():61with open(file="../datasets/cornellmoviequotes/moviequotes.scripts.txt", mode="r", encoding="latin-1") as f:62lines = f.readlines()63X, Y = [], []64
65for line in lines:66tmp = line.split("+++$+++")[-1].strip().lower()67# print(tmp)68
69x, y = preprocess(tmp)70
71# print(x)72# print(y)73# print('\n')74if x != []:75X.append(x)76Y.append(y)77
78with open(file="../datasets/cornqellmoviequotes.txt", mode="w", encoding="utf-8") as fo:79for x, y in zip(X, Y):80for word, label in zip(x, y):81fo.write("{}\t{}\n".format(word, label))82fo.write("\n")83
84
85def convert_dailydialog():86X, Y = [], []87with open(file="../datasets/dailydialog.txt", mode="r", encoding="utf-8") as f:88lines = f.readlines()89# print(lines[:10])90# print(len(lines))91for line in lines:92tmp = line.strip().lower()93if len(tmp) == 0:94continue95# print(tmp)96
97x, y = preprocess(tmp)98
99# print(x)100# print(y)101# print('\n')102if x != []:103X.append(x)104Y.append(y)105
106with open(file="../datasets/dailydialog_sentseg.txt", mode="w", encoding="utf-8") as fo:107for x, y in zip(X, Y):108for word, label in zip(x, y):109fo.write("{}\t{}\n".format(word, label))110fo.write("\n")111
112
113def data_split(x, y, dev_size, test_size):114from sklearn.model_selection import train_test_split115
116X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42)117X_train, X_dev, y_train, y_dev = train_test_split(118X_train, y_train, test_size=dev_size / (1 - test_size), random_state=42119)120return X_train, y_train, X_dev, y_dev, X_test, y_test121
122
123def split_dataset(dataset_name="cornellmoviequotes"):124X, Y = [], []125x, y = [], []126
127with open(file=f"data/{dataset_name}.txt", mode="r", encoding="utf-8") as f:128for line in f:129if line.strip() == "":130X.append(x)131Y.append(y)132x, y = [], []133else:134items = line.split()135x.append(items[0])136y.append(items[1])137
138xtrain, ytrain, xdev, ydev, xtest, ytest = data_split(X, Y, 0.1, 0.1)139# print(xtrain[:10])140# print(ytrain[:10])141# print(len(xtrain), len(ytrain), len(xdev), len(ydev), len(xtest), len(ytest))142
143def write2file(sents, labels, filename):144with open(file=filename, mode="w", encoding="utf-8") as fo:145for s, l in zip(sents, labels):146for word, tag in zip(s, l):147fo.write("{}\t{}\n".format(word, tag))148fo.write("\n")149
150write2file(xtrain, ytrain, f"data/{dataset_name}_train.txt")151write2file(xdev, ydev, f"data/{dataset_name}_dev.txt")152write2file(xtest, ytest, f"data/{dataset_name}_test.txt")153
154
155def create_dicts(inp_file, out_file):156word_counts = {}157
158with open(file=inp_file, mode="r", encoding="utf-8") as f:159for line in f:160words = line.strip().split()161if len(words) > 0:162if words[0] not in word_counts:163word_counts[words[0]] = 1164else:165word_counts[words[0]] += 1166
167listofTuples = sorted(word_counts.items(), key=lambda x: x[1])168
169words = ["<PAD>", "<UNK>"]170for elem in listofTuples:171if elem[1] > 3:172words.append(elem[0])173
174word2id = {k: v for (v, k) in enumerate(words)}175id2word = {k: v for (k, v) in enumerate(words)}176
177chars = ["<PAD>", "<UNK>"]178for word in word2id.keys():179for c in word:180if c not in chars:181chars.append(c)182
183char2id = {k: v for (v, k) in enumerate(chars)}184id2char = {k: v for (k, v) in enumerate(chars)}185
186tag2id = {"<PAD>": 0, "B-S": 1, "B-Q": 2, "O": 3}187id2tag = {0: "<PAD>", 1: "B-S", 2: "B-Q", 3: "O"}188
189print(word2id)190print(char2id)191print(len(word2id), len(id2word), len(char2id), len(id2char))192
193import pickle194
195with open(out_file, "wb") as f:196pickle.dump(197{198"word2id": word2id,199"id2word": id2word,200"char2id": char2id,201"id2char": id2char,202"tag2id": tag2id,203"id2tag": id2tag,204},205f,206)207
208
209def data_statistic(file):210stat = {"samples": 0, "total_words": 0, "B-S": 0, "B-Q": 0, "O": 0}211with open(file=file, mode="r") as f:212for line in f:213if len(line.strip()) > 0:214word, tag = line.strip().split("\t")215stat[tag] += 1216stat["total_words"] += 1217else:218stat["samples"] += 1219
220print(stat)221
222
223def create_dailydialog_for_deeppavlov():224with open(225file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/dailydialog_deeppavlov2.txt",226mode="w",227encoding="utf-8",228) as fo:229for dialog in open(230file="../datasets/ijcnlp_dailydialog/dialogues_text.txt", mode="r", encoding="utf-8"231).readlines():232utterances = dialog.lower().replace("! ?", "!").replace("? !", "?").replace("!", ".").split("__eou__")[:-1]233for utt in utterances:234if len(utt) > 200:235continue236x, y = "", ""237s = word_tokenize(utt)238for word in s:239if word in [".", "?", "!"]:240y += word + " "241elif word not in string.punctuation:242x += word + " "243y += word + " "244if y[-2] in [".", "?", "!"]:245fo.write("{} [SEP] {}\n".format(x[:-1], y[:-1]))246
247# if len(y) == 0:248# continue249# y = y.replace("!", ".").replace(",", "").replace(" ’ ", "'").replace(" ", " ").strip()250# if y[-1] not in [".", "?"]:251# print(y)252# x = y.replace("?", "").replace(".", "").replace("!", "").replace(" ", " ").strip()253# if len(x.strip()) > 0:254# fo.write("{} [SEP] {}\n".format(x, y))255
256
257def split_dailydialog_for_deeppavlov():258with open(259file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/dailydialog_deeppavlov2.txt",260mode="r",261encoding="utf-8",262) as f:263samples = f.readlines()264n = len(samples)265train = samples[: (int)(n * 0.8)]266val = samples[len(train) : (int)(n * 0.9)]267test = samples[len(train) + len(val) :]268print(len(samples), len(train), len(val), len(test))269
270with open(271file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/train2.txt", mode="w", encoding="utf-8"272) as fo:273fo.writelines(train)274with open(275file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/valid2.txt", mode="w", encoding="utf-8"276) as fo:277fo.writelines(val)278with open(279file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/test2.txt", mode="w", encoding="utf-8"280) as fo:281fo.writelines(test)282
283
284# convert = {"Q": "?", "S": ".", "": ""}
285# def SentSegRestoreSent(x, y):
286# assert len(x) == len(y)
287# if len(y) == 0:
288# return ""
289# sent = x[0]
290# punct = "" if y[0] == "O" else convert[y[0][-1]]
291# for word, tag in zip(x[1:], y[1:]):
292# if tag != "O":
293# sent += punct
294# punct = convert[tag[-1]]
295# sent += " " + word
296# sent += punct
297
298# return sent
299
300# with open(file="/home/theanh/.deeppavlov/downloads/sentseg_dailydialog/train.txt", mode="w", encoding="utf-8") as fo:
301# x, y = [], []
302# for line in open(file="models/dailydialog_811/train.txt", mode="r", encoding="utf-8").readlines():
303# items = line.strip().split()
304# if len(items) == 0:
305# if len(x) > 0:
306# xs = " ".join(x)
307# ys = SentSegRestoreSent(x, y)
308# fo.write(f"{xs} [SEP] {ys}\n")
309# x, y = [], []
310# else:
311# x.append(items[0].strip())
312# y.append(items[1].strip())
313
314
315# import pickle
316# print(pickle.load(open("models/dailydialog_811/params.pkl", "rb")))
317
318#
319# with open(file="/home/theanh/.deeppavlov/downloads/sentseg_dailydialog/test.txt", mode="w", encoding="utf-8") as fo:
320# for line in open(file="models/dailydialog_811/test.txt", mode="r", encoding="utf-8").readlines():
321# if len(line.strip()) > 0:
322# line = line.replace("B-Q", "B-?").replace("B-S", "B-.")
323# fo.write(line)
324
325convert_russian_subtitles()326
327split_dataset(dataset_name="ru_sentseg")328
329create_dicts("data/ru_sentseg.txt", "data/ru_sentseg_dict.pkl")330
331# data_statistic("models/dailydialog/train.txt")
332# data_statistic("models/dailydialog/dev.txt")
333# data_statistic("models/dailydialog/test.txt")
334
335# data_statistic("models/cornellmovie_811/train.txt")
336# data_statistic("models/cornellmovie_811/dev.txt")
337# data_statistic("models/cornellmovie_811/test.txt")
338
339# create_dailydialog_for_deeppavlov()
340
341# split_dailydialog_for_deeppavlov()
342