dream

data_preprocessing.py
341 строка · 10.4 Кб
Перенос по словам
1
import string
2

3
from nltk.tokenize import sent_tokenize, word_tokenize
4

5

6
# Segmentation task
7
# dataset: one sample = (list of token without punctuations, list of tags):
8
# [['hi', 'alexa', 'what', 'time', 'is', 'it']]
9
# [['B-S', ,'O', 'B-Q', 'O', 'O', 'O']]
10

11
# Convert cornellmoviequotes dataset to be suitable with the segmentation task
12

13

14
def preprocess(raw_text):
15
    # input: raw text consisting of sentences without punctuation
16
    # output: x - list of tokens, y - list of label
17
    tmp = sent_tokenize(raw_text)
18

19
    # remove the long line which consists more than three sentences
20
    if len(tmp) > 3:
21
        # print(tmp)
22
        return [], []
23

24
    tmp = [word_tokenize(sent) for sent in tmp]
25

26
    x, y = [], []
27

28
    for sent in tmp:
29
        if sent[-1] == "?":
30
            y.append("B-Q")
31
        # elif sent[-1].endswith('!'):
32
        # 	y.append('B-E')
33
        else:
34
            y.append("B-S")
35

36
        x.extend(sent[:-1])
37
        y.extend(["O"] * (len(sent) - 2))
38
    return x, y
39

40

41
def convert_russian_subtitles():
42
    with open(file="data/russian_subtitles_unique_utterances.txt", mode="r") as f:
43
        lines = f.readlines()
44
    X, Y = [], []
45

46
    for line in lines:
47
        tmp = line.strip().lower()
48
        x, y = preprocess(tmp)
49
        if x != []:
50
            X.append(x)
51
            Y.append(y)
52

53
    with open(file="./data/sentseg.txt", mode="w", encoding="utf-8") as fo:
54
        for x, y in zip(X, Y):
55
            for word, label in zip(x, y):
56
                fo.write("{}\t{}\n".format(word, label))
57
            fo.write("\n")
58

59

60
def convert_cornellmoviequotes():
61
    with open(file="../datasets/cornellmoviequotes/moviequotes.scripts.txt", mode="r", encoding="latin-1") as f:
62
        lines = f.readlines()
63
    X, Y = [], []
64

65
    for line in lines:
66
        tmp = line.split("+++$+++")[-1].strip().lower()
67
        # print(tmp)
68

69
        x, y = preprocess(tmp)
70

71
        # print(x)
72
        # print(y)
73
        # print('\n')
74
        if x != []:
75
            X.append(x)
76
            Y.append(y)
77

78
    with open(file="../datasets/cornqellmoviequotes.txt", mode="w", encoding="utf-8") as fo:
79
        for x, y in zip(X, Y):
80
            for word, label in zip(x, y):
81
                fo.write("{}\t{}\n".format(word, label))
82
            fo.write("\n")
83

84

85
def convert_dailydialog():
86
    X, Y = [], []
87
    with open(file="../datasets/dailydialog.txt", mode="r", encoding="utf-8") as f:
88
        lines = f.readlines()
89
    # print(lines[:10])
90
    # print(len(lines))
91
    for line in lines:
92
        tmp = line.strip().lower()
93
        if len(tmp) == 0:
94
            continue
95
        # print(tmp)
96

97
        x, y = preprocess(tmp)
98

99
        # print(x)
100
        # print(y)
101
        # print('\n')
102
        if x != []:
103
            X.append(x)
104
            Y.append(y)
105

106
    with open(file="../datasets/dailydialog_sentseg.txt", mode="w", encoding="utf-8") as fo:
107
        for x, y in zip(X, Y):
108
            for word, label in zip(x, y):
109
                fo.write("{}\t{}\n".format(word, label))
110
            fo.write("\n")
111

112

113
def data_split(x, y, dev_size, test_size):
114
    from sklearn.model_selection import train_test_split
115

116
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42)
117
    X_train, X_dev, y_train, y_dev = train_test_split(
118
        X_train, y_train, test_size=dev_size / (1 - test_size), random_state=42
119
    )
120
    return X_train, y_train, X_dev, y_dev, X_test, y_test
121

122

123
def split_dataset(dataset_name="cornellmoviequotes"):
124
    X, Y = [], []
125
    x, y = [], []
126

127
    with open(file=f"data/{dataset_name}.txt", mode="r", encoding="utf-8") as f:
128
        for line in f:
129
            if line.strip() == "":
130
                X.append(x)
131
                Y.append(y)
132
                x, y = [], []
133
            else:
134
                items = line.split()
135
                x.append(items[0])
136
                y.append(items[1])
137

138
    xtrain, ytrain, xdev, ydev, xtest, ytest = data_split(X, Y, 0.1, 0.1)
139
    # print(xtrain[:10])
140
    # print(ytrain[:10])
141
    # print(len(xtrain), len(ytrain), len(xdev), len(ydev), len(xtest), len(ytest))
142

143
    def write2file(sents, labels, filename):
144
        with open(file=filename, mode="w", encoding="utf-8") as fo:
145
            for s, l in zip(sents, labels):
146
                for word, tag in zip(s, l):
147
                    fo.write("{}\t{}\n".format(word, tag))
148
                fo.write("\n")
149

150
    write2file(xtrain, ytrain, f"data/{dataset_name}_train.txt")
151
    write2file(xdev, ydev, f"data/{dataset_name}_dev.txt")
152
    write2file(xtest, ytest, f"data/{dataset_name}_test.txt")
153

154

155
def create_dicts(inp_file, out_file):
156
    word_counts = {}
157

158
    with open(file=inp_file, mode="r", encoding="utf-8") as f:
159
        for line in f:
160
            words = line.strip().split()
161
            if len(words) > 0:
162
                if words[0] not in word_counts:
163
                    word_counts[words[0]] = 1
164
                else:
165
                    word_counts[words[0]] += 1
166

167
    listofTuples = sorted(word_counts.items(), key=lambda x: x[1])
168

169
    words = ["<PAD>", "<UNK>"]
170
    for elem in listofTuples:
171
        if elem[1] > 3:
172
            words.append(elem[0])
173

174
    word2id = {k: v for (v, k) in enumerate(words)}
175
    id2word = {k: v for (k, v) in enumerate(words)}
176

177
    chars = ["<PAD>", "<UNK>"]
178
    for word in word2id.keys():
179
        for c in word:
180
            if c not in chars:
181
                chars.append(c)
182

183
    char2id = {k: v for (v, k) in enumerate(chars)}
184
    id2char = {k: v for (k, v) in enumerate(chars)}
185

186
    tag2id = {"<PAD>": 0, "B-S": 1, "B-Q": 2, "O": 3}
187
    id2tag = {0: "<PAD>", 1: "B-S", 2: "B-Q", 3: "O"}
188

189
    print(word2id)
190
    print(char2id)
191
    print(len(word2id), len(id2word), len(char2id), len(id2char))
192

193
    import pickle
194

195
    with open(out_file, "wb") as f:
196
        pickle.dump(
197
            {
198
                "word2id": word2id,
199
                "id2word": id2word,
200
                "char2id": char2id,
201
                "id2char": id2char,
202
                "tag2id": tag2id,
203
                "id2tag": id2tag,
204
            },
205
            f,
206
        )
207

208

209
def data_statistic(file):
210
    stat = {"samples": 0, "total_words": 0, "B-S": 0, "B-Q": 0, "O": 0}
211
    with open(file=file, mode="r") as f:
212
        for line in f:
213
            if len(line.strip()) > 0:
214
                word, tag = line.strip().split("\t")
215
                stat[tag] += 1
216
                stat["total_words"] += 1
217
            else:
218
                stat["samples"] += 1
219

220
    print(stat)
221

222

223
def create_dailydialog_for_deeppavlov():
224
    with open(
225
        file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/dailydialog_deeppavlov2.txt",
226
        mode="w",
227
        encoding="utf-8",
228
    ) as fo:
229
        for dialog in open(
230
            file="../datasets/ijcnlp_dailydialog/dialogues_text.txt", mode="r", encoding="utf-8"
231
        ).readlines():
232
            utterances = dialog.lower().replace("! ?", "!").replace("? !", "?").replace("!", ".").split("__eou__")[:-1]
233
            for utt in utterances:
234
                if len(utt) > 200:
235
                    continue
236
                x, y = "", ""
237
                s = word_tokenize(utt)
238
                for word in s:
239
                    if word in [".", "?", "!"]:
240
                        y += word + " "
241
                    elif word not in string.punctuation:
242
                        x += word + " "
243
                        y += word + " "
244
                if y[-2] in [".", "?", "!"]:
245
                    fo.write("{} [SEP] {}\n".format(x[:-1], y[:-1]))
246

247
            # if len(y) == 0:
248
            # 	continue
249
            # y = y.replace("!", ".").replace(",", "").replace(" ’ ", "'").replace("  ", " ").strip()
250
            # if y[-1] not in [".", "?"]:
251
            # 	print(y)
252
            # x = y.replace("?", "").replace(".", "").replace("!", "").replace("  ", " ").strip()
253
            # if len(x.strip()) > 0:
254
            # 	fo.write("{} [SEP] {}\n".format(x, y))
255

256

257
def split_dailydialog_for_deeppavlov():
258
    with open(
259
        file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/dailydialog_deeppavlov2.txt",
260
        mode="r",
261
        encoding="utf-8",
262
    ) as f:
263
        samples = f.readlines()
264
    n = len(samples)
265
    train = samples[: (int)(n * 0.8)]
266
    val = samples[len(train) : (int)(n * 0.9)]
267
    test = samples[len(train) + len(val) :]
268
    print(len(samples), len(train), len(val), len(test))
269

270
    with open(
271
        file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/train2.txt", mode="w", encoding="utf-8"
272
    ) as fo:
273
        fo.writelines(train)
274
    with open(
275
        file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/valid2.txt", mode="w", encoding="utf-8"
276
    ) as fo:
277
        fo.writelines(val)
278
    with open(
279
        file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/test2.txt", mode="w", encoding="utf-8"
280
    ) as fo:
281
        fo.writelines(test)
282

283

284
# convert = {"Q": "?", "S": ".", "": ""}
285
# def SentSegRestoreSent(x, y):
286
#     assert len(x) == len(y)
287
#     if len(y) == 0:
288
#         return ""
289
#     sent = x[0]
290
#     punct = "" if y[0] == "O" else convert[y[0][-1]]
291
#     for word, tag in zip(x[1:], y[1:]):
292
#         if tag != "O":
293
#             sent += punct
294
#             punct = convert[tag[-1]]
295
#         sent += " " + word
296
#     sent += punct
297

298
#     return sent
299

300
# with open(file="/home/theanh/.deeppavlov/downloads/sentseg_dailydialog/train.txt", mode="w", encoding="utf-8") as fo:
301
# 	x, y = [], []
302
# 	for line in open(file="models/dailydialog_811/train.txt", mode="r", encoding="utf-8").readlines():
303
# 		items = line.strip().split()
304
# 		if len(items) == 0:
305
# 			if len(x) > 0:
306
# 				xs = " ".join(x)
307
# 				ys = SentSegRestoreSent(x, y)
308
# 				fo.write(f"{xs} [SEP] {ys}\n")
309
# 				x, y = [], []
310
# 		else:
311
# 			x.append(items[0].strip())
312
# 			y.append(items[1].strip())
313

314

315
# import pickle
316
# print(pickle.load(open("models/dailydialog_811/params.pkl", "rb")))
317

318
#
319
# with open(file="/home/theanh/.deeppavlov/downloads/sentseg_dailydialog/test.txt", mode="w", encoding="utf-8") as fo:
320
# 	for line in open(file="models/dailydialog_811/test.txt", mode="r", encoding="utf-8").readlines():
321
# 		if len(line.strip()) > 0:
322
# 			line = line.replace("B-Q", "B-?").replace("B-S", "B-.")
323
# 		fo.write(line)
324

325
convert_russian_subtitles()
326

327
split_dataset(dataset_name="ru_sentseg")
328

329
create_dicts("data/ru_sentseg.txt", "data/ru_sentseg_dict.pkl")
330

331
# data_statistic("models/dailydialog/train.txt")
332
# data_statistic("models/dailydialog/dev.txt")
333
# data_statistic("models/dailydialog/test.txt")
334

335
# data_statistic("models/cornellmovie_811/train.txt")
336
# data_statistic("models/cornellmovie_811/dev.txt")
337
# data_statistic("models/cornellmovie_811/test.txt")
338

339
# create_dailydialog_for_deeppavlov()
340

341
# split_dailydialog_for_deeppavlov()
342
dream

Использование cookies