dream

data_preprocessing.py
309 строк · 9.5 Кб
Перенос по словам
1
import string
2

3
from nltk.tokenize import sent_tokenize, word_tokenize
4

5

6
# Segmentation task
7
# dataset: one sample = (list of token without punctuations, list of tags):
8
# [['hi', 'alexa', 'what', 'time', 'is', 'it']]
9
# [['B-S', ,'O', 'B-Q', 'O', 'O', 'O']]
10

11
# Convert cornellmoviequotes dataset to be suitable with the segmentation task
12

13

14
def preprocess(raw_text):
15
    # input: raw text consisting of sentences without punctuation
16
    # output: x - list of tokens, y - list of label
17
    tmp = sent_tokenize(raw_text)
18

19
    # remove the long line which consists more than three sentences
20
    if len(tmp) > 3:
21
        # print(tmp)
22
        return [], []
23

24
    tmp = [word_tokenize(sent) for sent in tmp]
25

26
    x, y = [], []
27

28
    for sent in tmp:
29
        if sent[-1] == "?":
30
            y.append("B-Q")
31
        # elif sent[-1].endswith('!'):
32
        # 	y.append('B-E')
33
        else:
34
            y.append("B-S")
35

36
        x.extend(sent[:-1])
37
        y.extend(["O"] * (len(sent) - 2))
38
    return x, y
39

40

41
def convert_cornellmoviequotes():
42
    with open(file="../datasets/cornellmoviequotes/moviequotes.scripts.txt", mode="r", encoding="latin-1") as f:
43
        lines = f.readlines()
44
    X, Y = [], []
45

46
    for line in lines:
47
        tmp = line.split("+++$+++")[-1].strip().lower()
48
        # print(tmp)
49

50
        x, y = preprocess(tmp)
51

52
        # print(x)
53
        # print(y)
54
        # print('\n')
55
        if x != []:
56
            X.append(x)
57
            Y.append(y)
58

59
    with open(file="../datasets/cornqellmoviequotes.txt", mode="w", encoding="utf-8") as fo:
60
        for x, y in zip(X, Y):
61
            for word, label in zip(x, y):
62
                fo.write("{}\t{}\n".format(word, label))
63
            fo.write("\n")
64

65

66
def convert_dailydialog():
67
    X, Y = [], []
68
    with open(file="../datasets/dailydialog.txt", mode="r", encoding="utf-8") as f:
69
        lines = f.readlines()
70
    # print(lines[:10])
71
    # print(len(lines))
72
    for line in lines:
73
        tmp = line.strip().lower()
74
        if len(tmp) == 0:
75
            continue
76
        # print(tmp)
77

78
        x, y = preprocess(tmp)
79

80
        # print(x)
81
        # print(y)
82
        # print('\n')
83
        if x != []:
84
            X.append(x)
85
            Y.append(y)
86

87
    with open(file="../datasets/dailydialog_sentseg.txt", mode="w", encoding="utf-8") as fo:
88
        for x, y in zip(X, Y):
89
            for word, label in zip(x, y):
90
                fo.write("{}\t{}\n".format(word, label))
91
            fo.write("\n")
92

93

94
def data_split(x, y, dev_size, test_size):
95
    from sklearn.model_selection import train_test_split
96

97
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42)
98
    X_train, X_dev, y_train, y_dev = train_test_split(
99
        X_train, y_train, test_size=dev_size / (1 - test_size), random_state=42
100
    )
101
    return X_train, y_train, X_dev, y_dev, X_test, y_test
102

103

104
def split_dataset(dataset_name="cornellmoviequotes"):
105
    X, Y = [], []
106
    x, y = [], []
107

108
    with open(file=f"data/{dataset_name}.txt", mode="r", encoding="utf-8") as f:
109
        for line in f:
110
            if line.strip() == "":
111
                X.append(x)
112
                Y.append(y)
113
                x, y = [], []
114
            else:
115
                items = line.split()
116
                x.append(items[0])
117
                y.append(items[1])
118

119
    xtrain, ytrain, xdev, ydev, xtest, ytest = data_split(X, Y, 0.1, 0.1)
120
    # print(xtrain[:10])
121
    # print(ytrain[:10])
122
    # print(len(xtrain), len(ytrain), len(xdev), len(ydev), len(xtest), len(ytest))
123

124
    def write2file(sents, labels, filename):
125
        with open(file=filename, mode="w", encoding="utf-8") as fo:
126
            for s, l in zip(sents, labels):
127
                for word, tag in zip(s, l):
128
                    fo.write("{}\t{}\n".format(word, tag))
129
                fo.write("\n")
130

131
    write2file(xtrain, ytrain, f"data/{dataset_name}_train.txt")
132
    write2file(xdev, ydev, f"data/{dataset_name}_dev.txt")
133
    write2file(xtest, ytest, f"data/{dataset_name}_test.txt")
134

135

136
def create_dicts(inp_file, out_file):
137
    word_counts = {}
138

139
    with open(file=inp_file, mode="r", encoding="utf-8") as f:
140
        for line in f:
141
            words = line.strip().split()
142
            if len(words) > 0:
143
                if words[0] not in word_counts:
144
                    word_counts[words[0]] = 1
145
                else:
146
                    word_counts[words[0]] += 1
147

148
    listofTuples = sorted(word_counts.items(), key=lambda x: x[1])
149

150
    words = ["<PAD>", "<UNK>"]
151
    for elem in listofTuples:
152
        if elem[1] > 3:
153
            words.append(elem[0])
154

155
    word2id = {k: v for (v, k) in enumerate(words)}
156
    id2word = {k: v for (k, v) in enumerate(words)}
157

158
    chars = ["<PAD>", "<UNK>"]
159
    for word in word2id.keys():
160
        for c in word:
161
            if c not in chars:
162
                chars.append(c)
163

164
    char2id = {k: v for (v, k) in enumerate(chars)}
165
    id2char = {k: v for (k, v) in enumerate(chars)}
166

167
    tag2id = {"<PAD>": 0, "B-S": 1, "B-Q": 2, "O": 3}
168
    id2tag = {0: "<PAD>", 1: "B-S", 2: "B-Q", 3: "O"}
169

170
    print(word2id)
171
    print(char2id)
172
    print(len(word2id), len(id2word), len(char2id), len(id2char))
173

174
    import pickle
175

176
    with open(out_file, "wb") as f:
177
        pickle.dump(
178
            {
179
                "word2id": word2id,
180
                "id2word": id2word,
181
                "char2id": char2id,
182
                "id2char": id2char,
183
                "tag2id": tag2id,
184
                "id2tag": id2tag,
185
            },
186
            f,
187
        )
188

189

190
def data_statistic(file):
191
    stat = {"samples": 0, "total_words": 0, "B-S": 0, "B-Q": 0, "O": 0}
192
    with open(file=file, mode="r") as f:
193
        for line in f:
194
            if len(line.strip()) > 0:
195
                word, tag = line.strip().split("\t")
196
                stat[tag] += 1
197
                stat["total_words"] += 1
198
            else:
199
                stat["samples"] += 1
200

201
    print(stat)
202

203

204
def create_dailydialog_for_deeppavlov():
205
    with open(
206
        file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/dailydialog_deeppavlov2.txt",
207
        mode="w",
208
        encoding="utf-8",
209
    ) as fo:
210
        for dialog in open(
211
            file="../datasets/ijcnlp_dailydialog/dialogues_text.txt", mode="r", encoding="utf-8"
212
        ).readlines():
213
            utterances = dialog.lower().replace("! ?", "!").replace("? !", "?").replace("!", ".").split("__eou__")[:-1]
214
            for utt in utterances:
215
                if len(utt) > 200:
216
                    continue
217
                x, y = "", ""
218
                s = word_tokenize(utt)
219
                for word in s:
220
                    if word in [".", "?", "!"]:
221
                        y += word + " "
222
                    elif word not in string.punctuation:
223
                        x += word + " "
224
                        y += word + " "
225
                if y[-2] in [".", "?", "!"]:
226
                    fo.write("{} [SEP] {}\n".format(x[:-1], y[:-1]))
227

228
            # if len(y) == 0:
229
            # 	continue
230
            # y = y.replace("!", ".").replace(",", "").replace(" ’ ", "'").replace("  ", " ").strip()
231
            # if y[-1] not in [".", "?"]:
232
            # 	print(y)
233
            # x = y.replace("?", "").replace(".", "").replace("!", "").replace("  ", " ").strip()
234
            # if len(x.strip()) > 0:
235
            # 	fo.write("{} [SEP] {}\n".format(x, y))
236

237

238
def split_dailydialog_for_deeppavlov():
239
    with open(
240
        file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/dailydialog_deeppavlov2.txt",
241
        mode="r",
242
        encoding="utf-8",
243
    ) as f:
244
        samples = f.readlines()
245
    n = len(samples)
246
    train = samples[: (int)(n * 0.8)]
247
    val = samples[len(train) : (int)(n * 0.9)]
248
    test = samples[len(train) + len(val) :]
249
    print(len(samples), len(train), len(val), len(test))
250

251
    with open(
252
        file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/train2.txt", mode="w", encoding="utf-8"
253
    ) as fo:
254
        fo.writelines(train)
255
    with open(
256
        file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/valid2.txt", mode="w", encoding="utf-8"
257
    ) as fo:
258
        fo.writelines(val)
259
    with open(
260
        file="../datasets/ijcnlp_dailydialog/dailydialog_for_deeppavlov/test2.txt", mode="w", encoding="utf-8"
261
    ) as fo:
262
        fo.writelines(test)
263

264

265
# convert = {"Q": "?", "S": ".", "": ""}
266
# def SentSegRestoreSent(x, y):
267
#     assert len(x) == len(y)
268
#     if len(y) == 0:
269
#         return ""
270
#     sent = x[0]
271
#     punct = "" if y[0] == "O" else convert[y[0][-1]]
272
#     for word, tag in zip(x[1:], y[1:]):
273
#         if tag != "O":
274
#             sent += punct
275
#             punct = convert[tag[-1]]
276
#         sent += " " + word
277
#     sent += punct
278

279
#     return sent
280

281
# with open(file="/home/theanh/.deeppavlov/downloads/sentseg_dailydialog/train.txt", mode="w", encoding="utf-8") as fo:
282
# 	x, y = [], []
283
# 	for line in open(file="models/dailydialog_811/train.txt", mode="r", encoding="utf-8").readlines():
284
# 		items = line.strip().split()
285
# 		if len(items) == 0:
286
# 			if len(x) > 0:
287
# 				xs = " ".join(x)
288
# 				ys = SentSegRestoreSent(x, y)
289
# 				fo.write(f"{xs} [SEP] {ys}\n")
290
# 				x, y = [], []
291
# 		else:
292
# 			x.append(items[0].strip())
293
# 			y.append(items[1].strip())
294

295

296
# import pickle
297
# print(pickle.load(open("models/dailydialog_811/params.pkl", "rb")))
298
#
299
#
300
# with open(file="/home/theanh/.deeppavlov/downloads/sentseg_dailydialog/test.txt", mode="w", encoding="utf-8") as fo:
301
# 	for line in open(file="models/dailydialog_811/test.txt", mode="r", encoding="utf-8").readlines():
302
# 		if len(line.strip()) > 0:
303
# 			line = line.replace("B-Q", "B-?").replace("B-S", "B-.")
304
# 		fo.write(line)
305

306

307
create_dailydialog_for_deeppavlov()
308

309
split_dailydialog_for_deeppavlov()
310
dream

Использование cookies