CSS-LM

extract_from_org.py
113 строк · 4.1 Кб
Перенос по словам
1
import json
2

3
#{"label": "main subject", "tokens": "For the 1971 film \" A Blank on the Map \" , he joined the first Western expedition to a remote highland valley in New Guinea to seek out a lost tribe .", "h": ["A Blank on the Map", ["Q4655508", 20, 38, 0.5]], "t": ["lost tribe", ["Q672979", 138, 148, 0.5]]}
4

5
train_label = list()
6
train_list = list()
7
train_label_dict_num = dict()
8
with open("org/train.txt","r") as f:
9
    for line in f:
10
        train_dict = dict()
11
        line = json.loads(line)
12
        if line["label"] == 'AGONIST-INHIBITOR' or line["label"] == 'SUBSTRATE_PRODUCT-OF':
13
            continue
14
        train_dict["sentiment"] = line["label"]
15
        train_label.append(line["label"])
16
        if line["label"] not in train_label_dict_num:
17
            train_label_dict_num[line["label"]] = 1
18
        else:
19
            train_label_dict_num[line["label"]] += 1
20
        train_dict["sentence"] = line["text"]
21
        train_dict["aspect"] = "chemprot"
22
        #h_site = line["metadata"][:2]
23
        #t_site = line["metadata"][2:]
24
        #line = line["text"].replace("[[","").replace("]]","").replace("<<","").replace(">>","")
25
        #line = line.strip().split()
26
        #h = " ".join(line[h_site[0]:h_site[1]+1])
27
        #t = " ".join(line[t_site[0]:t_site[1]+1])
28
        #train_dict["h"] = [h]
29
        #train_dict["t"] = [t]
30
        train_list.append(train_dict)
31

32

33
dev_label = list()
34
dev_list = list()
35
dev_label_dict_num = dict()
36
with open("org/dev.txt","r") as f:
37
    for line in f:
38
        dev_dict = dict()
39
        line = json.loads(line)
40
        if line["label"] == 'AGONIST-INHIBITOR' or line["label"] == 'SUBSTRATE_PRODUCT-OF':
41
            continue
42
        dev_dict["sentiment"] = line["label"]
43
        dev_label.append(line["label"])
44
        dev_dict["sentence"] = line["text"]
45
        dev_dict["aspect"] = "chemprot"
46
        #h_site = line["metadata"][:2]
47
        #t_site = line["metadata"][2:]
48
        #line = line["text"].replace("[[","").replace("]]","").replace("<<","").replace(">>","")
49
        #line = line.strip().split()
50
        #h = " ".join(line[h_site[0]:h_site[1]+1])
51
        #t = " ".join(line[t_site[0]:t_site[1]+1])
52
        #dev_dict["h"] = [h]
53
        #dev_dict["t"] = [t]
54
        if line["label"] not in dev_label_dict_num:
55
            dev_label_dict_num[line["label"]] = 1
56
        else:
57
            dev_label_dict_num[line["label"]] += 1
58
        dev_list.append(dev_dict)
59

60
test_label = list()
61
test_list = list()
62
test_label_dict_num = dict()
63
with open("org/test.txt","r") as f:
64
    for line in f:
65
        test_dict = dict()
66
        line = json.loads(line)
67
        if line["label"] == 'AGONIST-INHIBITOR' or line["label"] == 'SUBSTRATE_PRODUCT-OF':
68
            continue
69
        test_dict["sentiment"] = line["label"]
70
        test_label.append(line["label"])
71
        test_dict["sentence"] = line["text"]
72
        test_dict["aspect"] = "chemprot"
73
        #h_site = line["metadata"][:2]
74
        #t_site = line["metadata"][2:]
75
        #line = line["text"].replace("[[","").replace("]]","").replace("<<","").replace(">>","")
76
        #line = line.strip().split()
77
        #h = " ".join(line[h_site[0]:h_site[1]+1])
78
        #t = " ".join(line[t_site[0]:t_site[1]+1])
79
        #test_dict["h"] = [h]
80
        #test_dict["t"] = [t]
81
        if line["label"] not in test_label_dict_num:
82
            test_label_dict_num[line["label"]] = 1
83
        else:
84
            test_label_dict_num[line["label"]] += 1
85
        test_list.append(test_dict)
86

87
print(len(set(train_label)))
88
print(len(set(dev_label)))
89
print(len(set(test_label)))
90
print("========")
91
'''
92
print(sorted(list(set(train_label_dict_num.keys()))))
93
print(sorted(list(set(dev_label_dict_num.keys()))))
94
print(sorted(list(set(test_label_dict_num.keys()))))
95
'''
96
for l in sorted(list(set(train_label_dict_num.keys()))):
97
    print(l)
98
print("=====")
99
for l in sorted(list(set(dev_label_dict_num.keys()))):
100
    print(l)
101
print("=====")
102
for l in sorted(list(set(test_label_dict_num.keys()))):
103
    print(l)
104

105

106
with open("train_all.json","w", encoding='utf-8') as f:
107
    json.dump(train_list,f)
108

109
with open("dev.json","w", encoding='utf-8') as f:
110
    json.dump(dev_list,f)
111

112
with open("test.json","w", encoding='utf-8') as f:
113
    json.dump(test_list,f)
114
CSS-LM

Использование cookies