CSS-LM
113 строк · 4.1 Кб
1import json
2
3#{"label": "main subject", "tokens": "For the 1971 film \" A Blank on the Map \" , he joined the first Western expedition to a remote highland valley in New Guinea to seek out a lost tribe .", "h": ["A Blank on the Map", ["Q4655508", 20, 38, 0.5]], "t": ["lost tribe", ["Q672979", 138, 148, 0.5]]}
4
5train_label = list()
6train_list = list()
7train_label_dict_num = dict()
8with open("org/train.txt","r") as f:
9for line in f:
10train_dict = dict()
11line = json.loads(line)
12if line["label"] == 'AGONIST-INHIBITOR' or line["label"] == 'SUBSTRATE_PRODUCT-OF':
13continue
14train_dict["sentiment"] = line["label"]
15train_label.append(line["label"])
16if line["label"] not in train_label_dict_num:
17train_label_dict_num[line["label"]] = 1
18else:
19train_label_dict_num[line["label"]] += 1
20train_dict["sentence"] = line["text"]
21train_dict["aspect"] = "chemprot"
22#h_site = line["metadata"][:2]
23#t_site = line["metadata"][2:]
24#line = line["text"].replace("[[","").replace("]]","").replace("<<","").replace(">>","")
25#line = line.strip().split()
26#h = " ".join(line[h_site[0]:h_site[1]+1])
27#t = " ".join(line[t_site[0]:t_site[1]+1])
28#train_dict["h"] = [h]
29#train_dict["t"] = [t]
30train_list.append(train_dict)
31
32
33dev_label = list()
34dev_list = list()
35dev_label_dict_num = dict()
36with open("org/dev.txt","r") as f:
37for line in f:
38dev_dict = dict()
39line = json.loads(line)
40if line["label"] == 'AGONIST-INHIBITOR' or line["label"] == 'SUBSTRATE_PRODUCT-OF':
41continue
42dev_dict["sentiment"] = line["label"]
43dev_label.append(line["label"])
44dev_dict["sentence"] = line["text"]
45dev_dict["aspect"] = "chemprot"
46#h_site = line["metadata"][:2]
47#t_site = line["metadata"][2:]
48#line = line["text"].replace("[[","").replace("]]","").replace("<<","").replace(">>","")
49#line = line.strip().split()
50#h = " ".join(line[h_site[0]:h_site[1]+1])
51#t = " ".join(line[t_site[0]:t_site[1]+1])
52#dev_dict["h"] = [h]
53#dev_dict["t"] = [t]
54if line["label"] not in dev_label_dict_num:
55dev_label_dict_num[line["label"]] = 1
56else:
57dev_label_dict_num[line["label"]] += 1
58dev_list.append(dev_dict)
59
60test_label = list()
61test_list = list()
62test_label_dict_num = dict()
63with open("org/test.txt","r") as f:
64for line in f:
65test_dict = dict()
66line = json.loads(line)
67if line["label"] == 'AGONIST-INHIBITOR' or line["label"] == 'SUBSTRATE_PRODUCT-OF':
68continue
69test_dict["sentiment"] = line["label"]
70test_label.append(line["label"])
71test_dict["sentence"] = line["text"]
72test_dict["aspect"] = "chemprot"
73#h_site = line["metadata"][:2]
74#t_site = line["metadata"][2:]
75#line = line["text"].replace("[[","").replace("]]","").replace("<<","").replace(">>","")
76#line = line.strip().split()
77#h = " ".join(line[h_site[0]:h_site[1]+1])
78#t = " ".join(line[t_site[0]:t_site[1]+1])
79#test_dict["h"] = [h]
80#test_dict["t"] = [t]
81if line["label"] not in test_label_dict_num:
82test_label_dict_num[line["label"]] = 1
83else:
84test_label_dict_num[line["label"]] += 1
85test_list.append(test_dict)
86
87print(len(set(train_label)))
88print(len(set(dev_label)))
89print(len(set(test_label)))
90print("========")
91'''
92print(sorted(list(set(train_label_dict_num.keys()))))
93print(sorted(list(set(dev_label_dict_num.keys()))))
94print(sorted(list(set(test_label_dict_num.keys()))))
95'''
96for l in sorted(list(set(train_label_dict_num.keys()))):
97print(l)
98print("=====")
99for l in sorted(list(set(dev_label_dict_num.keys()))):
100print(l)
101print("=====")
102for l in sorted(list(set(test_label_dict_num.keys()))):
103print(l)
104
105
106with open("train_all.json","w", encoding='utf-8') as f:
107json.dump(train_list,f)
108
109with open("dev.json","w", encoding='utf-8') as f:
110json.dump(dev_list,f)
111
112with open("test.json","w", encoding='utf-8') as f:
113json.dump(test_list,f)
114