CSS-LM
77 строк · 2.6 Кб
1import json
2
3#{"label": "main subject", "tokens": "For the 1971 film \" A Blank on the Map \" , he joined the first Western expedition to a remote highland valley in New Guinea to seek out a lost tribe .", "h": ["A Blank on the Map", ["Q4655508", 20, 38, 0.5]], "t": ["lost tribe", ["Q672979", 138, 148, 0.5]]}
4
5train_label = list()
6train_list = list()
7with open("org/train.txt","r") as f:
8for line in f:
9train_dict = dict()
10line = json.loads(line)
11train_dict["label"] = line["label"]
12train_label.append(line["label"])
13train_dict["tokens"] = line["text"]
14h_site = line["metadata"][:2]
15t_site = line["metadata"][2:]
16line = line["text"].replace("[[","").replace("]]","").replace("<<","").replace(">>","")
17line = line.strip().split()
18h = " ".join(line[h_site[0]:h_site[1]+1])
19t = " ".join(line[t_site[0]:t_site[1]+1])
20train_dict["h"] = [h]
21train_dict["t"] = [t]
22train_list.append(train_dict)
23
24
25dev_label = list()
26dev_list = list()
27with open("org/dev.txt","r") as f:
28for line in f:
29dev_dict = dict()
30line = json.loads(line)
31dev_dict["label"] = line["label"]
32dev_dict["tokens"] = line["text"]
33dev_label.append(line["label"])
34h_site = line["metadata"][:2]
35t_site = line["metadata"][2:]
36line = line["text"].replace("[[","").replace("]]","").replace("<<","").replace(">>","")
37line = line.strip().split()
38h = " ".join(line[h_site[0]:h_site[1]+1])
39t = " ".join(line[t_site[0]:t_site[1]+1])
40dev_dict["h"] = [h]
41dev_dict["t"] = [t]
42dev_list.append(dev_dict)
43
44
45test_label = list()
46test_list = list()
47with open("org/test.txt","r") as f:
48for line in f:
49test_dict = dict()
50line = json.loads(line)
51test_dict["label"] = line["label"]
52test_dict["tokens"] = line["text"]
53test_label.append(line["label"])
54h_site = line["metadata"][:2]
55t_site = line["metadata"][2:]
56line = line["text"].replace("[[","").replace("]]","").replace("<<","").replace(">>","")
57line = line.strip().split()
58h = " ".join(line[h_site[0]:h_site[1]+1])
59t = " ".join(line[t_site[0]:t_site[1]+1])
60test_dict["h"] = [h]
61test_dict["t"] = [t]
62test_list.append(test_dict)
63
64
65print(len(set(train_label)))
66print(len(set(dev_label)))
67print(len(set(test_label)))
68
69
70with open("train.json","w", encoding='utf-8') as f:
71json.dump(train_list,f)
72
73with open("dev.json","w", encoding='utf-8') as f:
74json.dump(dev_list,f)
75
76with open("test.json","w", encoding='utf-8') as f:
77json.dump(test_list,f)
78