CSS-LM

Форк
0
/
extract_from_org.py 
77 строк · 2.6 Кб
1
import json
2

3
#{"label": "main subject", "tokens": "For the 1971 film \" A Blank on the Map \" , he joined the first Western expedition to a remote highland valley in New Guinea to seek out a lost tribe .", "h": ["A Blank on the Map", ["Q4655508", 20, 38, 0.5]], "t": ["lost tribe", ["Q672979", 138, 148, 0.5]]}
4

5
train_label = list()
6
train_list = list()
7
with open("org/train.txt","r") as f:
8
    for line in f:
9
        train_dict = dict()
10
        line = json.loads(line)
11
        train_dict["label"] = line["label"]
12
        train_label.append(line["label"])
13
        train_dict["tokens"] = line["text"]
14
        h_site = line["metadata"][:2]
15
        t_site = line["metadata"][2:]
16
        line = line["text"].replace("[[","").replace("]]","").replace("<<","").replace(">>","")
17
        line = line.strip().split()
18
        h = " ".join(line[h_site[0]:h_site[1]+1])
19
        t = " ".join(line[t_site[0]:t_site[1]+1])
20
        train_dict["h"] = [h]
21
        train_dict["t"] = [t]
22
        train_list.append(train_dict)
23

24

25
dev_label = list()
26
dev_list = list()
27
with open("org/dev.txt","r") as f:
28
    for line in f:
29
        dev_dict = dict()
30
        line = json.loads(line)
31
        dev_dict["label"] = line["label"]
32
        dev_dict["tokens"] = line["text"]
33
        dev_label.append(line["label"])
34
        h_site = line["metadata"][:2]
35
        t_site = line["metadata"][2:]
36
        line = line["text"].replace("[[","").replace("]]","").replace("<<","").replace(">>","")
37
        line = line.strip().split()
38
        h = " ".join(line[h_site[0]:h_site[1]+1])
39
        t = " ".join(line[t_site[0]:t_site[1]+1])
40
        dev_dict["h"] = [h]
41
        dev_dict["t"] = [t]
42
        dev_list.append(dev_dict)
43

44

45
test_label = list()
46
test_list = list()
47
with open("org/test.txt","r") as f:
48
    for line in f:
49
        test_dict = dict()
50
        line = json.loads(line)
51
        test_dict["label"] = line["label"]
52
        test_dict["tokens"] = line["text"]
53
        test_label.append(line["label"])
54
        h_site = line["metadata"][:2]
55
        t_site = line["metadata"][2:]
56
        line = line["text"].replace("[[","").replace("]]","").replace("<<","").replace(">>","")
57
        line = line.strip().split()
58
        h = " ".join(line[h_site[0]:h_site[1]+1])
59
        t = " ".join(line[t_site[0]:t_site[1]+1])
60
        test_dict["h"] = [h]
61
        test_dict["t"] = [t]
62
        test_list.append(test_dict)
63

64

65
print(len(set(train_label)))
66
print(len(set(dev_label)))
67
print(len(set(test_label)))
68

69

70
with open("train.json","w", encoding='utf-8') as f:
71
    json.dump(train_list,f)
72

73
with open("dev.json","w", encoding='utf-8') as f:
74
    json.dump(dev_list,f)
75

76
with open("test.json","w", encoding='utf-8') as f:
77
    json.dump(test_list,f)
78

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.