dream
78 строк · 2.2 Кб
1import pickle2import re3
4
5def load_dictionaries(pickle_file):6dicts = pickle.load(open(pickle_file, mode="rb"))7return dicts8
9
10# _, _, char2id, id2char, _, _, _, _, _, _ = load_dictionaries("./datasets/conll/english/conll_eng.train_testa.pkl")
11# print(char2id, id2char)
12
13
14def zeros(s):15"""16Replace every digit in a string by a zero.
17:param s:
18:return: string after replacing all digits by zeros
19"""
20return re.sub("\d", "0", s)21
22
23def get_chunk_type(tok, idx_to_tag):24"""25Args:
26tok: id of token, ex 4
27idx_to_tag: dictionary {4: "B-PER", ...}
28Returns:
29tuple: "B", "PER"
30"""
31tag_name = idx_to_tag[tok]32tag_class = tag_name.split("-")[0]33tag_type = tag_name.split("-")[-1]34return tag_class, tag_type35
36
37def get_chunks(seq, tags):38"""39Args:
40seq: [4, 4, 0, 0, ...] sequence of labels
41tags: dict["O"] = 4
42Returns:
43list of (chunk_type, chunk_start, chunk_end)
44
45Example:
46seq = [4, 5, 0, 3]
47tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
48result = [("PER", 0, 2), ("LOC", 3, 4)]
49"""
50default = tags["O"]51idx_to_tag = {idx: tag for tag, idx in tags.items()}52chunks = []53chunk_type, chunk_start = None, None54for i, tok in enumerate(seq):55# End of a chunk 156if tok == default and chunk_type is not None:57# Add a chunk.58chunk = (chunk_type, chunk_start, i)59chunks.append(chunk)60chunk_type, chunk_start = None, None61
62# End of a chunk + start of a chunk!63elif tok != default:64tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)65if chunk_type is None:66chunk_type, chunk_start = tok_chunk_type, i67elif tok_chunk_type != chunk_type or tok_chunk_class == "B":68chunk = (chunk_type, chunk_start, i)69chunks.append(chunk)70chunk_type, chunk_start = tok_chunk_type, i71else:72pass73# end condition74if chunk_type is not None:75chunk = (chunk_type, chunk_start, len(seq))76chunks.append(chunk)77
78return chunks79