CSS-LM

tokenization_longformer.py
60 строк · 2.2 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
import logging
17

18
from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
19

20

21
logger = logging.getLogger(__name__)
22

23

24
# vocab and merges same as roberta
25
vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
26
merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
27
_all_longformer_models = [
28
    "allenai/longformer-base-4096",
29
    "allenai/longformer-large-4096",
30
    "allenai/longformer-large-4096-finetuned-triviaqa",
31
    "allenai/longformer-base-4096-extra.pos.embd.only",
32
    "allenai/longformer-large-4096-extra.pos.embd.only",
33
]
34

35

36
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
37
    "allenai/longformer-base-4096": 4096,
38
    "allenai/longformer-large-4096": 4096,
39
    "allenai/longformer-large-4096-finetuned-triviaqa": 4096,
40
    "allenai/longformer-base-4096-extra.pos.embd.only": 4096,
41
    "allenai/longformer-large-4096-extra.pos.embd.only": 4096,
42
}
43

44

45
class LongformerTokenizer(RobertaTokenizer):
46
    # merges and vocab same as Roberta
47
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
48
    pretrained_vocab_files_map = {
49
        "vocab_file": {m: vocab_url for m in _all_longformer_models},
50
        "merges_file": {m: merges_url for m in _all_longformer_models},
51
    }
52

53

54
class LongformerTokenizerFast(RobertaTokenizerFast):
55
    # merges and vocab same as Roberta
56
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
57
    pretrained_vocab_files_map = {
58
        "vocab_file": {m: vocab_url for m in _all_longformer_models},
59
        "merges_file": {m: merges_url for m in _all_longformer_models},
60
    }
61
CSS-LM

Использование cookies