CSS-LM
60 строк · 2.2 Кб
1# coding=utf-8
2# Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16import logging17
18from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast19
20
21logger = logging.getLogger(__name__)22
23
24# vocab and merges same as roberta
25vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"26merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"27_all_longformer_models = [28"allenai/longformer-base-4096",29"allenai/longformer-large-4096",30"allenai/longformer-large-4096-finetuned-triviaqa",31"allenai/longformer-base-4096-extra.pos.embd.only",32"allenai/longformer-large-4096-extra.pos.embd.only",33]
34
35
36PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {37"allenai/longformer-base-4096": 4096,38"allenai/longformer-large-4096": 4096,39"allenai/longformer-large-4096-finetuned-triviaqa": 4096,40"allenai/longformer-base-4096-extra.pos.embd.only": 4096,41"allenai/longformer-large-4096-extra.pos.embd.only": 4096,42}
43
44
45class LongformerTokenizer(RobertaTokenizer):46# merges and vocab same as Roberta47max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES48pretrained_vocab_files_map = {49"vocab_file": {m: vocab_url for m in _all_longformer_models},50"merges_file": {m: merges_url for m in _all_longformer_models},51}52
53
54class LongformerTokenizerFast(RobertaTokenizerFast):55# merges and vocab same as Roberta56max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES57pretrained_vocab_files_map = {58"vocab_file": {m: vocab_url for m in _all_longformer_models},59"merges_file": {m: merges_url for m in _all_longformer_models},60}61