gpt-neox
/
prepare_data.py
77 строк · 2.3 Кб
1# Copyright (c) 2024, EleutherAI
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15from tools.datasets.corpora import prepare_dataset, DATA_DOWNLOADERS16import argparse17
18TOKENIZER_CHOICES = [19"HFGPT2Tokenizer",20"HFTokenizer",21"GPT2BPETokenizer",22"CharLevelTokenizer",23"TiktokenTokenizer",24"SPMTokenizer",25]
26DATASET_CHOICES = [i for i in DATA_DOWNLOADERS.keys() if i != "pass"]27
28
29def get_args():30parser = argparse.ArgumentParser(description="Download & preprocess neox datasets")31parser.add_argument(32"dataset",33nargs="?",34default="enwik8",35help="name of dataset to download.",36choices=DATASET_CHOICES,37)38parser.add_argument(39"-t",40"--tokenizer",41default="GPT2BPETokenizer",42choices=TOKENIZER_CHOICES,43help=f'Type of tokenizer to use - choose from {", ".join(TOKENIZER_CHOICES)}',44)45parser.add_argument(46"-d",47"--data-dir",48default=None,49help=f"Directory to which to download datasets / tokenizer "50f"files - defaults to ./data",51)52parser.add_argument(53"-v", "--vocab-file", default=None, help=f"Tokenizer vocab file (if required)"54)55parser.add_argument(56"-m", "--merge-file", default=None, help=f"Tokenizer merge file (if required)"57)58parser.add_argument(59"-f",60"--force-redownload",61dest="force_redownload",62default=False,63action="store_true",64)65return parser.parse_args()66
67
68if __name__ == "__main__":69args = get_args()70prepare_dataset(71dataset_name=args.dataset,72tokenizer_type=args.tokenizer,73data_dir=args.data_dir,74vocab_file=args.vocab_file,75merge_file=args.merge_file,76force_redownload=args.force_redownload,77)78