gpt-neox

convert_neox_to_hf.py
720 строк · 26.1 Кб
Перенос по словам
1
# Copyright (c) 2023, EleutherAI
2
#
3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
6
#
7
#     http://www.apache.org/licenses/LICENSE-2.0
8
#
9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14

15
import os
16
import sys
17

18
import yaml
19
import argparse
20
from tqdm import tqdm
21

22
import torch
23
from transformers import (
24
    MistralConfig,
25
    LlamaConfig,
26
    GPTNeoXConfig,
27
    AutoModelForCausalLM,
28
    AutoConfig,
29
)
30

31
from typing import List, Literal
32

33
sys.path.append(
34
    os.path.abspath(
35
        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
36
    )
37
)
38
from megatron.tokenizer import build_tokenizer
39

40

41
"""
42
A script for converting saved NeoX Checkpoints to Huggingface (HF) compatible GPT-NeoX type models.
43

44
Note that this script does not support all NeoX features.
45
Please investigate carefully whether your model is compatible with all architectures supported by the GPTNeoXForCausalLM class in HF.
46

47
(e.g. position embeddings such as AliBi may not be supported by Huggingface's GPT-NeoX architecture).
48
"""
49

50

51
# Model definitions: a list of keys, and where they fall in terms of handling them in the presence of TP.
52
# in format : {model arch: {param type: {param in neox: param in HF}}}
53

54
MODEL_KEYS = {
55
    "neox": {
56
        "COLUMN_PARALLEL_LINEAR_KEYS": {
57
            "mlp.dense_h_to_4h.weight": "mlp.dense_h_to_4h.weight",
58
            "mlp.dense_h_to_4h.bias": "mlp.dense_h_to_4h.bias",
59
            "attention.query_key_value.weight": "attention.query_key_value.weight",
60
            "attention.query_key_value.bias": "attention.query_key_value.bias",  # TODO: handle GQA separately?
61
        },
62
        "ROW_PARALLEL_LINEAR_KEYS": {
63
            "attention.dense.weight": "attention.dense.weight",
64
            "mlp.dense_4h_to_h.weight": "mlp.dense_4h_to_h.weight",
65
        },
66
        "ROW_PARALLEL_BIAS_KEYS": {
67
            "mlp.dense_4h_to_h.bias": "mlp.dense_4h_to_h.bias",
68
            "attention.dense.bias": "attention.dense.bias",
69
        },
70
        "NORM_KEYS": {
71
            "input_layernorm.weight": "input_layernorm.weight",
72
            "input_layernorm.bias": "input_layernorm.bias",
73
            "post_attention_layernorm.weight": "post_attention_layernorm.weight",
74
            "post_attention_layernorm.bias": "post_attention_layernorm.bias",
75
        },
76
        "FINAL_NORM_KEYS": {
77
            "norm.weight": "weight",
78
            "norm.bias": "bias",
79
        },
80
    },
81
    "llama": {
82
        "COLUMN_PARALLEL_LINEAR_KEYS": {
83
            "mlp.w1.weight": "mlp.gate_proj.weight",
84
            "mlp.w3.weight": "mlp.up_proj.weight",
85
        },
86
        "ROW_PARALLEL_LINEAR_KEYS": {
87
            "attention.dense.weight": "self_attn.o_proj.weight",
88
            "mlp.w2.weight": "mlp.down_proj.weight",
89
        },
90
        "ROW_PARALLEL_BIAS_KEYS": {},  # No biases in RowParallelLinear layers
91
        "NORM_KEYS": {
92
            "input_layernorm.scale": "input_layernorm.weight",
93
            "post_attention_layernorm.scale": "post_attention_layernorm.weight",
94
        },
95
        "FINAL_NORM_KEYS": {
96
            "norm.scale": "weight",
97
        },
98
        "GQA_QKV_KEYS": {  # because Llama can have Grouped Query Attention and has separate Q, K, and V linear proj params, handle them separately.
99
            "attention.query_key_value.weight": [
100
                "self_attn.q_proj.weight",
101
                "self_attn.k_proj.weight",
102
                "self_attn.v_proj.weight",
103
            ],
104
        },
105
    },
106
}
107

108
MODEL_KEYS["mistral"] = MODEL_KEYS["llama"]
109

110

111
def load_partitions(
112
    input_checkpoint_path: str, mp_partitions: int, layer_idx: int, sequential: bool
113
) -> List[torch.Tensor]:
114
    """Returns a list containing all states from a model (across MP partitions)"""
115

116
    if sequential:
117
        filename_format = f"mp_rank_{{i:02}}_model_states.pt"
118
    else:
119
        filename_format = f"layer_{layer_idx:02}-model_{{i:02}}-model_states.pt"
120

121
    loaded_tp_ranks = [
122
        torch.load(
123
            os.path.join(
124
                input_checkpoint_path,
125
                filename_format.format(i=i),
126
            ),
127
            map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
128
        )
129
        for i in range(mp_partitions)
130
    ]
131

132
    return loaded_tp_ranks
133

134

135
def get_state(
136
    state_dicts: List[torch.Tensor], key: str, layer_idx: int, sequential: bool
137
) -> torch.Tensor:
138
    """Helper that returns a list containing a given weight's state from each MP partition, for a given layer in the model."""
139

140
    if sequential:
141
        # use the correct key into the sequential dict for given weight/provided key
142
        key = f"sequential.{layer_idx}.{key}"
143

144
        return [state_dict["module"][key] for state_dict in state_dicts]
145
    else:
146
        # For the PipelineModule case, we don't need any key / module prefix. just grab this weight value.
147
        # layer_idx is also ignored because we've loaded only this layer's weights, ahead of time.
148
        key = key
149

150
        return [state_dict[key] for state_dict in state_dicts]
151

152

153
def get_key(loaded_config, key, default=None):
154
    """
155
    Search for a given key in a NeoX yaml. normalizes underscores -> hyphens
156
    """
157
    key = key.replace("_", "-")
158
    try:
159
        return loaded_config[key]
160
    except KeyError:
161
        key = key.replace("-", "_")
162
        try:
163
            return loaded_config[key]
164
        except KeyError:
165
            return default
166

167

168
def create_config(neox_config, architecture="neox"):
169
    """take in a loaded yaml from NeoX and assign relevant values to HF config.
170
    Returns: GPTNeoXConfig() object
171
    """
172

173
    def gated_size(hidden_dim):
174
        # takes in a hidden dim and calculates intermediate dim of a LLaMAParallelMLP.
175
        # (only used if intermediate_size not specified in config)
176
        # hidden-size * 8 / 3 , rounded up to nearest multiple of 256
177
        ff_dim = int(2 * hidden_dim * 4 / 3)
178
        ff_dim = 256 * ((ff_dim + 256 - 1) // 256)
179
        return ff_dim
180

181
    class TokenizerArgs:
182
        # kinda hacky.
183
        # this is to get something with the same interface as is used in build_tokenizer()
184
        # without diving into loading a neox_args object or using argparse etc.
185
        def __init__(self, neox_config):
186
            self.make_vocab_size_divisible_by = get_key(
187
                neox_config, "make-vocab-size-divisible-by", default=128
188
            )
189
            self.model_parallel_size = get_key(neox_config, "model-parallel-size")
190
            self.vocab_file = get_key(neox_config, "vocab-file")
191
            self.merge_file = get_key(neox_config, "merge-file")
192
            self.tokenizer_type = get_key(neox_config, "tokenizer-type")
193

194
            self.rank = 0
195

196
    args = TokenizerArgs(neox_config)
197
    tokenizer = build_tokenizer(args)
198
    try:  # GPT2TokenizerFast raises NotImplementedError
199
        pad_token = tokenizer.pad
200
    except:
201
        pad_token = (
202
            1  # pad defaulting to 1. follows convention from GPT-NeoX-20b tokenizer
203
        )
204

205
    # TODO: change the default value here based on discussion regarding `gpt_j_tied` config parameter's default
206
    use_tied_lns = get_key(neox_config, "gpt-j-tied", False)
207

208
    if use_tied_lns:
209
        raise NotImplementedError(
210
            """ERROR: Huggingface Transformers does not yet support a single shared layernorm
211
                per transformer block for GPT-NeoX models trained  w/ GPT-J parallel residuals.
212
                See https://github.com/EleutherAI/gpt-neox/pull/481 for further details."""
213
        )
214

215
    # set all config values.
216

217
    # shared config parameters.
218
    args = {
219
        "vocab_size": args.padded_vocab_size,
220
        "hidden_size": get_key(neox_config, "hidden-size"),
221
        "num_hidden_layers": get_key(neox_config, "num-layers"),
222
        "num_attention_heads": get_key(neox_config, "num-attention-heads"),
223
        "max_position_embeddings": get_key(neox_config, "max-position-embeddings"),
224
        "initializer_range": get_key(neox_config, "init-method-std", 0.02),
225
        "tie_word_embeddings": (not get_key(neox_config, "no-weight-tying", False)),
226
        "use_cache": True,
227
    }
228
    if architecture == "mistral" or architecture == "llama":
229
        args.update(
230
            {
231
                "intermediate_size": get_key(
232
                    neox_config,
233
                    "intermediate-size",
234
                    gated_size(get_key(neox_config, "hidden-size")),
235
                ),
236
                "num_key_value_heads": get_key(
237
                    neox_config,
238
                    "num-kv-heads",
239
                    get_key(neox_config, "num-attention-heads"),
240
                ),
241
                "hidden_act": get_key(neox_config, "activation", default="silu"),
242
                "rms_norm_eps": get_key(neox_config, "rms-norm-epsilon", 1.0e-6),
243
                "bos_token_id": tokenizer.eod,
244
                "eos_token_id": tokenizer.eod,
245
                "rope_theta": get_key(neox_config, "rotary-emb-base", 10000.0),
246
            }
247
        )
248

249
        if architecture == "mistral":
250
            # mistral-specific options
251
            args.update(
252
                {
253
                    "sliding_window": get_key(
254
                        neox_config, "sliding-window-width", 4096
255
                    ),
256
                }
257
            )
258
            hf_config = MistralConfig(**args)
259
        elif architecture == "llama":
260
            # llama-specific options
261
            args.update(
262
                {
263
                    # NeoX library defaults to using bias in attention
264
                    "attention_bias": get_key(
265
                        neox_config, "use_bias_in_attn_linear", True
266
                    ),
267
                }
268
            )
269
            hf_config = LlamaConfig(**args)
270
    else:
271
        # GPT-NeoX HF model class-specific options
272
        args.update(
273
            {
274
                "rotary_pct": get_key(neox_config, "rotary-pct", default=1.0),
275
                "rotary_emb_base": get_key(
276
                    neox_config, "rotary-emb-base", default=1000.0
277
                ),
278
                "use_parallel_residual": get_key(neox_config, "gpt-j-residual", False),
279
                "layer_norm_eps": get_key(neox_config, "layernorm-epsilon", 1e-5),
280
            }
281
        )
282
        hf_config = GPTNeoXConfig(**args)
283

284
    return hf_config
285

286

287
def reshard_and_split_qkv(
288
    param_mapping: dict,  # a dictionary mapping the QKV weight keys in GPT-NeoX -> a list of keys representing the Q, K, and V weight keys the HF model will use
289
    hf_config: AutoConfig,  # a HF model config for the model
290
    loaded_tp_ranks: List[torch.Tensor],
291
    layer_idx: int,
292
    sequential: bool,
293
):
294
    """
295
    A helper function which performs reshaping and sharding to make the QKV projection from NeoX compatible with HF Llama models,
296
    even when grouped-query attention is required.
297
    """
298
    for key, hf_keys in param_mapping.items():
299
        assert (
300
            isinstance(hf_keys, list) and len(hf_keys) == 3
301
        ), "Must map QKV to precisely 3 resulting weight matrices."
302

303
    for key, hf_keys in param_mapping.items():
304
        # we first merge the QKV proj. across TP ranks
305
        sharded_qkv = torch.stack(
306
            get_state(loaded_tp_ranks, key, layer_idx, sequential), dim=0
307
        )
308
        # should now have shape [TP_SIZE, (hidden_size + 2 * kv_hidden_size) / TP_SIZE, hidden_size].
309

310
        sharded_qkv = sharded_qkv.view(
311
            len(loaded_tp_ranks),
312
            hf_config.num_attention_heads // len(loaded_tp_ranks),
313
            int(
314
                hf_config.hidden_size
315
                // hf_config.num_attention_heads
316
                * (
317
                    1
318
                    + 2 * hf_config.num_key_value_heads / hf_config.num_attention_heads
319
                )
320
            ),
321
            hf_config.hidden_size,
322
        )  # is meant to convert to shape [TP_SIZE, NUM_QUERY_HEADS_PER_SHARD, dims_per_head * (1 + 2 * kv-to-q head ratio), hidden_size]
323

324
        q, k, v = torch.split(
325
            sharded_qkv,
326
            [
327
                hf_config.hidden_size // hf_config.num_attention_heads,
328
                int(
329
                    (hf_config.num_key_value_heads / hf_config.num_attention_heads)
330
                    * hf_config.hidden_size
331
                    // hf_config.num_attention_heads
332
                ),
333
                int(
334
                    (hf_config.num_key_value_heads / hf_config.num_attention_heads)
335
                    * hf_config.hidden_size
336
                    // hf_config.num_attention_heads
337
                ),
338
            ],
339
            dim=2,
340
        )
341
        # splits along the (dims_per_head * (1 + 2 * kv-to-q head ratio)_ dim to get 3 tensors:
342
        # 1 x [TP_SIZE, NUM_Q_HEADS_PER_SHARD, dims_per_head, hidden_size] and 2 x [TP_SIZE, NUM_Q_HEADS_PER_SHARD, (dims_per_head / kv-to-q head ratio), hidden_size]
343
        # these are the Q, and K, V tensors respectively.
344

345
        # we have to do additional reshape for each individual tensor now,
346
        # into the expected square (or smaller than square, for K/V tensors) shape
347
        q, k, v = q.squeeze(dim=2), k.squeeze(dim=2), v.squeeze(dim=2)
348
        q = q.view(
349
            hf_config.num_attention_heads,
350
            hf_config.hidden_size // hf_config.num_attention_heads,
351
            hf_config.hidden_size,
352
        ).reshape(hf_config.hidden_size, hf_config.hidden_size)
353
        k = k.reshape(
354
            hf_config.num_key_value_heads,
355
            hf_config.hidden_size // hf_config.num_attention_heads,
356
            hf_config.hidden_size,
357
        ).reshape(
358
            hf_config.hidden_size
359
            // hf_config.num_attention_heads
360
            * hf_config.num_key_value_heads,
361
            hf_config.hidden_size,
362
        )
363
        v = v.reshape(
364
            hf_config.num_key_value_heads,
365
            hf_config.hidden_size // hf_config.num_attention_heads,
366
            hf_config.hidden_size,
367
        ).reshape(
368
            hf_config.hidden_size
369
            // hf_config.num_attention_heads
370
            * hf_config.num_key_value_heads,
371
            hf_config.hidden_size,
372
        )
373

374
        # return these
375
        state_dict = {}
376
        for hf_key, proj in zip(hf_keys, [q, k, v]):
377
            state_dict[hf_key] = proj.clone()
378
        return state_dict
379

380

381
def convert(
382
    input_checkpoint_path,
383
    loaded_config,
384
    output_checkpoint_path,
385
    sequential: bool = True,
386
    precision: Literal["auto", "fp16", "bf16", "fp32"] = "auto",
387
    architecture: Literal["neox", "llama", "mistral"] = "neox",
388
):
389
    """convert a NeoX checkpoint to a HF model format.
390
    should perform model-parallel merging correctly
391
    but only supports features allowed by HF GPT-NeoX implementation (e.g. rotary embeddings)
392
    """
393

394
    ARCH = MODEL_KEYS[architecture]
395

396
    hf_config = create_config(loaded_config, architecture=architecture)
397

398
    hf_model = AutoModelForCausalLM.from_config(hf_config)
399

400
    if architecture == "neox":
401
        hf_transformer = hf_model.gpt_neox
402
    else:
403
        hf_transformer = hf_model.model
404

405
    if precision == "auto":
406
        print("Auto-detecting precision to save model into...")
407
        # save model in FP16 if Deepspeed fp16 was used in config, else 32 bit
408
        fp16 = get_key(loaded_config, "fp16")
409

410
        if fp16:
411
            try:
412
                # current behavior is to pass "fp16": {"enabled": true}, when using upstream Deepspeed
413
                if fp16["enabled"]:
414
                    hf_model.half()
415
                    print("Saving weights in fp16 precision...")
416
            except:
417
                try:
418
                    # attempt to access bf16 dict in yaml file, if fp16 not enabled
419
                    bf16 = get_key(loaded_config, "bf16")
420
                    if bf16:
421
                        hf_model.to(dtype=torch.bfloat16)
422
                        print("Saving weights in bf16 precision...")
423
                except:
424
                    hf_model.to(dtype=torch.float)
425
                    print(
426
                        "Model not trained in fp16 / bf16 mixed precision, saving weights in fp32..."
427
                    )
428
    else:
429
        name_to_dtype = {
430
            "bf16": torch.bfloat16,
431
            "fp16": torch.float16,
432
            "fp32": torch.float,
433
        }
434
        print(f"Saving model into specified {precision} precision...")
435
        hf_model.to(dtype=name_to_dtype[precision])
436

437
    mp_partitions = get_key(loaded_config, "model-parallel-size")
438

439
    # Sequential saves all model states from an MP rank in one file.
440
    # so we only load the MP ranks only once and index into them with get_state().
441
    # for the pipeline-parallel case (pipeline-parallel-size >= 1),
442
    # we must load the correct layer's states at each step.
443
    # (this does mean that less memory is required for PP conversion.)
444
    loaded_tp_ranks = load_partitions(
445
        input_checkpoint_path, mp_partitions, layer_idx=0, sequential=sequential
446
    )
447

448
    ### Embedding layer ###
449
    # Embedding is layer idx 0
450
    if architecture == "neox":
451
        embed_in = hf_transformer.embed_in
452
    else:
453
        embed_in = hf_transformer.embed_tokens
454
    embed_in.load_state_dict(  # TODO: embed_in is not always model's name for embedding
455
        {
456
            "weight": torch.cat(
457
                get_state(
458
                    loaded_tp_ranks,
459
                    "word_embeddings.weight",
460
                    layer_idx=0,
461
                    sequential=sequential,
462
                ),
463
                dim=0,
464
            )
465
        }
466
    )
467
    assert (
468
        hf_config.vocab_size == embed_in.weight.shape[0]
469
    ), f"ERROR: calculated vocab size {hf_config.vocab_size} != embed param size {embed_in.shape[0]}"
470
    ### End Embedding Layer ###
471

472
    for layer_i in tqdm(range(get_key(loaded_config, "num-layers"))):
473

474
        # get layer from hf model
475
        hf_layer = hf_transformer.layers[layer_i]  # TODO: model module names
476

477
        if not sequential:
478
            # in the non-sequential case, must load from each layer individually.
479
            # use layer index + 2 bc of embed layer and a dummy _pre_transformer_block, which are "layers 0 and 1"
480
            loaded_tp_ranks = load_partitions(
481
                input_checkpoint_path,
482
                mp_partitions,
483
                layer_idx=layer_i + 2,
484
                sequential=sequential,
485
            )
486

487
        # + 2 bc of embed layer and a dummy _pre_transformer_block
488
        state_dict = {}
489
        for key, hf_key in ARCH["ROW_PARALLEL_LINEAR_KEYS"].items():
490
            state_dict[hf_key] = torch.cat(
491
                get_state(
492
                    loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential
493
                ),
494
                dim=1,
495
            )
496

497
        # average layernorm stats over mp ranks
498
        for key, hf_key in ARCH["NORM_KEYS"].items():
499
            state_dict[hf_key] = sum(
500
                get_state(
501
                    loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential
502
                )
503
            ) / len(loaded_tp_ranks)
504

505
        # LinearWithTPMerge
506
        for key, hf_key in ARCH["COLUMN_PARALLEL_LINEAR_KEYS"].items():
507
            state_dict[hf_key] = torch.cat(
508
                get_state(
509
                    loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential
510
                ),
511
                dim=0,
512
            )
513

514
        # LinearWithTPSplitBias
515
        for key, hf_key in ARCH["ROW_PARALLEL_BIAS_KEYS"].items():
516
            state_dict[hf_key] = sum(
517
                get_state(
518
                    loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential
519
                )
520
            )
521

522
        # Just take one
523
        if "attention.bias" in hf_layer.state_dict():
524
            state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"]
525
        if "attention.masked_bias" in hf_layer.state_dict():
526
            state_dict["attention.masked_bias"] = hf_layer.state_dict()[
527
                "attention.masked_bias"
528
            ]
529

530
        # some architectures, like Mistral and Llama, have the following which must be handled specially:
531
        # - Q, K, V projections are performed separately, so we must split apart GPT-NeoX library's single QKV proj
532
        # - Support for Grouped-Query Attention, meaning the Q and the K, V projections may not be the same size
533
        if "GQA_QKV_KEYS" in ARCH:
534
            state_dict.update(
535
                reshard_and_split_qkv(
536
                    param_mapping=ARCH["GQA_QKV_KEYS"],
537
                    hf_config=hf_config,
538
                    loaded_tp_ranks=loaded_tp_ranks,
539
                    layer_idx=layer_i + 2,
540
                    sequential=sequential,
541
                )
542
            )
543
        # load state_dict into layer
544
        hf_layer.load_state_dict(state_dict)
545

546
    if not sequential:
547
        loaded_tp_ranks = load_partitions(
548
            input_checkpoint_path,
549
            mp_partitions,
550
            get_key(loaded_config, "num-layers") + 3,
551
            sequential=sequential,
552
        )
553
    # Load final layer norm
554
    if architecture == "neox":
555
        lm_head = hf_model.embed_out
556
    else:
557
        lm_head = hf_model.lm_head
558
    norm_state_dict = {}
559
    for key, hf_key in ARCH["FINAL_NORM_KEYS"].items():
560
        norm_state_dict[hf_key] = sum(
561
            get_state(
562
                loaded_tp_ranks,
563
                key,
564
                layer_idx=get_key(loaded_config, "num-layers") + 3,
565
                sequential=sequential,
566
            )
567
        ) / len(loaded_tp_ranks)
568

569
    if architecture == "neox":
570
        final_layer_norm = hf_transformer.final_layer_norm
571
    else:
572
        final_layer_norm = hf_transformer.norm
573

574
    final_layer_norm.load_state_dict(norm_state_dict)
575

576
    # Load output embedding
577
    if not sequential:
578
        loaded_tp_ranks = load_partitions(
579
            input_checkpoint_path,
580
            mp_partitions,
581
            get_key(loaded_config, "num-layers") + 4,
582
            sequential=sequential,
583
        )
584
    # output embedding / LM head
585
    if architecture == "neox":  # name of lm head / final linear proj varies
586
        lm_head = hf_model.embed_out
587
    else:
588
        lm_head = hf_model.lm_head
589
    lm_head.load_state_dict(
590
        {
591
            "weight": torch.cat(
592
                get_state(
593
                    loaded_tp_ranks,
594
                    "final_linear.weight",
595
                    layer_idx=get_key(loaded_config, "num-layers") + 4,
596
                    sequential=sequential,
597
                ),
598
                dim=0,
599
            ),
600
        }
601
    )
602

603
    del loaded_tp_ranks
604

605
    return hf_model
606

607

608
def main(input_args=None, overwrite_values=None):
609
    from huggingface_hub import create_repo, HfApi
610

611
    parser = argparse.ArgumentParser(
612
        description="Merge MP partitions and convert to HF Model."
613
    )
614
    parser.add_argument(
615
        "--input_dir",
616
        type=str,
617
        help="Path to NeoX checkpoint, e.g. /path/to/model/global_step143000",
618
    )
619
    parser.add_argument(
620
        "--config_file",
621
        type=str,
622
        help="Path to config file for the input NeoX checkpoint.",
623
    )
624
    parser.add_argument(
625
        "--output_dir",
626
        type=str,
627
        help="Output dir, where to save the HF Model, tokenizer, and configs",
628
    )
629
    parser.add_argument(
630
        "--precision",
631
        type=str,
632
        default="auto",
633
        help="What precision to save the model into. Defaults to auto, which auto-detects which 16-bit dtype to save into, or falls back to fp32.",
634
    )
635
    parser.add_argument(
636
        "--no_save_tokenizer",
637
        action="store_true",
638
        help="Whether to skip saving the tokenizer alongside a model.",
639
    )
640
    parser.add_argument(
641
        "--architecture",
642
        type=str,
643
        default="neox",
644
        help="What HF model class type to export into.",
645
    )
646
    args = parser.parse_args(input_args)
647

648
    # validate arguments
649
    assert args.precision in [
650
        "auto",
651
        "fp16",
652
        "bf16",
653
        "fp32",
654
    ], f"expected --precision to be one of 'auto', 'fp16', 'bf16', 'fp32' but got '{args.precision}' !"
655
    assert args.architecture in [
656
        "neox",
657
        "llama",
658
        "mistral",
659
    ], f"expected --architecture to be one of 'neox', 'mistral', 'llama', but got '{args.architecture}' !"
660

661
    with open(args.config_file) as f:
662
        loaded_config = yaml.full_load(f)
663
        if overwrite_values:
664
            loaded_config.update(overwrite_values)
665

666
    # Determine the checkpoint format of the model.
667
    # DeepSpeed saves models wrapped in a PipelineModule differently from those not.
668
    # PipelineModule models are saved as per-layer state dicts per TP shard,
669
    # while Sequential model state dicts are saved all together in one mp_rank_xx_model_states.pt
670
    # file per tensor/model parallel shard.
671
    pipeline_world_size = get_key(loaded_config, "pipe-parallel-size", 1)
672
    if pipeline_world_size == 0:
673
        sequential = True
674
        print(
675
            f"Detected 'pipe-parallel-size' of {pipeline_world_size}, assuming model is saved as Sequential..."
676
        )
677
    else:
678
        sequential = False
679
        print(
680
            f"Detected 'pipe-parallel-size' of {pipeline_world_size}, assuming model is saved as PipelineModule..."
681
        )
682

683
    # convert the model to HF.
684
    hf_model = convert(
685
        args.input_dir,
686
        loaded_config,
687
        args.output_dir,
688
        sequential=sequential,
689
        architecture=args.architecture,
690
    )
691

692
    # Save to disk.
693
    hf_model.save_pretrained(args.output_dir)
694

695
    if not args.no_save_tokenizer:
696
        # save tokenizer to directory as well, for easy loading of model as a HF model.
697
        tokenizer_type = get_key(loaded_config, "tokenizer-type")
698

699
        if tokenizer_type == "HFTokenizer":  # TODO: handle sentencepiece tokenizers?
700
            print(f"saving tokenizer from file {get_key(loaded_config, 'vocab-file')}")
701
            print(
702
                "Warning: please check that your model config and tokenizer end with the correct special tokens (EOS, BOS)."
703
            )
704
            from transformers import PreTrainedTokenizerFast
705

706
            tokenizer = PreTrainedTokenizerFast(
707
                tokenizer_file=get_key(loaded_config, "vocab-file")
708
            )
709
            print("loaded tokenizer: ", tokenizer)
710
            tokenizer.save_pretrained(args.output_dir)
711
            print("tokenizer saved!")
712

713

714
if __name__ == "__main__":
715

716
    # before running script:
717
    # `pip install --upgrade transformers`
718
    # `huggingface-cli login`
719
    #
720
    main()
721
gpt-neox

Использование cookies