llama

convert_lora_to_gguf.py
400 строк · 14.6 Кб
Перенос по словам
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3

4
from __future__ import annotations
5

6
from dataclasses import dataclass
7
import logging
8
import argparse
9
import os
10
import sys
11
import json
12
from math import prod
13
from pathlib import Path
14
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
15

16
import torch
17

18
if TYPE_CHECKING:
19
    from torch import Tensor
20

21
if 'NO_LOCAL_GGUF' not in os.environ:
22
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
23
import gguf
24

25
# reuse model definitions from convert_hf_to_gguf.py
26
from convert_hf_to_gguf import LazyTorchTensor, Model
27

28
logger = logging.getLogger("lora-to-gguf")
29

30

31
@dataclass
32
class PartialLoraTensor:
33
    A: Tensor | None = None
34
    B: Tensor | None = None
35

36

37
# magic to support tensor shape modifications and splitting
38
class LoraTorchTensor:
39
    _lora_A: Tensor  # (n_rank, row_size)
40
    _lora_B: Tensor  # (col_size, n_rank)
41
    _rank: int
42

43
    def __init__(self, A: Tensor, B: Tensor):
44
        assert len(A.shape) == len(B.shape)
45
        assert A.shape[-2] == B.shape[-1]
46
        if A.dtype != B.dtype:
47
            A = A.to(torch.float32)
48
            B = B.to(torch.float32)
49
        self._lora_A = A
50
        self._lora_B = B
51
        self._rank = B.shape[-1]
52

53
    def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
54
        return (self._lora_A, self._lora_B)
55

56
    def __getitem__(
57
        self,
58
        indices: (
59
            SupportsIndex
60
            | slice
61
            | tuple[SupportsIndex | slice | Tensor, ...]  # TODO: add ellipsis in the type signature
62
        ),
63
    ) -> LoraTorchTensor:
64
        shape = self.shape
65
        if isinstance(indices, SupportsIndex):
66
            if len(shape) > 2:
67
                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
68
            else:
69
                raise NotImplementedError  # can't return a vector
70
        elif isinstance(indices, slice):
71
            if len(shape) > 2:
72
                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
73
            else:
74
                return LoraTorchTensor(self._lora_A, self._lora_B[indices])
75
        elif isinstance(indices, tuple):
76
            assert len(indices) > 0
77
            if indices[-1] is Ellipsis:
78
                return self[indices[:-1]]
79
            # expand ellipsis
80
            indices = tuple(
81
                u
82
                for v in (
83
                    (
84
                        (slice(None, None) for _ in range(len(indices) - 1))
85
                        if i is Ellipsis
86
                        else (i,)
87
                    )
88
                    for i in indices
89
                )
90
                for u in v
91
            )
92

93
            if len(indices) < len(shape):
94
                indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
95

96
            # TODO: make sure this is correct
97
            indices_A = (
98
                *(
99
                    (
100
                        j.__index__() % self._lora_A.shape[i]
101
                        if isinstance(j, SupportsIndex)
102
                        else slice(None, None)
103
                    )
104
                    for i, j in enumerate(indices[:-2])
105
                ),
106
                slice(None, None),
107
                indices[-1],
108
            )
109
            indices_B = indices[:-1]
110
            return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
111
        else:
112
            raise NotImplementedError  # unknown indice type
113

114
    @property
115
    def dtype(self) -> torch.dtype:
116
        assert self._lora_A.dtype == self._lora_B.dtype
117
        return self._lora_A.dtype
118

119
    @property
120
    def shape(self) -> tuple[int, ...]:
121
        assert len(self._lora_A.shape) == len(self._lora_B.shape)
122
        return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
123

124
    def size(self, dim=None):
125
        assert dim is None
126
        return self.shape
127

128
    def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
129
        if isinstance(shape[0], tuple):
130
            new_shape: tuple[int, ...] = shape[0]
131
        else:
132
            new_shape = cast(tuple[int, ...], shape)
133
        orig_shape = self.shape
134
        if len(new_shape) < 2:
135
            raise NotImplementedError  # can't become a vector
136

137
        # expand -1 in the shape
138
        if any(dim == -1 for dim in new_shape):
139
            n_elems = prod(orig_shape)
140
            n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
141
            assert n_elems % n_new_elems == 0
142
            new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
143

144
        if new_shape[-1] != orig_shape[-1]:
145
            raise NotImplementedError  # can't reshape the row size trivially
146

147
        shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
148
        shape_B = (*new_shape[:-1], self._rank)
149
        return LoraTorchTensor(
150
            self._lora_A.reshape(shape_A),
151
            self._lora_B.reshape(shape_B),
152
        )
153

154
    def reshape_as(self, other: Tensor) -> LoraTorchTensor:
155
        return self.reshape(*other.shape)
156

157
    def view(self, *size: int) -> LoraTorchTensor:
158
        return self.reshape(*size)
159

160
    def permute(self, *dims: int) -> LoraTorchTensor:
161
        shape = self.shape
162
        dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
163
        if dims[-1] == -1:
164
            # TODO: support higher dimensional A shapes bigger than 1
165
            assert all(dim == 1 for dim in self._lora_A.shape[:-2])
166
            return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
167
        if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
168
            return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
169
        else:
170
            # TODO: compose the above two
171
            raise NotImplementedError
172

173
    def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
174
        shape = self.shape
175
        dims = [i for i in range(len(shape))]
176
        dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
177
        return self.permute(*dims)
178

179
    def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
180
        return self.transpose(axis0, axis1)
181

182
    def to(self, *args, **kwargs):
183
        return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
184

185
    @classmethod
186
    def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
187
        del types  # unused
188

189
        if kwargs is None:
190
            kwargs = {}
191

192
        if func is torch.permute:
193
            return type(args[0]).permute(*args, **kwargs)
194
        elif func is torch.reshape:
195
            return type(args[0]).reshape(*args, **kwargs)
196
        elif func is torch.stack:
197
            assert isinstance(args[0], Sequence)
198
            dim = kwargs.get("dim", 0)
199
            assert dim == 0
200
            return LoraTorchTensor(
201
                torch.stack([a._lora_A for a in args[0]], dim),
202
                torch.stack([b._lora_B for b in args[0]], dim),
203
            )
204
        elif func is torch.cat:
205
            assert isinstance(args[0], Sequence)
206
            dim = kwargs.get("dim", 0)
207
            assert dim == 0
208
            if len(args[0][0].shape) > 2:
209
                return LoraTorchTensor(
210
                    torch.cat([a._lora_A for a in args[0]], dim),
211
                    torch.cat([b._lora_B for b in args[0]], dim),
212
                )
213
            elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
214
                return LoraTorchTensor(
215
                    args[0][0]._lora_A,
216
                    torch.cat([b._lora_B for b in args[0]], dim),
217
                )
218
            else:
219
                raise NotImplementedError
220
        else:
221
            raise NotImplementedError
222

223

224
def get_base_tensor_name(lora_tensor_name: str) -> str:
225
    base_name = lora_tensor_name.replace("base_model.model.", "")
226
    base_name = base_name.replace(".lora_A.weight", ".weight")
227
    base_name = base_name.replace(".lora_B.weight", ".weight")
228
    return base_name
229

230

231
def parse_args() -> argparse.Namespace:
232
    parser = argparse.ArgumentParser(
233
        description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
234
    parser.add_argument(
235
        "--outfile", type=Path,
236
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
237
    )
238
    parser.add_argument(
239
        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
240
        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
241
    )
242
    parser.add_argument(
243
        "--bigendian", action="store_true",
244
        help="model is executed on big endian machine",
245
    )
246
    parser.add_argument(
247
        "--no-lazy", action="store_true",
248
        help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
249
    )
250
    parser.add_argument(
251
        "--verbose", action="store_true",
252
        help="increase output verbosity",
253
    )
254
    parser.add_argument(
255
        "--dry-run", action="store_true",
256
        help="only print out what will be done, without writing any new files",
257
    )
258
    parser.add_argument(
259
        "--base", type=Path, required=True,
260
        help="directory containing base model file",
261
    )
262
    parser.add_argument(
263
        "lora_path", type=Path,
264
        help="directory containing LoRA adapter file",
265
    )
266

267
    return parser.parse_args()
268

269

270
if __name__ == '__main__':
271
    args = parse_args()
272
    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
273

274
    ftype_map: dict[str, gguf.LlamaFileType] = {
275
        "f32": gguf.LlamaFileType.ALL_F32,
276
        "f16": gguf.LlamaFileType.MOSTLY_F16,
277
        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
278
        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
279
        "auto": gguf.LlamaFileType.GUESSED,
280
    }
281

282
    ftype = ftype_map[args.outtype]
283

284
    dir_base_model: Path = args.base
285
    dir_lora: Path = args.lora_path
286
    lora_config = dir_lora / "adapter_config.json"
287
    input_model = dir_lora / "adapter_model.safetensors"
288

289
    if args.outfile is not None:
290
        fname_out = args.outfile
291
    else:
292
        # output in the same directory as the model by default
293
        fname_out = dir_lora
294

295
    if os.path.exists(input_model):
296
        # lazy import load_file only if lora is in safetensors format.
297
        from safetensors.torch import load_file
298

299
        lora_model = load_file(input_model, device="cpu")
300
    else:
301
        input_model = os.path.join(dir_lora, "adapter_model.bin")
302
        lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
303

304
    # load base model
305
    logger.info(f"Loading base model: {dir_base_model.name}")
306
    hparams = Model.load_hparams(dir_base_model)
307
    with torch.inference_mode():
308
        try:
309
            model_class = Model.from_model_architecture(hparams["architectures"][0])
310
        except NotImplementedError:
311
            logger.error(f"Model {hparams['architectures'][0]} is not supported")
312
            sys.exit(1)
313

314
        class LoraModel(model_class):
315
            model_arch = model_class.model_arch
316

317
            lora_alpha: float
318

319
            def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
320

321
                super().__init__(*args, **kwargs)
322

323
                self.dir_model_card = dir_lora_model
324
                self.lora_alpha = float(lora_alpha)
325

326
            def set_type(self):
327
                self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
328
                self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
329

330
            def set_gguf_parameters(self):
331
                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
332
                super().set_gguf_parameters()
333

334
            def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
335
                tensor_map: dict[str, PartialLoraTensor] = {}
336

337
                for name, tensor in lora_model.items():
338
                    if self.lazy:
339
                        tensor = LazyTorchTensor.from_eager(tensor)
340
                    base_name = get_base_tensor_name(name)
341
                    is_lora_a = ".lora_A.weight" in name
342
                    is_lora_b = ".lora_B.weight" in name
343
                    if not is_lora_a and not is_lora_b:
344
                        if ".base_layer.weight" in name:
345
                            continue
346
                        logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
347
                        sys.exit(1)
348

349
                    if base_name in tensor_map:
350
                        if is_lora_a:
351
                            tensor_map[base_name].A = tensor
352
                        else:
353
                            tensor_map[base_name].B = tensor
354
                    else:
355
                        if is_lora_a:
356
                            tensor_map[base_name] = PartialLoraTensor(A=tensor)
357
                        else:
358
                            tensor_map[base_name] = PartialLoraTensor(B=tensor)
359

360
                for name, tensor in tensor_map.items():
361
                    assert tensor.A is not None
362
                    assert tensor.B is not None
363
                    yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
364

365
            def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
366
                dest = list(super().modify_tensors(data_torch, name, bid))
367
                # some archs may have the same tensor for lm_head and output (tie word embeddings)
368
                # in this case, adapters targeting lm_head will fail when using llama-export-lora
369
                # therefore, we ignore them for now
370
                # see: https://github.com/ggerganov/llama.cpp/issues/9065
371
                if name == "lm_head.weight" and len(dest) == 0:
372
                    raise ValueError("lm_head is present in adapter, but is ignored in base model")
373
                for dest_name, dest_data in dest:
374
                    assert isinstance(dest_data, LoraTorchTensor)
375
                    lora_a, lora_b = dest_data.get_lora_A_B()
376

377
                    yield (dest_name + ".lora_a", lora_a)
378
                    yield (dest_name + ".lora_b", lora_b)
379

380
        with open(lora_config, "r") as f:
381
            lparams: dict[str, Any] = json.load(f)
382

383
        alpha: float = lparams["lora_alpha"]
384

385
        model_instance = LoraModel(
386
            dir_base_model,
387
            ftype,
388
            fname_out,
389
            is_big_endian=args.bigendian,
390
            use_temp_file=False,
391
            eager=args.no_lazy,
392
            dry_run=args.dry_run,
393
            dir_lora_model=dir_lora,
394
            lora_alpha=alpha,
395
            is_lora=True,
396
        )
397

398
        logger.info("Exporting model...")
399
        model_instance.write()
400
        logger.info(f"Model successfully exported to {model_instance.fname_out}")
401
llama

Использование cookies