llama
/
convert_lora_to_gguf.py
400 строк · 14.6 Кб
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3
4from __future__ import annotations
5
6from dataclasses import dataclass
7import logging
8import argparse
9import os
10import sys
11import json
12from math import prod
13from pathlib import Path
14from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
15
16import torch
17
18if TYPE_CHECKING:
19from torch import Tensor
20
21if 'NO_LOCAL_GGUF' not in os.environ:
22sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
23import gguf
24
25# reuse model definitions from convert_hf_to_gguf.py
26from convert_hf_to_gguf import LazyTorchTensor, Model
27
28logger = logging.getLogger("lora-to-gguf")
29
30
31@dataclass
32class PartialLoraTensor:
33A: Tensor | None = None
34B: Tensor | None = None
35
36
37# magic to support tensor shape modifications and splitting
38class LoraTorchTensor:
39_lora_A: Tensor # (n_rank, row_size)
40_lora_B: Tensor # (col_size, n_rank)
41_rank: int
42
43def __init__(self, A: Tensor, B: Tensor):
44assert len(A.shape) == len(B.shape)
45assert A.shape[-2] == B.shape[-1]
46if A.dtype != B.dtype:
47A = A.to(torch.float32)
48B = B.to(torch.float32)
49self._lora_A = A
50self._lora_B = B
51self._rank = B.shape[-1]
52
53def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
54return (self._lora_A, self._lora_B)
55
56def __getitem__(
57self,
58indices: (
59SupportsIndex
60| slice
61| tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature
62),
63) -> LoraTorchTensor:
64shape = self.shape
65if isinstance(indices, SupportsIndex):
66if len(shape) > 2:
67return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
68else:
69raise NotImplementedError # can't return a vector
70elif isinstance(indices, slice):
71if len(shape) > 2:
72return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
73else:
74return LoraTorchTensor(self._lora_A, self._lora_B[indices])
75elif isinstance(indices, tuple):
76assert len(indices) > 0
77if indices[-1] is Ellipsis:
78return self[indices[:-1]]
79# expand ellipsis
80indices = tuple(
81u
82for v in (
83(
84(slice(None, None) for _ in range(len(indices) - 1))
85if i is Ellipsis
86else (i,)
87)
88for i in indices
89)
90for u in v
91)
92
93if len(indices) < len(shape):
94indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
95
96# TODO: make sure this is correct
97indices_A = (
98*(
99(
100j.__index__() % self._lora_A.shape[i]
101if isinstance(j, SupportsIndex)
102else slice(None, None)
103)
104for i, j in enumerate(indices[:-2])
105),
106slice(None, None),
107indices[-1],
108)
109indices_B = indices[:-1]
110return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
111else:
112raise NotImplementedError # unknown indice type
113
114@property
115def dtype(self) -> torch.dtype:
116assert self._lora_A.dtype == self._lora_B.dtype
117return self._lora_A.dtype
118
119@property
120def shape(self) -> tuple[int, ...]:
121assert len(self._lora_A.shape) == len(self._lora_B.shape)
122return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
123
124def size(self, dim=None):
125assert dim is None
126return self.shape
127
128def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
129if isinstance(shape[0], tuple):
130new_shape: tuple[int, ...] = shape[0]
131else:
132new_shape = cast(tuple[int, ...], shape)
133orig_shape = self.shape
134if len(new_shape) < 2:
135raise NotImplementedError # can't become a vector
136
137# expand -1 in the shape
138if any(dim == -1 for dim in new_shape):
139n_elems = prod(orig_shape)
140n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
141assert n_elems % n_new_elems == 0
142new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
143
144if new_shape[-1] != orig_shape[-1]:
145raise NotImplementedError # can't reshape the row size trivially
146
147shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
148shape_B = (*new_shape[:-1], self._rank)
149return LoraTorchTensor(
150self._lora_A.reshape(shape_A),
151self._lora_B.reshape(shape_B),
152)
153
154def reshape_as(self, other: Tensor) -> LoraTorchTensor:
155return self.reshape(*other.shape)
156
157def view(self, *size: int) -> LoraTorchTensor:
158return self.reshape(*size)
159
160def permute(self, *dims: int) -> LoraTorchTensor:
161shape = self.shape
162dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
163if dims[-1] == -1:
164# TODO: support higher dimensional A shapes bigger than 1
165assert all(dim == 1 for dim in self._lora_A.shape[:-2])
166return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
167if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
168return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
169else:
170# TODO: compose the above two
171raise NotImplementedError
172
173def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
174shape = self.shape
175dims = [i for i in range(len(shape))]
176dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
177return self.permute(*dims)
178
179def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
180return self.transpose(axis0, axis1)
181
182def to(self, *args, **kwargs):
183return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
184
185@classmethod
186def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
187del types # unused
188
189if kwargs is None:
190kwargs = {}
191
192if func is torch.permute:
193return type(args[0]).permute(*args, **kwargs)
194elif func is torch.reshape:
195return type(args[0]).reshape(*args, **kwargs)
196elif func is torch.stack:
197assert isinstance(args[0], Sequence)
198dim = kwargs.get("dim", 0)
199assert dim == 0
200return LoraTorchTensor(
201torch.stack([a._lora_A for a in args[0]], dim),
202torch.stack([b._lora_B for b in args[0]], dim),
203)
204elif func is torch.cat:
205assert isinstance(args[0], Sequence)
206dim = kwargs.get("dim", 0)
207assert dim == 0
208if len(args[0][0].shape) > 2:
209return LoraTorchTensor(
210torch.cat([a._lora_A for a in args[0]], dim),
211torch.cat([b._lora_B for b in args[0]], dim),
212)
213elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
214return LoraTorchTensor(
215args[0][0]._lora_A,
216torch.cat([b._lora_B for b in args[0]], dim),
217)
218else:
219raise NotImplementedError
220else:
221raise NotImplementedError
222
223
224def get_base_tensor_name(lora_tensor_name: str) -> str:
225base_name = lora_tensor_name.replace("base_model.model.", "")
226base_name = base_name.replace(".lora_A.weight", ".weight")
227base_name = base_name.replace(".lora_B.weight", ".weight")
228return base_name
229
230
231def parse_args() -> argparse.Namespace:
232parser = argparse.ArgumentParser(
233description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
234parser.add_argument(
235"--outfile", type=Path,
236help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
237)
238parser.add_argument(
239"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
240help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
241)
242parser.add_argument(
243"--bigendian", action="store_true",
244help="model is executed on big endian machine",
245)
246parser.add_argument(
247"--no-lazy", action="store_true",
248help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
249)
250parser.add_argument(
251"--verbose", action="store_true",
252help="increase output verbosity",
253)
254parser.add_argument(
255"--dry-run", action="store_true",
256help="only print out what will be done, without writing any new files",
257)
258parser.add_argument(
259"--base", type=Path, required=True,
260help="directory containing base model file",
261)
262parser.add_argument(
263"lora_path", type=Path,
264help="directory containing LoRA adapter file",
265)
266
267return parser.parse_args()
268
269
270if __name__ == '__main__':
271args = parse_args()
272logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
273
274ftype_map: dict[str, gguf.LlamaFileType] = {
275"f32": gguf.LlamaFileType.ALL_F32,
276"f16": gguf.LlamaFileType.MOSTLY_F16,
277"bf16": gguf.LlamaFileType.MOSTLY_BF16,
278"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
279"auto": gguf.LlamaFileType.GUESSED,
280}
281
282ftype = ftype_map[args.outtype]
283
284dir_base_model: Path = args.base
285dir_lora: Path = args.lora_path
286lora_config = dir_lora / "adapter_config.json"
287input_model = dir_lora / "adapter_model.safetensors"
288
289if args.outfile is not None:
290fname_out = args.outfile
291else:
292# output in the same directory as the model by default
293fname_out = dir_lora
294
295if os.path.exists(input_model):
296# lazy import load_file only if lora is in safetensors format.
297from safetensors.torch import load_file
298
299lora_model = load_file(input_model, device="cpu")
300else:
301input_model = os.path.join(dir_lora, "adapter_model.bin")
302lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
303
304# load base model
305logger.info(f"Loading base model: {dir_base_model.name}")
306hparams = Model.load_hparams(dir_base_model)
307with torch.inference_mode():
308try:
309model_class = Model.from_model_architecture(hparams["architectures"][0])
310except NotImplementedError:
311logger.error(f"Model {hparams['architectures'][0]} is not supported")
312sys.exit(1)
313
314class LoraModel(model_class):
315model_arch = model_class.model_arch
316
317lora_alpha: float
318
319def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
320
321super().__init__(*args, **kwargs)
322
323self.dir_model_card = dir_lora_model
324self.lora_alpha = float(lora_alpha)
325
326def set_type(self):
327self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
328self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
329
330def set_gguf_parameters(self):
331self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
332super().set_gguf_parameters()
333
334def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
335tensor_map: dict[str, PartialLoraTensor] = {}
336
337for name, tensor in lora_model.items():
338if self.lazy:
339tensor = LazyTorchTensor.from_eager(tensor)
340base_name = get_base_tensor_name(name)
341is_lora_a = ".lora_A.weight" in name
342is_lora_b = ".lora_B.weight" in name
343if not is_lora_a and not is_lora_b:
344if ".base_layer.weight" in name:
345continue
346logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
347sys.exit(1)
348
349if base_name in tensor_map:
350if is_lora_a:
351tensor_map[base_name].A = tensor
352else:
353tensor_map[base_name].B = tensor
354else:
355if is_lora_a:
356tensor_map[base_name] = PartialLoraTensor(A=tensor)
357else:
358tensor_map[base_name] = PartialLoraTensor(B=tensor)
359
360for name, tensor in tensor_map.items():
361assert tensor.A is not None
362assert tensor.B is not None
363yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
364
365def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
366dest = list(super().modify_tensors(data_torch, name, bid))
367# some archs may have the same tensor for lm_head and output (tie word embeddings)
368# in this case, adapters targeting lm_head will fail when using llama-export-lora
369# therefore, we ignore them for now
370# see: https://github.com/ggerganov/llama.cpp/issues/9065
371if name == "lm_head.weight" and len(dest) == 0:
372raise ValueError("lm_head is present in adapter, but is ignored in base model")
373for dest_name, dest_data in dest:
374assert isinstance(dest_data, LoraTorchTensor)
375lora_a, lora_b = dest_data.get_lora_A_B()
376
377yield (dest_name + ".lora_a", lora_a)
378yield (dest_name + ".lora_b", lora_b)
379
380with open(lora_config, "r") as f:
381lparams: dict[str, Any] = json.load(f)
382
383alpha: float = lparams["lora_alpha"]
384
385model_instance = LoraModel(
386dir_base_model,
387ftype,
388fname_out,
389is_big_endian=args.bigendian,
390use_temp_file=False,
391eager=args.no_lazy,
392dry_run=args.dry_run,
393dir_lora_model=dir_lora,
394lora_alpha=alpha,
395is_lora=True,
396)
397
398logger.info("Exporting model...")
399model_instance.write()
400logger.info(f"Model successfully exported to {model_instance.fname_out}")
401