llama

minicpmv-convert-image-encoder-to-gguf.py
806 строк · 31.2 Кб
Перенос по словам
1
# coding=utf-8
2
# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
""" PyTorch Siglip model. """
16
# Copied from  HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
17

18

19
import os
20
import math
21
import warnings
22

23
import numpy as np
24
import torch
25
import torch.nn.functional as F
26
import torch.utils.checkpoint
27
from torch import nn
28
from torch.nn.init import _calculate_fan_in_and_fan_out
29

30
from transformers.activations import ACT2FN
31
from transformers.modeling_utils import PreTrainedModel
32
from transformers.configuration_utils import PretrainedConfig
33
from transformers.utils import (
34
    logging,
35
)
36
from transformers.utils import logging
37

38
logger = logging.get_logger(__name__)
39

40
class SiglipVisionConfig(PretrainedConfig):
41
    r"""
42
    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
43
    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
44
    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
45
    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
46
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
47
    documentation from [`PretrainedConfig`] for more information.
48
    Args:
49
        hidden_size (`int`, *optional*, defaults to 768):
50
            Dimensionality of the encoder layers and the pooler layer.
51
        intermediate_size (`int`, *optional*, defaults to 3072):
52
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
53
        num_hidden_layers (`int`, *optional*, defaults to 12):
54
            Number of hidden layers in the Transformer encoder.
55
        num_attention_heads (`int`, *optional*, defaults to 12):
56
            Number of attention heads for each attention layer in the Transformer encoder.
57
        num_channels (`int`, *optional*, defaults to 3):
58
            Number of channels in the input images.
59
        image_size (`int`, *optional*, defaults to 224):
60
            The size (resolution) of each image.
61
        patch_size (`int`, *optional*, defaults to 16):
62
            The size (resolution) of each patch.
63
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
64
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
65
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
66
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
67
            The epsilon used by the layer normalization layers.
68
        attention_dropout (`float`, *optional*, defaults to 0.0):
69
            The dropout ratio for the attention probabilities.
70
    Example:
71
    ```python
72
    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
73
    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
74
    >>> configuration = SiglipVisionConfig()
75
    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
76
    >>> model = SiglipVisionModel(configuration)
77
    >>> # Accessing the model configuration
78
    >>> configuration = model.config
79
    ```"""
80

81
    model_type = "siglip_vision_model"
82

83
    def __init__(
84
        self,
85
        hidden_size=768,
86
        intermediate_size=3072,
87
        num_hidden_layers=12,
88
        num_attention_heads=12,
89
        num_channels=3,
90
        image_size=224,
91
        patch_size=16,
92
        hidden_act="gelu_pytorch_tanh",
93
        layer_norm_eps=1e-6,
94
        attention_dropout=0.0,
95
        **kwargs,
96
    ):
97
        super().__init__(**kwargs)
98

99
        self.hidden_size = hidden_size
100
        self.intermediate_size = intermediate_size
101
        self.num_hidden_layers = num_hidden_layers
102
        self.num_attention_heads = num_attention_heads
103
        self.num_channels = num_channels
104
        self.patch_size = patch_size
105
        self.image_size = image_size
106
        self.attention_dropout = attention_dropout
107
        self.layer_norm_eps = layer_norm_eps
108
        self.hidden_act = hidden_act
109

110
_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
111

112
SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
113
    "google/siglip-base-patch16-224",
114
    # See all SigLIP models at https://huggingface.co/models?filter=siglip
115
]
116

117
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
118
def _get_unpad_data(attention_mask):
119
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
120
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
121
    max_seqlen_in_batch = seqlens_in_batch.max().item()
122
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
123
    return (
124
        indices,
125
        cu_seqlens,
126
        max_seqlen_in_batch,
127
    )
128

129

130
def _trunc_normal_(tensor, mean, std, a, b):
131
    # Cut & paste from PyTorch official master until it's in a few official releases - RW
132
    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
133
    def norm_cdf(x):
134
        # Computes standard normal cumulative distribution function
135
        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
136

137
    if (mean < a - 2 * std) or (mean > b + 2 * std):
138
        warnings.warn(
139
            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
140
            "The distribution of values may be incorrect.",
141
            stacklevel=2,
142
        )
143

144
    # Values are generated by using a truncated uniform distribution and
145
    # then using the inverse CDF for the normal distribution.
146
    # Get upper and lower cdf values
147
    l = norm_cdf((a - mean) / std)
148
    u = norm_cdf((b - mean) / std)
149

150
    # Uniformly fill tensor with values from [l, u], then translate to
151
    # [2l-1, 2u-1].
152
    tensor.uniform_(2 * l - 1, 2 * u - 1)
153

154
    # Use inverse cdf transform for normal distribution to get truncated
155
    # standard normal
156
    if tensor.dtype in [torch.float16, torch.bfloat16]:
157
        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
158
        og_dtype = tensor.dtype
159
        tensor = tensor.to(torch.float32)
160
        tensor.erfinv_()
161
        tensor = tensor.to(og_dtype)
162
    else:
163
        tensor.erfinv_()
164

165
    # Transform to proper mean, std
166
    tensor.mul_(std * math.sqrt(2.0))
167
    tensor.add_(mean)
168

169
    # Clamp to ensure it's in the proper range
170
    if tensor.dtype == torch.float16:
171
        # The `clamp_` op is not (yet?) defined in float16+cpu
172
        tensor = tensor.to(torch.float32)
173
        tensor.clamp_(min=a, max=b)
174
        tensor = tensor.to(torch.float16)
175
    else:
176
        tensor.clamp_(min=a, max=b)
177

178

179
def trunc_normal_tf_(
180
    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
181
):
182
    """Fills the input Tensor with values drawn from a truncated
183
    normal distribution. The values are effectively drawn from the
184
    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
185
    with values outside :math:`[a, b]` redrawn until they are within
186
    the bounds. The method used for generating the random values works
187
    best when :math:`a \\leq \text{mean} \\leq b`.
188
    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
189
    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
190
    and the result is subsquently scaled and shifted by the mean and std args.
191
    Args:
192
        tensor: an n-dimensional `torch.Tensor`
193
        mean: the mean of the normal distribution
194
        std: the standard deviation of the normal distribution
195
        a: the minimum cutoff value
196
        b: the maximum cutoff value
197
    """
198
    with torch.no_grad():
199
        _trunc_normal_(tensor, 0, 1.0, a, b)
200
        tensor.mul_(std).add_(mean)
201

202

203
def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
204
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
205
    denom = fan_in
206
    if mode == "fan_in":
207
        denom = fan_in
208
    elif mode == "fan_out":
209
        denom = fan_out
210
    elif mode == "fan_avg":
211
        denom = (fan_in + fan_out) / 2
212

213
    variance = scale / denom
214

215
    if distribution == "truncated_normal":
216
        # constant is stddev of standard normal truncated to (-2, 2)
217
        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
218
    elif distribution == "normal":
219
        with torch.no_grad():
220
            tensor.normal_(std=math.sqrt(variance))
221
    elif distribution == "uniform":
222
        bound = math.sqrt(3 * variance)
223
        with torch.no_grad():
224
            tensor.uniform_(-bound, bound)
225
    else:
226
        raise ValueError(f"invalid distribution {distribution}")
227

228

229
def lecun_normal_(tensor):
230
    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
231

232

233
def default_flax_embed_init(tensor):
234
    variance_scaling_(tensor, mode="fan_in", distribution="normal")
235

236
class SiglipVisionEmbeddings(nn.Module):
237
    def __init__(self, config: SiglipVisionConfig):
238
        super().__init__()
239
        self.config = config
240
        self.embed_dim = config.hidden_size
241
        self.image_size = config.image_size
242
        self.patch_size = config.patch_size
243

244
        self.patch_embedding = nn.Conv2d(
245
            in_channels=config.num_channels,
246
            out_channels=self.embed_dim,
247
            kernel_size=self.patch_size,
248
            stride=self.patch_size,
249
            padding="valid",
250
        )
251

252
        self.num_patches_per_side = self.image_size // self.patch_size
253
        self.num_patches = self.num_patches_per_side**2
254
        self.num_positions = self.num_patches
255
        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
256

257
class SiglipAttention(nn.Module):
258
    """Multi-headed attention from 'Attention Is All You Need' paper"""
259

260
    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
261
    def __init__(self, config):
262
        super().__init__()
263
        self.config = config
264
        self.embed_dim = config.hidden_size
265
        self.num_heads = config.num_attention_heads
266
        self.head_dim = self.embed_dim // self.num_heads
267
        if self.head_dim * self.num_heads != self.embed_dim:
268
            raise ValueError(
269
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
270
                f" {self.num_heads})."
271
            )
272
        self.scale = self.head_dim**-0.5
273
        self.dropout = config.attention_dropout
274

275
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
276
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
277
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
278
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
279

280
# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
281
class SiglipMLP(nn.Module):
282
    def __init__(self, config):
283
        super().__init__()
284
        self.config = config
285
        self.activation_fn = ACT2FN[config.hidden_act]
286
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
287
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
288

289

290
# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
291
class SiglipEncoderLayer(nn.Module):
292
    def __init__(self, config: SiglipVisionConfig):
293
        super().__init__()
294
        self.embed_dim = config.hidden_size
295
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
296
        self.self_attn = (
297
            SiglipAttention(config)
298
        )
299
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
300
        self.mlp = SiglipMLP(config)
301
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
302

303
class SiglipPreTrainedModel(PreTrainedModel):
304
    """
305
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
306
    models.
307
    """
308

309
    config_class = SiglipVisionConfig
310
    base_model_prefix = "siglip"
311
    supports_gradient_checkpointing = True
312

313
    def _init_weights(self, module):
314
        """Initialize the weights"""
315

316
        if isinstance(module, SiglipVisionEmbeddings):
317
            width = self.config.hidden_size
318
            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
319
        elif isinstance(module, nn.Embedding):
320
            default_flax_embed_init(module.weight)
321
        elif isinstance(module, SiglipAttention):
322
            nn.init.normal_(module.q_proj.weight)
323
            nn.init.normal_(module.k_proj.weight)
324
            nn.init.normal_(module.v_proj.weight)
325
            nn.init.normal_(module.out_proj.weight)
326
            nn.init.zeros_(module.q_proj.bias)
327
            nn.init.zeros_(module.k_proj.bias)
328
            nn.init.zeros_(module.v_proj.bias)
329
            nn.init.zeros_(module.out_proj.bias)
330
        elif isinstance(module, SiglipMLP):
331
            nn.init.normal_(module.fc1.weight)
332
            nn.init.normal_(module.fc2.weight)
333
            nn.init.normal_(module.fc1.bias, std=1e-6)
334
            nn.init.normal_(module.fc2.bias, std=1e-6)
335
        elif isinstance(module, (nn.Linear, nn.Conv2d)):
336
            lecun_normal_(module.weight)
337
            if module.bias is not None:
338
                nn.init.zeros_(module.bias)
339
        elif isinstance(module, nn.LayerNorm):
340
            module.bias.data.zero_()
341
            module.weight.data.fill_(1.0)
342

343

344
SIGLIP_START_DOCSTRING = r"""
345
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
346
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
347
    etc.)
348
    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
349
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
350
    and behavior.
351
    Parameters:
352
        config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
353
            Initializing with a config file does not load the weights associated with the model, only the
354
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
355
"""
356

357

358
SIGLIP_VISION_INPUTS_DOCSTRING = r"""
359
    Args:
360
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
361
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
362
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
363
        output_attentions (`bool`, *optional*):
364
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
365
            tensors for more detail.
366
        output_hidden_states (`bool`, *optional*):
367
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
368
            more detail.
369
        return_dict (`bool`, *optional*):
370
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
371
"""
372

373

374
# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
375
class SiglipEncoder(nn.Module):
376
    """
377
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
378
    [`SiglipEncoderLayer`].
379
    Args:
380
        config: SiglipConfig
381
    """
382

383
    def __init__(self, config: SiglipVisionConfig):
384
        super().__init__()
385
        self.config = config
386
        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
387
        self.gradient_checkpointing = False
388

389
class SiglipVisionTransformer(SiglipPreTrainedModel):
390
    config_class = SiglipVisionConfig
391
    main_input_name = "pixel_values"
392
    _supports_flash_attn_2 = True
393

394
    def __init__(self, config: SiglipVisionConfig):
395
        super().__init__(config)
396
        self.config = config
397
        embed_dim = config.hidden_size
398

399
        self.embeddings = SiglipVisionEmbeddings(config)
400
        self.encoder = SiglipEncoder(config)
401
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
402
        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
403

404
        # Initialize weights and apply final processing
405
        self.post_init()
406

407
    def get_input_embeddings(self) -> nn.Module:
408
        return self.embeddings.patch_embedding
409

410
import argparse
411
import json
412
import re
413

414
import numpy as np
415
from gguf import *
416
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
417

418
TEXT = "clip.text"
419
VISION = "clip.vision"
420

421

422
def add_key_str(raw_key: str, arch: str) -> str:
423
    return raw_key.format(arch=arch)
424

425

426
def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
427
    if name in (
428
        "logit_scale",
429
        "text_model.embeddings.position_ids",
430
        "vision_model.embeddings.position_ids",
431
    ):
432
        return True
433

434
    if has_minicpmv and name in ["visual_projection.weight"]:
435
        return True
436

437
    if name.startswith("v") and not has_vision:
438
        return True
439

440
    if name.startswith("t") and not has_text:
441
        return True
442

443
    return False
444

445

446
def get_tensor_name(name: str) -> str:
447
    if "projection" in name:
448
        return name
449
    if "mm_projector" in name:
450
        name = name.replace("model.mm_projector", "mm")
451
        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
452
        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
453
        return name
454

455
    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
456

457

458
def bytes_to_unicode():
459
    """
460
    Returns list of utf-8 byte and a corresponding list of unicode strings.
461
    The reversible bpe codes work on unicode strings.
462
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
463
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
464
    This is a significant percentage of your normal, say, 32K bpe vocab.
465
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
466
    And avoids mapping to whitespace/control characters the bpe code barfs on.
467
    """
468
    bs = (
469
        list(range(ord("!"), ord("~") + 1))
470
        + list(range(ord("¡"), ord("¬") + 1))
471
        + list(range(ord("®"), ord("ÿ") + 1))
472
    )
473
    cs = bs[:]
474
    n = 0
475
    for b in range(2**8):
476
        if b not in bs:
477
            bs.append(b)
478
            cs.append(2**8 + n)
479
            n += 1
480
    cs = [chr(n) for n in cs]
481
    return dict(zip(bs, cs))
482

483

484
ap = argparse.ArgumentParser()
485
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
486
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
487
ap.add_argument("--text-only", action="store_true", required=False,
488
                help="Save a text-only model. It can't be used to encode images")
489
ap.add_argument("--vision-only", action="store_true", required=False,
490
                help="Save a vision-only model. It can't be used to encode texts")
491
ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
492
                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
493
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
494
                help="The clip model is from openclip (for ViT-SO400M type))")
495
ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
496
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
497
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
498
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
499
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
500
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
501
default_image_std = [0.26862954, 0.26130258, 0.27577711]
502
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
503
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
504
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2)
505

506
# with proper
507
args = ap.parse_args()
508

509

510
if args.text_only and args.vision_only:
511
    print("--text-only and --image-only arguments cannot be specified at the same time.")
512
    exit(1)
513

514
if args.use_f32:
515
    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
516

517
# output in the same directory as the model if output_dir is None
518
dir_model = args.model_dir
519

520
if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
521
    vocab = None
522
    tokens = None
523
else:
524
    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
525
        vocab = json.load(f)
526
        tokens = [key for key in vocab]
527

528
# possible data types
529
#   ftype == 0 -> float32
530
#   ftype == 1 -> float16
531
#
532
# map from ftype to string
533
ftype_str = ["f32", "f16"]
534

535
ftype = 1
536
if args.use_f32:
537
    ftype = 0
538

539
# if args.clip_model_is_vision or args.clip_model_is_openclip:
540
#     model = CLIPVisionModel.from_pretrained(dir_model)
541
#     processor = None
542
# else:
543
#     model = CLIPModel.from_pretrained(dir_model)
544
#     processor = CLIPProcessor.from_pretrained(dir_model)
545

546
minicpmv_version = args.minicpmv_version
547
emb_dim = 4096
548
if minicpmv_version == 1:
549
    emb_dim = 2304
550
elif minicpmv_version == 2:
551
    emb_dim = 4096
552
elif minicpmv_version == 3:
553
    emb_dim = 3584
554

555
default_vision_config = {
556
        "hidden_size": 1152,
557
        "image_size": 980,
558
        "intermediate_size": 4304,
559
        "model_type": "idefics2",
560
        "num_attention_heads": 16,
561
        "num_hidden_layers": 27,
562
        "patch_size": 14,
563
    }
564

565
vision_config = Idefics2VisionConfig(**default_vision_config)
566
model = Idefics2VisionTransformer(vision_config)
567
if minicpmv_version == 3:
568
    vision_config = SiglipVisionConfig(**default_vision_config)
569
    model = SiglipVisionTransformer(vision_config)
570

571
processor = None
572
# if model.attn_pool is not None:
573
#     model.attn_pool = torch.nn.Identity()
574

575
# model.blocks = model.blocks[:-1]
576
model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))
577

578
fname_middle = None
579
has_text_encoder = True
580
has_vision_encoder = True
581
has_minicpmv_projector = False
582

583
if args.text_only:
584
    fname_middle = "text-"
585
    has_vision_encoder = False
586
elif args.minicpmv_projector is not None:
587
    fname_middle = "mmproj-"
588
    has_text_encoder = False
589
    has_minicpmv_projector = True
590
    minicpmv_version = 3
591
elif args.vision_only:
592
    fname_middle = "vision-"
593
    has_text_encoder = False
594
else:
595
    fname_middle = ""
596

597
output_dir = args.output_dir if args.output_dir is not None else dir_model
598
os.makedirs(output_dir, exist_ok=True)
599
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
600
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
601
fout = GGUFWriter(path=fname_out, arch="clip")
602

603
fout.add_bool("clip.has_text_encoder", has_text_encoder)
604
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
605
fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
606
fout.add_file_type(ftype)
607
if args.text_only:
608
    fout.add_description("text-only CLIP model")
609
elif args.vision_only and not has_minicpmv_projector:
610
    fout.add_description("vision-only CLIP model")
611
elif has_minicpmv_projector:
612
    fout.add_description("image encoder for MiniCPM-V")
613
    # add projector type
614
    fout.add_string("clip.projector_type", "resampler")
615
    fout.add_int32("clip.minicpmv_version", minicpmv_version)
616
else:
617
    fout.add_description("two-tower CLIP model")
618

619
if has_vision_encoder:
620
    # vision_model hparams
621
    fout.add_uint32("clip.vision.image_size", 448)
622
    fout.add_uint32("clip.vision.patch_size", 14)
623
    fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
624
    fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
625
    fout.add_uint32("clip.vision.projection_dim", 0)
626
    fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
627
    fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
628
    block_count = 26
629
    fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
630

631
    if processor is not None:
632
        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
633
        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
634
    else:
635
        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
636
        image_std = args.image_std if args.image_std is not None else default_image_std
637
    fout.add_array("clip.vision.image_mean", image_mean)
638
    fout.add_array("clip.vision.image_std", image_std)
639

640
use_gelu = True
641
fout.add_bool("clip.use_gelu", use_gelu)
642

643
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
644
    """
645
    embed_dim: output dimension for each position
646
    pos: a list of positions to be encoded: size (M,)
647
    out: (M, D)
648
    """
649
    assert embed_dim % 2 == 0
650
    omega = np.arange(embed_dim // 2, dtype=np.float32)
651
    omega /= embed_dim / 2.
652
    omega = 1. / 10000 ** omega  # (D/2,)
653

654
    pos = pos.reshape(-1)  # (M,)
655
    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
656

657
    emb_sin = np.sin(out)  # (M, D/2)
658
    emb_cos = np.cos(out)  # (M, D/2)
659

660
    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
661
    return emb
662

663
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
664
    assert embed_dim % 2 == 0
665

666
    # use half of dimensions to encode grid_h
667
    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
668
    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
669

670
    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
671
    return emb
672

673

674
# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
675
def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
676
    """
677
    grid_size: int of the grid height and width
678
    return:
679
    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
680
    """
681
    if isinstance(grid_size, int):
682
        grid_h_size, grid_w_size = grid_size, grid_size
683
    else:
684
        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
685

686
    grid_h = np.arange(grid_h_size, dtype=np.float32)
687
    grid_w = np.arange(grid_w_size, dtype=np.float32)
688
    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
689
    grid = np.stack(grid, axis=0)
690

691
    grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
692
    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
693
    if cls_token:
694
        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
695
    return pos_embed
696

697
def _replace_name_resampler(s, v):
698
    if re.match("resampler.pos_embed", s):
699
        return {
700
            s: v,
701
            re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
702
        }
703
    if re.match("resampler.proj", s):
704
        return {
705
            re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
706
            re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
707
        }
708
    if re.match("resampler.attn.in_proj_.*", s):
709
        return {
710
            re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
711
            re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
712
            re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
713
        }
714
    return {s: v}
715

716
if has_minicpmv_projector:
717
    projector = torch.load(args.minicpmv_projector)
718
    new_state_dict = {}
719
    for k, v in projector.items():
720
        kvs = _replace_name_resampler(k, v)
721
        for nk, nv in kvs.items():
722
            new_state_dict[nk] = nv
723
    projector = new_state_dict
724
    ftype_cur = 0
725
    for name, data in projector.items():
726
        name = get_tensor_name(name)
727
        data = data.squeeze().numpy()
728

729
        n_dims = len(data.shape)
730
        if ftype == 1:
731
            if name[-7:] == ".weight" and n_dims == 2:
732
                print("  Converting to float16")
733
                data = data.astype(np.float16)
734
                ftype_cur = 1
735
            else:
736
                print("  Converting to float32")
737
                data = data.astype(np.float32)
738
                ftype_cur = 0
739
        else:
740
            if data.dtype != np.float32:
741
                print("  Converting to float32")
742
                data = data.astype(np.float32)
743
                ftype_cur = 0
744

745
        fout.add_tensor(name, data)
746
        print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
747

748
    print("Projector tensors added\n")
749

750
def _replace_name(s, v):
751
    s = "vision_model." + s
752
    if re.match("vision_model.embeddings.position_embedding", s):
753
        v = v.unsqueeze(0)
754
        return {s: v}
755

756
    return {s: v}
757

758
state_dict = model.state_dict()
759
new_state_dict = {}
760
for k, v in state_dict.items():
761
    kvs = _replace_name(k, v)
762
    for nk, nv in kvs.items():
763
        new_state_dict[nk] = nv
764
state_dict = new_state_dict
765
for name, data in state_dict.items():
766
    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
767
        # we don't need this
768
        print(f"skipping parameter: {name}")
769
        continue
770

771
    name = get_tensor_name(name)
772
    data = data.squeeze().numpy()
773

774
    n_dims = len(data.shape)
775

776
    # ftype == 0 -> float32, ftype == 1 -> float16
777
    ftype_cur = 0
778
    if n_dims == 4:
779
        print(f"tensor {name} is always saved in f16")
780
        data = data.astype(np.float16)
781
        ftype_cur = 1
782
    elif ftype == 1:
783
        if name[-7:] == ".weight" and n_dims == 2:
784
            print("  Converting to float16")
785
            data = data.astype(np.float16)
786
            ftype_cur = 1
787
        else:
788
            print("  Converting to float32")
789
            data = data.astype(np.float32)
790
            ftype_cur = 0
791
    else:
792
        if data.dtype != np.float32:
793
            print("  Converting to float32")
794
            data = data.astype(np.float32)
795
            ftype_cur = 0
796

797
    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
798
    fout.add_tensor(name, data)
799

800

801
fout.write_header_to_file()
802
fout.write_kv_data_to_file()
803
fout.write_tensors_to_file()
804
fout.close()
805

806
print("Done. Output file: " + fname_out)
807
llama

Использование cookies