2
# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
8
# http://www.apache.org/licenses/LICENSE-2.0
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
""" PyTorch Siglip model. """
16
# Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
25
import torch.nn.functional as F
26
import torch.utils.checkpoint
28
from torch.nn.init import _calculate_fan_in_and_fan_out
30
from transformers.activations import ACT2FN
31
from transformers.modeling_utils import PreTrainedModel
32
from transformers.configuration_utils import PretrainedConfig
33
from transformers.utils import (
36
from transformers.utils import logging
38
logger = logging.get_logger(__name__)
40
class SiglipVisionConfig(PretrainedConfig):
42
This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
43
Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
44
configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
45
[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
46
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
47
documentation from [`PretrainedConfig`] for more information.
49
hidden_size (`int`, *optional*, defaults to 768):
50
Dimensionality of the encoder layers and the pooler layer.
51
intermediate_size (`int`, *optional*, defaults to 3072):
52
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
53
num_hidden_layers (`int`, *optional*, defaults to 12):
54
Number of hidden layers in the Transformer encoder.
55
num_attention_heads (`int`, *optional*, defaults to 12):
56
Number of attention heads for each attention layer in the Transformer encoder.
57
num_channels (`int`, *optional*, defaults to 3):
58
Number of channels in the input images.
59
image_size (`int`, *optional*, defaults to 224):
60
The size (resolution) of each image.
61
patch_size (`int`, *optional*, defaults to 16):
62
The size (resolution) of each patch.
63
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
64
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
65
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
66
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
67
The epsilon used by the layer normalization layers.
68
attention_dropout (`float`, *optional*, defaults to 0.0):
69
The dropout ratio for the attention probabilities.
72
>>> from transformers import SiglipVisionConfig, SiglipVisionModel
73
>>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
74
>>> configuration = SiglipVisionConfig()
75
>>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
76
>>> model = SiglipVisionModel(configuration)
77
>>> # Accessing the model configuration
78
>>> configuration = model.config
81
model_type = "siglip_vision_model"
86
intermediate_size=3072,
88
num_attention_heads=12,
92
hidden_act="gelu_pytorch_tanh",
94
attention_dropout=0.0,
97
super().__init__(**kwargs)
99
self.hidden_size = hidden_size
100
self.intermediate_size = intermediate_size
101
self.num_hidden_layers = num_hidden_layers
102
self.num_attention_heads = num_attention_heads
103
self.num_channels = num_channels
104
self.patch_size = patch_size
105
self.image_size = image_size
106
self.attention_dropout = attention_dropout
107
self.layer_norm_eps = layer_norm_eps
108
self.hidden_act = hidden_act
110
_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
112
SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
113
"google/siglip-base-patch16-224",
114
# See all SigLIP models at https://huggingface.co/models?filter=siglip
117
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
118
def _get_unpad_data(attention_mask):
119
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
120
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
121
max_seqlen_in_batch = seqlens_in_batch.max().item()
122
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
130
def _trunc_normal_(tensor, mean, std, a, b):
131
# Cut & paste from PyTorch official master until it's in a few official releases - RW
132
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
134
# Computes standard normal cumulative distribution function
135
return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
137
if (mean < a - 2 * std) or (mean > b + 2 * std):
139
"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
140
"The distribution of values may be incorrect.",
144
# Values are generated by using a truncated uniform distribution and
145
# then using the inverse CDF for the normal distribution.
146
# Get upper and lower cdf values
147
l = norm_cdf((a - mean) / std)
148
u = norm_cdf((b - mean) / std)
150
# Uniformly fill tensor with values from [l, u], then translate to
152
tensor.uniform_(2 * l - 1, 2 * u - 1)
154
# Use inverse cdf transform for normal distribution to get truncated
156
if tensor.dtype in [torch.float16, torch.bfloat16]:
157
# The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
158
og_dtype = tensor.dtype
159
tensor = tensor.to(torch.float32)
161
tensor = tensor.to(og_dtype)
165
# Transform to proper mean, std
166
tensor.mul_(std * math.sqrt(2.0))
169
# Clamp to ensure it's in the proper range
170
if tensor.dtype == torch.float16:
171
# The `clamp_` op is not (yet?) defined in float16+cpu
172
tensor = tensor.to(torch.float32)
173
tensor.clamp_(min=a, max=b)
174
tensor = tensor.to(torch.float16)
176
tensor.clamp_(min=a, max=b)
180
tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
182
"""Fills the input Tensor with values drawn from a truncated
183
normal distribution. The values are effectively drawn from the
184
normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
185
with values outside :math:`[a, b]` redrawn until they are within
186
the bounds. The method used for generating the random values works
187
best when :math:`a \\leq \text{mean} \\leq b`.
188
NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
189
bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
190
and the result is subsquently scaled and shifted by the mean and std args.
192
tensor: an n-dimensional `torch.Tensor`
193
mean: the mean of the normal distribution
194
std: the standard deviation of the normal distribution
195
a: the minimum cutoff value
196
b: the maximum cutoff value
198
with torch.no_grad():
199
_trunc_normal_(tensor, 0, 1.0, a, b)
200
tensor.mul_(std).add_(mean)
203
def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
204
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
208
elif mode == "fan_out":
210
elif mode == "fan_avg":
211
denom = (fan_in + fan_out) / 2
213
variance = scale / denom
215
if distribution == "truncated_normal":
216
# constant is stddev of standard normal truncated to (-2, 2)
217
trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
218
elif distribution == "normal":
219
with torch.no_grad():
220
tensor.normal_(std=math.sqrt(variance))
221
elif distribution == "uniform":
222
bound = math.sqrt(3 * variance)
223
with torch.no_grad():
224
tensor.uniform_(-bound, bound)
226
raise ValueError(f"invalid distribution {distribution}")
229
def lecun_normal_(tensor):
230
variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
233
def default_flax_embed_init(tensor):
234
variance_scaling_(tensor, mode="fan_in", distribution="normal")
236
class SiglipVisionEmbeddings(nn.Module):
237
def __init__(self, config: SiglipVisionConfig):
240
self.embed_dim = config.hidden_size
241
self.image_size = config.image_size
242
self.patch_size = config.patch_size
244
self.patch_embedding = nn.Conv2d(
245
in_channels=config.num_channels,
246
out_channels=self.embed_dim,
247
kernel_size=self.patch_size,
248
stride=self.patch_size,
252
self.num_patches_per_side = self.image_size // self.patch_size
253
self.num_patches = self.num_patches_per_side**2
254
self.num_positions = self.num_patches
255
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
257
class SiglipAttention(nn.Module):
258
"""Multi-headed attention from 'Attention Is All You Need' paper"""
260
# Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
261
def __init__(self, config):
264
self.embed_dim = config.hidden_size
265
self.num_heads = config.num_attention_heads
266
self.head_dim = self.embed_dim // self.num_heads
267
if self.head_dim * self.num_heads != self.embed_dim:
269
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
270
f" {self.num_heads})."
272
self.scale = self.head_dim**-0.5
273
self.dropout = config.attention_dropout
275
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
276
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
277
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
278
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
280
# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
281
class SiglipMLP(nn.Module):
282
def __init__(self, config):
285
self.activation_fn = ACT2FN[config.hidden_act]
286
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
287
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
290
# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
291
class SiglipEncoderLayer(nn.Module):
292
def __init__(self, config: SiglipVisionConfig):
294
self.embed_dim = config.hidden_size
295
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
297
SiglipAttention(config)
299
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
300
self.mlp = SiglipMLP(config)
301
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
303
class SiglipPreTrainedModel(PreTrainedModel):
305
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
309
config_class = SiglipVisionConfig
310
base_model_prefix = "siglip"
311
supports_gradient_checkpointing = True
313
def _init_weights(self, module):
314
"""Initialize the weights"""
316
if isinstance(module, SiglipVisionEmbeddings):
317
width = self.config.hidden_size
318
nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
319
elif isinstance(module, nn.Embedding):
320
default_flax_embed_init(module.weight)
321
elif isinstance(module, SiglipAttention):
322
nn.init.normal_(module.q_proj.weight)
323
nn.init.normal_(module.k_proj.weight)
324
nn.init.normal_(module.v_proj.weight)
325
nn.init.normal_(module.out_proj.weight)
326
nn.init.zeros_(module.q_proj.bias)
327
nn.init.zeros_(module.k_proj.bias)
328
nn.init.zeros_(module.v_proj.bias)
329
nn.init.zeros_(module.out_proj.bias)
330
elif isinstance(module, SiglipMLP):
331
nn.init.normal_(module.fc1.weight)
332
nn.init.normal_(module.fc2.weight)
333
nn.init.normal_(module.fc1.bias, std=1e-6)
334
nn.init.normal_(module.fc2.bias, std=1e-6)
335
elif isinstance(module, (nn.Linear, nn.Conv2d)):
336
lecun_normal_(module.weight)
337
if module.bias is not None:
338
nn.init.zeros_(module.bias)
339
elif isinstance(module, nn.LayerNorm):
340
module.bias.data.zero_()
341
module.weight.data.fill_(1.0)
344
SIGLIP_START_DOCSTRING = r"""
345
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
346
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
348
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
349
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
352
config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
353
Initializing with a config file does not load the weights associated with the model, only the
354
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
358
SIGLIP_VISION_INPUTS_DOCSTRING = r"""
360
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
361
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
362
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
363
output_attentions (`bool`, *optional*):
364
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
365
tensors for more detail.
366
output_hidden_states (`bool`, *optional*):
367
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
369
return_dict (`bool`, *optional*):
370
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
374
# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
375
class SiglipEncoder(nn.Module):
377
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
378
[`SiglipEncoderLayer`].
383
def __init__(self, config: SiglipVisionConfig):
386
self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
387
self.gradient_checkpointing = False
389
class SiglipVisionTransformer(SiglipPreTrainedModel):
390
config_class = SiglipVisionConfig
391
main_input_name = "pixel_values"
392
_supports_flash_attn_2 = True
394
def __init__(self, config: SiglipVisionConfig):
395
super().__init__(config)
397
embed_dim = config.hidden_size
399
self.embeddings = SiglipVisionEmbeddings(config)
400
self.encoder = SiglipEncoder(config)
401
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
402
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
404
# Initialize weights and apply final processing
407
def get_input_embeddings(self) -> nn.Module:
408
return self.embeddings.patch_embedding
416
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
419
VISION = "clip.vision"
422
def add_key_str(raw_key: str, arch: str) -> str:
423
return raw_key.format(arch=arch)
426
def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
429
"text_model.embeddings.position_ids",
430
"vision_model.embeddings.position_ids",
434
if has_minicpmv and name in ["visual_projection.weight"]:
437
if name.startswith("v") and not has_vision:
440
if name.startswith("t") and not has_text:
446
def get_tensor_name(name: str) -> str:
447
if "projection" in name:
449
if "mm_projector" in name:
450
name = name.replace("model.mm_projector", "mm")
451
name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
452
name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
455
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
458
def bytes_to_unicode():
460
Returns list of utf-8 byte and a corresponding list of unicode strings.
461
The reversible bpe codes work on unicode strings.
462
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
463
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
464
This is a significant percentage of your normal, say, 32K bpe vocab.
465
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
466
And avoids mapping to whitespace/control characters the bpe code barfs on.
469
list(range(ord("!"), ord("~") + 1))
470
+ list(range(ord("¡"), ord("¬") + 1))
471
+ list(range(ord("®"), ord("ÿ") + 1))
475
for b in range(2**8):
480
cs = [chr(n) for n in cs]
481
return dict(zip(bs, cs))
484
ap = argparse.ArgumentParser()
485
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
486
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
487
ap.add_argument("--text-only", action="store_true", required=False,
488
help="Save a text-only model. It can't be used to encode images")
489
ap.add_argument("--vision-only", action="store_true", required=False,
490
help="Save a vision-only model. It can't be used to encode texts")
491
ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
492
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
493
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
494
help="The clip model is from openclip (for ViT-SO400M type))")
495
ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
496
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
497
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
498
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
499
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
500
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
501
default_image_std = [0.26862954, 0.26130258, 0.27577711]
502
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
503
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
504
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2)
507
args = ap.parse_args()
510
if args.text_only and args.vision_only:
511
print("--text-only and --image-only arguments cannot be specified at the same time.")
515
print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
517
# output in the same directory as the model if output_dir is None
518
dir_model = args.model_dir
520
if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
524
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
526
tokens = [key for key in vocab]
529
# ftype == 0 -> float32
530
# ftype == 1 -> float16
532
# map from ftype to string
533
ftype_str = ["f32", "f16"]
539
# if args.clip_model_is_vision or args.clip_model_is_openclip:
540
# model = CLIPVisionModel.from_pretrained(dir_model)
543
# model = CLIPModel.from_pretrained(dir_model)
544
# processor = CLIPProcessor.from_pretrained(dir_model)
546
minicpmv_version = args.minicpmv_version
548
if minicpmv_version == 1:
550
elif minicpmv_version == 2:
552
elif minicpmv_version == 3:
555
default_vision_config = {
558
"intermediate_size": 4304,
559
"model_type": "idefics2",
560
"num_attention_heads": 16,
561
"num_hidden_layers": 27,
565
vision_config = Idefics2VisionConfig(**default_vision_config)
566
model = Idefics2VisionTransformer(vision_config)
567
if minicpmv_version == 3:
568
vision_config = SiglipVisionConfig(**default_vision_config)
569
model = SiglipVisionTransformer(vision_config)
572
# if model.attn_pool is not None:
573
# model.attn_pool = torch.nn.Identity()
575
# model.blocks = model.blocks[:-1]
576
model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))
579
has_text_encoder = True
580
has_vision_encoder = True
581
has_minicpmv_projector = False
584
fname_middle = "text-"
585
has_vision_encoder = False
586
elif args.minicpmv_projector is not None:
587
fname_middle = "mmproj-"
588
has_text_encoder = False
589
has_minicpmv_projector = True
591
elif args.vision_only:
592
fname_middle = "vision-"
593
has_text_encoder = False
597
output_dir = args.output_dir if args.output_dir is not None else dir_model
598
os.makedirs(output_dir, exist_ok=True)
599
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
600
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
601
fout = GGUFWriter(path=fname_out, arch="clip")
603
fout.add_bool("clip.has_text_encoder", has_text_encoder)
604
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
605
fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
606
fout.add_file_type(ftype)
608
fout.add_description("text-only CLIP model")
609
elif args.vision_only and not has_minicpmv_projector:
610
fout.add_description("vision-only CLIP model")
611
elif has_minicpmv_projector:
612
fout.add_description("image encoder for MiniCPM-V")
614
fout.add_string("clip.projector_type", "resampler")
615
fout.add_int32("clip.minicpmv_version", minicpmv_version)
617
fout.add_description("two-tower CLIP model")
619
if has_vision_encoder:
620
# vision_model hparams
621
fout.add_uint32("clip.vision.image_size", 448)
622
fout.add_uint32("clip.vision.patch_size", 14)
623
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
624
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
625
fout.add_uint32("clip.vision.projection_dim", 0)
626
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
627
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
629
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
631
if processor is not None:
632
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
633
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
635
image_mean = args.image_mean if args.image_mean is not None else default_image_mean
636
image_std = args.image_std if args.image_std is not None else default_image_std
637
fout.add_array("clip.vision.image_mean", image_mean)
638
fout.add_array("clip.vision.image_std", image_std)
641
fout.add_bool("clip.use_gelu", use_gelu)
643
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
645
embed_dim: output dimension for each position
646
pos: a list of positions to be encoded: size (M,)
649
assert embed_dim % 2 == 0
650
omega = np.arange(embed_dim // 2, dtype=np.float32)
651
omega /= embed_dim / 2.
652
omega = 1. / 10000 ** omega # (D/2,)
654
pos = pos.reshape(-1) # (M,)
655
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
657
emb_sin = np.sin(out) # (M, D/2)
658
emb_cos = np.cos(out) # (M, D/2)
660
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
663
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
664
assert embed_dim % 2 == 0
666
# use half of dimensions to encode grid_h
667
emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
668
emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
670
emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
674
# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
675
def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
677
grid_size: int of the grid height and width
679
pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
681
if isinstance(grid_size, int):
682
grid_h_size, grid_w_size = grid_size, grid_size
684
grid_h_size, grid_w_size = grid_size[0], grid_size[1]
686
grid_h = np.arange(grid_h_size, dtype=np.float32)
687
grid_w = np.arange(grid_w_size, dtype=np.float32)
688
grid = np.meshgrid(grid_w, grid_h) # here w goes first
689
grid = np.stack(grid, axis=0)
691
grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
692
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
694
pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
697
def _replace_name_resampler(s, v):
698
if re.match("resampler.pos_embed", s):
701
re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
703
if re.match("resampler.proj", s):
705
re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
706
re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
708
if re.match("resampler.attn.in_proj_.*", s):
710
re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
711
re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
712
re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
716
if has_minicpmv_projector:
717
projector = torch.load(args.minicpmv_projector)
719
for k, v in projector.items():
720
kvs = _replace_name_resampler(k, v)
721
for nk, nv in kvs.items():
722
new_state_dict[nk] = nv
723
projector = new_state_dict
725
for name, data in projector.items():
726
name = get_tensor_name(name)
727
data = data.squeeze().numpy()
729
n_dims = len(data.shape)
731
if name[-7:] == ".weight" and n_dims == 2:
732
print(" Converting to float16")
733
data = data.astype(np.float16)
736
print(" Converting to float32")
737
data = data.astype(np.float32)
740
if data.dtype != np.float32:
741
print(" Converting to float32")
742
data = data.astype(np.float32)
745
fout.add_tensor(name, data)
746
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
748
print("Projector tensors added\n")
750
def _replace_name(s, v):
751
s = "vision_model." + s
752
if re.match("vision_model.embeddings.position_embedding", s):
758
state_dict = model.state_dict()
760
for k, v in state_dict.items():
761
kvs = _replace_name(k, v)
762
for nk, nv in kvs.items():
763
new_state_dict[nk] = nv
764
state_dict = new_state_dict
765
for name, data in state_dict.items():
766
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
768
print(f"skipping parameter: {name}")
771
name = get_tensor_name(name)
772
data = data.squeeze().numpy()
774
n_dims = len(data.shape)
776
# ftype == 0 -> float32, ftype == 1 -> float16
779
print(f"tensor {name} is always saved in f16")
780
data = data.astype(np.float16)
783
if name[-7:] == ".weight" and n_dims == 2:
784
print(" Converting to float16")
785
data = data.astype(np.float16)
788
print(" Converting to float32")
789
data = data.astype(np.float32)
792
if data.dtype != np.float32:
793
print(" Converting to float32")
794
data = data.astype(np.float32)
797
print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
798
fout.add_tensor(name, data)
801
fout.write_header_to_file()
802
fout.write_kv_data_to_file()
803
fout.write_tensors_to_file()
806
print("Done. Output file: " + fname_out)