colossalai
101 строка · 4.2 Кб
1"""
2This code is copied from https://huggingface.co/THUDM/chatglm-6b/resolve/main/configuration_chatglm.py
3"""
4
5""" ChatGLM model configuration """
6
7from transformers.configuration_utils import PretrainedConfig8from transformers.utils import logging9
10logger = logging.get_logger(__name__)11
12
13class ChatGLMConfig(PretrainedConfig):14r"""15This is the configuration class to store the configuration of a [`~ChatGLMModel`].
16It is used to instantiate an ChatGLM model according to the specified arguments, defining the model
17architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
18the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture.
19
20Configuration objects inherit from [`PretrainedConfig`] and can be used
21to control the model outputs. Read the documentation from [`PretrainedConfig`]
22for more information.
23
24
25Args:
26vocab_size (`int`, *optional*, defaults to 150528):
27Vocabulary size of the ChatGLM-6B model. Defines the number of different tokens that can be represented by the
28`inputs_ids` passed when calling [`~ChatGLMModel`] or
29[`~TFChatGLMModel`].
30hidden_size (`int`, *optional*, defaults to 4096):
31Dimension of the encoder layers and the pooler layer.
32num_hidden_layers (`int`, *optional*, defaults to 28):
33Number of hidden layers in the Transformer encoder.
34num_attention_heads (`int`, *optional*, defaults to 32):
35Number of attention heads for each attention layer in the Transformer encoder.
36inner_hidden_size (`int`, *optional*, defaults to 16384):
37Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
38max_sequence_length (`int`, *optional*, defaults to 512):
39The maximum sequence length that this model might ever be used with.
40Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
41layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
42The epsilon used by the layer normalization layers.
43use_cache (`bool`, *optional*, defaults to `True`):
44Whether the model should return the last key/values attentions (not used by all models).
45Example:
46
47```python
48>>> from configuration_chatglm import ChatGLMConfig
49>>> from modeling_chatglm import ChatGLMModel
50
51>>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration
52>>> configuration = ChatGLMConfig()
53
54>>> # Initializing a model from the THUDM/ChatGLM-6B style configuration
55>>> model = ChatGLMModel(configuration)
56
57>>> # Accessing the model configuration
58>>> configuration = model.config
59```"""
60model_type = "chatglm"61
62def __init__(63self,64vocab_size=130528,65hidden_size=4096,66num_layers=28,67num_attention_heads=32,68layernorm_epsilon=1e-5,69use_cache=True,70bos_token_id=130004,71eos_token_id=130005,72mask_token_id=130000,73gmask_token_id=130001,74pad_token_id=3,75max_sequence_length=2048,76inner_hidden_size=16384,77position_encoding_2d=True,78quantization_bit=0,79pre_seq_len=None,80prefix_projection=False,81**kwargs,82):83self.num_layers = num_layers84self.vocab_size = vocab_size85self.hidden_size = hidden_size86self.num_attention_heads = num_attention_heads87self.max_sequence_length = max_sequence_length88self.layernorm_epsilon = layernorm_epsilon89self.inner_hidden_size = inner_hidden_size90self.use_cache = use_cache91self.bos_token_id = bos_token_id92self.eos_token_id = eos_token_id93self.pad_token_id = pad_token_id94self.mask_token_id = mask_token_id95self.gmask_token_id = gmask_token_id96self.position_encoding_2d = position_encoding_2d97self.quantization_bit = quantization_bit98self.pre_seq_len = pre_seq_len99self.prefix_projection = prefix_projection100
101super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)102