| | |
| | |
| | |
| | |
| | |
| | import copy |
| |
|
| | from transformers import LlamaConfig |
| | from transformers.configuration_utils import PretrainedConfig |
| | from transformers.utils import logging |
| |
|
| | from .configuration_intern_vit import InternVisionConfig |
| |
|
| | logger = logging.get_logger(__name__) |
| |
|
| |
|
| | class InternVLConfig(PretrainedConfig): |
| | r""" |
| | [`InternVLConfig`] is the configuration class to store the configuration of a |
| | [`InternVLModel`]. It is used to instantiate a InternVLModel according to the specified |
| | arguments, defining the InternViT-6B and QLLaMA configs. Instantiating a configuration with |
| | the defaults will yield a similar configuration to that of the InternVL architecture. |
| | |
| | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| | documentation from [`PretrainedConfig`] for more information. |
| | |
| | Args: |
| | vision_config (`dict`, *optional*): |
| | Dictionary of configuration options used to initialize [`InternVisionConfig`]. |
| | qllama_config (`dict`, *optional*): |
| | Dictionary of configuration options used to initialize [`LLaMAConfig`]. |
| | clip_embed_dim (`int`, *optional*, defaults to 768): |
| | Size of the embeddings from the CLIP model. |
| | attn_pool_num_heads (`int`, *optional*, defaults to 16): |
| | Number of attention heads used in the attention pooling layers. |
| | num_query_token (`int`, *optional*, defaults to 96): |
| | Number of query tokens used in the transformer. |
| | label_smoothing (`float`, *optional*, defaults to 0.0): |
| | The amount of label smoothing to apply. |
| | cross_attention_frequency (`int`, *optional*, defaults to 2): |
| | The frequency of cross-attention layers in the model. |
| | use_backbone_lora (`int`, *optional*, defaults to 0): |
| | If non-zero, indicates the use of LoRA in the backbone of the model. |
| | use_qllama_lora (`int`, *optional*, defaults to 0): |
| | If non-zero, indicates the use of LoRA in the QLLaMA of the model. |
| | force_image_size (`int` or `None`, *optional*): |
| | If not None, forces the model to use this specific image size. |
| | initializer_range (`float`, *optional*, defaults to 0.02): |
| | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. |
| | kwargs (*optional*): |
| | Dictionary of additional keyword arguments. |
| | """ |
| |
|
| | model_type = 'internvl' |
| | is_composition = True |
| |
|
| | def __init__( |
| | self, |
| | vision_config=None, |
| | qllama_config=None, |
| | clip_embed_dim=768, |
| | attn_pool_num_heads=16, |
| | num_query_token=96, |
| | label_smoothing=0.0, |
| | cross_attention_frequency=2, |
| | use_backbone_lora=0, |
| | use_qllama_lora=0, |
| | force_image_size=None, |
| | initializer_range=0.02, |
| | **kwargs): |
| | super().__init__(**kwargs) |
| |
|
| | if vision_config is None: |
| | vision_config = {} |
| | logger.info('vision_config is None. initializing the InternVisionConfig with default values.') |
| |
|
| | if qllama_config is None: |
| | qllama_config = {} |
| | logger.info( |
| | 'qllama_config is None. Initializing the InternTextConfig config with default values (`LlamaConfig`).') |
| |
|
| | self.vision_config = InternVisionConfig(**vision_config) |
| | self.qllama_config = LlamaConfig(**qllama_config) |
| | self.qllama_config.num_query_token = num_query_token |
| | self.qllama_config.cross_attention_frequency = cross_attention_frequency |
| | self.hidden_size = self.qllama_config.hidden_size |
| |
|
| | self.clip_embed_dim = clip_embed_dim |
| | self.attn_pool_num_heads = attn_pool_num_heads |
| | self.num_query_token = num_query_token |
| | self.label_smoothing = label_smoothing |
| | self.use_backbone_lora = use_backbone_lora |
| | self.use_qllama_lora = use_qllama_lora |
| | self.force_image_size = force_image_size |
| | self.initializer_range = initializer_range |
| |
|
| | def to_dict(self): |
| | """ |
| | Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. |
| | |
| | Returns: |
| | `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, |
| | """ |
| | output = copy.deepcopy(self.__dict__) |
| | output['vision_config'] = self.vision_config.to_dict() |
| | output['qllama_config'] = self.qllama_config.to_dict() |
| | output['model_type'] = self.__class__.model_type |
| | return output |
| |
|