From b8f8dd8cbfc4249b0a2d282bba34d76b4dcb6196 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 17 Nov 2025 18:23:18 +0100 Subject: [PATCH 1/2] tmp --- src/transformers/modeling_rope_utils.py | 101 +++++++----------- .../models/bamba/configuration_bamba.py | 3 +- .../configuration_efficientloftr.py | 21 ++-- .../efficientloftr/modeling_efficientloftr.py | 2 +- .../models/fuyu/configuration_fuyu.py | 9 +- .../models/glm/configuration_glm.py | 4 +- src/transformers/models/glm/modeling_glm.py | 2 +- src/transformers/models/glm/modular_glm.py | 2 +- .../models/glm4/configuration_glm4.py | 5 +- src/transformers/models/glm4/modeling_glm4.py | 2 +- .../models/glm4_moe/configuration_glm4_moe.py | 5 +- .../models/glm4_moe/modeling_glm4_moe.py | 2 +- .../models/glm4_moe/modular_glm4_moe.py | 5 +- .../glm4v/convert_glm4v_mgt_weights_to_hf.py | 8 +- .../models/glm4v/modeling_glm4v.py | 2 +- .../glm4v_moe/configuration_glm4v_moe.py | 4 +- .../convert_glm4v_moe_mgt_weights_to_hf.py | 7 +- .../models/glm4v_moe/modeling_glm4v_moe.py | 2 +- .../models/glm4v_moe/modular_glm4v_moe.py | 6 +- .../models/gpt_neox/configuration_gpt_neox.py | 12 +-- .../models/gpt_neox/modeling_gpt_neox.py | 2 +- .../models/gpt_neox/modular_gpt_neox.py | 2 +- .../configuration_gpt_neox_japanese.py | 9 +- .../moonshine/configuration_moonshine.py | 5 +- .../models/moonshine/modeling_moonshine.py | 2 +- .../models/moonshine/modular_moonshine.py | 5 +- .../models/nemotron/configuration_nemotron.py | 4 +- .../models/nemotron/modeling_nemotron.py | 2 +- .../persimmon/configuration_persimmon.py | 5 +- .../models/persimmon/modeling_persimmon.py | 2 +- .../models/phi/configuration_phi.py | 5 +- src/transformers/models/phi/modeling_phi.py | 2 +- src/transformers/models/phi/modular_phi.py | 2 +- .../models/phi3/configuration_phi3.py | 5 +- src/transformers/models/phi3/modeling_phi3.py | 2 +- .../configuration_phi4_multimodal.py | 5 +- .../modeling_phi4_multimodal.py | 2 +- .../qwen3_next/configuration_qwen3_next.py | 5 +- .../models/qwen3_next/modeling_qwen3_next.py | 2 +- .../models/qwen3_next/modular_qwen3_next.py | 2 +- .../configuration_recurrent_gemma.py | 5 +- .../modeling_recurrent_gemma.py | 2 +- .../models/stablelm/configuration_stablelm.py | 5 +- .../models/stablelm/modeling_stablelm.py | 2 +- 44 files changed, 104 insertions(+), 184 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index c6a66ba1c4b3..5d9bfc936e81 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -16,7 +16,7 @@ from functools import wraps from typing import Optional, TypedDict -from .configuration_utils import PreTrainedConfig +from .configuration_utils import ALLOWED_LAYER_TYPES, PreTrainedConfig from .utils import is_torch_available, logging @@ -27,57 +27,6 @@ import torch -def standardize_rope_params(config, rope_theta: float | dict[str, float] | None = None): - """ - Helper to standardize the config's rope params field by ensuring the params are defined for each - later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility) - """ - rope_parameters = getattr(config, "rope_parameters", None) - layer_types = getattr(config, "layer_types", None) - if rope_theta is None: - rope_theta = getattr(config, "rope_theta", None) - - # Case 1: one RoPE theat = one RoPE param per model without nesting - if not isinstance(rope_theta, dict): - if rope_parameters is None: - rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} - else: - # BC: if there is a 'type' field, copy it it to 'rope_type'. - rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default")) - rope_theta = rope_parameters.get("rope_theta") or rope_theta - rope_parameters.update({"rope_theta": rope_theta, "rope_type": rope_type}) - config.rope_parameters = rope_parameters - - # Case 2: different RoPE for each layer as nested dict - else: - rope_parameters_per_layer_type = {} - for layer_type in layer_types: - if rope_parameters is None: - rope_parameters_per_layer_type[layer_type] = { - "rope_type": "default", - "rope_theta": rope_theta[layer_type], - } - else: - is_field_in_new_format = any(layer_type in rope_parameters for layer_type in layer_types) - if not is_field_in_new_format: - curr_rope_type = rope_parameters.get("rope_type", rope_parameters.get("type")) - rope_parameters_per_layer_type[layer_type] = { - **rope_parameters, - "rope_type": curr_rope_type, - "rope_theta": rope_theta[layer_type], - } - else: - curr_rope_type = rope_parameters[layer_type].get( - "rope_type", rope_parameters[layer_type].get("type") - ) - rope_parameters_per_layer_type[layer_type] = { - **rope_parameters[layer_type], - "rope_type": curr_rope_type, - "rope_theta": rope_theta[layer_type], - } - config.rope_parameters = rope_parameters_per_layer_type - - def dynamic_rope_update(rope_forward): """ Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE @@ -214,7 +163,7 @@ def _compute_linear_scaling_rope_parameters( # Gets the default RoPE parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) attention_factor = 1.0 # Unused in this type of RoPE @@ -277,7 +226,7 @@ def _compute_dynamic_ntk_parameters( rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) max_position_embeddings = config.max_position_embeddings @@ -364,7 +313,7 @@ def _compute_yarn_parameters( rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) @@ -494,7 +443,7 @@ def _compute_longrope_parameters( rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) @@ -579,7 +528,7 @@ def _compute_llama3_parameters( # Gets the default RoPE parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) attention_factor = 1.0 # Unused in this type of RoPE @@ -760,7 +709,7 @@ def _validate_longrope_parameters(rope_parameters: dict, config: PreTrainedConfi rope_type = rope_parameters["rope_type"] _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) @@ -862,9 +811,7 @@ def rope_config_validation(config: PreTrainedConfig, ignore_keys: Optional[set] if rope_parameters_dict is None: return - if getattr(config, "layer_types", None) is not None and all( - key in config.layer_types for key in rope_parameters_dict.keys() - ): + if set(rope_parameters_dict.keys()).issubset(ALLOWED_LAYER_TYPES): pass else: rope_parameters_dict = {"full_attention": rope_parameters_dict} @@ -885,7 +832,7 @@ def rope_config_validation(config: PreTrainedConfig, ignore_keys: Optional[set] ) -class RopeParameters(TypedDict): +class RopeParameters(TypedDict, total=False): """ Args: rope_theta (`float`): @@ -893,6 +840,8 @@ class RopeParameters(TypedDict): rope_type (`str`, *optional*, defaults to "default"): The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. + partial_rotary_factor (`float`, *optional*): + Percentage of the query and keys which will have rotary embedding. factor (`float`, *optional*): Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In most scaling types, a `factor` of x will enable the model to handle sequences of length x * @@ -926,6 +875,7 @@ class RopeParameters(TypedDict): rope_theta: float rope_type: Optional[str] + partial_rotary_factor: Optional[float] factor: Optional[float] original_max_position_embeddings: Optional[int] attention_factor: Optional[float] @@ -935,3 +885,30 @@ class RopeParameters(TypedDict): long_factor: Optional[list[float]] low_freq_factor: Optional[float] high_freq_factor: Optional[float] + + +def standardize_rope_params(config): + """ + Helper to standardize the config's rope params field by ensuring the params are defined for each + later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility) + """ + rope_parameters = getattr(config, "rope_parameters", {}) + + # Move `rope_theta` and `partial_rotary_factor` to the params dict, if not there yet + rope_theta = getattr(config, "rope_theta", None) + partial_rotary_factor = getattr(config, "partial_rotary_factor", None) + + # Case 1: one RoPE theat = one RoPE param per model without nesting + if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + rope_parameters.setdefault("rope_type", rope_parameters.get("type", "default")) + rope_parameters.setdefault("rope_theta", rope_theta) + rope_parameters.setdefault("partial_rotary_factor", partial_rotary_factor) + # Case 2: different RoPE for each layer as nested dict + else: + for layer_type in config.layer_types: + rope_parameters[layer_type].setdefault("rope_type", rope_parameters[layer_type].get("type", "default")) + rope_parameters[layer_type].setdefault("rope_theta", rope_theta) + rope_parameters[layer_type].setdefault("partial_rotary_factor", partial_rotary_factor) + + config.rope_parameters = rope_parameters + rope_config_validation(config) diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py index 07fd3eaa1aab..f849fbb3cefa 100644 --- a/src/transformers/models/bamba/configuration_bamba.py +++ b/src/transformers/models/bamba/configuration_bamba.py @@ -171,10 +171,11 @@ def __init__( self.num_logits_to_keep = num_logits_to_keep self.attn_layer_indices = attn_layer_indices + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - self.partial_rotary_factor = 0.5 rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = 0.5 # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index 8b57c903dde8..8ba99bd02d23 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import standardize_rope_params class EfficientLoFTRConfig(PreTrainedConfig): @@ -67,10 +67,7 @@ class EfficientLoFTRConfig(PreTrainedConfig): fine_kernel_size (`int`, *optional*, defaults to 8): Kernel size used for the fine feature matching batch_norm_eps (`float`, *optional*, defaults to 1e-05): - The epsilon used by the batch normalization layers. - partial_rotary_factor (`float`, *optional*, defaults to 4.0): - Dim factor for the RoPE embeddings, in EfficientLoFTR, frequencies should be generated for - the whole hidden_size, so this factor is used to compensate. + The epsilon used by the batch normalization layers rope_parameters (`RopeParameters`, *optional*): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE @@ -121,7 +118,6 @@ def __init__( coarse_matching_border_removal: int = 2, fine_kernel_size: int = 8, batch_norm_eps: float = 1e-5, - partial_rotary_factor: float = 4.0, rope_parameters: Optional[dict] = None, fine_matching_slice_dim: int = 8, fine_matching_regress_temperature: float = 10.0, @@ -176,17 +172,16 @@ def __init__( self.fine_matching_regress_temperature = fine_matching_regress_temperature self.num_key_value_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.initializer_range = initializer_range + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_scaling or rope_parameters or {} + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 4.0) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + # Standardize and validate the correctness of rotary position embeddings parameters + standardize_rope_params(self) super().__init__(**kwargs) diff --git a/src/transformers/models/efficientloftr/modeling_efficientloftr.py b/src/transformers/models/efficientloftr/modeling_efficientloftr.py index bdf6dd67ae48..1774bc48e0dd 100644 --- a/src/transformers/models/efficientloftr/modeling_efficientloftr.py +++ b/src/transformers/models/efficientloftr/modeling_efficientloftr.py @@ -125,7 +125,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index bbe4a5ec22d8..e1315ad9cbca 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -77,9 +77,6 @@ class FuyuConfig(PreTrainedConfig): The dropout ratio after applying the MLP to the hidden states. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio after computing the attention scores. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - Percentage of the query and keys which will have rotary embedding. - pad_token_id (`int`, *optional*): The id of the *padding* token. bos_token_id (`int`, *optional*, defaults to 1): @@ -122,7 +119,6 @@ def __init__( qk_layernorm: Optional[bool] = True, hidden_dropout: Optional[float] = 0.0, attention_dropout: Optional[float] = 0.0, - partial_rotary_factor: Optional[float] = 0.5, pad_token_id: Optional[int] = None, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, @@ -131,6 +127,8 @@ def __init__( **kwargs, ): if text_config is None: + rope_parameters = rope_parameters if rope_parameters is not None else {} + rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) text_config = { "vocab_size": vocab_size, "max_position_embeddings": max_position_embeddings, @@ -146,7 +144,6 @@ def __init__( "qk_layernorm": qk_layernorm, "hidden_dropout": hidden_dropout, "attention_dropout": attention_dropout, - "partial_rotary_factor": partial_rotary_factor, "pad_token_id": pad_token_id, "bos_token_id": bos_token_id, "eos_token_id": eos_token_id, @@ -172,11 +169,11 @@ def __init__( self.qk_layernorm = qk_layernorm self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout - self.partial_rotary_factor = partial_rotary_factor self.image_token_id = image_token_id # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 25000.0) diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index e0d2c3d6492a..7f7509730d8f 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -48,7 +48,6 @@ class GlmConfig(PreTrainedConfig): by meanpooling all the original heads within that group. For more details, check out [this paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `num_attention_heads`. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position. head_dim (`int`, *optional*, defaults to 128): The attention head dimension. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): @@ -112,7 +111,6 @@ def __init__( num_hidden_layers: Optional[int] = 40, num_attention_heads: Optional[int] = 32, num_key_value_heads: Optional[int] = 2, - partial_rotary_factor: Optional[float] = 0.5, head_dim: Optional[int] = 128, hidden_act: Optional[str] = "silu", attention_dropout: Optional[float] = 0.0, @@ -134,7 +132,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.head_dim = head_dim self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -146,6 +143,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index a4880c0145e9..19d08d9fa44e 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -101,7 +101,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py index 059cb296c972..97b47a9b8f7f 100644 --- a/src/transformers/models/glm/modular_glm.py +++ b/src/transformers/models/glm/modular_glm.py @@ -60,7 +60,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index 43e6323b0060..caf6194bba94 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -48,8 +48,6 @@ class Glm4Config(PreTrainedConfig): by meanpooling all the original heads within that group. For more details, check out [this paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `num_attention_heads`. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - The factor of the partial rotary position. head_dim (`int`, *optional*, defaults to 128): The attention head dimension. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): @@ -113,7 +111,6 @@ def __init__( num_hidden_layers: Optional[int] = 40, num_attention_heads: Optional[int] = 32, num_key_value_heads: Optional[int] = 2, - partial_rotary_factor: Optional[float] = 0.5, head_dim: Optional[int] = 128, hidden_act: Optional[str] = "silu", attention_dropout: Optional[float] = 0.0, @@ -135,7 +132,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.head_dim = head_dim self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -147,6 +143,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/glm4/modeling_glm4.py b/src/transformers/models/glm4/modeling_glm4.py index ba07da7cab54..a1f6a01c1cc5 100644 --- a/src/transformers/models/glm4/modeling_glm4.py +++ b/src/transformers/models/glm4/modeling_glm4.py @@ -305,7 +305,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index aa1a16a95b37..10da1ba90aac 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -47,8 +47,6 @@ class Glm4MoeConfig(PreTrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 96): Number of attention heads for each attention layer in the Transformer encoder. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - The factor of the partial rotary position. num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if @@ -144,7 +142,6 @@ def __init__( intermediate_size: Optional[int] = 10944, num_hidden_layers: Optional[int] = 46, num_attention_heads: Optional[int] = 96, - partial_rotary_factor: Optional[float] = 0.5, num_key_value_heads: Optional[int] = 8, hidden_act: Optional[str] = "silu", max_position_embeddings: Optional[int] = 131072, @@ -173,7 +170,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -185,6 +181,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/glm4_moe/modeling_glm4_moe.py b/src/transformers/models/glm4_moe/modeling_glm4_moe.py index 1c6575f3420a..13ab166a6312 100644 --- a/src/transformers/models/glm4_moe/modeling_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modeling_glm4_moe.py @@ -82,7 +82,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index 0912f2289f2f..b8c4cd222475 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -61,8 +61,6 @@ class Glm4MoeConfig(PreTrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 96): Number of attention heads for each attention layer in the Transformer encoder. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - The factor of the partial rotary position. num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if @@ -158,7 +156,6 @@ def __init__( intermediate_size: Optional[int] = 10944, num_hidden_layers: Optional[int] = 46, num_attention_heads: Optional[int] = 96, - partial_rotary_factor: Optional[float] = 0.5, num_key_value_heads: Optional[int] = 8, hidden_act: Optional[str] = "silu", max_position_embeddings: Optional[int] = 131072, @@ -187,7 +184,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -199,6 +195,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py b/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py index fb57f66a9ae0..dd9f2fba17d3 100644 --- a/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py +++ b/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py @@ -702,9 +702,13 @@ def offset_layer(x, offset=llm_layer_offset): "dtype": text_config.get("torch_dtype", "bfloat16"), "use_cache": text_config.get("use_cache", True), "vocab_size": text_config.get("vocab_size", 151552), - "partial_rotary_factor": 0.5, "tie_word_embeddings": False, - "rope_parameters": {"rope_type": "default", "rope_theta": 10000.0, "mrope_section": [8, 12, 12]}, + "rope_parameters": { + "rope_type": "default", + "rope_theta": 10000.0, + "mrope_section": [8, 12, 12], + "partial_rotary_factor": 0.5, + }, } hf_config["text_config"] = txt_config diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py index 47ad72ac96ce..835fd47cc48e 100644 --- a/src/transformers/models/glm4v/modeling_glm4v.py +++ b/src/transformers/models/glm4v/modeling_glm4v.py @@ -425,7 +425,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index 3d158e1b19cb..44004afd6c9c 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -139,7 +139,6 @@ class Glm4vMoeTextConfig(PreTrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 96): Number of attention heads for each attention layer in the Transformer encoder. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position. num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if @@ -231,7 +230,6 @@ def __init__( intermediate_size: Optional[int] = 10944, num_hidden_layers: Optional[int] = 46, num_attention_heads: Optional[int] = 96, - partial_rotary_factor: Optional[float] = 0.5, num_key_value_heads: Optional[int] = 8, hidden_act: Optional[str] = "silu", max_position_embeddings: Optional[int] = 65536, @@ -261,7 +259,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -273,6 +270,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/glm4v_moe/convert_glm4v_moe_mgt_weights_to_hf.py b/src/transformers/models/glm4v_moe/convert_glm4v_moe_mgt_weights_to_hf.py index d8b08716b6c4..54a9564b69c5 100644 --- a/src/transformers/models/glm4v_moe/convert_glm4v_moe_mgt_weights_to_hf.py +++ b/src/transformers/models/glm4v_moe/convert_glm4v_moe_mgt_weights_to_hf.py @@ -707,7 +707,12 @@ def offset_layer(x, offset=llm_layer_offset): "n_shared_experts": text_config.get("n_shared_experts", 1), "norm_topk_prob": text_config.get("norm_topk_prob", True), "num_experts_per_tok": text_config.get("num_experts_per_tok", 8), - "rope_parameters": {"rope_type": "default", "rope_theta": 10000.0, "mrope_section": [8, 12, 12]}, + "rope_parameters": { + "rope_type": "default", + "rope_theta": 10000.0, + "mrope_section": [8, 12, 12], + "partial_rotary_factor": 0.5, + }, } hf_config["text_config"] = txt_config diff --git a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py index 631505562bc6..b93d0b03bfe7 100644 --- a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py @@ -129,7 +129,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index c69ca8439315..a0b494664ab1 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -88,7 +88,6 @@ class Glm4vMoeTextConfig(Glm4MoeConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 96): Number of attention heads for each attention layer in the Transformer encoder. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position. num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if @@ -177,7 +176,6 @@ def __init__( intermediate_size: Optional[int] = 10944, num_hidden_layers: Optional[int] = 46, num_attention_heads: Optional[int] = 96, - partial_rotary_factor: Optional[float] = 0.5, num_key_value_heads: Optional[int] = 8, hidden_act: Optional[str] = "silu", max_position_embeddings: Optional[int] = 65536, @@ -207,7 +205,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -219,6 +216,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) @@ -376,7 +374,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 744e0316146c..8c2c77a8deb3 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -50,8 +50,6 @@ class GPTNeoXConfig(PreTrainedConfig): hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. - rotary_pct (`float`, *optional*, defaults to 0.25): - percentage of hidden dimensions to allocate to rotary embeddings attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio probability of the attention score. hidden_dropout (`float`, *optional*, defaults to 0.0): @@ -59,8 +57,7 @@ class GPTNeoXConfig(PreTrainedConfig): hidden states. classifier_dropout (`float`, *optional*, defaults to 0.1): Argument used when doing token classification, used in the model [`GPTNeoXForTokenClassification`]. - - The dropout ratio for the hidden layer. + The dropout ratio for the c;assifier head. max_position_embeddings (`int`, *optional*, defaults to 2048): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). @@ -119,7 +116,6 @@ def __init__( num_attention_heads: Optional[int] = 64, intermediate_size: Optional[int] = 24576, hidden_act: Optional[str] = "gelu", - rotary_pct: Optional[float] = 0.25, attention_dropout: Optional[float] = 0.0, hidden_dropout: Optional[float] = 0.0, classifier_dropout: Optional[float] = 0.1, @@ -143,8 +139,6 @@ def __init__( self.num_attention_heads = num_attention_heads self.intermediate_size = intermediate_size self.hidden_act = hidden_act - self.rotary_pct = rotary_pct - self.partial_rotary_factor = rotary_pct self.attention_dropout = attention_dropout self.hidden_dropout = hidden_dropout self.classifier_dropout = classifier_dropout @@ -156,10 +150,8 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 0.25) self.attention_bias = attention_bias - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rotary_emb_base", 10000.0) diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index fc7d6fd40a80..e3cd7c7d4d39 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -88,7 +88,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/gpt_neox/modular_gpt_neox.py b/src/transformers/models/gpt_neox/modular_gpt_neox.py index c267753db350..a2baca515668 100644 --- a/src/transformers/models/gpt_neox/modular_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modular_gpt_neox.py @@ -62,7 +62,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index 409232145f2a..077b6429f510 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -49,8 +49,6 @@ class GPTNeoXJapaneseConfig(PreTrainedConfig): intermediate_multiple_size. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. - rotary_pct (`float`, *optional*, defaults to 1.00): - percentage of hidden dimensions to allocate to rotary embeddings max_position_embeddings (`int`, *optional*, defaults to 2048): The maximum sequence length that this model might ever be used with. initializer_range (`float`, *optional*, defaults to 0.02): @@ -93,7 +91,6 @@ def __init__( num_attention_heads: Optional[int] = 32, intermediate_multiple_size: Optional[int] = 4, hidden_act: Optional[str] = "gelu", - rotary_pct: Optional[float] = 1.00, max_position_embeddings: Optional[int] = 2048, initializer_range: Optional[float] = 0.02, layer_norm_eps: Optional[int] = 1e-5, @@ -113,19 +110,15 @@ def __init__( self.num_attention_heads = num_attention_heads self.intermediate_multiple_size = intermediate_multiple_size self.hidden_act = hidden_act - self.rotary_pct = rotary_pct - self.partial_rotary_factor = rotary_pct self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 1.0) self.attention_dropout = attention_dropout self.hidden_dropout = hidden_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rotary_emb_base", 10000.0) diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index e04909e1f7eb..59513e50477d 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -87,8 +87,6 @@ class MoonshineConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.9): - Percentage of the query and keys which will have rotary embedding. is_encoder_decoder (`bool`, *optional*, defaults to `True`): Whether the model is used as an encoder/decoder or not. attention_bias (`bool`, *optional*, defaults to `False`): @@ -142,7 +140,6 @@ def __init__( decoder_start_token_id: Optional[int] = 1, use_cache: Optional[bool] = True, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.9, is_encoder_decoder: Optional[bool] = True, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, @@ -174,13 +171,13 @@ def __init__( self.initializer_range = initializer_range self.decoder_start_token_id = decoder_start_token_id self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.is_encoder_decoder = is_encoder_decoder self.attention_bias = attention_bias self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py index 0840c1623489..e54541c91b2b 100644 --- a/src/transformers/models/moonshine/modeling_moonshine.py +++ b/src/transformers/models/moonshine/modeling_moonshine.py @@ -118,7 +118,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 38314c4535a6..ab379620345c 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -110,8 +110,6 @@ class MoonshineConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.9): - Percentage of the query and keys which will have rotary embedding. is_encoder_decoder (`bool`, *optional*, defaults to `True`): Whether the model is used as an encoder/decoder or not. attention_bias (`bool`, *optional*, defaults to `False`): @@ -165,7 +163,6 @@ def __init__( decoder_start_token_id: Optional[int] = 1, use_cache: Optional[bool] = True, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.9, is_encoder_decoder: Optional[bool] = True, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, @@ -197,13 +194,13 @@ def __init__( self.initializer_range = initializer_range self.decoder_start_token_id = decoder_start_token_id self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.is_encoder_decoder = is_encoder_decoder self.attention_bias = attention_bias self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index c5f888ac6d36..f9d57bf46b73 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -80,7 +80,6 @@ class NemotronConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -123,7 +122,6 @@ def __init__( eos_token_id: Optional[int] = 3, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.5, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, @@ -141,13 +139,13 @@ def __init__( self.initializer_range = initializer_range self.norm_eps = norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index af1d14ee2da0..751305b0a0ea 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -132,7 +132,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index f9dbe11580b2..3760519d4266 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -70,8 +70,6 @@ class PersimmonConfig(PreTrainedConfig): The dropout ratio after applying the MLP to the hidden states. attention_dropout (`float`, *optional*, default to 0.0): The dropout ratio after computing the attention scores. - partial_rotary_factor (`float`, *optional*, default to 0.5): - Percentage of the query and keys which will have rotary embedding. Example: @@ -102,7 +100,6 @@ def __init__( qk_layernorm: Optional[bool] = True, hidden_dropout: Optional[float] = 0.0, attention_dropout: Optional[float] = 0.0, - partial_rotary_factor: Optional[float] = 0.5, pad_token_id: Optional[int] = None, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, @@ -121,10 +118,10 @@ def __init__( self.qk_layernorm = qk_layernorm self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout - self.partial_rotary_factor = partial_rotary_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 25000.0) diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index 4b09a2dd75bf..094b26dbabc0 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -99,7 +99,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index 5476cb1b6c7c..d386b30c6959 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -79,8 +79,6 @@ class PhiConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - Percentage of the query and keys which will have rotary embedding. qk_layernorm (`bool`, *optional*, defaults to `False`): Whether or not to normalize the Queries and Keys after projecting the hidden states. bos_token_id (`int`, *optional*, defaults to 1): @@ -138,7 +136,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.5, qk_layernorm: Optional[bool] = False, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, @@ -162,11 +159,11 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.qk_layernorm = qk_layernorm # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 4a1530b78564..5e91a9f70265 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -70,7 +70,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/phi/modular_phi.py b/src/transformers/models/phi/modular_phi.py index 3ecc9ba9d4f7..a06e3b16cfc6 100644 --- a/src/transformers/models/phi/modular_phi.py +++ b/src/transformers/models/phi/modular_phi.py @@ -54,7 +54,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 35eb2df30c9d..6581a2ce9b36 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -81,8 +81,6 @@ class Phi3Config(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 1.0): - Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0. bos_token_id (`int`, *optional*, defaults to 1): The id of the "beginning-of-sequence" token. eos_token_id (`int`, *optional*, defaults to 32000): @@ -140,7 +138,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 1.0, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 32000, pad_token_id: Optional[int] = 32000, @@ -166,10 +163,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 29b3d2847ed1..3f98bb1b0042 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -104,7 +104,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index 46c104d027a7..53f686234fd2 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -296,8 +296,6 @@ class Phi4MultimodalConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to `1.0`): - Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0. bos_token_id (`int`, *optional*, defaults to 199999): The id of the "beginning-of-sequence" token. eos_token_id (`int` or `list[int]`, *optional*, defaults to `[199999, 200020]`): @@ -367,7 +365,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[int] = 1, bos_token_id: Optional[int] = 199999, eos_token_id: Optional[list[int]] = [199999, 200020], pad_token_id: Optional[int] = 199999, @@ -407,10 +404,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py index eab15068d252..304d89e6a5f0 100644 --- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py @@ -1481,7 +1481,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index 83eb062cb6f8..0527148166c0 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -71,8 +71,6 @@ class Qwen3NextConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.25): - Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -166,7 +164,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.25, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, head_dim: Optional[int] = 256, @@ -199,13 +196,13 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.head_dim = head_dim # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.25) self.layer_types = layer_types if self.layer_types is None: diff --git a/src/transformers/models/qwen3_next/modeling_qwen3_next.py b/src/transformers/models/qwen3_next/modeling_qwen3_next.py index 362c8fab007f..d0bf37e64de2 100644 --- a/src/transformers/models/qwen3_next/modeling_qwen3_next.py +++ b/src/transformers/models/qwen3_next/modeling_qwen3_next.py @@ -213,7 +213,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/qwen3_next/modular_qwen3_next.py b/src/transformers/models/qwen3_next/modular_qwen3_next.py index 7deedb9c868b..d4f9d017d2d3 100644 --- a/src/transformers/models/qwen3_next/modular_qwen3_next.py +++ b/src/transformers/models/qwen3_next/modular_qwen3_next.py @@ -203,7 +203,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index 130044ee099d..54b482141b42 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -75,8 +75,6 @@ class RecurrentGemmaConfig(PreTrainedConfig): Beginning of stream token id. hidden_activation (``str` or `function``, *optional*, defaults to `"gelu_pytorch_tanh"`): The hidden activation used in the recurrent block as well as the MLP layer of the decoder layers. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - The partial rotary factor used in the initialization of the rotary embeddings. rope_parameters (`RopeParameters`, *optional*): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE @@ -119,7 +117,6 @@ def __init__( eos_token_id: Optional[int] = 1, bos_token_id: Optional[int] = 2, hidden_activation: Optional[str] = "gelu_pytorch_tanh", - partial_rotary_factor: Optional[float] = 0.5, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, block_types: Optional[list[str]] = ("recurrent", "recurrent", "attention"), attention_dropout: Optional[float] = 0.0, @@ -139,7 +136,6 @@ def __init__( self.logits_soft_cap = logits_soft_cap self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.block_types = list(block_types) self.hidden_activation = hidden_activation self.head_dim = self.hidden_size // self.num_attention_heads @@ -153,6 +149,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py index dc1a3d4951e2..8f3061d495a0 100644 --- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py @@ -102,7 +102,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index 0efdcd94adcd..a0ddcc33a79c 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -86,8 +86,6 @@ class StableLmConfig(PreTrainedConfig): The dropout ratio after applying the MLP to the hidden states. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - partial_rotary_factor (`float`, *optional*, defaults to 0.25): - Percentage of the query and keys which will have rotary embedding. bos_token_id (int, *optional*, defaults to 0): The id of the `BOS` token in the vocabulary. eos_token_id (int, *optional*, defaults to 0): @@ -125,7 +123,6 @@ def __init__( use_parallel_residual: Optional[bool] = False, hidden_dropout: Optional[float] = 0.0, attention_dropout: Optional[float] = 0.0, - partial_rotary_factor: Optional[float] = 0.25, bos_token_id: Optional[int] = 0, eos_token_id: Optional[int] = 0, **kwargs, @@ -148,10 +145,10 @@ def __init__( self.use_parallel_residual = use_parallel_residual self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout - self.partial_rotary_factor = partial_rotary_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.25) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index 3b091726fab4..fd56e5642cf0 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -98,7 +98,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) From 2ee00d06bcb60b917760d21c062678855b74f196 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 18 Nov 2025 16:02:44 +0100 Subject: [PATCH 2/2] batch push --- src/transformers/modeling_rope_utils.py | 181 +++++++++--------- .../models/apertus/configuration_apertus.py | 11 +- .../models/apertus/modular_apertus.py | 7 +- .../models/arcee/configuration_arcee.py | 10 +- .../models/aria/configuration_aria.py | 10 +- .../models/bamba/configuration_bamba.py | 10 +- .../models/bitnet/configuration_bitnet.py | 10 +- .../models/blt/configuration_blt.py | 39 ++-- .../chameleon/configuration_chameleon.py | 9 +- .../models/cohere/configuration_cohere.py | 9 +- .../models/cohere2/configuration_cohere2.py | 10 +- .../models/cohere2/modular_cohere2.py | 11 +- .../models/csm/configuration_csm.py | 18 +- .../models/cwm/configuration_cwm.py | 10 +- src/transformers/models/cwm/modular_cwm.py | 6 +- .../models/dbrx/configuration_dbrx.py | 9 +- .../deepseek_v2/configuration_deepseek_v2.py | 10 +- .../deepseek_v3/configuration_deepseek_v3.py | 10 +- .../open_llama/configuration_open_llama.py | 3 +- .../models/dia/configuration_dia.py | 18 +- .../diffllama/configuration_diffllama.py | 10 +- .../models/doge/configuration_doge.py | 10 +- src/transformers/models/doge/modular_doge.py | 10 +- .../models/dots1/configuration_dots1.py | 10 +- .../configuration_efficientloftr.py | 7 +- .../models/emu3/configuration_emu3.py | 10 +- .../models/ernie4_5/configuration_ernie4_5.py | 10 +- .../configuration_ernie4_5_moe.py | 10 +- .../models/evolla/configuration_evolla.py | 10 +- .../models/exaone4/configuration_exaone4.py | 10 +- .../models/exaone4/modular_exaone4.py | 10 +- .../models/falcon/configuration_falcon.py | 10 +- .../falcon_h1/configuration_falcon_h1.py | 10 +- .../flex_olmo/configuration_flex_olmo.py | 10 +- .../models/flex_olmo/modular_flex_olmo.py | 7 +- .../models/fuyu/configuration_fuyu.py | 10 +- .../models/gemma/configuration_gemma.py | 10 +- .../models/gemma/modular_gemma.py | 10 +- .../models/gemma2/configuration_gemma2.py | 10 +- .../models/gemma2/modular_gemma2.py | 11 +- .../models/gemma3/configuration_gemma3.py | 29 ++- .../models/gemma3/modular_gemma3.py | 28 ++- .../models/gemma3n/configuration_gemma3n.py | 28 +-- .../models/gemma3n/modular_gemma3n.py | 24 ++- .../models/glm/configuration_glm.py | 10 +- .../models/glm4/configuration_glm4.py | 10 +- .../models/glm4_moe/configuration_glm4_moe.py | 10 +- .../models/glm4_moe/modular_glm4_moe.py | 10 +- .../models/glm4v/configuration_glm4v.py | 10 +- .../models/glm4v/modular_glm4v.py | 10 +- .../glm4v_moe/configuration_glm4v_moe.py | 10 +- .../models/glm4v_moe/modular_glm4v_moe.py | 10 +- .../models/gpt_neox/configuration_gpt_neox.py | 10 +- .../configuration_gpt_neox_japanese.py | 10 +- .../models/gpt_oss/configuration_gpt_oss.py | 10 +- .../models/granite/configuration_granite.py | 12 +- .../granitemoe/configuration_granitemoe.py | 12 +- .../configuration_granitemoehybrid.py | 10 +- .../configuration_granitemoeshared.py | 12 +- .../models/helium/configuration_helium.py | 10 +- .../configuration_hunyuan_v1_dense.py | 10 +- .../configuration_hunyuan_v1_moe.py | 10 +- .../models/jetmoe/configuration_jetmoe.py | 10 +- .../configuration_kyutai_speech_to_text.py | 10 +- .../models/lfm2/configuration_lfm2.py | 10 +- .../models/lfm2_moe/configuration_lfm2_moe.py | 10 +- .../models/llama/configuration_llama.py | 10 +- .../models/llama4/configuration_llama4.py | 18 +- .../configuration_longcat_flash.py | 10 +- .../models/mimi/configuration_mimi.py | 10 +- .../models/minimax/configuration_minimax.py | 10 +- .../models/minimax/modular_minimax.py | 10 +- .../ministral/configuration_ministral.py | 10 +- .../models/ministral/modular_ministral.py | 10 +- .../models/mistral/configuration_mistral.py | 10 +- .../models/mixtral/configuration_mixtral.py | 10 +- .../models/mllama/configuration_mllama.py | 10 +- .../modernbert/configuration_modernbert.py | 25 ++- .../models/modernbert/modular_modernbert.py | 25 ++- .../configuration_modernbert_decoder.py | 25 ++- .../modular_modernbert_decoder.py | 25 ++- .../moonshine/configuration_moonshine.py | 10 +- .../models/moonshine/modular_moonshine.py | 10 +- .../models/moshi/configuration_moshi.py | 10 +- .../models/nemotron/configuration_nemotron.py | 10 +- .../models/olmo/configuration_olmo.py | 10 +- .../models/olmo2/configuration_olmo2.py | 10 +- .../models/olmo3/configuration_olmo3.py | 10 +- .../models/olmo3/modular_olmo3.py | 10 +- .../models/olmoe/configuration_olmoe.py | 10 +- .../persimmon/configuration_persimmon.py | 10 +- .../models/phi/configuration_phi.py | 10 +- .../models/phi3/configuration_phi3.py | 10 +- .../configuration_phi4_multimodal.py | 10 +- .../models/phimoe/configuration_phimoe.py | 11 +- .../models/pixtral/configuration_pixtral.py | 10 +- .../models/qwen2/configuration_qwen2.py | 10 +- .../configuration_qwen2_5_omni.py | 26 +-- .../qwen2_5_omni/modular_qwen2_5_omni.py | 26 +-- .../qwen2_5_vl/configuration_qwen2_5_vl.py | 10 +- .../qwen2_moe/configuration_qwen2_moe.py | 10 +- .../models/qwen2_vl/configuration_qwen2_vl.py | 10 +- .../models/qwen3/configuration_qwen3.py | 10 +- .../qwen3_moe/configuration_qwen3_moe.py | 10 +- .../qwen3_next/configuration_qwen3_next.py | 10 +- .../configuration_qwen3_omni_moe.py | 35 ++-- .../qwen3_omni_moe/modular_qwen3_omni_moe.py | 15 +- .../models/qwen3_vl/configuration_qwen3_vl.py | 10 +- .../models/qwen3_vl/modular_qwen3_vl.py | 10 +- .../configuration_qwen3_vl_moe.py | 10 +- .../qwen3_vl_moe/modular_qwen3_vl_moe.py | 10 +- .../configuration_recurrent_gemma.py | 10 +- .../models/seed_oss/configuration_seed_oss.py | 10 +- .../models/smollm3/configuration_smollm3.py | 7 +- .../models/smollm3/modular_smollm3.py | 7 +- .../models/stablelm/configuration_stablelm.py | 10 +- .../starcoder2/configuration_starcoder2.py | 10 +- .../models/t5gemma/configuration_t5gemma.py | 10 +- .../vaultgemma/configuration_vaultgemma.py | 10 +- .../models/zamba2/configuration_zamba2.py | 10 +- 120 files changed, 806 insertions(+), 790 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 5d9bfc936e81..aebf3a65b777 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -27,6 +27,87 @@ import torch +class RopeParameters(TypedDict, total=False): + """ + Args: + rope_theta (`float`): + The base period of the RoPE embeddings. + rope_type (`str`, *optional*, defaults to "default"): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + partial_rotary_factor (`float`, *optional*): + Percentage of the query and keys which will have rotary embedding. + factor (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + original_max_position_embeddings (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + attention_factor (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + beta_fast (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + beta_slow (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + short_factor (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + long_factor (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + low_freq_factor (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + high_freq_factor (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + """ + + rope_theta: float + rope_type: Optional[str] + partial_rotary_factor: Optional[float] + factor: Optional[float] + original_max_position_embeddings: Optional[int] + attention_factor: Optional[float] + beta_fast: Optional[float] + beta_slow: Optional[float] + short_factor: Optional[list[float]] + long_factor: Optional[list[float]] + low_freq_factor: Optional[float] + high_freq_factor: Optional[float] + + +def get_standardized_rope_params(config): + """ + Helper to standardize the config's rope params field by ensuring the params are defined for each + later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility) + """ + rope_parameters = getattr(config, "rope_parameters", {}) + + # Move `rope_theta` and `partial_rotary_factor` to the params dict, if not there yet + rope_theta = getattr(config, "rope_theta", None) + partial_rotary_factor = getattr(config, "partial_rotary_factor", None) + + # Case 1: one RoPE theat = one RoPE param per model without nesting + if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + rope_parameters.setdefault("rope_type", rope_parameters.get("type", "default")) + rope_parameters.setdefault("rope_theta", rope_theta) + rope_parameters.setdefault("partial_rotary_factor", partial_rotary_factor) + # Case 2: different RoPE for each layer as nested dict + else: + for layer_type in config.layer_types: + rope_parameters[layer_type].setdefault("rope_type", rope_parameters[layer_type].get("type", "default")) + rope_parameters[layer_type].setdefault("rope_theta", rope_theta) + rope_parameters[layer_type].setdefault("partial_rotary_factor", partial_rotary_factor) + + return rope_parameters + + def dynamic_rope_update(rope_forward): """ Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE @@ -157,8 +238,8 @@ def _compute_linear_scaling_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) - rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters + rope_parameters_dict = get_standardized_rope_params(config) + rope_parameters_dict = rope_parameters_dict[layer_type] if layer_type is not None else rope_parameters_dict factor = rope_parameters_dict["factor"] # Gets the default RoPE parameters @@ -222,7 +303,7 @@ def _compute_dynamic_ntk_parameters( """ # TODO (joao): use the new `original_max_position_embeddings` from rope_parameters # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) + rope_parameters_dict = get_standardized_rope_params(config) rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] @@ -309,7 +390,7 @@ def _compute_yarn_parameters( post-processing scaling factor applied to the computed cos/sin. """ # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) + rope_parameters_dict = get_standardized_rope_params(config) rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] @@ -439,7 +520,7 @@ def _compute_longrope_parameters( """ # TODO (joao): use the new `original_max_position_embeddings` from rope_parameters # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) + rope_parameters_dict = get_standardized_rope_params(config) rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] @@ -523,7 +604,7 @@ def _compute_llama3_parameters( post-processing scaling factor applied to the computed cos/sin. """ # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) + rope_parameters_dict = get_standardized_rope_params(config) rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters # Gets the default RoPE parameters @@ -803,14 +884,16 @@ def _validate_llama3_parameters(rope_parameters: dict, config: PreTrainedConfig, } -def rope_config_validation(config: PreTrainedConfig, ignore_keys: Optional[set] = None): +def rope_config_standardize_and_validate(config: PreTrainedConfig, ignore_keys: Optional[set] = None): """ Validate the RoPE config arguments, given a `PreTrainedConfig` object """ - rope_parameters_dict = getattr(config, "rope_parameters", None) # not a default parameter in `PreTrainedConfig` + rope_parameters_dict = get_standardized_rope_params(config) if rope_parameters_dict is None: return + # Update the config with correctly formatted RoPE parameters + config.rope_parameters = rope_parameters_dict if set(rope_parameters_dict.keys()).issubset(ALLOWED_LAYER_TYPES): pass else: @@ -830,85 +913,3 @@ def rope_config_validation(config: PreTrainedConfig, ignore_keys: Optional[set] logger.warning( f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" ) - - -class RopeParameters(TypedDict, total=False): - """ - Args: - rope_theta (`float`): - The base period of the RoPE embeddings. - rope_type (`str`, *optional*, defaults to "default"): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - partial_rotary_factor (`float`, *optional*): - Percentage of the query and keys which will have rotary embedding. - factor (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - original_max_position_embeddings (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - attention_factor (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - beta_fast (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - beta_slow (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - short_factor (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - long_factor (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - low_freq_factor (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - high_freq_factor (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - """ - - rope_theta: float - rope_type: Optional[str] - partial_rotary_factor: Optional[float] - factor: Optional[float] - original_max_position_embeddings: Optional[int] - attention_factor: Optional[float] - beta_fast: Optional[float] - beta_slow: Optional[float] - short_factor: Optional[list[float]] - long_factor: Optional[list[float]] - low_freq_factor: Optional[float] - high_freq_factor: Optional[float] - - -def standardize_rope_params(config): - """ - Helper to standardize the config's rope params field by ensuring the params are defined for each - later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility) - """ - rope_parameters = getattr(config, "rope_parameters", {}) - - # Move `rope_theta` and `partial_rotary_factor` to the params dict, if not there yet - rope_theta = getattr(config, "rope_theta", None) - partial_rotary_factor = getattr(config, "partial_rotary_factor", None) - - # Case 1: one RoPE theat = one RoPE param per model without nesting - if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): - rope_parameters.setdefault("rope_type", rope_parameters.get("type", "default")) - rope_parameters.setdefault("rope_theta", rope_theta) - rope_parameters.setdefault("partial_rotary_factor", partial_rotary_factor) - # Case 2: different RoPE for each layer as nested dict - else: - for layer_type in config.layer_types: - rope_parameters[layer_type].setdefault("rope_type", rope_parameters[layer_type].get("type", "default")) - rope_parameters[layer_type].setdefault("rope_theta", rope_theta) - rope_parameters[layer_type].setdefault("partial_rotary_factor", partial_rotary_factor) - - config.rope_parameters = rope_parameters - rope_config_validation(config) diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index 98fe8e157016..4cf305b19726 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class ApertusConfig(PreTrainedConfig): @@ -160,14 +160,15 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 12000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + rope_parameters["rope_theta"] = kwargs.get("rope_theta", 12000000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py index b52e8bd82344..15498aaf1ec9 100644 --- a/src/transformers/models/apertus/modular_apertus.py +++ b/src/transformers/models/apertus/modular_apertus.py @@ -20,7 +20,7 @@ from torch import nn from ...cache_utils import Cache -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -180,9 +180,8 @@ def __init__( del self.head_dim # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 12000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 12000000.0) + rope_config_standardize_and_validate(self) class ApertusMLP(NemotronMLP): diff --git a/src/transformers/models/arcee/configuration_arcee.py b/src/transformers/models/arcee/configuration_arcee.py index b9892eaf8b61..26e36e123ab9 100644 --- a/src/transformers/models/arcee/configuration_arcee.py +++ b/src/transformers/models/arcee/configuration_arcee.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class ArceeConfig(PreTrainedConfig): @@ -165,12 +165,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index 7e11d4d99d11..6bcd0df6d9fa 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ..auto import CONFIG_MAPPING, AutoConfig @@ -170,12 +170,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py index f849fbb3cefa..943ffb38102d 100644 --- a/src/transformers/models/bamba/configuration_bamba.py +++ b/src/transformers/models/bamba/configuration_bamba.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -174,13 +174,13 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = 0.5 # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/bitnet/configuration_bitnet.py b/src/transformers/models/bitnet/configuration_bitnet.py index 0473ad6ac407..0918b6470723 100644 --- a/src/transformers/models/bitnet/configuration_bitnet.py +++ b/src/transformers/models/bitnet/configuration_bitnet.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -140,12 +140,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/blt/configuration_blt.py b/src/transformers/models/blt/configuration_blt.py index 7459346645ea..2c2992e3ce25 100644 --- a/src/transformers/models/blt/configuration_blt.py +++ b/src/transformers/models/blt/configuration_blt.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -67,12 +67,12 @@ def __init__( self.initializer_range = initializer_range # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -122,12 +122,12 @@ def __init__( self.initializer_range = initializer_range # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -169,11 +169,12 @@ def __init__( self.initializer_range = initializer_range # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -249,11 +250,12 @@ def __init__( self.initializer_range = initializer_range # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -377,11 +379,12 @@ def __init__( self.monotonicity = kwargs.get("monotonicity", False) # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # Cross attention configurations self.cross_attn_k = cross_attn_k diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index bfa8a9f33469..003265a6cec5 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -232,11 +232,12 @@ def __init__( self.swin_norm = swin_norm # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) if vq_config is None: vq_config = {} diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index 18afd5fd32e9..23656ae0ad5e 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -167,11 +167,12 @@ def __init__( self.use_qk_norm = use_qk_norm # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py index 910dc6dcb80a..12126d4f7b22 100644 --- a/src/transformers/models/cohere2/configuration_cohere2.py +++ b/src/transformers/models/cohere2/configuration_cohere2.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Cohere2Config(PreTrainedConfig): @@ -168,7 +168,8 @@ def __init__( self.layer_types = layer_types # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Need to specify head_dim in the config so it can be used in the attention forward functions self.head_dim = hidden_size // num_attention_heads @@ -193,9 +194,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["Cohere2Config"] diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index af9fa871f391..f769bf7c204b 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -27,8 +27,7 @@ from ...modeling_rope_utils import ( RopeParameters, dynamic_rope_update, - rope_config_validation, - standardize_rope_params, + rope_config_standardize_and_validate, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack @@ -192,7 +191,8 @@ def __init__( self.layer_types = layer_types # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Need to specify head_dim in the config so it can be used in the attention forward functions self.head_dim = hidden_size // num_attention_heads @@ -217,9 +217,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) class Cohere2RotaryEmbedding(CohereRotaryEmbedding): diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index ce1ad2dd5993..eac38977457a 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -163,12 +163,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) class CsmConfig(PreTrainedConfig): @@ -350,12 +350,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cwm/configuration_cwm.py b/src/transformers/models/cwm/configuration_cwm.py index 765f7f713247..5673657c6fa2 100644 --- a/src/transformers/models/cwm/configuration_cwm.py +++ b/src/transformers/models/cwm/configuration_cwm.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import rope_config_standardize_and_validate class CwmConfig(PreTrainedConfig): @@ -179,12 +179,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1_000_000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cwm/modular_cwm.py b/src/transformers/models/cwm/modular_cwm.py index df2a003438a8..ac093fd13733 100644 --- a/src/transformers/models/cwm/modular_cwm.py +++ b/src/transformers/models/cwm/modular_cwm.py @@ -21,7 +21,7 @@ from ...configuration_utils import layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import standardize_rope_params +from ...modeling_rope_utils import rope_config_standardize_and_validate from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging from ..llama.configuration_llama import LlamaConfig @@ -183,8 +183,8 @@ def __init__( del self.attention_bias # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1_000_000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + rope_config_standardize_and_validate(self) class CwmRotaryEmbedding(Qwen2RotaryEmbedding): diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index 82182c49bd3f..193ba0909711 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -17,7 +17,7 @@ from typing import Any, Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -223,11 +223,12 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters["rope_theta"] = 10000.0 # Validate the correctness of rotary position embeddings parameters - standardize_rope_params(self, rope_theta=10000.0) - rope_config_validation(self) + rope_config_standardize_and_validate(self) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py index 101d699194fd..75ed18f6d3f7 100644 --- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class DeepseekV2Config(PreTrainedConfig): @@ -211,12 +211,12 @@ def __init__( self.head_dim = qk_rope_head_dim # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index 928a0e1fcf7a..19804559fcdf 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -19,7 +19,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {} @@ -227,17 +227,17 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) for key in ["beta_fast", "beta_slow", "factor"]: if key in self.rope_parameters: self.rope_parameters[key] = float(self.rope_parameters[key]) - rope_config_validation(self) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py index 64545d7abcf6..96d23881617a 100644 --- a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py +++ b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py @@ -132,7 +132,8 @@ def __init__( self.rope_theta = rope_theta # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self._rope_parameters_validation() super().__init__( diff --git a/src/transformers/models/dia/configuration_dia.py b/src/transformers/models/dia/configuration_dia.py index b54b5620e524..076e9c42c528 100644 --- a/src/transformers/models/dia/configuration_dia.py +++ b/src/transformers/models/dia/configuration_dia.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -94,12 +94,12 @@ def __init__( self.initializer_range = initializer_range # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__(**kwargs) @@ -200,12 +200,12 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py index cbfb5fea5160..b946d57c0f7d 100644 --- a/src/transformers/models/diffllama/configuration_diffllama.py +++ b/src/transformers/models/diffllama/configuration_diffllama.py @@ -20,7 +20,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class DiffLlamaConfig(PreTrainedConfig): @@ -147,12 +147,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py index b1058db36a72..250fb272045c 100644 --- a/src/transformers/models/doge/configuration_doge.py +++ b/src/transformers/models/doge/configuration_doge.py @@ -23,7 +23,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class DogeConfig(PreTrainedConfig): @@ -191,12 +191,12 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # for backward compatibility if num_key_value_heads is None: diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py index 008466fbf4ac..0090ccaafd68 100644 --- a/src/transformers/models/doge/modular_doge.py +++ b/src/transformers/models/doge/modular_doge.py @@ -31,7 +31,7 @@ from ...integrations.flex_attention import compile_friendly_flex_attention from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import AttentionInterface, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, is_torch_flex_attn_available, logging @@ -220,12 +220,12 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # for backward compatibility if num_key_value_heads is None: diff --git a/src/transformers/models/dots1/configuration_dots1.py b/src/transformers/models/dots1/configuration_dots1.py index 71393a7844ba..b645d8df9652 100644 --- a/src/transformers/models/dots1/configuration_dots1.py +++ b/src/transformers/models/dots1/configuration_dots1.py @@ -15,7 +15,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -194,7 +194,8 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -207,9 +208,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index 8ba99bd02d23..0643b35dd6ed 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import standardize_rope_params +from ...modeling_rope_utils import rope_config_standardize_and_validate class EfficientLoFTRConfig(PreTrainedConfig): @@ -176,12 +176,13 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters or {} + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} or {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 4.0) self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) # Standardize and validate the correctness of rotary position embeddings parameters - standardize_rope_params(self) + rope_config_standardize_and_validate(self) super().__init__(**kwargs) diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py index 634efd227f9e..7d4aa1d29449 100644 --- a/src/transformers/models/emu3/configuration_emu3.py +++ b/src/transformers/models/emu3/configuration_emu3.py @@ -17,7 +17,7 @@ from typing import Optional, Union from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Emu3VQVAEConfig(PreTrainedConfig): @@ -228,12 +228,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/ernie4_5/configuration_ernie4_5.py b/src/transformers/models/ernie4_5/configuration_ernie4_5.py index 346eff50e9f2..f3439fa4b248 100644 --- a/src/transformers/models/ernie4_5/configuration_ernie4_5.py +++ b/src/transformers/models/ernie4_5/configuration_ernie4_5.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Ernie4_5Config(PreTrainedConfig): @@ -150,12 +150,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py index 66a299b04c00..c6286cca3089 100644 --- a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +++ b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -183,12 +183,12 @@ def __init__( self.use_bias = use_bias # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/evolla/configuration_evolla.py b/src/transformers/models/evolla/configuration_evolla.py index 4dab03fb9314..ad3248543725 100644 --- a/src/transformers/models/evolla/configuration_evolla.py +++ b/src/transformers/models/evolla/configuration_evolla.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -251,12 +251,12 @@ def __init__( self.resampler_ff_mult = resampler_ff_mult # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # Subconfig if protein_encoder_config is None: diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py index a968bcc6f07b..9c9f044878f9 100644 --- a/src/transformers/models/exaone4/configuration_exaone4.py +++ b/src/transformers/models/exaone4/configuration_exaone4.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Exaone4Config(PreTrainedConfig): @@ -166,7 +166,8 @@ def __init__( self.sliding_window_pattern = sliding_window_pattern # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.sliding_window is None: @@ -183,9 +184,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index 4ddc3466ffd9..56a7b7f12a7b 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -30,7 +30,7 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import ( @@ -199,7 +199,8 @@ def __init__( self.sliding_window_pattern = sliding_window_pattern # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.sliding_window is None: @@ -216,9 +217,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 3e7b437954dc..fce62fa5a929 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -164,12 +164,12 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/falcon_h1/configuration_falcon_h1.py b/src/transformers/models/falcon_h1/configuration_falcon_h1.py index 6ba590f15025..b02fe9d46466 100644 --- a/src/transformers/models/falcon_h1/configuration_falcon_h1.py +++ b/src/transformers/models/falcon_h1/configuration_falcon_h1.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -199,12 +199,12 @@ def __init__( self.num_logits_to_keep = num_logits_to_keep # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) self.projectors_bias = projectors_bias mamba_intermediate = mamba_expand * hidden_size if mamba_d_ssm is None else mamba_d_ssm diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py index 635d398b46d9..bd4832be6636 100644 --- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py +++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class FlexOlmoConfig(PreTrainedConfig): @@ -177,12 +177,12 @@ def __init__( self.norm_topk_prob = norm_topk_prob # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py index a25362a71f35..062d8ad141d9 100644 --- a/src/transformers/models/flex_olmo/modular_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py @@ -21,7 +21,7 @@ from ...cache_utils import Cache, DynamicCache from ...masking_utils import create_causal_mask from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import OutputRecorder, check_model_inputs @@ -192,9 +192,8 @@ def __init__( del self.clip_qkv # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # FlexOlmo RMS norm reuses Olmo2 RMS norm, which handles low precision slightly differently than the original Olmoe. diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index e1315ad9cbca..40e6eed2ca65 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -172,13 +172,13 @@ def __init__( self.image_token_id = image_token_id # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 25000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 25000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index a2c6ac12f008..926ed9c406a5 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class GemmaConfig(PreTrainedConfig): @@ -154,12 +154,12 @@ def __init__( self.use_bidirectional_attention = use_bidirectional_attention # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index 1445baef96bf..bb4d219d54aa 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -24,7 +24,7 @@ from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...tokenization_utils import AddedToken, PreTrainedTokenizer @@ -182,12 +182,12 @@ def __init__( self.use_bidirectional_attention = use_bidirectional_attention # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index 460fb7000354..a37b79930226 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Gemma2Config(PreTrainedConfig): @@ -182,7 +182,8 @@ def __init__( self.use_bidirectional_attention = use_bidirectional_attention # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -191,9 +192,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["Gemma2Config"] diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 4e36cc22e030..f62769e66ad0 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -30,8 +30,7 @@ ROPE_INIT_FUNCTIONS, RopeParameters, dynamic_rope_update, - rope_config_validation, - standardize_rope_params, + rope_config_standardize_and_validate, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack @@ -211,7 +210,8 @@ def __init__( self.use_bidirectional_attention = use_bidirectional_attention # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -220,9 +220,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) class Gemma2RMSNorm(GemmaRMSNorm): diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index d549bfffddf1..64c699deda5a 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -22,7 +22,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging from ..siglip import SiglipVisionConfig @@ -187,16 +187,18 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - if rope_parameters is None: - rope_parameters = {"sliding_attention": {"rope_type": "default"}, "full_attention": rope_scaling} - elif "full_attention" in rope_parameters: - rope_parameters["full_attention"].update(rope_scaling) - else: - rope_parameters.update(rope_scaling) - - self.rope_parameters = rope_parameters + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) + self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: self.sliding_window = (self.sliding_window // 2) + 1 # due to fa we set exclusive bounds @@ -212,12 +214,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1_000_000.0) - rope_local_base_freq = getattr(self, "rope_local_base_freq", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) class Gemma3Config(PreTrainedConfig): diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 47e1b49ac7bb..b8aae16ddc4d 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -30,8 +30,7 @@ ROPE_INIT_FUNCTIONS, RopeParameters, dynamic_rope_update, - rope_config_validation, - standardize_rope_params, + rope_config_standardize_and_validate, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack @@ -203,16 +202,18 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - if rope_parameters is None: - rope_parameters = {"sliding_attention": {"rope_type": "default"}, "full_attention": rope_scaling} - elif "full_attention" in rope_parameters: - rope_parameters["full_attention"].update(rope_scaling) - else: - rope_parameters.update(rope_scaling) + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) - self.rope_parameters = rope_parameters self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: self.sliding_window = (self.sliding_window // 2) + 1 # due to fa we set exclusive bounds @@ -228,12 +229,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1_000_000.0) - rope_local_base_freq = getattr(self, "rope_local_base_freq", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) class Gemma3Config(PreTrainedConfig): diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index 796822cf4e37..76981434317b 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -23,7 +23,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import is_timm_available, logging, requires_backends @@ -225,9 +225,21 @@ def __init__( self.sliding_window = sliding_window self.final_logit_softcapping = final_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) + + # Validate the correctness of rotary position embeddings parameters + rope_config_standardize_and_validate(self) if layer_types is None: self.layer_types = [ @@ -238,14 +250,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - rope_local_base_freq = kwargs.get("rope_local_base_freq", 100000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) - self.hidden_size_per_layer_input = hidden_size_per_layer_input self.num_kv_shared_layers = num_kv_shared_layers diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index 375bd93f2723..60d760c3f651 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -28,7 +28,7 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -236,9 +236,6 @@ def __init__( self.sliding_window = sliding_window self.final_logit_softcapping = final_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if layer_types is None: self.layer_types = [ @@ -249,13 +246,20 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) + # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - rope_local_base_freq = kwargs.get("rope_local_base_freq", 100000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) self.hidden_size_per_layer_input = hidden_size_per_layer_input self.num_kv_shared_layers = num_kv_shared_layers diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index 7f7509730d8f..324e779151cc 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class GlmConfig(PreTrainedConfig): @@ -142,13 +142,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index caf6194bba94..33e850e07893 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Glm4Config(PreTrainedConfig): @@ -142,13 +142,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index 10da1ba90aac..b64435c28faf 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Glm4MoeConfig(PreTrainedConfig): @@ -180,13 +180,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index b8c4cd222475..5e470938d05c 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -20,7 +20,7 @@ from torch import nn from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging from ..cohere.modeling_cohere import CohereAttention from ..deepseek_v3.modeling_deepseek_v3 import ( @@ -194,13 +194,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index 7370a80b52f2..ff63708923b3 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Glm4vVisionConfig(PreTrainedConfig): @@ -240,12 +240,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) self.image_token_id = image_token_id self.video_token_id = video_token_id diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 2df8b6f9d04a..1892343d79d6 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -31,7 +31,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -277,12 +277,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) self.image_token_id = image_token_id self.video_token_id = video_token_id diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index 44004afd6c9c..a63c6c664fa6 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Glm4vMoeVisionConfig(PreTrainedConfig): @@ -269,13 +269,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index a0b494664ab1..04b140772939 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -23,7 +23,7 @@ from ...masking_utils import create_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, logging @@ -215,13 +215,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 8c2c77a8deb3..376f2f0189fd 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -149,14 +149,14 @@ def __init__( self.use_parallel_residual = use_parallel_residual # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 0.25) self.attention_bias = attention_bias # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rotary_emb_base", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rotary_emb_base", 10000.0) + rope_config_standardize_and_validate(self) if self.hidden_size % self.num_attention_heads != 0: raise ValueError( "The hidden size is not divisible by the number of attention heads! Make sure to update them!" diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index 077b6429f510..517f2a3eacf6 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -115,15 +115,15 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 1.0) self.attention_dropout = attention_dropout self.hidden_dropout = hidden_dropout # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rotary_emb_base", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rotary_emb_base", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["GPTNeoXJapaneseConfig"] diff --git a/src/transformers/models/gpt_oss/configuration_gpt_oss.py b/src/transformers/models/gpt_oss/configuration_gpt_oss.py index d7e714079e39..ef249ff05d6f 100644 --- a/src/transformers/models/gpt_oss/configuration_gpt_oss.py +++ b/src/transformers/models/gpt_oss/configuration_gpt_oss.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class GptOssConfig(PreTrainedConfig): @@ -111,12 +111,12 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 150000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 150000.0) + rope_config_standardize_and_validate(self) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py index 97d3eca0aafe..5a5abdbd13e1 100644 --- a/src/transformers/models/granite/configuration_granite.py +++ b/src/transformers/models/granite/configuration_granite.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -177,12 +177,12 @@ def __init__( self.attention_multiplier = attention_multiplier # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, @@ -192,7 +192,5 @@ def __init__( **kwargs, ) - rope_config_validation(self) - __all__ = ["GraniteConfig"] diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py index 98460ec8a363..615a5b558044 100644 --- a/src/transformers/models/granitemoe/configuration_granitemoe.py +++ b/src/transformers/models/granitemoe/configuration_granitemoe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -161,12 +161,12 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) self.attention_bias = attention_bias self.attention_dropout = attention_dropout @@ -189,7 +189,5 @@ def __init__( **kwargs, ) - rope_config_validation(self) - __all__ = ["GraniteMoeConfig"] diff --git a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py index 9a58272ec428..016b94544364 100644 --- a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -200,12 +200,12 @@ def __init__( self.shared_intermediate_size = shared_intermediate_size # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py index b94545710e35..c5922ef7c6ce 100644 --- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -166,12 +166,12 @@ def __init__( self.position_embedding_type = "rope" # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) self.attention_bias = attention_bias self.attention_dropout = attention_dropout @@ -195,7 +195,5 @@ def __init__( **kwargs, ) - rope_config_validation(self) - __all__ = ["GraniteMoeSharedConfig"] diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py index 3f3ee841991f..eb6a07d53134 100644 --- a/src/transformers/models/helium/configuration_helium.py +++ b/src/transformers/models/helium/configuration_helium.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class HeliumConfig(PreTrainedConfig): @@ -149,12 +149,12 @@ def __init__( self.mlp_bias = mlp_bias # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 100000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 100000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py index 3dfa5388d1f7..fde580e13223 100644 --- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -143,12 +143,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) # TODO needs model-specific validation? + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # TODO needs model-specific validation? super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py index 5ee86b218ae0..1a9edbd96b9e 100644 --- a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +++ b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py @@ -17,7 +17,7 @@ from typing import Optional, Union from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -159,12 +159,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py index 43a7b069a32e..660e47d137da 100644 --- a/src/transformers/models/jetmoe/configuration_jetmoe.py +++ b/src/transformers/models/jetmoe/configuration_jetmoe.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -149,12 +149,12 @@ def __init__( self.rms_norm_eps = rms_norm_eps # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index 05c901d96dd4..5091117a1ea0 100644 --- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -185,12 +185,12 @@ def __init__( self.sliding_window = sliding_window # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/lfm2/configuration_lfm2.py b/src/transformers/models/lfm2/configuration_lfm2.py index 6ee32698cc85..28e72b7aa7f2 100644 --- a/src/transformers/models/lfm2/configuration_lfm2.py +++ b/src/transformers/models/lfm2/configuration_lfm2.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Lfm2Config(PreTrainedConfig): @@ -150,7 +150,8 @@ def __init__( self.block_auto_adjust_ff_dim = block_auto_adjust_ff_dim # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -158,9 +159,8 @@ def __init__( self.layer_types = ["full_attention" if i in full_attn_idxs else "conv" for i in range(num_hidden_layers)] # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) + rope_config_standardize_and_validate(self) tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys super().__init__( diff --git a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py index f65af16d77b6..8d60a8637f92 100644 --- a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py +++ b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Lfm2MoeConfig(PreTrainedConfig): @@ -138,7 +138,8 @@ def __init__( self.num_hidden_layers = num_hidden_layers # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.max_position_embeddings = max_position_embeddings self.use_cache = use_cache self.norm_eps = norm_eps @@ -162,9 +163,8 @@ def __init__( self.layer_types = layer_types # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) + rope_config_standardize_and_validate(self) tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys super().__init__( diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index add6c8ee2f74..150f54b4b9b9 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class LlamaConfig(PreTrainedConfig): @@ -173,12 +173,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index 88a0f4f82e53..7de3c3e6a830 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -126,12 +126,12 @@ def __init__( self.vision_feature_select_strategy = vision_feature_select_strategy # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__(**kwargs) @@ -318,7 +318,8 @@ def __init__( self.use_qk_norm = use_qk_norm # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.num_experts_per_tok = num_experts_per_tok self.num_local_experts = num_local_experts @@ -353,9 +354,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) class Llama4Config(PreTrainedConfig): diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py index e99c2b8265c2..80868a6ca1d4 100644 --- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py +++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class LongcatFlashConfig(PreTrainedConfig): @@ -212,17 +212,17 @@ def __init__( self.routed_scaling_factor = routed_scaling_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000000.0) for key in ["beta_fast", "beta_slow", "factor"]: if key in self.rope_parameters: self.rope_parameters[key] = float(self.rope_parameters[key]) - rope_config_validation(self) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py index 5453817e3ea4..52956f181c0c 100644 --- a/src/transformers/models/mimi/configuration_mimi.py +++ b/src/transformers/models/mimi/configuration_mimi.py @@ -20,7 +20,7 @@ import numpy as np from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -223,12 +223,12 @@ def __init__( self.attention_bias = attention_bias # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # Handle backward compatibility for frame_rate: # If frame_rate is explicitly provided, use it (backward compatibility) diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index 1e582de1bff8..badbf8bbb485 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -23,7 +23,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class MiniMaxConfig(PreTrainedConfig): @@ -223,7 +223,8 @@ def __init__( self.mlp_beta_factor = mlp_beta_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -232,9 +233,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index 2f459f770998..b7e341416afc 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -28,7 +28,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging from ...utils.generic import OutputRecorder, check_model_inputs @@ -248,7 +248,8 @@ def __init__( self.mlp_beta_factor = mlp_beta_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -257,9 +258,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/ministral/configuration_ministral.py b/src/transformers/models/ministral/configuration_ministral.py index 3f286cd69a9f..6cc23d9bea76 100644 --- a/src/transformers/models/ministral/configuration_ministral.py +++ b/src/transformers/models/ministral/configuration_ministral.py @@ -7,7 +7,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class MinistralConfig(PreTrainedConfig): @@ -159,7 +159,8 @@ def __init__( self.layer_types = layer_types # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -167,9 +168,8 @@ def __init__( ] * num_hidden_layers # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["MinistralConfig"] diff --git a/src/transformers/models/ministral/modular_ministral.py b/src/transformers/models/ministral/modular_ministral.py index f79600e82974..309149f4b2de 100644 --- a/src/transformers/models/ministral/modular_ministral.py +++ b/src/transformers/models/ministral/modular_ministral.py @@ -7,7 +7,7 @@ from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import check_model_inputs @@ -161,7 +161,8 @@ def __init__( self.layer_types = layer_types # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -169,9 +170,8 @@ def __init__( ] * num_hidden_layers # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) class MinistralMLP(Qwen2MLP): diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py index 0fac55d26e2a..a8021d3080bf 100644 --- a/src/transformers/models/mistral/configuration_mistral.py +++ b/src/transformers/models/mistral/configuration_mistral.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -168,12 +168,12 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index adc86a035bed..caeb3b530b75 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -188,12 +188,12 @@ def __init__( self.router_jitter_noise = router_jitter_noise # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index 85be3701a84a..2b0263acdca3 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import rope_config_standardize_and_validate from ...utils import logging @@ -249,12 +249,12 @@ def __init__( self.max_position_embeddings = max_position_embeddings # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index b3a045ae324a..19260a95177d 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -22,7 +22,7 @@ from typing import Literal, Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class ModernBertConfig(PreTrainedConfig): @@ -206,9 +206,6 @@ def __init__( self.sparse_pred_ignore_index = sparse_pred_ignore_index self.reference_compile = reference_compile self.repad_logits_with_grad = repad_logits_with_grad - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.classifier_pooling not in ["cls", "mean"]: raise ValueError( @@ -227,13 +224,21 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["sliding_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = getattr(self, "global_rope_theta", 160_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = getattr(self, "local_rope_theta", 10000.0) + # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "global_rope_theta", 160_000.0) - rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) def to_dict(self): output = super().to_dict() diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index 8783517edb8a..b276ca0faef2 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -35,7 +35,7 @@ SequenceClassifierOutput, TokenClassifierOutput, ) -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, is_flash_attn_2_available, logging from ...utils.import_utils import is_triton_available @@ -234,9 +234,6 @@ def __init__( self.sparse_pred_ignore_index = sparse_pred_ignore_index self.reference_compile = reference_compile self.repad_logits_with_grad = repad_logits_with_grad - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.classifier_pooling not in ["cls", "mean"]: raise ValueError( @@ -255,13 +252,21 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["sliding_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = getattr(self, "global_rope_theta", 160_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = getattr(self, "local_rope_theta", 10000.0) + # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "global_rope_theta", 160_000.0) - rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) def to_dict(self): output = super().to_dict() diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index be60950fa593..1d8903ecd45d 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class ModernBertDecoderConfig(PreTrainedConfig): @@ -187,9 +187,6 @@ def __init__( self.classifier_activation = classifier_activation self.use_cache = use_cache self.global_attn_every_n_layers = global_attn_every_n_layers - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters # for consistency with ModernBert self.reference_compile = False @@ -204,13 +201,21 @@ def __init__( else: self.layer_types.append("full_attention") + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["sliding_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = getattr(self, "global_rope_theta", 160_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = getattr(self, "local_rope_theta", 10000.0) + # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "global_rope_theta", 160_000.0) - rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index 24c63f499bb2..09ddc064bbf4 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -28,7 +28,7 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -208,9 +208,6 @@ def __init__( self.classifier_activation = classifier_activation self.use_cache = use_cache self.global_attn_every_n_layers = global_attn_every_n_layers - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters # for consistency with ModernBert self.reference_compile = False @@ -225,13 +222,21 @@ def __init__( else: self.layer_types.append("full_attention") + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["sliding_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = getattr(self, "global_rope_theta", 160_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = getattr(self, "local_rope_theta", 10000.0) + # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "global_rope_theta", 160_000.0) - rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index 59513e50477d..cd542ffaf9b4 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class MoonshineConfig(PreTrainedConfig): @@ -176,13 +176,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index ab379620345c..1964820bb482 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -35,7 +35,7 @@ Seq2SeqLMOutput, Seq2SeqModelOutput, ) -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -199,13 +199,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index fea1a7cff985..8d77b25cc63a 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -284,12 +284,12 @@ def __init__( self.num_codebooks = num_codebooks # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) audio_encoder_config = kwargs.pop("audio_encoder_config", {}) audio_encoder_model_type = audio_encoder_config.pop("model_type", "mimi") diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index f9d57bf46b73..4de7f35c83d9 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -144,13 +144,13 @@ def __init__( self.mlp_bias = mlp_bias # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index f01e33ead00a..30ba9d1863db 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -160,12 +160,12 @@ def __init__( self.clip_qkv = clip_qkv # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py index 3ba97d4f162b..533b6848e065 100644 --- a/src/transformers/models/olmo2/configuration_olmo2.py +++ b/src/transformers/models/olmo2/configuration_olmo2.py @@ -27,7 +27,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Olmo2Config(PreTrainedConfig): @@ -160,12 +160,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/olmo3/configuration_olmo3.py b/src/transformers/models/olmo3/configuration_olmo3.py index 6e3f5594cbb5..179394093f36 100644 --- a/src/transformers/models/olmo3/configuration_olmo3.py +++ b/src/transformers/models/olmo3/configuration_olmo3.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Olmo3Config(PreTrainedConfig): @@ -170,7 +170,8 @@ def __init__( self.rms_norm_eps = rms_norm_eps # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.sliding_window = sliding_window self.layer_types = layer_types @@ -181,9 +182,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["Olmo3Config"] diff --git a/src/transformers/models/olmo3/modular_olmo3.py b/src/transformers/models/olmo3/modular_olmo3.py index d8bec6e9f15d..b01b56b4b382 100644 --- a/src/transformers/models/olmo3/modular_olmo3.py +++ b/src/transformers/models/olmo3/modular_olmo3.py @@ -25,7 +25,7 @@ from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding @@ -186,7 +186,8 @@ def __init__( self.rms_norm_eps = rms_norm_eps # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.sliding_window = sliding_window self.layer_types = layer_types @@ -197,9 +198,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) class Olmo3RMSNorm(Olmo2RMSNorm): diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py index efc04e8a56bb..2d030f312b69 100644 --- a/src/transformers/models/olmoe/configuration_olmoe.py +++ b/src/transformers/models/olmoe/configuration_olmoe.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class OlmoeConfig(PreTrainedConfig): @@ -160,12 +160,12 @@ def __init__( self.norm_topk_prob = norm_topk_prob # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index 3760519d4266..28e0301f1999 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -120,13 +120,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 25000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 25000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index d386b30c6959..bd8be29e1b80 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -162,13 +162,13 @@ def __init__( self.qk_layernorm = qk_layernorm # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 6581a2ce9b36..c064ad628a39 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -165,13 +165,13 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) self._rope_parameters_adjustment() self._rope_parameters_validation() self.sliding_window = sliding_window diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index 53f686234fd2..fbae60d90d4f 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Phi4MultimodalVisionConfig(PreTrainedConfig): @@ -406,13 +406,13 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) self._rope_parameters_adjustment() self._rope_parameters_validation() self.sliding_window = sliding_window diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py index f7a9b528211f..687010b38290 100644 --- a/src/transformers/models/phimoe/configuration_phimoe.py +++ b/src/transformers/models/phimoe/configuration_phimoe.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -169,11 +169,12 @@ def __init__( self.input_jitter_noise = input_jitter_noise # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) if self.rope_parameters["rope_type"] != "default": if "original_max_position_embeddings" in self.rope_parameters: @@ -189,8 +190,6 @@ def __init__( f"`rope_parameters`'s long_mscale field must be a number, got {rope_parameters_long_mscale}" ) - rope_config_validation(self) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index 62c179b20edc..b39db091a3bf 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -105,12 +105,12 @@ def __init__( self.initializer_range = initializer_range # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["PixtralVisionConfig"] diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py index bda8bb8abfc7..4d0eaeeba280 100644 --- a/src/transformers/models/qwen2/configuration_qwen2.py +++ b/src/transformers/models/qwen2/configuration_qwen2.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -159,7 +159,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -172,9 +173,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index af96e9a3163f..4fdac248f53a 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -356,7 +356,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -369,9 +370,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) class Qwen2_5OmniThinkerConfig(PreTrainedConfig): @@ -699,7 +699,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.position_id_per_seconds = position_id_per_seconds # zf self.seconds_per_chunk = seconds_per_chunk # zf self.audio_start_token_id = audio_start_token_id # zf @@ -719,9 +720,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -824,12 +824,12 @@ def __init__( self.enc_se_channels = enc_se_channels # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 5b26bef72601..448af15b8bf3 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -45,7 +45,7 @@ from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...generation import GenerationMixin from ...modeling_outputs import BaseModelOutput, ModelOutput -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import ( @@ -389,7 +389,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -402,9 +403,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) class Qwen2_5OmniThinkerConfig(PreTrainedConfig): @@ -732,7 +732,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.position_id_per_seconds = position_id_per_seconds # zf self.seconds_per_chunk = seconds_per_chunk # zf self.audio_start_token_id = audio_start_token_id # zf @@ -752,9 +753,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -857,12 +857,12 @@ def __init__( self.enc_se_channels = enc_se_channels # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index 77ce5556c6cf..3e0195193dca 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -27,7 +27,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Qwen2_5_VLVisionConfig(PreTrainedConfig): @@ -204,7 +204,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -217,11 +218,10 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) if self.rope_parameters["rope_type"] == "mrope": self.rope_parameters["rope_type"] = "default" - rope_config_validation(self, ignore_keys={"mrope_section"}) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py index 256d663d3114..a287417b3594 100644 --- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -187,7 +187,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -210,9 +211,8 @@ def __init__( layer_type_validation(self.layer_types) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 58e80e2011d3..2244585fe198 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -192,7 +192,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -205,11 +206,10 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) if self.rope_parameters["rope_type"] == "mrope": self.rope_parameters["rope_type"] = "default" - rope_config_validation(self, ignore_keys={"mrope_section"}) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index a1cf6a1ea861..c6ce972f1e88 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -167,7 +167,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -180,9 +181,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py index 8bc756a17267..c766815e4ba1 100644 --- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -181,12 +181,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # MoE arguments self.decoder_sparse_step = decoder_sparse_step diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index 0527148166c0..de96a59dd918 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -201,7 +201,8 @@ def __init__( self.head_dim = head_dim # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.25) self.layer_types = layer_types @@ -214,9 +215,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # linear attention part self.linear_conv_kernel_dim = linear_conv_kernel_dim diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index c7746f420514..d03ba4c8ba8b 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -326,12 +326,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -347,7 +347,6 @@ def __init__( tie_word_embeddings=tie_word_embeddings, **kwargs, ) - rope_config_validation(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) class Qwen3OmniMoeThinkerConfig(PreTrainedConfig): @@ -591,7 +590,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -604,9 +604,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( tie_word_embeddings=tie_word_embeddings, @@ -770,12 +769,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -1039,12 +1038,12 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) @property def layer_types(self): diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index ea6ac6860133..3617eab80448 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -42,7 +42,7 @@ MoeCausalLMOutputWithPast, MoeModelOutputWithPast, ) -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import PreTrainedModel from ...processing_utils import ProcessorMixin, Unpack from ...tokenization_utils_base import TextInput @@ -217,9 +217,8 @@ def __init__( self.sliding_window = sliding_window # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) class Qwen3OmniMoeThinkerConfig(Qwen2_5OmniThinkerConfig): @@ -674,12 +673,12 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) @property def layer_types(self): diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py index 546a3da5bb7b..78cf4d35bb01 100644 --- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Qwen3VLVisionConfig(PreTrainedConfig): @@ -172,12 +172,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 5000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 5000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 8cbac42f13c3..0e3ca62e7ca1 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -30,7 +30,7 @@ from ...masking_utils import create_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, dynamic_rope_update, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, dynamic_rope_update, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import ProcessingKwargs, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -213,12 +213,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 5000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 5000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py index eab77fa368a2..f9b7a786fa5a 100644 --- a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Qwen3VLMoeTextConfig(PreTrainedConfig): @@ -168,12 +168,12 @@ def __init__( self.head_dim = head_dim or hidden_size // num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 5000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 5000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) # MoE arguments self.decoder_sparse_step = decoder_sparse_step diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index 006fa186fe44..bdb54bd179a3 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -23,7 +23,7 @@ from ...activations import ACT2FN from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -190,12 +190,12 @@ def __init__( self.head_dim = head_dim or hidden_size // num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 5000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 5000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) # MoE arguments self.decoder_sparse_step = decoder_sparse_step diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index 54b482141b42..48a0c34bebe6 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -148,13 +148,13 @@ def __init__( self.final_w_init_variance_scale = 2.0 / self.num_hidden_layers # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/seed_oss/configuration_seed_oss.py b/src/transformers/models/seed_oss/configuration_seed_oss.py index 240cb03bac77..2ce745b42966 100644 --- a/src/transformers/models/seed_oss/configuration_seed_oss.py +++ b/src/transformers/models/seed_oss/configuration_seed_oss.py @@ -16,7 +16,7 @@ from typing import Optional from transformers.configuration_utils import PreTrainedConfig -from transformers.modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from transformers.modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class SeedOssConfig(PreTrainedConfig): @@ -172,12 +172,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/smollm3/configuration_smollm3.py b/src/transformers/models/smollm3/configuration_smollm3.py index 04e8e78e575c..d6a2741e9f29 100644 --- a/src/transformers/models/smollm3/configuration_smollm3.py +++ b/src/transformers/models/smollm3/configuration_smollm3.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class SmolLM3Config(PreTrainedConfig): @@ -202,9 +202,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 2000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 2000000.0) + rope_config_standardize_and_validate(self) __all__ = ["SmolLM3Config"] diff --git a/src/transformers/models/smollm3/modular_smollm3.py b/src/transformers/models/smollm3/modular_smollm3.py index e5551d414c1b..f8b140d9def6 100644 --- a/src/transformers/models/smollm3/modular_smollm3.py +++ b/src/transformers/models/smollm3/modular_smollm3.py @@ -21,7 +21,7 @@ from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...modeling_flash_attention_utils import FlashAttentionKwargs -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import logging @@ -219,9 +219,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 2000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 2000000.0) + rope_config_standardize_and_validate(self) class SmolLM3RotaryEmbedding(Qwen2RotaryEmbedding): diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index a0ddcc33a79c..9ebde6e422ed 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -147,13 +147,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.25) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py index cb34ad1d9157..04fea7033776 100644 --- a/src/transformers/models/starcoder2/configuration_starcoder2.py +++ b/src/transformers/models/starcoder2/configuration_starcoder2.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -157,12 +157,12 @@ def __init__( self.embedding_dropout = embedding_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py index ae51ab07269b..1ac2be91c247 100644 --- a/src/transformers/models/t5gemma/configuration_t5gemma.py +++ b/src/transformers/models/t5gemma/configuration_t5gemma.py @@ -22,7 +22,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class T5GemmaModuleConfig(PreTrainedConfig): @@ -178,7 +178,8 @@ def __init__( self.layer_types = layer_types # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -187,9 +188,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) class T5GemmaConfig(PreTrainedConfig): diff --git a/src/transformers/models/vaultgemma/configuration_vaultgemma.py b/src/transformers/models/vaultgemma/configuration_vaultgemma.py index 0a784c02c1e6..4edbed15065a 100644 --- a/src/transformers/models/vaultgemma/configuration_vaultgemma.py +++ b/src/transformers/models/vaultgemma/configuration_vaultgemma.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class VaultGemmaConfig(PreTrainedConfig): @@ -178,7 +178,8 @@ def __init__( self.layer_types = layer_types # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -187,9 +188,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["VaultGemmaConfig"] diff --git a/src/transformers/models/zamba2/configuration_zamba2.py b/src/transformers/models/zamba2/configuration_zamba2.py index 4d6c92439da5..385091866c65 100644 --- a/src/transformers/models/zamba2/configuration_zamba2.py +++ b/src/transformers/models/zamba2/configuration_zamba2.py @@ -23,7 +23,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Zamba2Config(PreTrainedConfig): @@ -197,12 +197,12 @@ def __init__( self.use_long_context = use_long_context # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) self.mamba_d_state = mamba_d_state self.mamba_d_conv = mamba_d_conv