+
    ~j                       a  R  t0 t ^ RIt^ RIt^ RIHt ^ RIHt ^ RIHtH	t	H
t
 ^RIHtHt ]P                  ! ]4      t]! 4       '       d   ^ RIt]'       d   ^RIHt R tR!R R	 lltR"R
 R lltR!R R lltR#R R lltR#R R lltR#R R lltR]R]R]R]R]R]/t] ^ k  ! R R]
4      t ! R R4      tR$R R lltR# )%    N)Callablewraps)TYPE_CHECKINGOptional	TypedDict)is_torch_availablelogging)PreTrainedConfigc                P   a aa RR loRR lo\        S 4      RVVV 3R ll4       pV# )aD  
Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
(i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

Args:
    rope_forward (Callable):
        The forward pass of the RoPE implementation.

Returns:
    The decorated forward pass.
c                   \         P                  ! V4      ^,           pVf9   V P                  pV P                  pRpV P                  P
                  R,          pMJV P                  V,          p\        W R24      pV R2pV P                  P
                  V,          R,          pWH8  di   \        W R24      '       g-   \        V,          p	V	! V P                  VV^,           VR7      w  rV P                  V R2X
R	R
7       \        W R2V
4       R# VP                  V4      pV P                  V R2VR	R
7       \        W R2V4       R# )zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.N  original_max_position_embeddings_original_inv_freq__long_inv_freqseq_len
layer_typeinv_freqF
persistentlong_inv_freqoriginal_inv_freq)torchmax	rope_typer   configrope_parametersgetattrhasattrROPE_INIT_FUNCTIONSregister_buffersetattrto)selfposition_idsdevicer   r   r   r   prefixr   rope_init_fnr   r   s   &&&&        u/Users/mitch_tango/dev/rabbit-r1-livekit/agent/.venv/lib/python3.14/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_update6dynamic_rope_update.<locals>.longrope_frequency_update/   sO   ))L)A-I $ 6 6F/3{{/J/JKm/n,z2I '<N.O P"|1%F/3{{/J/J:/V20, 54<~!>??29=#/KK<q@)	$    F88!4mPU VDHM2MB !2 4 4V <  F88!46GTY ZDH$568IJ    c                   \         P                  ! V4      ^,           pVf(   V P                  pV P                  pV P                  pRpM?V P                  V,          p\        W R2V P                  4      p\        W R24      pV R2pWF8  dQ   \        V,          p	V	! V P                  VVVR7      w  qn        V P                  V R2V
RR	7       \        W R2V4       W@P                  8  de   W`P                  8  dS   VP                  V4      pV P                  V R2VRR	7       \        W R
2V4       \        W R2V P                  4       R# R# R# )z
dynamic RoPE layers should recompute `inv_freq` in the following situations:
1 - growing beyond the cached sequence length (allow scaling)
2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
Nr   _max_seq_len_cachedr   r   r   r   Fr   r   )r   r   r   max_seq_len_cachedr   r    r"   r   attention_scalingr#   r$   original_max_seq_lenr%   )r&   r'   r(   r   r   r   r1   r   r)   r*   r   s   &&&&       r+   dynamic_frequency_update5dynamic_rope_update.<locals>.dynamic_frequency_updateR   si    ))L)A-I!%!8!8 $ 6 6Fz2I!(=P/QSWSjSj!k '<N.O P"|1%F'.y9L/;%	0,H,   F88!4h5 QDL(;<gF...3EHaHa3a !2 4 4V <  F88!46GTY ZDH$568IJDL(;<d>W>WX 4b.r.   c                    < Vf   V P                   MV P                   V,          pVe   RV/M/ pRV9   d   S! W3RVP                  /VB  MVR8X  d   S! W3RVP                  /VB  S! WV3/ VB # )Nr   dynamicr(   longrope)r   r(   )	r&   xr'   r   r   kwargsr4   r,   rope_forwards	   &&&&  r+   wrapper$dynamic_rope_update.<locals>.wrapperx   s{    &0&8DNNdnnZ>X	/9/E,
+2	!$TSSFS*$%dTTVTD\<V<<r.   Nr   )r;   r<   r4   r,   s   f @@r+   dynamic_rope_updater?   "   s6    !KF$YL <= = = Nr.   c                    V ^8  d   QhR\         R,          R\         R,          R\        R,          R\        R,          R\        R	\        3,          /# 
   r   r   r(   torch.devicer   Nr   returntorch.Tensorr   intstrtuplefloat)formats   "r+   __annotate__rL      sW     3& 3&'(3&^$3& 4Z3& d
	3&
 >5 !3&r.   c           	        V P                  4        Ve   V P                  V,          MV P                  pVR,          pVR,          pVP                  RR4      p\        V RR4      ;'       g    V P                  V P
                  ,          p\        W,          4      p	Rp
RV\        P                  ! ^ V	^\        P                  R7      P                  V\        P                  R7      V	,          ,          ,          pW,          pW3# )	a  
Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
Args:
    config ([`~transformers."PreTrainedConfig"`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
            the first fraction of the head_dim. Defaults to 1.0.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
Nfactor
rope_thetapartial_rotary_factor      ?head_dimdtyper(   rT   )standardize_rope_paramsr   getr    hidden_sizenum_attention_headsrG   r   arangeint64r%   rJ   )r   r(   r   r   rope_parameters_dictrN   baserP   rR   dimattention_factorr   s   &&&&        r+   '_compute_linear_scaling_rope_parametersr`      s    B ""$AKAW611*=]c]s]s!(+F  -D0445LcRvz40ddF4F4F&JdJd4dH
h.
/C du||AsAU[[ILLTZbgbmbmLnqttuvH
 H%%r.   c                    V ^8  d   QhR\         R,          R\         R,          R\        R,          R\        R,          R\        R	\        R
\        3,          /# )rB   r   r   r(   rC   r   Nr   head_dim_keyrD   rE   rF   )rK   s   "r+   rL   rL      si     C& C&'(C&^$C& 4ZC& d
	C&
 C& >5 !C&r.   c           	        V P                  4        Ve   V P                  V,          MV P                  p\        WR4      ;'       g    V P                  V P                  ,          pVR,          pVP                  RR4      pVP                  RR4      p	Rp
\        W,          ^,          4      pRV\        P                  ! ^ ^V,          ^\        P                  R7      P                  V\        P                  R7      V,          ,          ,          pV^,          V,
          pV^ 8  dA   \        P                  ! V\        P                  ! V\        P                  VR7      3^ R	7      pMTpW,          pW3# )
a  
Computes the inverse frequencies with proportional RoPE.

Args:
    config ([`~transformers.PretrainedConfig`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): The proportion of the embedding dimension
            to apply rotary positional encoding, e.g., [0.0, 0.25, 0.5, 0.75, 1.0]. Unlike other RoPE functions
            that use this parameter, proportional RoPE will always return an encoding that is the size of
            `head_dim`.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
NrO   rN   rQ   rP   rS   rU   rT   r(   )r^   )rV   r   r    rX   rY   rW   rG   r   rZ   r[   r%   rJ   catzerosfloat32)r   r(   r   r   rb   r\   rR   r]   rN   rope_proportionr_   rope_anglesinv_freq_rotatednope_anglesr   s   &&&&&          r+   %_compute_proportional_rope_parametersrl      sE   J ""$AKAW611*=]c]s]svT2fff6H6HFLfLf6fH-D!%%h4F*../FLOo0A56KLLAOQekkBEEV[`[f[fEgjrr	t
 a-+-KQ99 Ku}}VL 
 $H%%r.   c                    V ^8  d   QhR\         R,          R\         R,          R\        R,          R\        R,          R\        R	\        3,          /# rA   rF   )rK   s   "r+   rL   rL     s^     C& C&'(C&^$C& 4ZC& d
	C&
 >5 !C&r.   c           	        V P                  4        Ve   V P                  V,          MV P                  pVR,          pVP                  RR4      p\        V RV P                  V P
                  ,          4      p\        Wv,          4      pVR,          p	Rp
Vf   V P                  pM\        V\        P                  4      '       dN   \        P                  ! V\        P                  ! V P                  VP                  VP                  R7      4      pM\        W P                  4      pWYV,          V P                  ,          V	^,
          ,
          W^,
          ,          ,          ,          pRV\        P                   ! ^ V^\        P"                  R7      P%                  V\        P&                  R7      V,          ,          ,          pW3# )	a	  
Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla

Args:
    config ([`~transformers."PreTrainedConfig"`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
        *   max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at
            inference time
        *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor`
            will be accessed. The value of `factor` is used to determine the new base frequency, along with the
            current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the
            computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this
            factor has no effect. If seq_len <= max_position_embeddings, this factor effectively stretches the
            context window using an exponent derived from `dim`.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
            the first fraction of the head_dim. Defaults to 1.0.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length, used to update the dynamic RoPE at inference time. If `None` or shorter than
        max_position_embeddings, this value will be overridden by max_position_embeddings.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
rO   rP   rQ   rR   rN   rd   rS   rU   )rV   r   rW   r    rX   rY   rG   max_position_embeddings
isinstancer   TensormaximumtensorrT   r(   r   rZ   r[   r%   rJ   )r   r(   r   r   r\   r]   rP   rR   r^   rN   r_   r   s   &&&&        r+   _compute_dynamic_ntk_parametersrt     sm   V ""$AKAW611*=]c]s]s-D0445LcRvz6+=+=A[A[+[\H
h.
/C!(+F 00	GU\\	*	*--LL77w}}U\UcUcd

 g==> W$v'E'EE&ST*U[^hibi[jkkDdu||AsAU[[ILLTZbgbmbmLnqttuvH%%r.   c                    V ^8  d   QhRRR\         R,          R\        R,          R\        R,          R\        R	\        3,          /# rA   rF   )rK   s   "r+   rL   rL   G  sX     D& D&D&^$D& 4ZD& d
	D&
 >5 !D&r.   c                  a V P                  4        Ve   V P                  V,          MV P                  pVR,          pVP                  RR4      p\        V RV P                  V P
                  ,          4      p\        Wv,          4      pVR,          p	VP                  R4      p
VP                  R4      pVP                  R4      pVR	,          pV	f   V P                  V,          p	RR
 lpV
f8   V'       d(   V'       d    \        V! W4      V! W4      ,          4      p
MV! V	4      p
VP                  R4      ;'       g    ^ pVP                  R4      ;'       g    ^pR oV3R lpR pV\        P                  ! ^ V^4      P                  V\        P                  R7      V,          ,          pRV,          pRV	V,          ,          pV P                  P                  RR4      pV! VVWVV4      w  pp^V! VVV^,          4      P                  V\        P                  R7      ,
          pV^V,
          ,          VV,          ,           pVV
3# )a  
Computes the inverse frequencies with NTK scaling. Please refer to the
[original paper](https://huggingface.co/papers/2309.00071)

Args:
    config ([`~transformers."PreTrainedConfig"`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
        *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
        *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
            keys will be accessed:
            *   `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin.
                If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as available.
            *   `beta_fast` (`float`, *optional*, defaults to 32): Parameter to set the boundary for extrapolation
                (only) in the linear ramp function.
            *   `beta_slow` (`float`, *optional*, defaults to 1): Parameter to set the boundary for interpolation
                (only) in the linear ramp function.
            *   `factor` (`float`, *optional*): The scaling factor applied when interpolating the position IDs to
                extend the possible context length. Additionally, if `attention_factor` is None, the log of this
                value is used to compute a value for `attention_factor`, possibly in conjunciton with `mscale` and
                `mscale_all_dim`, if provided.
            *   `mscale` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                `mscale_all_dim` are provided, `mscale` acts scalar augmenting `log(factor)` when computing the
                numerator for the inferred value of `attention_factor`. If not provided, `attention_factor` will be
                calculated based on `factor` only.
            *   `mscale_all_dim` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing
                the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor`
                will be calculated based on `factor` only.
            *   `original_max_position_embeddings` (`int`): The original max position embeddings used during pretraining.
            *   `truncate` (`bool`, *optional*): Whether to truncate the correction range.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
            will be returned for the first fraction of the head_dim.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin.
rO   rP   rQ   rR   rN   r_   mscalemscale_all_dimr   c                 j    V ^8:  d   R# RV,          \         P                  ! V 4      ,          R,           # )   rQ   g?)mathlog)scalerw   s   &&r+   
get_mscale,_compute_yarn_parameters.<locals>.get_mscale  s(    A:V|dhhuo-33r.   	beta_fast	beta_slowc                    V\         P                  ! W0^,          \         P                  ,          ,          4      ,          ^\         P                  ! V4      ,          ,          # )zPInverse dimension formula to find the dimension based on the number of rotations)r{   r|   pi)num_rotationsr^   r]   ro   s   &&&&r+   find_correction_dim5_compute_yarn_parameters.<locals>.find_correction_dim  s@    dhh6!:Kdgg:UVWW\]`d`h`him`n\noor.   c                   < S! WW44      pS! WW44      pV'       d-   \         P                  ! V4      p\         P                  ! V4      p\        V^ 4      \	        Wr^,
          4      3# )z.Find dimension range bounds based on rotations)r{   floorceilr   min)	low_rothigh_rotr^   r]   ro   truncatelowhighr   s	   &&&&&&  r+   find_correction_range7_compute_yarn_parameters.<locals>.find_correction_range  sR    !'N"8$P**S/C99T?D3{CAg...r.   c                     W8X  d
   VR ,          p\         P                  ! V\         P                  R7      V ,
          W,
          ,          p\         P                  ! V^ ^4      pV# )gMbP?rS   )r   rZ   rg   clamp)r   r   r^   linear_func	ramp_funcs   &&&  r+   linear_ramp_factor4_compute_yarn_parameters.<locals>.linear_ramp_factor  sH    :5LC||Cu}}=C	RKKQ2	r.   rU   r   T)rz   )rV   r   rW   r    rX   rY   rG   ro   rJ   r   rZ   r%   )r   r(   r   r   r\   r]   rP   rR   r^   rN   r_   rw   rx   r   r~   r   r   r   r   	pos_freqsinv_freq_extrapolationinv_freq_interpolationr   r   r   inv_freq_extrapolation_factorr   r   s   &&&&                       @r+   _compute_yarn_parametersr   G  s5   t ""$AKAW611*=]c]s]s-D0445LcRvz6+=+=A[A[+[\H
h.
/C!(+F+//0BC!%%h/F)--.>?N';<^'_$
 ~//2RR4 n$Z%?*VBd%de)&1 %((5;;I$((5::Ip/ aa03363UX[[\I 9_ FY$67%%))*d;H%iCGgiqrIC %&(:3cQh(O(R(RZ`hmhshs(R(t$t!!&C"CD
 #@
@	A  %%%r.   c                    V ^8  d   QhRRR\         R,          R\        R,          R\        R,          R\        R	\        3,          /# rA   rF   )rK   s   "r+   rL   rL     sX     U& U&U&^$U& 4ZU& d
	U&
 >5 !U&r.   c                   V P                  4        Ve   V P                  V,          MV P                  pVR,          pVP                  RR4      p\        V RV P                  V P
                  ,          4      p\        Wv,          4      pVR,          p	VR,          p
VP                  R4      pVP                  R4      pVR	,          pVf   V P                  V,          pVfW   VR8:  d   RpML\        P                  ! ^\        P                  ! V4      \        P                  ! V4      ,          ,           4      pV'       d/   W-8  d)   \        P                  ! V	\        P                  VR
7      pM'\        P                  ! V
\        P                  VR
7      p\        P                  ! ^ V^\        P                  VR
7      P!                  4       V,          pRWV,          ,          ,          pVV3# )ay  
Computes the inverse frequencies with LongRoPE scaling. Please refer to the
[original implementation](https://github.com/microsoft/LongRoPE)

Args:
    config ([`~transformers."PreTrainedConfig"`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
        *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
        *   original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during
            pretraining. If not provided, defaults to `max_position_embeddings`.
        *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys
            will be accessed:
            *   `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, inferred from
                the value of `factor`.
            *   `factor` (`float`, *optional*): The scaling factor to apply to the RoPE embeddings. If both
                `max_position_embeddings` and `original_max_position_embeddings` are provided, this value will be
                overridden s the ratio between those values.
            *   `long_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                frequencies if `seq_len` is provided and greater than `original_max_position_embeddings`.
            *   `short_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
            will be returned for the first fraction of the head_dim.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length.

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin.
rO   rP   rQ   rR   long_factorshort_factorrN   r_   r   rd   )rV   r   rW   r    rX   rY   rG   ro   r{   sqrtr|   r   rs   rg   rZ   r[   rJ   )r   r(   r   r   r\   r]   rP   rR   r^   r   r   rN   r_   r   ext_factorsinv_freq_shaper   s   &&&&             r+   _compute_longrope_parametersr     s   d ""$AKAW611*=]c]s]s-D0445LcRvz6+=+=A[A[+[\H
h.
/C&}5K'7L!%%h/F+//0BC';<^'_$
 ~//2RR S="#yyTXXf-=Ii@j-j)jk 7=ll;emmFSll<u}}VT\\!S!5;;vNTTVY\\Nk.$889H%%%r.   c                    V ^8  d   QhRRR\         R,          R\        R,          R\        R,          R\        R	\        3,          /# rA   rF   )rK   s   "r+   rL   rL   &  sX     L, L,L,^$L, 4ZL, d
	L,
 >5 !L,r.   c           	        V P                  4        Ve   V P                  V,          MV P                  pVR,          pVP                  RR4      p\        V RR4      ;'       g    V P                  V P
                  ,          p\        Wv,          4      pRp	RV\        P                  ! ^ V^\        P                  R7      P                  V\        P                  R7      V,          ,          ,          p
VR,          pVR	,          pVR
,          pVR,          pW,          pW,          p^\        P                  ,          V
,          p\        P                  ! VV8  W,          V
4      pVV,          V,
          W,
          ,          p^V,
          V,          V,          VV,          ,           pVV8  ( VV8  ( ,          p\        P                  ! VVV4      pVV	3# )a,
  
Computes the inverse frequencies for llama 3.1.

Args:
    config ([`~transformers."PreTrainedConfig"`]):
        The model configuration. This function assumes that the config will provide at least the following
        properties:

        *   rope_theta (`float`, *optional*): The base wavelength from which the inverse frequencies will be derived. Defaults to `config.default_theta` if omitted.
        *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
        *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
        *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
            keys will be accessed:
            *   `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the
                wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies
                during smoothing.
            *   `high_freq_factor` (`float`): The scale factor used to compute `high_freq_wavelen` and
                the value for the denominator of the smoothing factor prior to the `low_freq_factor` shift.
            *   `low_freq_factor` (`float`): The scale factor used to compute `low_freq_wavelen` and
                the shift applied to the numerator and denominator of the smoothing factor.
                frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.
            *   `original_max_position_embeddings` (`int`): The original max position embeddings used
                during pretraining. If not provided, the function falls back to `max_position_embeddings`.

        Additionally, this function will make use of the following properties if they are found in the config:

        *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
            derived as hidden_size // num_attention_heads.
        *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
            the first fraction of the head_dim. Defaults to 1.0.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin.
NrO   rP   rQ   rR   rS   rU   rN   low_freq_factorhigh_freq_factorr   )rV   r   rW   r    rX   rY   rG   r   rZ   r[   r%   rJ   r{   r   where)r   r(   r   r   r\   r]   rP   rR   r^   r_   r   rN   r   r   old_context_lenlow_freq_wavelenhigh_freq_wavelenwaveleninv_freq_llamasmooth_factorsmoothed_inv_freqis_medium_freqs   &&&&                  r+   _compute_llama3_parametersr   &  s   Z ""$AKAW611*=]c]s]s  -D0445LcRvz40ddF4F4F&JdJd4dH
h.
/C du||AsAU[[ILLTZbgbmbmLnqttuvH!(+F*+<=O+,>?*+MNO&8':$''kH$G [[+;!;X=NPXYN$w.@EUEghM]*n<vEXfHff!223BR8R6SSN[[1BNSN+++r.   linearr7   yarnr8   llama3proportionalc                   0   a  ] tR tRt o RtV 3R ltRtV tR# )RopeParametersi  uu
  
Args:
    rope_theta (`float`, *optional*, defaults to `RotaryEmbeddingConfigMixin.default_theta`):
        The base period of the RoPE embeddings. Optional in serialized configs — if omitted,
        the model's `default_theta` (typically 10000.0) is used.
    rope_type (`str`, *optional*, defaults to "default"):
        The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
        'llama3'], with 'default' being the original RoPE implementation.
    partial_rotary_factor (`float`, *optional*):
        The percentage of the query and key head embedding on which RoPE will be applied.
    factor (`float`, *optional*):
        Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
        most scaling types, a `factor` of x will enable the model to handle sequences of length x *
        original maximum pre-trained length.
    original_max_position_embeddings (`int`, *optional*):
        Used with 'yarn', 'longrope' and 'llama3'. The original max position embeddings used during
        pretraining.
    attention_factor (`float`, *optional*):
        Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
        computation. If unspecified, it defaults to value recommended by the implementation, using the
        `factor` field to infer the suggested value.
    beta_fast (`float`, *optional*):
        Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
        ramp function. If unspecified, it defaults to 32.
    beta_slow (`float`, *optional*):
        Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
        ramp function. If unspecified, it defaults to 1.
    short_factor (`list[float]`, *optional*):
        Only used with 'longrope'. The scaling factor to be applied to short contexts (<
        `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
        size divided by the number of attention heads divided by 2
    long_factor (`list[float]`, *optional*):
        Only used with 'longrope'. The scaling factor to be applied to long contexts (<
        `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
        size divided by the number of attention heads divided by 2
    low_freq_factor (`float`, *optional*):
        Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
    high_freq_factor (`float`, *optional*):
        Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
c                r  < V ^8  d   Qh/ S[ R,          ;R&   S[R,          ;R&   S[ R,          ;R&   S[ R,          ;R&   S[R,          ;R&   S[ R,          ;R&   S[ R,          ;R&   S[ R,          ;R	&   S[S[ ,          R,          ;R
&   S[S[ ,          R,          ;R&   S[ R,          ;R&   S[ R,          ;R&   # )rB   NrO   r   rP   rN   r   r_   r   r   r   r   r   r   )rJ   rH   rG   list)rK   __classdict__s   "r+   rL   RopeParameters.__annotate__  s     T U V TzW X !4<'Y Z DL[ \ '*Dj0] ^ dl"_ ` t|a b t|c d u+$$e f et##g h T\!i j dl"k r.    N)__name__
__module____qualname____firstlineno____doc____annotate_func____static_attributes____classdictcell__r   s   @r+   r   r     s     '  r.   r   c                     a  ] tR tRt o RtRt]! 4       tR tR t	V 3R lR lt
RV 3R	 lR
 lltRV 3R lR lltRV 3R lR lltRV 3R lR lltRV 3R lR lltRV 3R lR lltRV 3R lR llt]RV 3R lR ll4       tRtV tR# )RotaryEmbeddingConfigMixini  zS
A Mixin containing the functionality to standardize and validate RoPE parameters.
g     @c                   VP                  R R4      pT;'       g    V P                  V n        V P                  e   V P                  M/ V n        VP                  R\        V RV P                  4      4      pV P                  P	                  RV4       VP                  R\        V RR4      4      pVe6   V P                  P	                  RV4       V P                  R0,          V n        V P                  4        V# )rope_scalingNrO   rP   )popr   r    default_theta
setdefaultrW   ignore_keys_at_rope_validationrV   )r&   r:   r   rO   rP   s   &,   r+   convert_rope_params_to_dict6RotaryEmbeddingConfigMixin.convert_rope_params_to_dict  s    zz.$7+CCt/C/C7;7K7K7Wt33]_ ZZgdL$J\J\.]^
''jA &

+BGDRikoDp q ,  ++,CEZ[262U2UYpXq2qD/$$&r.   c                   \        V RR4      p\        V RR4      p\        V RR4      ;'       g    / p\        V RR4      pV'       g    V'       g   \        P                  R4       R# Ve6   V/ 8X  g/   \        VP	                  4       4      P                  V4      '       g   VP                  RVP                  RR	4      4       VP                  RV4       Ve   W#R&   VR,          R9   dS   \        V R
4      '       d   V P                  V P                  R
&   MV P                  P                  R
V P                  4       M\        V4       F  pW5,          P                  RW5,          P                  RR	4      4       W5,          P                  RV4       Ve   W#V,          R&   W5,          R,          R9   g   Kn  V P                  V,          P                  R
V P                  4       K  	  W0n
        R# )z
Helper to standardize the config's rope params field by ensuring the params are defined for each
later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility)
rO   NrP   r   layer_typeszG`standardize_rope_params` was called but no RoPE parameters were found.r   typedefaultr   )r   r   r8   )r    loggerwarningsetkeysissubsetr   rW   r!   r   r   ro   )r&   rO   rP   r   r   r   s   &     r+   rV   2RotaryEmbeddingConfigMixin.standardize_rope_params  s    T<6
 '.Et L!$(94@FFBdM48  :NNde Or$9_EYEYE[A\AeAefqArAr&&{O4G4GPY4Z[&&|Z@$0;P 78 {+/MM4!CDD PTOtOtD(()KL((334VX\XtXtu "+.
+66{OD_DcDcdjluDvw+66|ZP(4K`J/0GH".{;?]]((4??:D<X<X /  /r.   c                   < V ^8  d   QhRR/# )rB   r&   r   r   )rK   r   s   "r+   rL   'RotaryEmbeddingConfigMixin.__annotate__  s      . r.   c                   \        V RR4      pV'       g   R# \        V RR4      e:   \        VP                  4       4      P                  V P                  4      '       d   MRV/pVP                  4        Fl  pVP                  RVP                  RR4      4      p\        V RV R	2R4      pW2R&   Ve   V! W P                  R
7       KS  \        P                  RV R24       Kn  	  R# )zI
Validate the RoPE config arguments, given a `"PreTrainedConfig"` object
r   Nr   full_attentionr   r   r   
_validate__rope_parametersignore_keyszMMissing validation function in 'RotaryEmbeddingConfigMixin' for 'rope_type'='')
r    r   r   r   r   valuesrW   r   r   r   )r&   r\   r   r   validation_fns   &    r+   validate_rope(RotaryEmbeddingConfigMixin.validate_rope  s      't->E#4-9cBVB[B[B]>^>g>g?
 ?
 $46J#K 3::<O'++K9L9LVU^9_`I#DJykAQ*RTXYM+4K((o;^;^_cdmcnnop  =r.   Nc                4   < V ^8  d   QhRS[ RS[R,          /# rB   r   r   Ndictr   )rK   r   s   "r+   rL   r   $  s"     
 
 
TWZ^T^ 
r.   c                ~    R 0pR0p\        VP                  4       4      pVR ,          pV P                  WeW4VR7       R# )r   rO   optional_keysr   N)r   r   _check_received_keys)r&   r   r   required_keysr   received_keysr   s   &&&    r+   !_validate_default_rope_parameters<RotaryEmbeddingConfigMixin._validate_default_rope_parameters$  sH    $%O0023#K0	!!m^i 	" 	
r.   c                4   < V ^8  d   QhRS[ RS[R,          /# r   r   )rK   r   s   "r+   rL   r   -  s&     j j jSVY]S] jr.   c                   R R0pR0p\        VP                  4       4      pVR ,          pV P                  WeW4VR7       VR,          pVe$   \        V\        \
        34      '       d   VR8  d   \        P                  RV 24       R# R# r   rN   rO   r   NrQ   B`rope_parameters`'s factor field must be a float or int >= 1, got r   r   r   rp   rJ   rG   r   r   r&   r   r   r   r   r   r   rN   s   &&&     r+    _validate_linear_rope_parameters;RotaryEmbeddingConfigMixin._validate_linear_rope_parameters-      $h/%O0023#K0	!!m^i 	" 	
 !*>FUCL!A!AVc\NN_`f_ghi FRr.   c                4   < V ^8  d   QhRS[ RS[R,          /# r   r   )rK   r   s   "r+   rL   r   :  s&     j j jTWZ^T^ jr.   c                   R R0pR0p\        VP                  4       4      pVR ,          pV P                  WeW4VR7       VR,          pVe$   \        V\        \
        34      '       d   VR8  d   \        P                  RV 24       R# R# r   r   r   s   &&&     r+   !_validate_dynamic_rope_parameters<RotaryEmbeddingConfigMixin._validate_dynamic_rope_parameters:  r   r.   c                4   < V ^8  d   QhRS[ RS[R,          /# r   r   )rK   r   s   "r+   rL   r   G  s"     2 2d 2QTW[Q[ 2r.   c           	        0 Rmp0 Rmp\        VP                  4       4      pVR ,          pV P                  WeW4VR7       VR,          pVe$   \        V\        \
        34      '       d   VR8  d   \        P                  R	V 24       VP                  R4      pVe6   \        V\        4      '       d   V^ 8  d   \        P                  R
V 24       VP                  R4      p	V	e5   \        V	\        \
        34      '       g   \        P                  RV	 24       VP                  R4      p
V
e5   \        V
\        \
        34      '       g   \        P                  RV
 24       T	;'       g    ^ T
;'       g    ^8  d   \        P                  RV	 RV
 R24       VR,          pV P                  V,          pW8w  d+   V^8w  d"   \        P                  RV RV RV R24       R# R# R# )r   rN   r   r_   r   r   r   NrQ   r   zO`rope_parameters`'s attention_factor field must be a float greater than 0, got z@`rope_parameters`'s beta_fast field must be a float or int, got z@`rope_parameters`'s beta_slow field must be a float or int, got zR`rope_parameters`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)zKThe explicitly set RoPE scaling factor (config.rope_parameters['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'original_max_position_embeddings' fields in the model config.>   rN   r   r   >   rw   r   r   r   rO   rx   r_   )r   r   r   rp   rJ   rG   r   r   rW   ro   warning_once)r&   r   r   r   r   r   r   rN   r_   r   r   r   implicit_factors   &&&          r+   _validate_yarn_rope_parameters9RotaryEmbeddingConfigMixin._validate_yarn_rope_parametersG  s   S
 O0023#K0	!!)Mfq!r *>FUCL!A!AVc\NN_`f_ghi*../AB'<Le1T1TXhklXlNNabrast $''4	 Is|)L)LNN]^g]hij#''4	 Is|)L)LNN]^g]hijOO	Q/NNdendo p::CD\^ ,;;]+^(669YY$A)=]^d]e fq ###A& J~	~ *>$r.   c                4   < V ^8  d   QhRS[ RS[R,          /# r   r   )rK   r   s   "r+   rL   r   {  s"     0 0$ 0UX[_U_ 0r.   c                   0 Rmp0 Rmp\        VP                  4       4      pVR ,          pV P                  WeW4VR7       VP                  RR4      p\	        V R	V P
                  V P                  ,          4      p\        W,          4      p	VP                  R4      p
\        V
\        4      '       d;   \        ;QJ d    R
 V
 4       F  '       d   K   RM	  RM! R
 V
 4       4      '       g   \        P                  RV
 24       \        V
4      V	^,          8w  d,   \        P                  RV	^,           R\        V
4       24       VP                  R4      p\        V\        4      '       d;   \        ;QJ d    R V 4       F  '       d   K   RM	  RM! R V 4       4      '       g   \        P                  RV 24       \        V4      V	^,          8w  d,   \        P                  RV	^,           R\        V4       24       VP                  R4      pVR,          pVf   Ve   \        P                  R4       MYVf   Vf   \        P                  R4       M;\        V\        \        34      '       d   VR8  d   \        P                  RV 24       VP                  R4      pVe@   \        V\        \        34      '       d   VR8  d   \        P                  RV 24       R# R# R# )r   r   r   r   r_   rN   r   rP   rQ   rR   c              3   N   "   T F  p\        V\        \        34      x  K  	  R # 5ir>   rp   rG   rJ   .0r9   s   & r+   	<genexpr>PRotaryEmbeddingConfigMixin._validate_longrope_rope_parameters.<locals>.<genexpr>  s!     6i\hWXz!c5\7R7R\h   #%FTzF`rope_parameters`'s short_factor field must be a list of numbers, got z8`rope_parameters`'s short_factor field must have length z, got c              3   N   "   T F  p\        V\        \        34      x  K  	  R # 5ir>   r
  r  s   & r+   r  r    s!     5g[fVWjS%L6Q6Q[fr  zE`rope_parameters`'s long_factor field must be a list of numbers, got z7`rope_parameters`'s long_factor field must have length Nav  This model config has set a `rope_parameters['original_max_position_embeddings']` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.z4Missing required keys in `rope_parameters`: 'factor'r   g        zV`rope_parameters`'s attention_factor field must be a float or int greater than 0, got >   r   r   r   r   >   rN   rO   r_   )r   r   r   rW   r    rX   rY   rG   rp   r   allr   r   lenr  rJ   )r&   r   r   r   r   r   r   rP   rR   r^   r   r   rN   r   r_   s   &&&            r+   "_validate_longrope_rope_parameters=RotaryEmbeddingConfigMixin._validate_longrope_rope_parameters{  s|   hDO0023#K0	!!)Mfq!r / 3 34KS Q4T-=-=AYAY-YZ(23&**>:<..336i\h6i3336i\h6i3i3iNNcdpcqrs|q(NNJ3RS8*TZ[^_k[lZmn &))-8;--##5g[f5g###5g[f5g2g2gNNbcnbopq{sax'NNI#QR(SYZ]^iZjYkl !$$X.+:;]+^( >>JE ^ @ HNNQRFUCL11Vc\NN_`f_ghi*../AB'<LuVYl1[1[_oru_uNNhiyhz{ `v'r.   c                4   < V ^8  d   QhRS[ RS[R,          /# r   r   )rK   r   s   "r+   rL   r     s"     ) ) )SVY]S] )r.   c                2   0 RmpVR ,          p\        VP                  4       4      pV P                  WEW2R7       VR,          pVe$   \        V\        \
        34      '       d   VR8  d   \        P                  RV 24       VR,          pVR,          pVe   \        V\        \
        34      '       g   \        P                  R	V 24       Ve   \        V\        \
        34      '       g   \        P                  R
V 24       W8:  d   \        P                  RV RV 24       VR,          p	V	e   \        V	\
        4      '       g   \        P                  RV	 24       WP                  8  d(   \        P                  RV	 RV P                   24       R# R# )r   rN   r   r   r   r   NrQ   r   zF`rope_parameters`'s low_freq_factor field must be a float, or int got zG`rope_parameters`'s high_freq_factor field must be a float or int, got zf`rope_parameters`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=zS`rope_parameters`'s original_max_position_embeddings field must be an integer, got zj`rope_parameters`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=>   rN   r   rO   r   r   r   )	r   r   r   rp   rJ   rG   r   r   ro   )
r&   r   r   r   r   r   rN   r   r   r   s
   &&&       r+    _validate_llama3_rope_parameters;RotaryEmbeddingConfigMixin._validate_llama3_rope_parameters  s   
 $K0	O0023!!)M!c *>FUCL!A!AVc\NN_`f_ghi)*;<*+=>"*_ucl*S*SNNcdsctuv#:6FPS+U+UNNYZjYkl .NNx#$$9/9JL
 ,;;]+^(+3:Ffhk;l;lNNe346 ,/K/KKNN|344QRVRnRnQoq Lr.   c                4   < V ^8  d   QhRS[ RS[R,          /# r   r   )rK   r   s   "r+   rL   r     s"      d Y\_cYc r.   c                    R R0pVR ,          p\        VP                  4       4      pV P                  WEW2R7       VP                  R4      pVf   \        P                  R4       R# R# )r   rO   r   rP   Nz`rope_parameters`'s partial_rotary_factor is None. This will default to 1.0 in the computation, making this equivalent to the linear_scaling RoPE type. Provide a value in the range [0.0, 1.0) to make use of the proportional RoPE funcitonality.)r   r   r   rW   r   r   )r&   r   r   r   r   r   rP   s   &&&    r+   &_validate_proportional_rope_parametersARotaryEmbeddingConfigMixin._validate_proportional_rope_parameters  sk    $l3#K0	O0023!!)M!c / 3 34K L (NNC )r.   c                T   < V ^8  d   QhRS[ RS[RS[RS[R,          RS[R,          /# )rB   r   r   r   r   Nr   )rH   r   )rK   r   s   "r+   rL   r     sJ     s sss s Tz	s
 4Zsr.   c                x   RV9   d   VR0,          pVP                  R4       T;'       g    \        4       pRV9  d   VP                  R4       Ve   V\        V4      ,          pW!,
          pV'       d   \        RV  RV 24      hW,
          V,
          pV'       d   \        P	                  RV  RV 24       R# R# )z\Compare the received keys in `config.rope_parameters` against the expected and optional keysr   r   rP   Nz<Missing required keys in `rope_parameters` for 'rope_type'='z': z8Unrecognized keys in `rope_parameters` for 'rope_type'=')addr   KeyErrorr   r   )r   r   r   r   r   missing_keysunused_keyss   &&&&&  r+   r   /RotaryEmbeddingConfigMixin._check_received_keys  s     ]"fX%Mk*%.."-756 "S--M$4YZcYddghtguvww#3mCNNUV_U``cdocpqr r.   )r   r   r>   )NN)r   r   r   r   r   r   r   r   r   rV   r   r   r   r   r  r  r  r  staticmethodr   r   r   r   s   @r+   r   r     s      M%(U"*./` :
 
j jj j2 2h0 0d) )V  s s sr.   r   c                >    V ^8  d   QhR\         R\        R,          /# )rB   r   r   N)r   r   )rK   s   "r+   rL   rL     s       #= CRVJ r.   c                |    \         P                  ! R\        4       V P                  4        V P	                  4        R# )ze
This is a deprecated function.
It has been kept for backward compatibility with custom code models.
aX  `rope_config_validation` is deprecated and has been removed. Its functionality has been moved to RotaryEmbeddingConfigMixin.validate_rope method. PreTrainedConfig inherits this class, so please call self.validate_rope() instead. Also, make sure to use the new rope_parameters syntax. You can call self.standardize_rope_params() in the meantime.N)warningswarnFutureWarningrV   r   )r   r   s   &&r+   rope_config_validationr*    s5    
 MM	G
 	 ""$
r.   c                    V ^8  d   Qh/ ^ \         9   d5   \        \        \        R\        R\
        3,          3,          3,          ;R&   # )rB   .rE   r"   )__conditional_annotations__r   rH   r   rI   rJ   )rK   s   "r+   rL   rL      s<    R T#xU>53H-I(IJJK S r.   )NNNN)NNNNrR   )NNNr>   ) r,  r{   r'  collections.abcr   	functoolsr   typingr   r   r   utilsr	   r
   
get_loggerr   r   r   configuration_utilsr   r?   r`   rl   rt   r   r   r   r"   r   r   r*  rL   )r,  s   @r+   <module>r3     s      $  5 5 . 
		H	% 5`F3&lC&LC&LD&NU&pL,f 5.
$,(9O  5#Y 5#pHs HsV
 r.   