+
    ~j,                       R t ^ RIt^ RIt^ RIt^ RIHt ^ RIHt ^ RIH	t	 ^ RI
Ht ^ RIHt ^ RIHt ^ RIHtHt ^ RIHt ^ R	IHt ^ R
IHt ^ RIHtHt ^ RIHtH t H!t!H"t" ^ RI#H$t$ ^RI%H&t& ^RI'H(t( ^RI)H*t* ^RI+H,t,H-t-H.t.H/t/H0t0H1t1H2t2 ^RI3H4t4H5t5H6t6 ]6Pn                  ! ]84      t9Rt:Rt;Rt<Rt=Rt>],R,          t,R]R] R]!R]"/t?R]:R]=/t@]5! ],4       ! R R ]/4      4       tA]AtBR# )!z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
N)defaultdict)Iterable)copyfile)Any)is_offline_mode)
AddedToken
processors)Encoding)	Tokenizer)Decoder)BPEUnigram)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainercached_file)SpmConverter)convert_gguf_tokenizer)load_gguf_checkpoint)INIT_TOKENIZER_DOCSTRINGBatchEncodingPreTokenizedInputPreTrainedTokenizerBase	TextInputTruncationStrategygenerate_merges)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
r   r   	WordLevel	WordPiecetokenizer_file
vocab_filec                   r  a a ] tR t^St oRt]tRtRt]	RBR l4       t
V 3R lt]V3R lR l4       t]V3R lR	 l4       tRCV3R
 lR lltR t]R 4       t]R 4       t]P&                  R 4       t]P&                  R 4       tR t]V3R lR l4       tV3R lR lt]V3R lR l4       t]V3R lR l4       t]V3R lR l4       t]t]tV3R lR ltV3R lR ltV3R  lR! lt]V3R" lR# l4       t]V3R$ lR% l4       t RDV3R' lR( llt!V3R) lR* lt"V3R+ lR, lt#RBV3R- lR. llt$RBV3R/ lR0 llt%RBV3R1 lR2 llt&REV3R3 lR4 llt'V3R5 lR6 lt(RR&])PT                  ]+PX                  R^ RRRRRRRRRRR&R3V3R7 lR8 llt-V3R9 lR: lt.RFV3R; lR< llt/RGV3R= lR> llt0RHR? lt1]	RIR@ l4       t2RAt3Vt4V ;t5# )JTokenizersBackenda5  
Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

Handles all the shared methods for tokenization and special tokens, as well as methods for
downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
NFc                  a( \        V4      pVP                  RR4      pVed   \        P                  P	                  V4      '       d?   V \
        J g   RV P                  9  g	   V'       d   \        P                  ! V4      VR&   V# VEe?   \        P                  P	                  V4      '       Ed   \        VRR7      ;_uu_ 4       p\        P                  ! V4      pRRR4       XP                  R/ 4      P                  R4      pVR:9  dc   \        V4      p\        VR,          4      p	/ V	R
&   VR8X  d   . V	R&   WR&   . VR&   \        P                  ! \        P                  ! V4      4      p
M\        P                  ! V4      p
V
P                  VR&   V
P                   VR&   V
P"                  VR&   V
P"                  e   V
P"                  VR&   V
P                   e   V
P                   VR&   VP                  R4      pV'       d   VP                  RR4      R8X  d   VR,          pM\%        V\&        4      '       g   V.pV FC  pVP                  R4      R8X  g   K  RV9   g   K$  ^ RIpVP+                  VR,          4      VR&    M	  VP                  R/ 4      P                  R
R4      pV P,                  f2   \%        V\&        4      '       d   \'        \/        \0        V4      4      pEM>V P,                  P2                  R	8X  d]   \%        V\&        4      '       dF   V'       d>   \%        V^ ,          \&        \0        34      '       d   V Uu. uF  p\1        V4      NK  	  ppMV P,                  P2                  R8X  d"   \5        V4       UUu/ uF	  w  ppVVbK  	  pppMV P,                  P2                  R8X  g   V P,                  P2                  R8X  dV   \%        V\&        4      '       d@   \5        V4       UUu/ uF(  w  pp\%        V\&        4      '       d
   V^ ,          MTVbK*  	  pppWR
&   \7        V RR4      pRVP                  R/ 4      9   dy   V'       dq   VP2                  R8X  d`   VR,          R,          pV Uu. uF>  p\%        V\8        4      '       d   \1        VP;                  R4      4      M
\1        V4      NK@  	  ppVVR&   V# VP                  R4      pVP                  R4      pVP                  R
4      pVP                  R4      p\%        V\8        4      '       df   VP=                  R4      '       dO   \        P                  P	                  V4      '       d*   ^RIH p V! VR 7      PC                  V4      w  VR
&   VR&   V# \%        V\8        4      '       Ed   \        P                  P	                  V4      '       Ed   VP=                  R!4      '       Ed    ^R"IH"p V! V4      pVPF                  ! V P,                  3/ VB p ^R#IH$p VP                  V P2                  4      pVe%   \K        VR$4      '       d   VPL                  ! R;/ VB p\K        T R(4      '       d   T PT                  ! R;/ TB pRT9  Ed   T \
        J g   RT P                  9  Ed   TP                  R
R4      pTP                  RR4      pTP                  R)4      ;'       g    / pTe   T'       d   TPW                  4        UUu/ uF	  w  ppTTbK  	  pppTPW                  4        Fb  w  pp\Y        T4      p\9        T4      pTP                  T4      p T '       g   K7  T T8w  g   K@  TT9  g   KI  TP                  T 4      TT&   TTT&   Kd  	  \Z        P\                  ! TP^                  TTR*7      p!T!e   T!TR&   TP^                  P`                  p"T"Pb                  ^ 8  d'   TPe                  R+T"Pf                  ;'       g    R,4       T"Ph                  ^ 8  d'   TPe                  R-T"Pj                  ;'       g    R.4       T"Pl                  ^ 8  d'   TPe                  R/T"Pn                  ;'       g    R04       T# VfJ   \%        V\8        4      '       d4   \        P                  P	                  V4      '       d   VVR
&   VR
,          pVfJ   \%        V\8        4      '       d4   \        P                  P	                  V4      '       d   VVR&   VR,          pVf   V P,                  e   V P,                  P2                  R8X  dq   \%        V\         4      '       d[   R7 V(3R8 llo(. R<Op%\u        4       p&V% F+  p'V'V9   g   K  V&Pw                  S(! VV',          .4      4       K-  	  \y        VV&R97      pVVR&   V#   + '       g   i     EL; iu upi u uppi u uppi u upi   \N         d2   p\P        PS                  R%T P2                   R&T R'24        Rp?ELRp?ii ; iu uppi   \N         dZ   p\P        PS                  R1T R2T R324       ^R4IH8p# T#! TTP                  R54      R67      p$T$Ps                  4       TR&    Rp?T# Rp?ii ; i)=z
Build a `tokenizers.Tokenizer` backend from the available serialization files (tokenizer.json, sentencepiece
models, tekken.json, vocab/merges).
r#   N__init__tokenizer_objectutf-8encodingmodeltyper   vocabr   mergesadded_tokenspost_processortokenizer_paddingtokenizer_truncation_json_truncation_json_padding
normalizerSequencenormalizersPrecompiledprecompiled_charsmap_spm_precompiled_charsmapr!   r"    r$   merges_fileztekken.json)MistralConverter)r$   .model)SentencePieceExtractor)SLOW_TO_FAST_CONVERTERSconvert_from_spmz,Could not reorder vocab using converter for z due to z/. Falling back to raw SentencePiece extraction.convert_from_spm_modeladded_tokens_decoder)protor/   r0   	bos_token<s>	eos_token</s>	unk_tokenz<unk>z+Could not extract SentencePiece model from z$ using sentencepiece library due to z%. Falling back to TikToken extractor.)TikTokenConverterextra_special_tokens)r$   rM   c                \    V ^8  d   QhR\         \        ,          R\        \        ,          /# )   valuesreturn)r   r   liststr)formats   "/Users/mitch_tango/dev/rabbit-r1-livekit/agent/.venv/lib/python3.14/site-packages/transformers/tokenization_utils_tokenizers.py__annotate__@TokenizersBackend.convert_to_native_format.<locals>.__annotate__)  s"     	! 	!Xc] 	!tCy 	!    c                    < . pV  FX  pVf   K	  \        V\        \        34      '       d   VP                  S! V4      4       K>  VP	                  \        V4      4       KZ  	  V# N)
isinstancerR   tupleextendappendrS   )rP   	collectedval_iter_special_tokenss   &  rU   ra   HTokenizersBackend.convert_to_native_format.<locals>._iter_special_tokens)  sY    ')	!C{ !#e}55!(()=c)BC!((S2 " ! rX   )skip_tokens)Nr    )		pad_tokenrK   rG   rI   	sep_token	cls_token
mask_tokenadditional_special_tokensrM   )=dictpopospathisfiler&   __dict__TokenizerFast	from_fileopenjsonloadgetfrom_strdumpsr2   padding
truncationr[   rR   base64	b64decoder-   mapr\   __name__	enumerategetattrrS   splitendswithconvert_slow_tokenizerr?   extract_vocab_merges_from_modelrA   extractrB   hasattrrC   	ExceptionloggerwarningrD   itemsintr   build_tokenizer_from_spm_protorF   trainer_specbos_id
setdefault	bos_pieceeos_id	eos_pieceunk_id	unk_piecerL   	convertedsetupdater   ))clstrust_remote_codekwargslocal_kwargsfast_tokenizer_filetokenizer_handletokenizer_json
model_typeminimal_tokenizer_jsonminimal_modeltok_from_filenormalizer_configr7   rz   r/   itemitokenr0   merger$   r>   r?   rA   	extractorrB   converter_classerE   token_idid_to_token	new_tokencurrent_tokenr)   
proto_specrL   	converterspecial_tokens_keysrc   keyra   s)   &&,                                     @rU   convert_to_native_format*TokenizersBackend.convert_to_native_formate   sI	    F|*../?F  +233))Zs||-KO`/</F/FGZ/[L+, ,@S1T1T )G<<@P!%+;!< = (++GR8<<VDJ!22)-n)=& $^G%< =)+g&&.0M(+2?w/9;&~6 - 6 6tzzBX7Y Z - 7 78K L-:-I-IL)*0=0E0EL,-3@3K3KL/0 ''33@3K3K/0$$00=0E0E_- !/ 2 2< @ $((6*D(9-(H%#$5t<<):(;%"3J!~~f->CY]gCg%DJDTDT&'=>E%@A  #4 #&&w377FEyy eT** UE!23E##y0eT**uE!HtUZm9\9\5:;UTU4[UE;##{22;E2BC2Bha2BC##u,		0B0Bk0QeT**_hin_op_oS[STV[E4)@)@U1XeQN_oEp$)! gt4J>--gr::
zObObfkOk'0:kqrkqbgZs5K5K%C 01QVW\Q]]kqr)/X&!%%l3
"&&}5  )!!(+ j#&&:+>+>}+M+MRTRYRYR`R`akRlRl@<L%=--j9 :L!<#9   j#&&277>>*+E+E*J]J]^fJgJgFIJ 3:>	(00KlK	O&=&A&A#,,&OO&2wPb7c7c'6'G'G'W,'W
 3 899#&#=#=#M#ML
 &\9,,
#,,0N(,,Wd;E)--h=F ,8+;+;<R+S+Y+YWY((-ANSkkm&\m?5(xm&\3G3M3M3O/Hi'*8}H(+II,7OOH,EM,})1KPYafPf3899]3Ki 08AH 5 4P (4'R'R'oo#%($
 (3;K%78 &/__%A%A
%,,1(33KAUAUA^A^Y^_%,,1(33KAUAUA_A_Y_`%,,1(33KAUAUA`A`Y`a   =Z
C88RWW^^J=W=W$.L! )E>jc::rww~~k?Z?Z%0L"!(+F >cii3		8J8Je8SXbchjnXoXo	! 	!
# %(EK*,&&&';\#=N<O'PQ + %UDF%+L"[ =<<x <C q sF ! NNFs||nT\]^\_  `O  P & ']6  
IA*Mqrsqt u: : F-)@P@PQg@h	 4=3F3F3H/0
Is   f#	f7	f<6.g<Ag+h 
A	g Bh h &h 9hAh h h $Bh 36h *6h !h #f4	h	&h>h h			h i6Ai11i6c           
     .  < VP                  R R4      pVP                  RR4      pVP                  RR4       VP                  RR4      pVP                  RR4      pVP                  RR4      pVP                  R/ 4      pVP                  RR	4      p	VP                  R
4      p
VP                  R4      pVP                  R4      pRpVe   \        P                  ! V4      pEMVe>   \        P
                  P                  V4      '       d   \        P                  ! V4      pEMVe   \        VP                  RR4      V3/ VB p\        V4      pVR,          R,          pVR,          pVR,          p\        VV4      w  ppVP                  V4       \        V4      ^ 8  d   VP                  V4       EMV P                  f   Ve   VeX   \        V\         4      '       d   TM%\#        V4       UUUu/ uF  w  pw  ppVVbK  	  upppp\        \%        VVRRR7      4      pM\        V\         4      '       d   \        \%        V. RRR7      4      pM\        V\&        4      '       dQ   V'       dI   \        V^ ,          \(        \&        34      '       d&   \        \+        WP                  R^ 4      R7      4      pMV P                  f   \-        R4      hVf7   Vf3   V P                  f%   VP/                  RR4       VP/                  RR4       Ve   Wn        V P                  f   \-        R4      hVP                  RR4      ;'       g!    V P                  P0                  ;'       g    TpVe   V P                  P2                  ! R7/ VB  VP/                  RVR,          4       VP/                  RVR ,          4       VP/                  R!VR!,          4       VP/                  R"VR#,          4       MV P                  P5                  4        VP                  R$R4      ;'       g!    V P                  P6                  ;'       g    TpVe   V P                  P8                  ! R7/ VB  VP/                  R%VR%,          4       VP/                  R&VR',          4       VP/                  R(VR ,          4       VP/                  RVR),          4       VP/                  R*VR*,          4       R+V9  d   R,VR+&   R-V9   ;'       g    R.V9   pVP                  R-R	4      V n        VP                  R.R	4      V n        VP                  R/R4      ;p'       d   VV P                  n        T;'       g    V P                  P>                  RJ V n         \B        S&V `  ! R7/ VB  V
e   Wn#        Wn$        V PJ                  V P                  n&        V PN                   Uu0 uF  p\Q        \S        V4      4      kK  	  pp\U        VPW                  4       R0 R17       UUu. uF#  w  pp\Q        \S        V4      4      V9  g   K!  VNK%  	  ppp\'        V PX                  P[                  4       4      V Uu. uF  p\]        V4      NK  	  up,           p V P^                  Pa                  4        F5  p!V!f   K	  \]        V!4      V 9  g   K  V!V9  g   K$  VPc                  V!4       K7  	  V Pd                   F/  p\]        V4      V 9  g   K  VV9  g   K  VPc                  V4       K1  	  \        V4      ^ 8  d   . p"V P^                  Pa                  4        U#u. uF  p#V#'       g   K  \]        V#4      NK  	  p$p#V Fw  p\        V\\        4      '       d   \g        VRR27      pM?\        V\f        4      '       d*   VPh                  '       g   \]        V4      V$9   d   RVn4        V"Pc                  V4       Ky  	  V"'       d   V Pk                  V"4        V P                  Pm                  4       p%V%R38  d   \q        V P                  R4R4      en   VP                  RR4       V Pr                  ! V P                  V Pt                  P                  RR4      3R5V Pt                  R6VP                  R6R4      /VB V n        V P@                  ;'       g    V P                  P>                  RJ V n         V P@                  '       d   V Pw                  4        R# R# u upppi u upi u uppi u upi u up#i   \n         d    ^ p% ELi ; i)8r5   Nr6   r<   r)   	gguf_filer#   rE   add_prefix_spaceFr$   r/   r0   name_or_path configr   	tokenizertokenizer_configT)r/   r0   fuse_unkdropoutr   )r/   r   a9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.rG   rH   rI   rJ   z3The backend tokenizer is not correctly initialized.r4   
max_lengthtruncation_side	directionstridetruncation_strategystrategyr3   re   pad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofbackend
tokenizersadd_bos_tokenadd_eos_tokenr2   c                     V ^ ,          #     rd   )xs   &rU   <lambda>,TokenizersBackend.__init__.<locals>.<lambda>  s    STUVSWrX   r   )speciali pre_tokenizerinit_kwargsfix_mistral_regexrd   )<rk   ru   copydeepcopyrl   rm   rn   rp   rq   r   r   r   r   len
_tokenizerr[   rj   r~   r   rR   r\   r   
ValueErrorr   ry   enable_truncationno_truncationrx   enable_padding_add_bos_token_add_eos_tokenr2   _should_update_post_processorsuperr(   r$   r   split_special_tokensencode_special_tokensrE   hashreprsortedr   added_tokens_encoderkeysrS   _special_tokens_maprP   r^   _extra_special_tokensr   r   
add_tokensget_vocab_sizeNotImplementedErrorr   _patch_mistral_regexr   update_post_processor)'selfargsr   r5   r6   r)   r   r   rE   r   r$   r/   r0   fast_tokenizer	gguf_path
gguf_paramarchitecturetokenizer_dictr   additional_kwargsr   w_
vocab_dict_truncation_paddingexplicit_bos_eos_in_kwargsr2   r   added_tokens_decoder_hashindextokens_to_addencoderspecial_token_valuetokenstall_named_tokens
vocab_size	__class__s'   &*,                                   rU   r(   TokenizersBackend.__init__H  s    "::&8$?

?D9 	

.5!::&8$?JJ{D1	$jj)94@%zz*@"E!::&8%@ZZ-


7#H%'!]]+;<N ,@S1T1T*445HIN"#FJJ~r$BIXQWXI-i8J%h/=L'4N)*<=0F|Uc0d-N-MM*+$%)/0__$):!&0&=&=UZcdiZjCkZjYQPVQRTUAqDZjCk
!.sF]ako/p!qE4((!.srTXbf/g!hE4((Uz%(UTXM7Z7Z!.wU::V^`aKb/c!d__$r  &+;+CH_k51k62%,O??"RSSjj!7>pp$//B\B\pp`p"OO--<<lK,EF/[1IJhH(=>3[5LMOO))+::148ddDOO<S<SddWdOO**6X6k8K+@A18M3JKnh{.CDlHX,>?2H=Q4RS F" ,F9%4%>%[%[/U[B["$jj%@$jj%@#ZZ(8$??>?-;DOO*-G-q-q4??KiKimqKq*"6"!(O 0040I0I-DHD]D]$^D]5T$u+%6D]!$^ !'';'A'A'C X
 XuDK (AA E X 	 

 t005578Ta;bTa5CJTa;bb $(#;#;#B#B#D"*&'w6;NVc;c$$%89	 $E //E5z(U--G$$U+ 0 }!F040H0H0O0O0QW0Q1UVA0QW&eS))&ud;Ez22 ===SZ;K-K(,e$ ' '	779J
 74??OT#R#^JJ{D)"77  $$^T: !,, #)**-@$"G	
 DO ..XX$//2P2PTX2X 	* ---&&( .o Dl~ %_

 <c"  X  # 	J	s<   2c'c.c3&c3c92
c>c>)d ddc                    < V ^8  d   QhRS[ /# rO   rQ   bool)rT   __classdict__s   "rU   rV   TokenizersBackend.__annotate__  s       rX   c                    R # )Trd   r   s   &rU   is_fastTokenizersBackend.is_fast  s    rX   c                    < V ^8  d   QhRS[ /# r  r  )rT   r  s   "rU   rV   r    s       rX   c                   RV P                   9   dx   V P                   R,          P                  R4      '       dP   \        V R4      '       d<   V P                  '       d*   \        P
                  P                  V P                  4      # R# R# )z
`bool`: Whether or not the slow tokenizer can be saved. For a sentencepiece based slow tokenizer, this
can only be `True` if the original `"sentencepiece.model"` was not deleted.
r$   r@   FT)vocab_files_namesr   r   r$   rl   rm   rn   r  s   &rU   can_save_slow_tokenizer)TokenizersBackend.can_save_slow_tokenizer  s`     4111d6L6L\6Z6c6cdl6m6mt\**tww~~doo66rX   c                J   < V ^8  d   QhRS[ RS[ R,          RS[S[ ,          /# )rO   save_directoryfilename_prefixNrQ   )rS   r\   )rT   r  s   "rU   rV   r    s-     ! !c !C$J !Z_`cZd !rX   c                   \         P                  P                  V4      '       g   \        P	                  R V R24       R# \         P                  P                  Y'       d
   VR,           MR\        R,          ,           4      p\         P                  P                  V P                  4      \         P                  P                  V4      8w  d   \        V P                  V4       V3# )zVocabulary path (z) should be a directoryN-r   r$   )
rl   rm   isdirr   errorjoinVOCAB_FILES_NAMESabspathr$   r   )r   r  r  out_vocab_files   &&& rU   save_vocabulary!TokenizersBackend.save_vocabulary  s    ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNT__n5  rX   c                   V P                   pV P                  pVf   V P                  '       d   RV n        V P                  pV P                  pVf   V P
                  '       d   RV n        V P                  '       d
   VR,           MR RV P
                  '       d   RV,           R,           MR 2pT V P                  '       d   RV,           R,           MR R	V P
                  '       d   RV,           R,           MR 2p. pV P                  '       d   VP                  W34       V P
                  '       d   VP                  W434       \        P                  ! WVVR
7      V P                  n
        R# )zU
Updates the underlying post processor with the current `bos_token` and `eos_token`.
NFz:0 r   z$A:0r=   z:0z:1z $B:1)singlepairspecial_tokens)rG   bos_token_idr   rI   eos_token_idr   r^   r   TemplateProcessingr   r2   )r   bosr+  eosr,  r(  r)  r*  s   &       rU   r   'TokenizersBackend.update_post_processor
  sF    nn((;4---!&Dnn((;4---!&D%)%7%7%7S5[R@[_[m[m[mcCiRVFVsuDvw0B0B0B39t+K5gkgygygyRUX[R[^bRb  @B  QC  D!!3"56!!3"56)3)F)F^*
&rX   c                    \        V R R4      # )r   Fr   r  s   &rU   r   TokenizersBackend.add_eos_token$      t-u55rX   c                    \        V R R4      # )r   Fr2  r  s   &rU   r   TokenizersBackend.add_bos_token(  r4  rX   c                T    \         P                  V R V4       V P                  4        R# )r   Nobject__setattr__r   r   values   &&rU   r   r3  ,  !    4!159""$rX   c                T    \         P                  V R V4       V P                  4        R# )r   Nr8  r;  s   &&rU   r   r6  1  r=  rX   c           	        . pV P                   P                  4        Fg  pVf   K	  \        V\        4      '       d   VP	                  V4       K2  \        V\
        4      '       g   KJ  VP	                  \        VRRR7      4       Ki  	  V P                   Fa  p\        V\        4      '       d   VP	                  V4       K,  \        V\
        4      '       g   KD  VP	                  \        VRRR7      4       Kc  	  V'       d   V P                  VRR7       \        V RR4      '       g   V P                  P                  f   V P                  4        R# R# )a3  
Post-initialization hook that runs after the tokenizer is fully set up.
This is called by from_pretrained() after loading the tokenizer, which allows
us to add any special tokens that may have been passed as AddedToken objects.

Child classes should call super()._post_init() if they override this method.
NTF)r   
normalized)r*  r   )r   rP   r[   r   r^   rS   r   r   r   r   r2   r   )r   r  token_valuer   s   &   rU   
_post_initTokenizersBackend._post_init6  s    33::<K"+z22$$[1K--$$ZTV[%\] = //E%,,$$U+E3''$$ZtPU%VW	 0 OOM$O?48$??4??CaCaCi&&( DjrX   c                    < V ^8  d   QhRS[ /# r  r   )rT   r  s   "rU   rV   r  W  s     G GC GrX   c                :    V P                   P                  RR7      # )z@
`int`: Size of the base vocabulary (without the added tokens).
Fwith_added_tokensr   r   r  s   &rU   r  TokenizersBackend.vocab_sizeV  s    
 ---FFrX   c                6   < V ^8  d   QhRS[ S[S[3,          /# r  rj   rS   r   )rT   r  s   "rU   rV   r  ]  s     A A4S> ArX   c                :    V P                   P                  R R7      # )TrG  )r   	get_vocabr  s   &rU   rN  TokenizersBackend.get_vocab]  s    ((4(@@rX   c                6   < V ^8  d   QhRS[ S[S[3,          /# r  rL  )rT   r  s   "rU   rV   r  a  s        tCH~  rX   c                "    V P                  4       # rZ   )rN  r  s   &rU   r/   TokenizersBackend.vocab`  s    ~~rX   c                6   < V ^8  d   QhRS[ S[S[3,          /# r  rL  )rT   r  s   "rU   rV   r  e  s     n nd38n nrX   c                    \        V P                  P                  4       R R7       UUu/ uF  w  rVP                  VbK  	  upp# u uppi )z
Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
optimisation in `self._added_tokens_encoder` for the slow tokenizers.
c                     V ^ ,          # r   rd   r   s   &rU   r   8TokenizersBackend.added_tokens_encoder.<locals>.<lambda>j      dhijdkrX   r   r   rE   r   contentr   vks   &  rU   r   &TokenizersBackend.added_tokens_encoderd  s?     *00I0I0O0O0QWk)lm)l		1)lmmm   Ac                6   < V ^8  d   QhRS[ S[S[3,          /# r  )rj   r   r   )rT   r  s   "rU   rV   r  m  s     : :d3
?&; :rX   c                6    V P                   P                  4       # )z
Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

Returns:
    `dict[str, int]`: The added tokens.
)r   get_added_tokens_decoderr  s   &rU   rE   &TokenizersBackend.added_tokens_decoderl  s     7799rX   c                6   < V ^8  d   QhRS[ S[S[3,          /# r  rL  )rT   r  s   "rU   rV   r  {  s     n nc3h nrX   c                    \        V P                  P                  4       R R7       UUu/ uF  w  rVP                  VbK  	  upp# u uppi )z
Returns the added tokens in the vocabulary as a dictionary of token to index.

Returns:
    `dict[str, int]`: The added tokens.
c                     V ^ ,          # r   rd   rV  s   &rU   r   3TokenizersBackend.get_added_vocab.<locals>.<lambda>  rX  rX   r   rY  r[  s   &  rU   get_added_vocab!TokenizersBackend.get_added_vocab{  s?     *00I0I0O0O0QWk)lm)l		1)lmmmr_  c                    < V ^8  d   QhRS[ /# r  r  )rT   r  s   "rU   rV   r    s      $ rX   c                    R# )z>
Returns True, to avoid expensive `assert tokenizer` gotchas.
Trd   r  s   &rU   __bool__TokenizersBackend.__bool__  s     rX   c                    < V ^8  d   QhRS[ /# r  rE  )rT   r  s   "rU   rV   r    s     F F FrX   c                :    V P                   P                  RR7      # )z4
Size of the full vocabulary with the added tokens.
TrG  rI  r  s   &rU   __len__TokenizersBackend.__len__  s     ---EErX   c                    < V ^8  d   QhRS[ /# r  )rp   )rT   r  s   "rU   rV   r    s      = rX   c                    V P                   # )zS
`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
)r   r  s   &rU   backend_tokenizer#TokenizersBackend.backend_tokenizer  s    
 rX   c                    < V ^8  d   QhRS[ /# r  )DecoderFast)rT   r  s   "rU   rV   r    s     ' ' 'rX   c                .    V P                   P                  # )zE
`tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
)r   decoderr  s   &rU   ry  TokenizersBackend.decoder  s    
 &&&rX   Tc                   < V ^8  d   QhRS[ RS[R,          RS[R,          RS[RS[RS[RS[R	S[R
S[S[S[S[3,          S[S[ ,          3,          /	# )rO   r,   return_token_type_idsNreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverboserQ   )EncodingFastr  r\   rj   rS   r   rR   )rT   r  s   "rU   rV   r    s     -( -(-(  $d{-(  $d{	-(
 $(-( %)-( !%-( -( -( 
tCH~tL11	2-(rX   c	                   Vf   RV P                   9   pVf   RV P                   9   pV'       d$   VP                  e   V.VP                  ,           p	MV.p	\        \        4      p
V	 EF  pV
R,          P	                  VP
                  4       V'       d#   V
R,          P	                  VP                  4       V'       d#   V
R,          P	                  VP                  4       V'       d#   V
R,          P	                  VP                  4       V'       d#   V
R,          P	                  VP                  4       V'       g   K  V
R,          P	                  \        VP
                  4      4       EK  	  W3# )ar  
Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
of encodings, take care of building a batch from overflowing tokens.

Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
lists (overflows) of lists (tokens).

Output shape: (overflows, sequence length)
token_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingr   )model_input_namesoverflowingr   rR   r^   idstype_idsr  r  offsetsr   )r   r,   r|  r}  r~  r  r  r  r  	encodingsencoding_dictr   s   &&&&&&&&&   rU   _convert_encoding#TokenizersBackend._convert_encoding  s   ( !($48N8N$N! ($48N8N$N!$)=)=)I!
X%9%99I!
I#D)A+&--aee4$./66qzzB$./66q7G7GH)34;;A<Q<QR%./66qyyA}h'..s155z:  ''rX   c                &   < V ^8  d   QhRS[ RS[/# )rO   r   rQ   )rS   r   )rT   r  s   "rU   rV   r    s        rX   c                \    V P                   P                  V4      pVf   V P                  # V# rZ   )r   token_to_idunk_token_id)r   r   r  s   && rU   #_convert_token_to_id_with_added_voc5TokenizersBackend._convert_token_to_id_with_added_voc  s,    ++E2=$$$rX   c                4   < V ^8  d   QhRS[ RS[R,          /# )rO   r  rQ   N)r   rS   )rT   r  s   "rU   rV   r    s     7 7# 7#* 7rX   c                J    V P                   P                  \        V4      4      # rZ   )r   r   r   )r   r  s   &&rU   _convert_id_to_token&TokenizersBackend._convert_id_to_token  s    **3u:66rX   c                F   < V ^8  d   QhRS[ S[S[,          ,          RS[/# )rO   
new_tokensrQ   )rR   rS   r   r   )rT   r  s   "rU   rV   r    s&     6 6d3+;&< 6WZ 6rX   c                ~    V'       d   V P                   P                  V4      # V P                   P                  V4      # rZ   )r   add_special_tokensr   )r   r  r*  s   &&&rU   _add_tokensTokenizersBackend._add_tokens  s/    ??55jAA))*55rX   c                &   < V ^8  d   QhRS[ RS[/# )rO   r)  rQ   )r  r   )rT   r  s   "rU   rV   r    s     ? ?d ?s ?rX   c                8    V P                   P                  V4      # )a  
Returns the number of added tokens when encoding a sequence with special tokens.

<Tip>

This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
this inside your training loop.

</Tip>

Args:
    pair (`bool`, *optional*, defaults to `False`):
        Whether the number of added tokens should be computed in the case of a sequence pair or a single
        sequence.

Returns:
    `int`: Number of special tokens added to sequences.
)r   num_special_tokens_to_add)r   r)  s   &&rU   r  +TokenizersBackend.num_special_tokens_to_add  s    & 88>>rX   c                l   < V ^8  d   QhRS[ S[S[ ,          ,          RS[RS[S[S[,          ,          /# )rO   r  skip_special_tokensrQ   r   rR   r  rS   )rT   r  s   "rU   rV   r    s7      tCy t `cfjknfo`o rX   c                F   \        V\        4      '       d   V P                  P                  V4      # . pV'       d   \	        V P
                  4      M	\	        4       pV F@  p\        V4      pWT9   d   K  VP                  V P                  P                  V4      4       KB  	  V# )a  
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
added tokens.

Args:
    ids (`int` or `list[int]`):
        The token id (or token ids) to convert to tokens.
    skip_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to remove special tokens in the decoding.

Returns:
    `str` or `list[str]`: The decoded token(s).
)r[   r   r   r   r   all_special_idsr^   )r   r  r  r  ids_to_skipr  s   &&&   rU   convert_ids_to_tokens'TokenizersBackend.convert_ids_to_tokens  s     c3??..s333Fc$../CEEJE#MM$//55e<=	 
 rX   c          	      P   < V ^8  d   QhRS[ RS[ R,          RS[RS[S[ ,          /# )rO   textr)  Nr  rQ   )rS   r  rR   )rT   r  s   "rU   rV   r    s:     v vS vd
 vt vjnorjs vrX   c           	     N    V P                   ! RR VRVRV/VB P                  4       # )r  	text_pairr  rd   )_encode_plusr  )r   r  r)  r  r   s   &&&&,rU   tokenizeTokenizersBackend.tokenize  s0      ldldlOaleklssuurX   c                Z   < V ^8  d   QhRS[ RS[RS[RS[RS[R,          RS[R,          /# )rO   padding_strategyr   r   r   r   Nr   )r   r   r   rS   )rT   r  s   "rU   rV   r    sW     I9 I9)I9 0I9 	I9
 I9  $JI9 DjI9rX   c                    V P                   P                  pV P                   P                  pV\        P                  8X  d    Ve   V P                   P                  4        MhRVRVRVP                  RV P                  /p	Vf   Rp
M!V	 Uu/ uF  qVP                  VR4      bK  	  p
pW8w  d   V P                   P                  ! R/ V	B  V\        P                  8X  d#   Ve   V P                   P                  4        R# R# V\        P                  8X  d   TMRpRTRVe   TMV P                  RV P                  RV P                   R	V P"                  R
V/p	W8w  d   V P                   P$                  ! R/ V	B  R# R# u upi )a  
Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
library) and restore the tokenizer settings afterwards.

The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
section.

Args:
    padding_strategy ([`~utils.PaddingStrategy`]):
        The kind of padding that will be applied to the input
    truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
        The kind of truncation that will be applied to the input
    max_length (`int`):
        The maximum size of a sequence.
    stride (`int`):
        The stride to use when handling overflow.
    pad_to_multiple_of (`int`, *optional*):
        If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
        the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
    padding_side (`str`, *optional*):
        The side on which the model should have padding applied. Should be selected between ['right', 'left'].
        Default value is picked from the class attribute of the same name.
Nr   r   r   r   r   pad_idre   r   r   rd   )r   ry   rx   r   DO_NOT_TRUNCATEr   r<  r   ru   r   r   
DO_NOT_PAD
no_padding
MAX_LENGTHr   pad_token_idre   r   r   )r   r  r   r   r   r   r   r   r   targetcurrentr]  r   s   &&&&&&&      rU   set_truncation_and_padding,TokenizersBackend.set_truncation_and_padding  sh   B oo00??**"4"D"DD&--/ j&/55T11	F "@FG1kooa66G 11;F;999#**, $ $47Q7Q#QZW[F&\-E\4K\K\$++T^^t55$&8F !..88 "% Hs   
E;c          (        < V ^8  d   QhRS[ S[,          S[S[ ,          ,          S[S[,          ,          RS[ S[,          S[S[ ,          ,          S[S[,          ,          R,          RS[RS[RS[RS[R,          RS[R	S[R
S[R,          RS[R,          RS[R,          RS[R,          RS[R,          RS[RS[RS[RS[RS[RS[R,          RS[/# )rO   r  r  Nr  r  r   r   r   is_split_into_wordsr   r   return_tensorsr|  r}  r~  r  r  r  r  r   rQ   )	r   r   rR   r  r   r   r   rS   r   )rT   r  s   "rU   rV   r  Y  s?    X X++d9o=EV@WWX 004	?BTJ[E\\_ccX !	X
 *X 0X $JX X "X  $JX DjX tX  $d{X  $d{X $(X  %)!X" !%#X$ %X& 'X( #Tk)X, 
-XrX   c                   R  pV! V4      '       g   \        R4      hVe   V! V4      '       g   \        R4      hV'       dG   \        V\        \        34      ;'       d(    T;'       d    \        V^ ,          \        \        34      pM\        V\        \        34      pV'       d~   \        V\        4      '       d   \        R4      hVe>   \        V4      \        V4      8w  d%   \        R\        V4       R\        V4       R24      hVe   \        \        W4      4      MTpMV'       d   W3.MV.p\        V\        \        34      '       g   \        R\        V4       R24      hV P                  VVVVV	V
R7       Vf   V P                  pV P                  P                  V8w  d   VV P                  n        V P                  P                  VVVR	7      pV Uu. uF  pV P                  VVVVVVVVR
7      NK  	  pp/ pV^ ,          ^ ,           F0  pV UUUu. uF  w  ppVV,           F  pVNK  	  K  	  p pppV VV&   K2  	  V UUUu. uF  w  ppV F  pVNK  	  K  	  p!pppV'       dA   . p"\        V4       F*  w  p#w  p$pV"V#.\        V$R,          4      ,          ,          p"K,  	  V"VR&   VR,           F  p%V P!                  V%VV4       K  	  \#        VV!VR7      p&V'       g}   Vfy   V'       gq   \#        V&P%                  4        UU'u/ uF?  w  pp'T\        V'4      ^ 8  d'   \        V'^ ,          \        4      '       d
   V'^ ,          MT'bKA  	  up'pV&P&                  4      p&V&# u upi u upppi u upppi u up'pi )c                    \        V \        4      '       d   R # \        V \        \        34      '       Ed	   \	        V 4      ^ 8X  d   R # \        V ^ ,          \        4      '       d   R # \        V ^ ,          \        \        34      '       d   \	        V ^ ,          4      ^ 8X  g%   \        V ^ ,          ^ ,          \        4      '       d   R # \        V ^ ,          ^ ,          \        \        34      '       dJ   \	        V ^ ,          ^ ,          4      ^ 8H  ;'       g&    \        V ^ ,          ^ ,          ^ ,          \        4      # R# R# R# )TF)r[   rS   rR   r\   r   )r  s   &rU   _is_valid_text_input<TokenizersBackend._encode_plus.<locals>._is_valid_text_inputq  s    !S!!Ae}--q6Q;!c**!tUm441Q4yA~AaDGS)A)A##AaDGdE];;"1Q47|q0OOJqtAwqz34OO$ rX   ztext input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples) or `list[tuple[list[str], list[str]]]` (batch of pretokenized sequence pairs).zdwhen tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`.zbatch length of `text`: z- does not match batch length of `text_pair`: .z:batch_text_or_text_pairs has to be a list or a tuple (got ))r  r   r   r   r   r   )r  is_pretokenized)r,   r|  r}  r~  r  r  r  r  r  overflow_to_sample_mapping)tensor_type)r   r[   rR   r\   rS   	TypeErrorr   zipr.   r  r   r   r   encode_batchr  r~   &_eventual_warn_about_too_long_sequencer   r   r  )(r   r  r  r  r  r   r   r   r  r   r   r  r|  r}  r~  r  r  r  r  r   r   r  
is_batchedbatch_text_or_text_pairsr  r,   tokens_and_encodingssanitized_tokensr   r   r   r   stacksanitized_encodingsr  r   toksr  batched_outputr<  s(   &&&&&&&&&&&&&&&&&&&&,                   rU   r  TokenizersBackend._encode_plusY  s   0	( $D))W 
  )=i)H)HW  #D4-8hhThhjQUVWQX[_afZgFhJ#D4-8J)S))  $Tc)n)D .s4yk :I'q*  FOEZtC,@'A`d$ ?H(9':dV$ 2UDMBBLTRjMkLllmn  	''- 3!1% 	( 	
  '#'#<#< ??004HH4HDOO1 OO00$1/ 1 
	$ & 
 & ""!&;&;*C+E'=+ # 	 & 	  
 '*1--C&:N&:74DIIqQIQ&:EN$)S! . 1ES0DWQdqdq0DS %)+& )*> ?9D!*qcC[8I4J.JJ* !@=W9:)+66I77	:wW 7 ''79LZhi n4=V* '5&:&:&<&<
U c%j1nE!Hd9S9S%(Y^^&< ((N W 
" OS"s   "M%M*
M1AM8
c                6   < V ^8  d   QhRS[ S[,          RS[/# )rO   r  rQ   )rR   rS   )rT   r  s   "rU   rV   r    s     
 
tCy 
S 
rX   c                    V P                   P                  e&   V P                   P                  P                  V4      # RP                  V4      # )Nr=   )rt  ry  decoder!  )r   r  s   &&rU   convert_tokens_to_string*TokenizersBackend.convert_tokens_to_string  sJ     %%--9 ""**11&9	
 &!	
rX   c                `   < V ^8  d   QhRS[ S[S[ ,          ,          RS[RS[R,          RS[/# )rO   	token_idsr  clean_up_tokenization_spacesNrQ   r  )rT   r  s   "rU   rV   r    s?     ) )c?) ") '+Tk	) 
)rX   c                   VP                  R R4       \        V\        4      '       d   V.p\        V\        4      '       d
   VR,          pV P                  P                  WR7      pVe   TMV P                  pV'       d   \        V P                  P                  4      P                  R8X  dB   V P                  '       g0   \        P                  RV P                  P                   R24       V# V P                  V4      pV# )use_source_tokenizerNr  )r  r   z=Ignoring clean_up_tokenization_spaces=True for BPE tokenizer aE  . The clean_up_tokenization post-processing step is designed for WordPiece tokenizers and is destructive for BPE (it strips spaces before punctuation). Set clean_up_tokenization_spaces=False to suppress this warning, or set clean_up_tokenization_spaces_for_bpe_even_though_it_will_corrupt_output=True to force cleanup anyway.)rk   r[   r   rj   r   r  r  r.   rt  r-   r}   Gclean_up_tokenization_spaces_for_bpe_even_though_it_will_corrupt_outputr   warning_oncer	  clean_up_tokenization)r   r  r  r  r   r  s   &&&&, rU   _decodeTokenizersBackend._decode  s     	

)40i%%"Ii&&!+.I%%i%Y ,7 )22 	%
 (
 T++112;;uDddd##//0 1--  11$7rX   c                   < V ^8  d   QhRS[ S[P                  ,          RS[S[ R3,          RS[R,          RS[ R,          RS[S[ R3,          /# )rO   r  
file_names.legacy_formatNr  rQ   )rS   rl   PathLiker\   r  )rT   r  s   "rU   rV   r  %  s^      bkk) #s(O d{	
 t 
sCxrX   c                    \        V4      p\        P                  P                  Y'       d
   VR ,           MR\        ,           4      pV P
                  P                  V4       W%3,           pV# )r  r   )rS   rl   rm   r!  TOKENIZER_FILErt  save)r   r  r  r  r  r#   s   &&&&& rU   _save_pretrained"TokenizersBackend._save_pretrained%  s[     ^,o_s22Q__
 	##N3"33
rX   c           
        \         P                  ! V P                  P                  4       4      pVP	                  R4      pVP	                  R4      p	Rp
VR,          R,          R8X  d   / VR,          R&   . VR,          R&   MVR,          R,          R	8X  do   VR,          R
,          e\   VR,          R
,          pVR,          R,          V,          ^ ,          p
Ve   W9   d	   WZ,          p
^ VR,          R
&   V
R..VR,          R&   M?VR,          R,          R&9   d   / VR,          R&   M\        RVR,          R,           R24      hVeD   RVR,          9   d6   VR,          R,          V9   d!   WWR,          R,          ,          VR,          R&   \        P                  ! \         P                  ! V4      4      p. pV F  pVP	                  RR4      pVP	                  RR4      pVR,          R,          R	8w  d   V'       g   KF  Ve!   VR,          V9   d   W^R,          ,          VR&   VP                  \        R'/ VB 4       K  	  Ve   VP                  V4       VR,          R,          R8X  d-   RV9  d&   VR,          R,          e   VR,          R,          VR&   VR,          R,          R8X  d-   RV9  d&   VR,          R,          e   VR,          R,          VR&   VR,          R,          R	8X  d	   V
e   WR&   VR,          e   VR,          R,          R8X  gz   VR,          R,          R8X  d   RVR,          9   dx   \        ;QJ d*    R VR,          R,           4       F  '       g   K   RM	  RM! R VR,          R,           4       4      '       d"   \        P                  P                  4       VR&   \         VR,          R,          ,          pV! R'RVRV/VB pVP#                  WVR7       V	Een   \         P                  ! VP                  4       4      pRV	9   d   V	R,           F  pV	R,          V,          R,          pVe"   V Uu. uF  pVP%                  VV4      NK  	  ppVV	R,          V,          R&   V F#  pVP'                  V4      pVe   K  \        R 4      h	  V Uu. uF  pVP'                  V4      NK  	  upV	R,          V,          R!&   K  	  R( FS  pVV	9   g   K  V	V,          w  ppVe   VV9   d
   VV,          pVP'                  V4      pVf   \        R 4      hVV.V	V&   KU  	  V	VR&   \        P                  ! \         P                  ! V4      4      pV P(                  P+                  4       p\,        P.                   F  p\1        V V4      f   K  \1        V V4      pVe   VV9   d
   VV,          pV P2                  P%                  VR4      p\5        V\        4      '       d?   \        VVP6                  VP8                  VP:                  VP<                  RR"7      VV&   K  VVV&   K  	  V P>                  '       d   V P>                  P+                  4       M. pVe   VP                  V4       \A        V4      ^ 8  d   VVR#&   WR$&    V PB                  ! R'/ VB # u upi u upi   \D         dI   pR%\G        T4      9   d3   TP	                  R$R4       T PB                  ! R'/ TB pTTn        Tu Rp?# h Rp?ii ; i))u  
Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
as the current one.

Args:
    text_iterator (generator of `list[str]`):
        The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
        if you have everything in memory.
    vocab_size (`int`):
        The size of the vocabulary you want for your tokenizer.
    length (`int`, *optional*):
        The total number of sequences in the iterator. This is used to provide meaningful progress tracking
    new_special_tokens (list of `str` or `AddedToken`, *optional*):
        A list of new special tokens to add to the tokenizer you are training.
    special_tokens_map (`dict[str, str]`, *optional*):
        If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
        token name to new special token name in this argument.
    kwargs (`dict[str, Any]`, *optional*):
        Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

Returns:
    [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
    `text_iterator`.

r1   r2   Nr-   r.   r   r/   r0   r   r   g        z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.rK   r   idrZ  continuing_subword_prefixend_of_word_suffixr   	ByteLevelr8   pretokenizersc              3   :   "   T F  pVR ,          R8H  x  K  	  R# 5i)r.   r  Nrd   ).0pretokenizers   & rU   	<genexpr><TokenizersBackend.train_new_from_iterator.<locals>.<genexpr>  s!      (X !(K7(Xs   TFinitial_alphabetr  r*  )r   trainerr  zQAttempted to set a token in the post processor that does not exist in the mappingr  )single_wordlstriprstripr@  r   rM   r)   z7multiple values for keyword argument 'tokenizer_object')r!   r"   rd   )r   sep)$rs   loadsr   to_strrk   r   rp   rv   rw   r^   r   r]   anypre_tokenizers_fastr  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorru   r  r   r   r   SPECIAL_TOKENS_ATTRIBUTESr   r   r[   r  r  r  r@  rM   r   r	  r  rS   )r   text_iteratorr  r   new_special_tokensspecial_tokens_mapr   r   r1   r2   rK   r   r   r*  added_tokenr   r   trainer_classr   trained_tokenizer_jsonr   r  r   r   special_tokenspecial_token_fullrM   r   new_tokenizers   &&&&&&,                      rU   train_new_from_iterator)TokenizersBackend.train_new_from_iterator6  s   D DOO$:$:$<=%)).9'++,<=	'"6*e3/1N7#G,02N7#H-G$V,	9g&x0<'0:*73G<VDQG	%1i6U 2 =I45w'15>4D3Ew'0G$V,0JJ/1N7#G,Mn]dNeflNmMn o> >  *~g66w'48JJ3EU\F]^iFj3kN7#K0!**4::n+EF	 'K!ooi6Gd+Ag&v.);G!-+i2HL^2^);	<R)SI&!!*";{";< ( )!!"45 7#F+u4+69w'(CDP2@2IJe2fF./7#F+u4$F2w'(<=I+9'+BCW+XF'('"6*i7I<Q"+;/*6/7;F!/26:jH#~o'FFC (6(G(XCCC (6(G(X  
 .A-J-J-S-S-U)*01H1PQ_:_n_X^_%%mG%T%%)ZZ	0@0@0B%C">1)*:;;C+,<=cB8LF)5TZ![TZ5"4"8"8"FTZ![FLN#34S9(C!'#,#8#8#?#+", s#  "( ouCuntejIDYDYZ_D`ntCuN#34S9%@ < "0 N2-m<HE1)5%CU:U 25 9(44U;H'(o  6;H4EN=1 "0 8F"#34%..tzz:P/QRI!!&&(,FFEtU#/ 'e 4%1mGY6Y$6}$EM%)%=%=%A%A%%N"0*==$.%$6$B$B188188#5#@#@ $%F5M %2F5M% G* DHC\C\C\t88==?bd) ''(:;#$q(-AF)* &/!"	>>+F++{ "\ Dvj  	HCPQFR 

-t4 $ 8 8+4($$ 	s0   Z Z$Z) )[<4<[70[<6[77[<c
                  aa ^ RI o^ RIHp ^ RIHo ^ RIHp ^ RIHp V! ^R7      R VV3R ll4       pV'       g   \        4       '       d   R	pVEe   V'       g   V'       Egp   V! V4      '       Eda   V! VR
VVVRRVR7      pRpVe   \        VRR7      ;_uu_ 4       p\        P                  ! V4      pRRR4       XP                  R4      pVP                  R4      pV'       d<   VP                  V4      VP                  R4      8  d   V'       d   Ve
   VR9  d   V# M/V'       d(   VP                  V4      VP                  R4      8  d   V# R	pV'       g   V'       Egp   V! V4      '       Eda   V'       d   RV9   d   \        VRVR,          4       V	f<   \!        VRR4      '       g)   \        VRR4       \"        P%                  RV R24       V# V	R	J g   \!        VRR4      '       d   \        VRR	4       ^ RIpVP(                  P+                  VP-                  R4      RR7      pVP.                  p\1        VVP(                  P2                  4      '       d   VVP.                  ^ &   V# \1        VVP(                  P4                  4      '       d   VP(                  P7                  RRR7      pVP(                  P3                  VV.4      Vn        V#   + '       g   i     EL); i)a6  
Patches mistral related tokenizers with incorrect regex if detected
    1) Local file with an associated config saved next to it
        >> Model type one of the mistral models (on older versions)
    2) Remote models on the hub from official mistral models
        >> Tags including `base_model:.*mistralai`
N)	lru_cache)
model_info)versionr   )maxsizec                0    V ^8  d   QhR\         R\        /# )rO   model_idrQ   )rS   r  )rT   s   "rU   rV   <TokenizersBackend._patch_mistral_regex.<locals>.__annotate__  s     		 		c 		d 		rX   c                    <  S! V 4      pTP                  e4   SP                  RRP                  TP                  4      4      '       d   R# R #   \          d     R # i ; i)Fzbase_model:.*mistralair   T)r   tagssearchr!  )r  r-   r  res   & rU   is_base_mistral?TokenizersBackend._patch_mistral_regex.<locals>.is_base_mistral  sW    "8, zz%995rwwuzz7JKK  s   A AATzconfig.jsonF)	cache_dirr   local_files_only%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errors_commit_hashr*   r+   transformers_versionr   z5.0.0r   z$The tokenizer you are loading from 'a  ' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.z[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+isolated)patternbehavior)r   	use_regex)mistralmistral3voxtral	ministralpixtral)r#  	functoolsr  huggingface_hubr  	packagingr  transformers.utils.hubr   r   rr   rs   rt   ru   parsesetattrr   r   r   r   pre_tokenizersSplitRegexr   r[   r8   	Metaspacer  )r   r   pretrained_model_name_or_pathr   r&  r'  r*  is_localr   r   r   r  r  r   r$  _config_filemistral_config_detectedf_configr+  transformers_model_typer   split_pretokenizercurrent_pretokenizerr  r#  s   &&&&&&&&&&,             @@rU   r   &TokenizersBackend._patch_mistral_regex  s   * 	'.%6	3			 		 
 		 00H(4XX/:W*X*X&-#!16;8=)	L ',#',99Q"iilG :'.{{3I'J$*1++l*C'
 (GMM:N,ORYR_R_`gRh,h 3?3   )()gmm<P.QU\UbUbcjUk.k$$*.'&xxOLi<j<j#6+#EI':KH[<\] %,WYH[]b5c5cI':EBNN>?\>] ^e eH ? '$.')EXZ_2`2`I':DA%)3)B)B)H)H * 0 0 s! ",	 *I *& ,5+B+B(!"6
8Q8Q8Z8Z[[5G	//2"  &&:J<U<U<_<_``3=3L3L3V3V16% 4W 40
 3=2K2K2T2T 2 43	/ O :99s   KK'	)r   r   r   r   r   r   r   r$   )FrZ   )NNFFFFT)NF)FN)NN)NNN)NNFNFNN)6r}   
__module____qualname____firstlineno____doc__r"  r  r-   r   classmethodr   r(   propertyr  r  r%  r   r   r   setterrB  r  rN  r/   r   rE   _added_tokens_encoder_added_tokens_decoderrh  rl  rp  rt  ry  r  r  r  r  r  r  r  r  r   r  r   r  r  r  r  r  r  r   __static_attributes____classdictcell____classcell__)r	  r  s   @@rU   r&   r&   S   st    
 *EJ` `Da)F    ! !
4 6 6 6 6 % % % %)@ G GA A     n n : : 10n n F F   ' '-( -(^ 7 76 6? ?* 4v vI9 I9\ gk#',;,F,F2D2T2T!%$))-#'&*-1-1*/+0',#,0)X Xt
 
) )V "CJ C C CrX   r&   )CrL  r   rs   rl   collectionsr   collections.abcr   shutilr   typingr   tokenizers.pre_tokenizersr;  r  r6  r   r   r   r   r	   r  r
   rp   tokenizers.decodersr   rw  tokenizers.modelsr   r   tokenizers.trainersr   r   r   r   r8  r   r   r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utils_baser   r   r   r   r   r   r   utilsr   r   r    
get_loggerr}   r   r  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILEADDED_TOKENS_FILEr
  r"  r&   PreTrainedTokenizerFastrd   rX   rU   <module>rg     s  
   	 # $   7 + - / 1 6 * ^ ^ . 0 5 =   @ ? 
		H	% "3 / '  (      
:~!!	  &~|EXY  ,-k/ k .k^) , rX   