+
    ~jO1             	       P   R t ^ RIt^ RIHt ^ RIHt ^ RIHt ^ RIH	t	H
t
HtHtHtHtHt ^ RIHtHtHt ^ RIHt ^RIHtHtHtHt ^R	IHt ]P8                  ! ]4      t. ROt]. RO,           t RR
 lt!R R lt"RR R llt# ! R R4      t$ ! R R]$4      t%R R lt& ! R R4      t' ! R R]'4      t( ! R R]'4      t) ! R R]'4      t* ! R R]'4      t+ ! R R ]'4      t, ! R! R"]'4      t- ! R# R$]'4      t. ! R% R&]'4      t/ ! R' R(]'4      t0 ! R) R*]'4      t1 ! R+ R,]'4      t2 ! R- R.]'4      t3 ! R/ R0]34      t4 ! R1 R2]34      t5 ! R3 R4]34      t6 ! R5 R6]34      t7 ! R7 R8]34      t8 ! R9 R:]34      t9 ! R; R<]34      t: ! R= R>]34      t; ! R? R@]34      t< ! RA RB]34      t= ! RC RD]34      t> ! RE RF]34      t? ! RG RH]34      t@ ! RI RJ]34      tA ! RK RL]34      tB ! RM RN]34      tC ! RO RP]'4      tD ! RQ RR]34      tE ! RS RT]'4      tF ! RU RV]'4      tG ! RW RX]'4      tH ! RY RZ]34      tI ! R[ R\]34      tJ ! R] R^]34      tK ! R_ R`]'4      tL ! Ra Rb]34      tM ! Rc Rd]34      tN ! Re Rf]34      tORg tP ! Rh Ri4      tQ ! Rj Rk4      tR/ Rl]4bRm]0bRn]5bRo](bRp]EbRq]HbRr]6bRs]FbRt]-bRu](bRv]2bRw]7bRx](bRy](bRz](bR{](bR|](b/ R}]4bR~]*bR]-bR].bR](bR](bR]0bR]<bR]0bR]0bR](bR]LbR]8bR]9bR]+bR](bR]0bC/ R]:bR],bR]AbR]/bR]>bR]?bR]0bR]1bR];bR](bR]BbR]CbR]DbR]<bR]=bR])bR]IbCR]KR]KR]JR]K/CtSRR R lltTR# )z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
N)
Collection)	lru_cache)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece)tqdm)is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERRORc                 >   \        4       '       d	   ^ RIHp V# \        4       '       dX   ^ RIp\
        P                  ! VP                  P                  4      \
        P                  ! R4      8  d	   ^ RI	Hp V# ^ RI	H
p V# \        \        P                  ! V 4      4      h)    )sentencepiece_model_pb2Nz4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   google.protobufr   parseprotobuf__version__transformers.utilsr   ImportErrorr   format)error_messager   googles   &  x/Users/mitch_tango/dev/rabbit-r1-livekit/agent/.venv/lib/python3.14/site-packages/transformers/convert_slow_tokenizer.pyimport_protobufr$   _   sr    !##9&&==445g8NNB '& b&&/66}EFF    c                0    V ^8  d   QhR\         R\        /# )   add_prefix_spacereturn)boolstr)r    s   "r#   __annotate__r,   p   s      $ s r%   c                 L    V '       d   R p\        VRR4      '       g   RpV# RpV# )alwayslegacyTfirstnever)getattr)r(   original_tokenizerprepend_schemes   && r#   _get_prepend_schemer5   p   s4    !)8T::$N  !r%   c                H    V ^8  d   QhR\         \        ,          R,          /# )r'   skip_tokensN)r   r+   )r    s   "r#   r,   r,   z   s      jo6L r%   c                   a  Ve   \        V4      M	\        4       pVR JpV'       d   \        V4      MS p. pVP                  4        F  w  rVWR9   d   K  . p\        ^\	        V4      4       F>  pVR V WXR  rW9   g   W9   d   K  V	S 9   g   K"  V
S 9   g   K+  VP                  WV34       K@  	  \        VV 3R lR7      pVP                  V4       K  	  \        VR VR7      pV Uu. uF  q^ ,          V^,          3NK  	  ppV# u upi )Nc                 D   < SV ^ ,          ,          SV ^,          ,          3# r    )xvocabs   &r#   <lambda>!generate_merges.<locals>.<lambda>   s    U1Q4[%!+,Fr%   keyc                 Z    V ^,          \        V ^ ,          4      \        V ^,          4      3# r'   )lenvals   &r#   r>   r?      s    SVSQ[#c!f+,Nr%   rA   reverse)setdictitemsrangerD   appendsortedextend)r=   vocab_scoresr7   rH   mergesmergepiece_scorelocalindexpiece_lpiece_rrF   s   f&&         r#   generate_mergesrX   z   s   &1&=#k"35K$&G)04%eLF*0021c%j)E$Ve}eFmW%)?%Gu$4g<= * u"FGe 3 F NX_`F*01&31vs1v&F1M 2s   %Dc                   H   a  ] tR t^t o RtV 3R lR ltV 3R lR ltRtV tR# )SentencePieceExtractorzd
Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
c                    < V ^8  d   QhRS[ /# )r'   modelr+   )r    __classdict__s   "r#   r,   #SentencePieceExtractor.__annotate__   s     
 
c 
r%   c                   \        V R 4       \        V R4       \        4       pVP                  4       p\        VR4      ;_uu_ 4       pVP	                  VP                  4       4       RRR4       W0n        R#   + '       g   i     L; i)r   r   rbN)r   r$   
ModelProtoopenParseFromStringreadproto)selfr\   	model_pb2mfs   &&   r#   __init__SentencePieceExtractor.__init__   sd    $0$
+ $%	  "%!affh' 
 s    A77B	c                \   < V ^8  d   QhRS[ S[S[S[3,          S[S[ ,          3,          /# r'   r)   tuplerJ   r+   intlist)r    r^   s   "r#   r,   r_      s*     ! !uT#s(^T%[5P/Q !r%   c                   V P                   P                  P                   Vf2   ^ RIHpHp V P                   P                  P                  ^8X  d   TMTpV P                   P                   Uu. uF  qUP                  VP                  3NK  	  ppVP                  R8w  d)   V P                   P                  P                  VR&   WbR&   M9^RIHp \        V4       UU	U
u/ uF
  w  pw  rWbK  	  pp	pp
V! V4      pWbR&   WR&   \        V P                   P                  4       UUu. uF3  w  rVP                  R9   g   K  WP                  VP                  ^8H  3NK5  	  ppp\        VR R	7       UUUu. uF  w  rp\!        VR
VR7      NK  	  upppVR&   \#        V P                   P$                  RR4      VR&   V# u upi u up
p	pi u uppi u upppi )
By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
order the merges with respect to the piece scores instead.
N)r   r   r   unk_idr=   )rX   rQ   c                     V ^ ,          # r:   r;   r<   s   &r#   r>   0SentencePieceExtractor.extract.<locals>.<lambda>   s    QqTr%   r@   F
normalizedspecialadditional_special_tokensprecompiled_charsmap_spm_precompiled_charsmap      )rf   trainer_specru   tokenizers.modelsr   r   
model_typepiecespiecescore__name__tokenization_utils_baserX   	enumeratetyperN   r   r2   normalizer_spec)rg   r   kwargsr   r   r   r=   rX   iwordr   rQ   idpspm_added_tokenstokenr{   s   &&,              r#   extractSentencePieceExtractor.extract   s   
 	

&&6$(JJ$;$;$F$F!$KQTJ9=9J9JK9J++u{{+9JK%'#zz66==F8#7O@5>u5EF5E!1MTTW5EEF$U+F#7O%8 ENdjjN_N_D`uD`52dedjdjntdt6R!&&A+6D`u '--=>&R/
&R"7 u@&R/
*+ /6djj6P6PRhjn.o*+/ L G v/
s   . F:F?$G>G2Grf   N)	r   
__module____qualname____firstlineno____doc__rk   r   __static_attributes____classdictcell__r^   s   @r#   rZ   rZ      s     
 
! !r%   rZ   c                   6   a  ] tR t^t o RV 3R lR lltRtV tR# )GemmaSentencePieceExtractorNc                \   < V ^8  d   QhRS[ S[S[S[3,          S[S[ ,          3,          /# rn   ro   )r    r^   s   "r#   r,   (GemmaSentencePieceExtractor.__annotate__   s*      E$sCx.$u+2M,N r%   c                    V P                   p\        VP                  4       4       Uu/ uF  q2P                  V4      VbK  	  ppRV9  d   VP	                  R4      VR&   \        WA4      pWE3# u upi )rt   	<0x09>)sprL   GetPieceSizeid_to_piecegetrX   )rg   rP   r   rU   r=   rQ   s   &&    r#   r   #GemmaSentencePieceExtractor.extract   so    
 WW;@AR;ST;S%&-;ST u))H-E$K 5} Us   A-r;   N)r   r   r   r   r   r   r   r   s   @r#   r   r      s       r%   r   c                0    V ^8  d   QhR\         R\        /# )r'   r   r)   )r+   r*   )r    s   "r#   r,   r,      s     I Ic Id Ir%   c                     \        V 4      ^8  ;'       g1    V R,          R8g  ;'       g    V R,          P                  4       '       * # )r'   ,)rD   isdigit)r   s   &r#   check_number_commar      s9    u:>HHU2Y#-HHU2Y5F5F5H1HHr%   c                   8   a  ] tR t^t o R tV 3R lR ltRtV tR# )	Converterc                    Wn         R # r   r3   )rg   r3   s   &&r#   rk   Converter.__init__   s    "4r%   c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   Converter.__annotate__   s     $ $9 $r%   c                    \        4       hr   )NotImplementedErrorrg   s   &r#   	convertedConverter.converted   s    !##r%   r   N)r   r   r   r   rk   r   r   r   r   s   @r#   r   r      s     5$ $r%   r   c                   2   a  ] tR t^t o V 3R lR ltRtV tR# )BertConverterc                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   BertConverter.__annotate__        # #9 #r%   c           	     |   V P                   P                  p\        \        V\	        V P                   P
                  4      R 7      4      pRpRpRp\        V P                   R4      '       da   V P                   P                  P                  pV P                   P                  P                  pV P                   P                  P                  p\        P                  ! RVVVR7      Vn        \        P                  ! 4       Vn        \	        V P                   P"                  4      p\	        V P                   P$                  4      pV P                   P&                  pV P                   P(                  p	\*        P,                  ! V RV R2V RV RV R2Wh3Wy3.R	7      Vn        \0        P                  ! R
R7      Vn        V# )	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr3   r=   r   r   r+   r   hasattrr   tokenize_chinese_charsr   do_lower_caser	   BertNormalizer
normalizerr
   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr   decoder
rg   r=   	tokenizerr   r   r   clssepr   r   s
   &         r#   r   BertConverter.converted      ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5##$
	  %..d;	r%   r;   Nr   r   r   r   r   r   r   r   s   @r#   r   r           # #r%   r   c                   2   a  ] tR tRt o V 3R lR ltRtV tR# )SplinterConverteri  c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   SplinterConverter.__annotate__  s     . .9 .r%   c           
        V P                   P                  p\        \        V\	        V P                   P
                  4      R 7      4      pRpRpRp\        V P                   R4      '       da   V P                   P                  P                  pV P                   P                  P                  pV P                   P                  P                  p\        P                  ! RVVVR7      Vn        \        P                  ! 4       Vn        \	        V P                   P"                  4      p\	        V P                   P$                  4      p\	        V P                   P&                  4      pRp	V P                   P(                  p
V P                   P*                  pV P                   P,                  pV P                   P/                  R4      pV P                   P0                  R8X  d   V RV RV	 RV R	V R
2
pMV RV R	V RV	 RV R
2
p\2        P4                  ! V RV R2VWj3W{3W3W3.R7      Vn        \8        P                  ! RR7      Vn        V# )r   Fr   Tr   .rightr    r   r   r   r   r   r   )r3   r=   r   r   r+   r   r   r   r   r   r   r	   r   r   r
   r   r   r   r   question_tokenr   r   question_token_idconvert_tokens_to_idspadding_sider   r   r   r   r   )rg   r=   r   r   r   r   r   r   questiondotr   r   r   dot_token_idr   s   &              r#   r   SplinterConverter.converted  s   ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334t..==>..;;..;; 33EE..DDSI""//7:U(8*AcU!C5RHDU(3%xz3%qRHD#-#@#@U(3%r*##-#		$
	  %..d;	r%   r;   Nr   r   s   @r#   r   r     s     . .r%   r   c                   2   a  ] tR tRt o V 3R lR ltRtV tR# )FunnelConverteri=  c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   FunnelConverter.__annotate__>  r   r%   c           	     |   V P                   P                  p\        \        V\	        V P                   P
                  4      R 7      4      pRpRpRp\        V P                   R4      '       da   V P                   P                  P                  pV P                   P                  P                  pV P                   P                  P                  p\        P                  ! RVVVR7      Vn        \        P                  ! 4       Vn        \	        V P                   P"                  4      p\	        V P                   P$                  4      pV P                   P&                  pV P                   P(                  p	\*        P,                  ! V RV R2V RV RV R2Wh3Wy3.R	7      Vn        \0        P                  ! R
R7      Vn        V# )r   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r   s
   &         r#   r   FunnelConverter.converted>  r   r%   r;   Nr   r   s   @r#   r   r   =  r   r%   r   c                   2   a  ] tR tRt o V 3R lR ltRtV tR# )MPNetConverterid  c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   MPNetConverter.__annotate__e  r   r%   c                   V P                   P                  p\        \        V\	        V P                   P
                  4      R 7      4      pRpRpRp\        V P                   R4      '       da   V P                   P                  P                  pV P                   P                  P                  pV P                   P                  P                  p\        P                  ! RVVVR7      Vn        \        P                  ! 4       Vn        \	        V P                   P"                  4      p\	        V P                   P$                  4      pV P                   P&                  pV P                   P(                  p	\*        P,                  ! V RV R2V RV RV RV R	2Wh3Wy3.R
7      Vn        \0        P                  ! RR7      Vn        V# )r   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r   s
   &         r#   r   MPNetConverter.convertede  s   ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5SXcU"=##$
	  %..d;	r%   r;   Nr   r   s   @r#   r  r  d  r   r%   r  c                   2   a  ] tR tRt o V 3R lR ltRtV tR# )OpenAIGPTConverteri  c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   OpenAIGPTConverter.__annotate__  s      9 r%   c                   V P                   P                  p\        V P                   P                  P	                  4       4      pV P                   P
                  p\        \        VVR \        V4      RRR7      4      pVP                  \        V4      4      e   VP                  \        V4      .4       \        P                  ! RR7      Vn        \        P                  ! 4       Vn        \"        P$                  ! RR7      Vn        V# )N</w>F)r=   rQ   dropoutr   end_of_word_suffixfuse_unkT)r   suffix)r3   encoderrr   	bpe_rankskeysr   r   r   r+   token_to_idadd_special_tokensr	   r   r   r
   r   r   r   
BPEDecoderr   rg   r=   rQ   r   r   s   &    r#   r   OpenAIGPTConverter.converted  s    ''//d--77<<>?++55	i.#)	
	   Y0<((#i.)9:*99DI	"0"A"A"C	$//v>	r%   r;   Nr   r   s   @r#   r  r    s      r%   r  c                   6   a  ] tR tRt o RV 3R lR lltRtV tR# )GPT2Converteri  Nc                   < V ^8  d   QhRS[ S[S[3,          R,          RS[S[S[S[3,          ,          R,          RS[/# r'   r=   NrQ   r)   rJ   r+   rq   rr   rp   r   )r    r^   s   "r#   r,   GPT2Converter.__annotate__  sF     " "tCH~4 "T%PSUXPX/EZ]aEa "mv "r%   c                x   V'       g   V P                   P                  pV'       g    \        V P                   P                  4      p\	        \        VVR RRRR7      4      p\        V P                   RR4      p\        P                  ! VR7      Vn	        \        P                  ! 4       Vn        \        V P                   RR4      '       dU   V P                   P                  pV P                   P                  p\        P                  ! V R2V R2WV3.R	7      Vn        V# \        P                  ! RR
7      Vn        V# )N Fr=   rQ   r  continuing_subword_prefixr  r  r(   r(   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)r3   r  rr   r  r   r   r2   r
   	ByteLevelr   r   r   	bos_tokenbos_token_idr   r   r   )rg   r=   rQ   r   r(   bosr.  s   &&&    r#   r   GPT2Converter.converted  s   ++33E$11;;<F*,#%	
	 #4#:#:<NPUV"0":":L\"]	$..0	4**OUCC))33C22??L'1'D'DguL)' (I$  (2';';'OI$r%   r;   NNr   r   s   @r#   r  r    s     " " "r%   r  c                   2   a  ] tR tRt o V 3R lR ltRtV tR# )HerbertConverteri  c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   HerbertConverter.__annotate__        9 r%   c           
        R pRpV P                   P                  p\        V P                   P                  P	                  4       4      pW^ ,          ^ ,          9   d
   VR,          p\        \        VVRV P                   P                  VR7      4      p\        P                  ! RRR7      Vn
        \        P                  ! 4       Vn        \        P                  ! VR7      Vn        \"        P$                  ! V P                   P&                  V P                   P(                  3V P                   P*                  V P                   P,                  3R7      Vn        V# )	z	#version:r     NNN)r  r   r  F)r   r   r  )r   r   )r3   r  rr   r  r  r   r   r   r	   r   r   r
   r   r   r   r  r   r   BertProcessingr   r   r   r   r   )rg   tokenizer_info_strtoken_suffixr=   rQ   r   s   &     r#   r   HerbertConverter.converted  s   (''//d--77<<>?1-BZF11;;#/
	  +99EY^_	"0"A"A"C	$//|D	#-#<#<((22D4K4K4X4XY((22D4K4K4X4XY$
	 
 r%   r;   Nr   r   s   @r#   r3  r3          r%   r3  c                   6   a  ] tR tRt o RV 3R lR lltRtV tR# )Qwen2Converteri  Nc                   < V ^8  d   QhRS[ S[S[3,          R,          RS[S[S[S[3,          ,          R,          RS[/# r!  r"  )r    r^   s   "r#   r,   Qwen2Converter.__annotate__  sF     ( (tCH~4 (T%PSUXPX/EZ]aEa (mv (r%   c                T   V'       g   V P                   P                  pV'       g.   \        V P                   P                  P	                  4       4      p\        \        VVR R RRRRR7      4      p\        P                  ! 4       Vn	        \        P                  ! \        P                  ! \        R4      RRR7      \        P                  ! \        V P                   RR4      RR7      .4      Vn        \"        P                  ! 4       Vn        \&        P                  ! RR	7      Vn        V# )
Nr%  F)r=   rQ   r  r   r'  r  r  byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr(   r(   	use_regexr*  )r3   r  rr   r  r  r   r   r	   NFCr   r
   SequenceSplitr   r,  r2   r   r   r   r   r   )rg   r=   rQ   r   s   &&& r#   r   Qwen2Converter.converted  s   ++33E$11;;@@BCF*,#%#	
	  +0	"0"9"9$$ N (  ((%,T-D-DFXZ_%`##
	  %..0	#-#7#7U#K	 r%   r;   r1  r   r   s   @r#   r@  r@    s     ( ( (r%   r@  c                   2   a  ] tR tRt o V 3R lR ltRtV tR# )RobertaConverteri  c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   RobertaConverter.__annotate__        9 r%   c                   V P                   pVP                  p\        VP                  P	                  4       4      p\        \        VVR RRRR7      4      p\        P                  ! VP                  R7      Vn
        \        P                  ! 4       Vn        \        P                  ! VP                  VP                   3VP"                  VP$                  3VP                  RR7      Vn        V# )Nr%  Fr&  r(  Tr   r   r(   r+  )r3   r  rr   r  r  r   r   r
   r,  r(   r   r   r   r   RobertaProcessingr   r   r   r   r   rg   otr=   rQ   r   s   &    r#   r   RobertaConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#?#?r/r/00	$
	  r%   r;   Nr   r   s   @r#   rP  rP          r%   rP  c                   2   a  ] tR tRt o V 3R lR ltRtV tR# )RoFormerConverteri4  c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   RoFormerConverter.__annotate__5  r   r%   c           	     d   ^RI Hp V P                  P                  p\	        \        V\        V P                  P                  4      R7      4      pRpRp\        V P                  R4      '       dA   V P                  P                  P                  pV P                  P                  P                  p\        P                  ! RRVVR7      Vn        \        P                   P#                  V! V4      4      Vn        \        V P                  P&                  4      p\        V P                  P(                  4      pV P                  P*                  pV P                  P,                  p	\.        P0                  ! V RV R2V RV R	V R
2Wh3Wy3.R7      Vn        \4        P
                  ! RR7      Vn        V# )r9  )JiebaPreTokenizerr   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsr`  r3   r=   r   r   r+   r   r   r   r   r   r	   r   r   r
   PreTokenizercustomr   r   r   r   r   r   r   r   r   r   )
rg   r`  r=   r   r   r   r   r   r   r   s
   &         r#   r   RoFormerConverter.converted5  sx   I''--iT=T=T=^=^9_`a	4**,=>> 33CCQQM 33CCQQM*99!&'#	 
	 #1"="="D"DEVW\E]"^	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5##$
	  %..d;	r%   r;   Nr   r   s   @r#   r\  r\  4  r   r%   r\  c                   2   a  ] tR tRt o V 3R lR ltRtV tR# )DebertaConverteri[  c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   DebertaConverter.__annotate__\  r6  r%   c                   V P                   pVP                  p\        VP                  P	                  4       4      p\        \        VVR RRRR7      4      p\        P                  ! VP                  R7      Vn
        \        P                  ! 4       Vn        \        P                  ! RRRV P                   P                  R4      3RV P                   P                  R4      3.R	7      Vn        V# )
Nr%  Fr&  r(  [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )r3   r  rr   r  r  r   r   r
   r,  r(   r   r   r   r   r   r   r   rW  s   &    r#   r   DebertaConverter.converted\  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@)4$11GGPQ$11GGPQ$
	  r%   r;   Nr   r   s   @r#   rf  rf  [  r>  r%   rf  c                      a a ] tR tRt oRt]t/ t]RR l4       t	]
RR l4       tV 3R ltR tR tR tR	 tR
 tR tR tV3R lR ltRtVtV ;t# )SpmConverteriz  Fc                   V P                   P                  pV P                   P                  pV P                  P                  p\        V\        4      '       d&   \        \        TT;'       g    . VRVRR7      4      pMt\        V\        4      '       d]   V'       dU   \        V^ ,          \        \        ,          4      '       d-   \        \        VV P                   P                  VR7      4      pMR# \        P                  ! RR4      .pV'       d'   VP                  ^ \        P                   ! V4      4       \        P"                  ! V4      Vn        V'       d\   \&        P"                  ! \&        P                  ! RR4      \&        P(                  ! 4       \&        P*                  ! 4       .4      Vn        V# \&        P"                  ! \&        P                  ! RR4      .4      Vn        V# )z
Similar to convert_from_spm method, but used only when there is no `model_type` class, i.e. there is no matching class in `TOKENIZERS_MAPPING` and we just create a tokenizer instead of extracting stuff from the sentencepiece file
TN)r=   rQ   r   r  rD  r  )r=   ru   rD  r      ▁)r   rD  	unk_piecer   r}   
isinstancerJ   r   r   rr   rp   r   ru   r	   ReplaceinsertPrecompiledrL  r   r   ByteFallbackFuser   )rf   r=   rQ   rD  rs  r}   r   _normalizerss   &&&     r#   build_tokenizer_from_spm_proto+SpmConverter.build_tokenizer_from_spm_proto  s}   
 **88&&00	$44II eT""!!<<R'!"/ 	I t$$:eAhPT3U3U! --44"/I  $++C78;#:#:;O#PQ*33LA	  ( 1 1!!%-x/D/D/FX!I  !) 1 183C3CE33O2P QIr%   c                    Ve   WR&   V# )z
Hook used when converting directly from a SentencePiece model without a slow tokenizer instance.
By default, return kwargs unchanged.
r=   r;   )r   r=   r   s   &&,r#   convert_from_spmSpmConverter.convert_from_spm  s     #7Or%   c                  < \        V R 4       \        SV `  ! V!   \        4       pVP	                  4       p\        V P                  P                  R4      ;_uu_ 4       pVP                  VP                  4       4       RRR4       W0n
        V P                  P                  P                  '       d-   V P                  '       g   \        P                  ! R4       R# R# R#   + '       g   i     Lj; i)r   ra   Na  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superrk   r$   rb   rc   r3   
vocab_filerd   re   rf   r   rD  handle_byte_fallbackwarningswarn)rg   argsrh   ri   rj   	__class__s   &*   r#   rk   SpmConverter.__init__  s    $
+$ $%	  "$))44d;;qaffh' <
::""0009R9R9RMMe :S0	 <;s    C  C0	c                n    VP                    Uu. uF  q"P                  VP                  3NK  	  up# u upi r   r   r   r   rg   rf   r   s   && r#   r=   SpmConverter.vocab  s)    8=Euekk*EEEs    2c                .    VP                   P                  # r   )r   ru   rg   rf   s   &&r#   ru   SpmConverter.unk_id  s    !!(((r%   c                   VP                   P                  pV P                  V4      pV^8X  d2   \        \	        VV P                  V4      V P                  R7      4      pMV^8X  d   V P                  V P                  P                  4      P                  V4      w  rV\        V4       UUU	u/ uF
  w  pw  rWbK  	  p
ppp	\        \        V
VVP                   P                  RV P                  RR7      4      pM\        R4      h\        VP                  4       UUu. uFT  w  rVP                   R
9   g   K  YP"                  VP                   ^8H  ;'       g    VP"                  V P$                  9   3NKV  	  pppTP'                  \)        VR R7       UUUu. uF  w  rp\+        VRVR	7      NK  	  uppp4       V# u up	ppi u uppi u upppi )r9  ru   rD  TNr   r  rD  r  z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmc                     V ^ ,          # r:   r;   rw   s   &r#   r>   (SpmConverter.tokenizer.<locals>.<lambda>       QRSTQUr%   r@   Fry   r   )r   r   r=   r   r   ru   r  SpmExtractorr3   r  r   r   r   rs  	Exceptionr   r   r   r   
add_tokensrN   r   )rg   rf   r   rP   r   _rQ   r   r   r   	bpe_vocabr   r   r   r   r{   s   &&              r#   r   SpmConverter.tokenizer  s   ''22
zz%(?! ;;u-"&";";I 1_))$*A*A*L*LMUUVbcIA9B<9PQ9P%5Q9PIQ!#00::!"&";"; 	I o  #5<<0
0vv IR!&&A+GGD4G4G)GH0 	 

 	 +11A~*V*V&Bw 5UGD*V	
 C R*
s   ,F;G6 GGGc                >   VP                   P                  p\        P                  ! R RR7      \        P                  ! \        R4      R4      .pV'       g   \        P                  ! V4      # \        P                  ! \        P                  ! V4      .V,           4      # )FT)leftr    {2,}rr  )r   r}   r	   Stripru  r   rL  rw  rg   rf   r}   rz  s   &&  r#   r   SpmConverter.normalizer  s{    $44II55g6
 $''55'')@)@AU)V(WZf(fggr%   c                Z    \        W P                  4      p\        P                  ! WR 7      # replacementr4   )r5   r3   r
   	Metaspacerg   r  r(   r4   s   &&& r#   r   SpmConverter.pre_tokenizer  s$    ,-=?V?VW''K__r%   c                    R # r   r;   r   s   &r#   r   SpmConverter.post_processor  s    r%   c                Z    \        W P                  4      p\        P                  ! WR 7      # r  )r5   r3   r   r  r  s   &&& r#   r   SpmConverter.decoder  s$    ,-=?V?VW!!kYYr%   c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   SpmConverter.__annotate__  s      9 r%   c                   V P                  V P                  4      pV P                  V P                  4      pVe   W!n        RpRp\        V P                  R4      '       d   V P                  P
                  pV P                  W44      pVe   WQn        V P                  W44      Vn        V P                  4       pV'       d   Wan        V# )Nrr  Tr(   )	r   rf   r   r   r3   r(   r   r   r   )rg   r   r   r  r(   r   r   s   &      r#   r   SpmConverter.converted  s    NN4::.	 __TZZ0
!#- 4**,>??#66GG**;I$&3# LLG	,,.'5$r%   r   r   )r   r   r   r   r  rZ   r  r   staticmethodr{  classmethodr~  rk   r=   ru   r   r   r   r   r   r   r   r   __classcell__)r  r^   s   @@r#   rp  rp  z  sx      )LN- -^  *F)0d	h`Z  r%   rp  c                   2   a  ] tR tRt o R tR tR tRtV tR# )AlbertConverteri5  c                    VP                    Uu. uFU  p\        VP                  4      '       d   VP                  VP                  3MVP                  VP                  ^d,
          3NKW  	  up# u upi d   r   r   r   r   r  s   && r#   r=   AlbertConverter.vocab6  f     
% +=U[[*I*IU[[%++&PUP[P[]b]h]hkn]nOoo%
 	
 
   AA-c                   \         P                  ! R R4      \         P                  ! RR4      .pV P                  P                  '       gI   VP	                  \         P
                  ! 4       4       VP	                  \         P                  ! 4       4       V P                  P                  '       d%   VP	                  \         P                  ! 4       4       VP                  P                  pV'       d&   VP	                  \         P                  ! V4      4       VP	                  \         P                  ! \        R4      R4      4       \         P                  ! V4      # z``"z''r  r   r	   ru  r3   keep_accentsrM   NFKDStripAccentsr   	Lowercaser   r}   rw  r   rL  rg   rf   list_normalizersr}   s   &&  r#   r   AlbertConverter.normalizer<      c*c*
 &&333##K$4$4$67##K$<$<$>?""000##K$9$9$;<$44II##K$;$;<P$QR 3 3E'NC HI##$455r%   c           	         \         P                  ! R RRV P                  P                  R4      3RV P                  P                  R4      3.R7      # rj  rk  rl  rm  r   r   r   r3   r   r   s   &r#   r   AlbertConverter.post_processorO  R    ,,)4$11GGPQ$11GGPQ
 	
r%   r;   N	r   r   r   r   r=   r   r   r   r   r   s   @r#   r  r  5       
6&
 
r%   r  c                   ,   a  ] tR tRt o R tR tRtV tR# )BarthezConverteriZ  c                
    ^pV# r   r;   rg   rf   ru   s   && r#   ru   BarthezConverter.unk_id[      r%   c           	         \         P                  ! R RRV P                  P                  R4      3RV P                  P                  R4      3.R7      # z<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   r  r   s   &r#   r   BarthezConverter.post_processor_  R    ,, +//EEeLM00FFvNO
 	
r%   r;   N)r   r   r   r   ru   r   r   r   r   s   @r#   r  r  Z  s     
 
r%   r  c                   F   a  ] tR tRt o R tR tR t]RR l4       tRt	V t
R# )	CamembertConverterij  c                    . ROpY!P                   R,           Uu. uF  q3P                  VP                  3NK  	  up,          pVR.,          pV# u upi )
<s>NOTUSEDr8  )r          <pad>r  z</s>NOTUSEDr  <unk>r  )<unk>NOTUSEDi<mask>r  r  rg   rf   r=   r   s   &&  r#   r=   CamembertConverter.vocabk  sP    
 	,,r:JK:J;;,:JKK/"" L    Ac                    ^# r  r;   r  s   &&r#   ru   CamembertConverter.unk_idx  s    r%   c           	         \         P                  ! R RRV P                  P                  R4      3RV P                  P                  R4      3.R7      # r  r  r   s   &r#   r   !CamembertConverter.post_processor|  r  r%   Nc                6   \        VP                  R R4      4      p\        VP                  RR4      4      p\        VP                  RR4      4      pR	VR3R
VR3R.pVe"   VP                  \        V4      R,          4       VP	                  VR34       WbR&   V# )	pad_tokenr  r   r  
mask_tokenr  r  r8  r=   r  r  )r        Yr+   r   rO   rr   rM   )r   r=   r   r  r   r  
vocab_lists   &&,    r#   r~  #CamembertConverter.convert_from_spm  s    

;89	

;89	L(;<
   $

 d5k"o.:s+,$wr%   r;   r   r   r   r   r   r=   ru   r   r  r~  r   r   r   s   @r#   r  r  j  s(     
  r%   r  c                   2   a  ] tR tRt o R tR tR tRtV tR# )DebertaV2Converteri  c                ,   . pV P                   P                  '       d'   VP                  \        P                  ! R R7      4       \        W P                   4      pVP                  \        P                  ! WR7      4       \        P                  ! V4      # )rE  )rG  r  )r3   split_by_punctrM   r
   Punctuationr5   r  rL  )rg   r  r(   list_pretokenizersr4   s   &&&  r#   r    DebertaV2Converter.pre_tokenizer  so    ""111%%n&@&@*&UV,-=?V?VW!!.":":{"rs&&'9::r%   c                   . pV P                   P                  '       d%   VP                  \        P                  ! 4       4       VP                  \        P
                  ! 4       4       VP                  P                  pV'       d&   VP                  \        P                  ! V4      4       VP                  \        P                  ! \        R 4      R4      4       \        P                  ! V4      # )r  r   )r3   r   rM   r	   r  r  r   r}   rw  ru  r   rL  r  s   &&  r#   r   DebertaV2Converter.normalizer  s    ""000##K$9$9$;< 1 1 34$44II##K$;$;<P$QR 3 3E'NC HI##$455r%   c           	         \         P                  ! R RRV P                  P                  R4      3RV P                  P                  R4      3.R7      # r  r  r   s   &r#   r   !DebertaV2Converter.post_processor  r  r%   r;   N)	r   r   r   r   r   r   r   r   r   r   s   @r#   r  r    s     ;6
 
r%   r  c                   F   a  ] tR tRt o R tR tR t]RR l4       tRt	V t
R# )	MBartConverteri  c                    . ROpY!P                   R,           Uu. uF  q3P                  VP                  3NK  	  up,          pV. RO,          pVR.,          pV# u upi )r  r   NNr  r  r  r  r  r  )ar_ARr  cs_CZr  de_DEr  en_XXr  es_XXr  et_EEr  fi_FIr  fr_XXr  gu_INr  hi_INr  it_ITr  ja_XXr  kk_KZr  ko_KRr  lt_LTr  lv_LVr  my_MMr  ne_NPr  nl_XXr  ro_ROr  ru_RUr  si_LKr  tr_TRr  vi_VNr  zh_CNr  r  r  r  s   &&  r#   r=   MBartConverter.vocab  sa    
 	,,r:JK:J;;,:JKK 
 	
6 	/""; Ls    Ac                    ^# r  r;   r  s   &&r#   ru   MBartConverter.unk_id      r%   c           	         \         P                  ! R RRV P                  P                  R4      3RV P                  P                  R4      3.R7      # )z$A </s> en_XXz$A $B </s> en_XXr  r  r   r  r   s   &r#   r   MBartConverter.post_processor  R    ,,"#$11GGPQ00FFvNO
 	
r%   Nc                   \        VP                  R R4      4      p\        VP                  RR4      4      p\        VP                  RR4      4      p\        VP                  RR4      4      p\        VP                  RR	4      4      pVR
3VR
3VR
3VR
3.pVe"   VP                  \        V4      R,          4       VP                  R \         4       4       VP                  VR
34       WR&   V# )r-  r  r  r  	eos_tokenr  r   r  r  r  r  r  c              3   (   "   T F  qR 3x  K
  	  R# 5ir  Nr;   .0	lang_codes   & r#   	<genexpr>2MBartConverter.convert_from_spm.<locals>.<genexpr>  s     LOyc*O   r=   )r+   r   rO   rr   MBART_LANGUAGESrM   	r   r=   r   r-  r  rF  r   r  r  s	   &&,      r#   r~  MBartConverter.convert_from_spm  s    

;67	

;89	

;78	

;89	L(;<
 	

 d5k"o.LOLL:s+,$wr%   r;   r   r  r   s   @r#   r  r    s)     $L
  r%   r  c                   F   a  ] tR tRt o R tR tR t]RR l4       tRt	V t
R# )	MBart50Converteri  c                   . ROpY!P                   R,           Uu. uF  q3P                  VP                  3NK  	  up,          pV. RNRNRNRNRNRNR	NR
NRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNR NR!NR"NR#NR$NR%NR&NR'NR(NR)NR*NR+NR,NR-NR.NR/NR0NR1NR2NR3NR4NR5NR6N,          pVR7.,          pV# u upi )8r  r  r	  r  r  r  r  r  r  r  r  r  r  r   r"  r$  r&  r(  r*  r,  r.  r0  r2  r4  r6  r8  r:  r<  )af_ZAr  )az_AZr  )bn_INr  )fa_IRr  )he_ILr  )hr_HRr  )id_IDr  )ka_GEr  )km_KHr  )mk_MKr  )ml_INr  )mn_MNr  )mr_INr  )pl_PLr  )ps_AFr  )pt_XXr  )sv_SEr  )sw_KEr  )ta_INr  )te_INr  )th_THr  )tl_XXr  )uk_UAr  )ur_PKr  )xh_ZAr  )gl_ESr  )sl_SIr  r  r  r  s   &&  r#   r=   MBart50Converter.vocab  s   
 	,,r:JK:J;;,:JKK  R.  R.  R.  R.  RR`  Rbp  R  sA  R  CQ  R  Sa  R  cq  R  sA  R  CQ  R  Sa  R  cq  R  sA  R  CQ  R  Sa  R  cq  R  sA  R  CQ  R  Sa  R  cq  R  sA  R  CQ  R  Sa  R  cq  R  sA  R  CQ  R  Sa  R  cq  R  sA  R  CQ  R  Sa  R  cq  R  sA	  R  C	Q	  R  S	a	  R  c	q	  R  s	A
  R  C
Q
  R  S
a
  R  c
q
  R  s
A  R  CQ  R  Sa  R  cq  R  sA  R  CQ  R  Sa  R  cq  R  sA  R  CQ  R  	R/"" Ls    C c                    ^# r  r;   r  s   &&r#   ru   MBart50Converter.unk_id  rA  r%   c           	         \         P                  ! R RRV P                  P                  R4      3RV P                  P                  R4      3.R7      # )zen_XX $A </s>zen_XX $A $B </s>r  r  r   r  r   s   &r#   r   MBart50Converter.post_processor  rD  r%   Nc                   \        VP                  R R4      4      p\        VP                  RR4      4      p\        VP                  RR4      4      p\        VP                  RR4      4      p\        VP                  RR	4      4      pVR
3VR
3VR
3VR
3.pVe"   VP                  \        V4      R,          4       VP                  R \         4       4       VP                  VR
34       WR&   V# )r   r  r  r  rF  r  r   r  r  r  r  r  c              3   (   "   T F  qR 3x  K
  	  R# 5irH  r;   rI  s   & r#   rL  4MBart50Converter.convert_from_spm.<locals>.<genexpr>/  s     N<Myc*<MrN  r=   )r+   r   rO   rr   MBART50_LANGUAGESrM   )	r   r=   r   r   r  rF  r   r  r  s	   &&,      r#   r~  !MBart50Converter.convert_from_spm  s    

;67	

;89	

;78	

;89	L(;<
 	

 d5k"o.N<MNN:s+,$wr%   r;   r   r  r   s   @r#   rS  rS    s(     

  r%   rS  c                   F   a  ] tR tRt o R tR tR t]RR l4       tRt	V t
R# )	NllbConverteri5  c                    . ROpY!P                   R,           Uu. uF  q3P                  VP                  3NK  	  up,          pV# u upi )r  r  r	  r  r  s   &&  r#   r=   NllbConverter.vocab6  C    
 	,,r:JK:J;;,:JKK L    Ac                    ^# r  r;   r  s   &&r#   ru   NllbConverter.unk_id@  rA  r%   c           	         \         P                  ! R RRV P                  P                  R4      3RV P                  P                  R4      3.R7      # )zeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnr  r   r  r   s   &r#   r   NllbConverter.post_processorC  sR    ,,%&T44JJ:VW00FFvNO
 	
r%   Nc                   \        VP                  R R4      4      p\        VP                  RR4      4      p\        VP                  RR4      4      p\        VP                  RR4      4      pV^ V^V^V^/pVe[   \        V\        4      '       d   VP	                  4       MV UU	u. uF  w  rVNK	  	  up	pp
V
 F  pW9   d   K  \        V4      W{&   K  	  WrR&   V# u up	pi )	r-  r  r  r  rF  r  r   r  r=   )r+   r   rt  rJ   r  rD   )r   r=   r   r-  r  rF  r   reordered_vocabtokr  tokensr   s   &&,         r#   r~  NllbConverter.convert_from_spmM  s    

;67	

;89	

;78	

;89	 qqqq	
 %/t%<%<UZZ\UZB[UZ633UZB[F+),_)=&   *w C\s   &Cr;   r   r  r   s   @r#   r{  r{  5  s(     
  r%   r{  c                   2   a  ] tR tRt o R tR tR tRtV tR# )SeamlessM4TConverterid  c                    . ROpY!P                   R,           Uu. uF  q3P                  VP                  3NK  	  up,          pV# u upi )r  r  )r  r  r
  r  r  r  s   &&  r#   r=   SeamlessM4TConverter.vocabe  r~  r  c                .    V P                   P                  # r   )r3   unk_token_idr  s   &&r#   ru   SeamlessM4TConverter.unk_ido  s    &&333r%   c           	         \         P                  ! R RRV P                  P                  R4      3RV P                  P                  R4      3.R7      # )z__eng__ $A </s>z__eng__ $A $B </s>__eng__r  r   r  r   s   &r#   r   #SeamlessM4TConverter.post_processorr  sR    ,,$%D33II)TU00FFvNO
 	
r%   r;   N	r   r   r   r   r=   ru   r   r   r   r   s   @r#   r  r  d  s     4
 
r%   r  c                   F   a  ] tR tRt o R tR tR t]RR l4       tRt	V t
R# )	XLMRobertaConverteri}  c                    . ROpY!P                   R,           Uu. uF  q3P                  VP                  3NK  	  up,          pVR.,          pV# u upi )r  r  r	  r  r  r  s   &&  r#   r=   XLMRobertaConverter.vocab~  sP    
 	,,r:JK:J;;,:JKK/"" Lr  c                
    ^pV# r  r;   r  s   && r#   ru   XLMRobertaConverter.unk_id  r  r%   c           	         \         P                  ! R RRV P                  P                  R4      3RV P                  P                  R4      3.R7      # r  r  r   s   &r#   r   "XLMRobertaConverter.post_processor  r  r%   Nc                   \        VP                  R R4      4      p\        VP                  RR4      4      p\        VP                  RR4      4      p\        VP                  RR4      4      p\        VP                  RR	4      4      pVR
3VR
3VR
3VR
3.pVe"   VP                  \        V4      R,          4       VP	                  VR
34       WR&   V# )r-  r  r  r  rF  r  r   r  r  r  r  r  r=   r  rP  s	   &&,      r#   r~  $XLMRobertaConverter.convert_from_spm  s    

;67	

;89	

;78	

;89	L(;<
 	

 d5k"o.:s+,$wr%   r;   r   r  r   s   @r#   r  r  }  s(     	
  r%   r  c                   2   a  ] tR tRt o R tR tR tRtV tR# )XLNetConverteri  c                    VP                    Uu. uFU  p\        VP                  4      '       d   VP                  VP                  3MVP                  VP                  ^d,
          3NKW  	  up# u upi r  r  r  s   && r#   r=   XLNetConverter.vocab  r  r  c                   \         P                  ! R R4      \         P                  ! RR4      .pV P                  P                  '       gI   VP	                  \         P
                  ! 4       4       VP	                  \         P                  ! 4       4       V P                  P                  '       d%   VP	                  \         P                  ! 4       4       VP                  P                  pV'       d&   VP	                  \         P                  ! V4      4       VP	                  \         P                  ! \        R4      R4      4       \         P                  ! V4      # r  r  r  s   &&  r#   r   XLNetConverter.normalizer  r  r%   c           	         \         P                  ! R RRV P                  P                  R4      3RV P                  P                  R4      3.R7      # )z$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   r  r   s   &r#   r   XLNetConverter.post_processor  r  r%   r;   Nr  r   s   @r#   r  r    r  r%   r  c                       ] tR tRtRtR# )ReformerConverteri  r;   Nr   r   r   r   r   r;   r%   r#   r  r        r%   r  c                   ,   a  ] tR tRt o R tR tRtV tR# )RemBertConverteri  c                   \         P                  ! R R4      \         P                  ! RR4      \         P                  ! \        R4      R4      .pV P                  P                  '       gI   VP                  \         P                  ! 4       4       VP                  \         P                  ! 4       4       V P                  P                  '       d%   VP                  \         P                  ! 4       4       VP                  P                  pV'       d&   VP                  \         P                  ! V4      4       \         P                  ! V4      # r  )r	   ru  r   r3   r  rM   r  r  r   r  r   r}   rw  rL  r  s   &&  r#   r   RemBertConverter.normalizer  s    c*c*g4

 &&333##K$4$4$67##K$<$<$>?""000##K$9$9$;<$44II##K$;$;<P$QR##$455r%   c           	         \         P                  ! R RRV P                  P                  R4      3RV P                  P                  R4      3.R7      # r  r  r   s   &r#   r   RemBertConverter.post_processor  r  r%   r;   N)r   r   r   r   r   r   r   r   r   s   @r#   r  r    s     6&
 
r%   r  c                       ] tR tRtRtR# )BertGenerationConverteri  r;   Nr  r;   r%   r#   r  r    r  r%   r  c                   L   a  ] tR tRt o R t]R	R l4       tR tR tR t	Rt
V tR# )
PegasusConverteri  c                   V P                   P                  R 3V P                   P                  R 3.pV P                   P                  e    W P                   P                  R 3.,          pV P                   P                  eO   V P                   P
                  V P                   P                  8  d    W P                   P                  R 3.,          pT\        ^V P                   P                  4       Uu. uF  pRV R2R3NK  	  up,          pY!P                  R,           Uu. uF  qDP                  VP                  3NK  	  up,          pV# u upi u upi )r  <unk_>r'   NNr  )r3   r  rF  mask_token_sentr  mask_token_idoffsetrL   r   r   r   )rg   rf   r=   r   r   s   &&   r#   r=   PegasusConverter.vocab  s'   $$..4$$..4

 ""22>..>>DEEE ##..:''558O8O8V8VV..993?@@E%4;R;R;Y;Y2Z[2ZQU1#Q<(2Z[[,,r:JK:J;;,:JKK \Ks   1E  ENc           	     @   \        VP                  R R4      4      p\        VP                  RR4      4      p\        VP                  RR4      4      p\        VP                  RR4      4      pVR3VR3.pVR	8w  d   VP                  VR34       VR	8w  d   We8w  d   VP                  VR34       TP                  \	        ^VP                  R
^g4      4       Uu. uF  pRV R2R3NK  	  up4       Ve"   VP                  \        V4      R,          4       WrR&   V# u upi )r  r  rF  r  r  z<mask_1>r  z<mask_2>r  Noner  r  r  r  r=   r  )r+   r   rM   rO   rL   rr   )	r   r=   r   r  rF  r  r  r  r   s	   &&,      r#   r~  !PegasusConverter.convert_from_spm  s   

;89	

;78	L*=>
fjj):JGH 

 z3/0f$)F455FJJxY\D];^_;^aeA3aL&1;^_`d5k"o.$w	 `s   Dc                d    VP                   P                  V P                  P                  ,           # r   )r   ru   r3   r  r  s   &&r#   ru   PegasusConverter.unk_id#  s%    !!((4+B+B+I+IIIr%   c                    \        W P                  4      p\        P                  ! \        P                  ! 4       \        P
                  ! WR 7      .4      # r  )r5   r3   r
   rL  WhitespaceSplitr  r  s   &&& r#   r   PegasusConverter.pre_tokenizer&  sE    ,-=?V?VW&&..0(([`
 	
r%   c                    V P                   P                  pWP                   P                  3.p\        P                  ! R V.R RV.VR7      # )$A$Br   )r3   rF  eos_token_idr   r   )rg   eosr   s   &  r#   r   PegasusConverter.post_processor/  sP    %%//))667
 ,,T3KtTSVFWhvwwr%   r;   r   )r   r   r   r   r=   r  r~  ru   r   r   r   r   r   s   @r#   r  r    s5     &  *J
x xr%   r  c                   @   a  ] tR tRt o R tR t]RR l4       tRtV t	R# )T5Converteri7  c                   V P                   P                  pVP                   Uu. uF  q3P                  VP                  3NK  	  ppT\        V^,
          RR4       Uu. uF  pRV R2R3NK  	  up,          pV# u upi u upi )r9  
<extra_id_r  r  r   )r3   
_extra_idsr   r   r   rL   )rg   rf   num_extra_idsr   r=   r   s   &&    r#   r=   T5Converter.vocab8  s|    //::9>F++u{{+FE-!:KRQS4TU4TqZs!$c*4TUU GUs    A:A?c                t    \         P                  ! R R.. RORV P                  P                  R4      3.R7      # r  r  r   )r  r  r  r  r  r   s   &r#   r   T5Converter.post_processor>  =    ,,&>-00FFvNO
 	
r%   Nc                    VP                  R ^d4      p\        V^,
          RR4       Uu. uF	  pRV R2NK  	  ppVe   \        V4      M. pVP                  R V 4       4       VP	                  RV4       WbR&   V# u upi )	extra_idsr  r  c              3   (   "   T F  qR 3x  K
  	  R# 5irH  r;   )rJ  r   s   & r#   rL  /T5Converter.convert_from_spm.<locals>.<genexpr>L  s     AL5#,LrN  r|   r=   r   )r   rL   rr   rO   
setdefault)r   r=   r   r  r   extra_tokensr  s   &&,    r#   r~  T5Converter.convert_from_spmG  s    JJ{C0	38QB3OP3Oa*QCq)3OP$)$5T%[2
ALAA5|D$w Qs   A;r;   r   )
r   r   r   r   r=   r   r  r~  r   r   r   s   @r#   r  r  7  s#     
  r%   r  c                   &   a  ] tR tRt o R tRtV tR# )UdopConverteriS  c                t    \         P                  ! R R.. RORV P                  P                  R4      3.R7      # r  r  r   s   &r#   r   UdopConverter.post_processorT  r  r%   r;   Nr   r   r   r   r   r   r   r   s   @r#   r  r  S  s     
 
r%   r  c                   2   a  ] tR tRt o V 3R lR ltRtV tR# )WhisperConverteri^  c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   WhisperConverter.__annotate___  s        9  r%   c                   V P                   P                  p\        V P                   P                  P	                  4       4      p\        \        VVR RRRR7      4      p\        P                  ! V P                   P                  R7      Vn
        \        P                  ! 4       Vn        V P                   P                  pV P                   P                  V4      pV P                   P                  pV P                   P                   pRP#                  V Uu. uF  q R2NK	  	  up4      p	\$        P&                  ! V	 RV R2V	 RV R	2Wg3.\)        WT4      OR
7      Vn        V# u upi )Nr%  Fr&  r(  r   r   z $A:0 z $A:0 $B:1 r   r   )r3   r  rr   r  r  r   r   r
   r,  r(   r   r   r   prefix_tokensconvert_ids_to_tokensrF  r  joinr   r   zipr   )
rg   r=   rQ   r   prefix_token_idsprefixesr  r  r   prefix_templates
   &         r#   r   WhisperConverter.converted_  sO   ''//d--77<<>?*,#%	
	 #1":":DLcLcLtLt"u	$..0	22@@**@@AQR%%//..;;((h#GhUgRLh#GH#-#@#@%&fSE4#$KuB7#X0$
	   $Hs   E!r;   Nr   r   s   @r#   r  r  ^  s        r%   r  c                   &   a  ] tR tRt o R tRtV tR# )BigBirdConverteri  c           	         \         P                  ! R RRV P                  P                  R4      3RV P                  P                  R4      3.R7      # r  r  r   s   &r#   r   BigBirdConverter.post_processor  r  r%   r;   Nr  r   s   @r#   r  r    s     
 
r%   r  c                   2   a  ] tR tRt o V 3R lR ltRtV tR# )CLIPConverteri  c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   CLIPConverter.__annotate__  s     ' '9 'r%   c                   V P                   P                  p\        V P                   P                  P	                  4       4      pV P                   P
                  p\        \        VVR RRR\        V4      R7      4      p\        P                  ! \        P                  ! 4       \        P                  ! \        R4      R4      \        P                  ! 4       .4      Vn        \         P                  ! \         P"                  ! \        R4      RR	R
7      \         P$                  ! RR7      .4      Vn        \(        P$                  ! 4       Vn        \,        P.                  ! V P                   P0                  V P                   P2                  3V P                   P4                  V P                   P6                  3RRR7      Vn        V# )Nr%  r  Fr=   rQ   r  r'  r  r  r   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTrF  r(  rU  )r3   r  rr   r  r  r   r   r   r+   r	   rL  rK  ru  r   r  r   r
   rM  r,  r   r   r   r   rV  rF  r  r-  r.  r   r  s   &    r#   r   CLIPConverter.converted  sk   ''//d--77<<>?++55	*,#)i.

	  +33__ 3 3E&M3 GI^I^I`a 
	 #1"9"9$$Z[&
 ((%@	#
	 %..0	 $.#?#?((22D4K4K4X4XY((22D4K4K4X4XY"	$
	  r%   r;   Nr   r   s   @r#   r  r    s     ' 'r%   r  c                   2   a  ] tR tRt o V 3R lR ltRtV tR# )LayoutLMv2Converteri  c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,    LayoutLMv2Converter.__annotate__  r   r%   c           	     |   V P                   P                  p\        \        V\	        V P                   P
                  4      R 7      4      pRpRpRp\        V P                   R4      '       da   V P                   P                  P                  pV P                   P                  P                  pV P                   P                  P                  p\        P                  ! RVVVR7      Vn        \        P                  ! 4       Vn        \	        V P                   P"                  4      p\	        V P                   P$                  4      pV P                   P&                  pV P                   P(                  p	\*        P,                  ! V RV R2V RV RV R2Wh3Wy3.R	7      Vn        \0        P                  ! R
R7      Vn        V# )r   FTr   r   r   r   r   r   r   r   r   r   r   s
   &         r#   r   LayoutLMv2Converter.converted  s   ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5##$
	  %..d;	r%   r;   Nr   r   s   @r#   r  r    r   r%   r  c                   2   a  ] tR tRt o V 3R lR ltRtV tR# )BlenderbotConverteri  c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,    BlenderbotConverter.__annotate__  rS  r%   c                   V P                   pVP                  p\        VP                  P	                  4       4      p\        \        VVR RRRR7      4      p\        P                  ! VP                  R7      Vn
        \        P                  ! 4       Vn        \        P                  ! RVP                   R2VP                  VP                   3.R7      Vn        V# )Nr%  Fr&  r(  z$A:0 r   )r   r   )r3   r  rr   r  r  r   r   r
   r,  r(   r   r   r   r   r   rF  r  r   rW  s   &    r#   r   BlenderbotConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@2<<.+r/$
	  r%   r;   Nr   r   s   @r#   r  r    rZ  r%   r  c                   2   a  ] tR tRt o R tR tR tRtV tR# )XGLMConverteri  c                    . ROpY!P                   R,           Uu. uF  q3P                  VP                  3NK  	  up,          pV. RO,          pV# u upi )r  r  r	  ))z<madeupword0>r  )z<madeupword1>r  )z<madeupword2>r  )z<madeupword3>r  )z<madeupword4>r  )z<madeupword5>r  )z<madeupword6>r  r  r  s   &&  r#   r=   XGLMConverter.vocab  sT    
 	,,r:JK:J;;,:JKK  z  	z Ls    Ac                
    ^pV# r  r;   r  s   && r#   ru   XGLMConverter.unk_id	  r  r%   c           	         \         P                  ! R RRV P                  P                  R4      3RV P                  P                  R4      3.R7      # )z</s> $Az</s> $A </s> </s> $Br  r  r   r  r   s   &r#   r   XGLMConverter.post_processor  sR    ,,'//EEeLM00FFvNO
 	
r%   r;   Nr  r   s   @r#   r  r    s     	
 
r%   r  c                   P   a  ] tR tRt o Rt]tRR0t R tR t	R t
R tR	 tR
tV tR# )GemmaConverteri  Tz<start_of_turn>z<end_of_turn>c                0    \         P                  ! R R4      # r   rr  )r	   ru  r  s   &&r#   r   GemmaConverter.normalizer(  s    ""3..r%   c                   V P                   P                  R 3V P                   P                  R 3V P                   P                  R 3.pY!P                  R,           Uu. uF  q3P
                  VP                  3NK  	  up,          p\        ;QJ d    R V 4       F  '       g   K   RM	  RM! R V 4       4      '       g%   \        R \        V4       4       R4      pVe   RW$&   V# u upi )r  r  c              3   8   "   T F  q^ ,          R8H  x  K  	  R# 5i)r   r   Nr;   )rJ  r<   s   & r#   rL  'GemmaConverter.vocab.<locals>.<genexpr>4  s     /AQ44<s   TFc              3   J   "   T F  w  rV^ ,          R8X  g   K  Vx  K  	  R# 5i)r   r   Nr;   )rJ  r   r<   s   &  r#   rL  r  5  s!     "V1AQqTXEU111As   #
#N)r   r  )
r3   r  rF  r-  r   r   r   anynextr   )rg   rf   r=   r   override_indexs   &&   r#   r=   GemmaConverter.vocab+  s    $$..4$$..4$$..4

 	,,r:JK:J;;,:JKK s//sss////!"V51A"VX\]N)(3% Ls    C&c                0    \         P                  ! R R4      # )r   merged_with_previous)r
   rM  rg   r  r(   s   &&&r#   r   GemmaConverter.pre_tokenizer;  s    ##C)?@@r%   c                
    ^pV# r  r;   r  s   && r#   ru   GemmaConverter.unk_id>  r  r%   c                    \         P                  ! \         P                  ! R R4      \         P                  ! 4       \         P                  ! 4       .4      # )rr  r   )r   rL  ru  rx  ry  r  s   &&&r#   r   GemmaConverter.decoderB  s?        ,%%'
 	
r%   r;   N)r   r   r   r   r  r   r  r   r   r=   r   ru   r   r   r   r   s   @r#   r  r    s>     .L'9N/ A
 
r%   r  c                   H   a  ] tR tRt o RtR tR tR tR tR t	R t
R	tV tR
# )LlamaConverteriL  Tc                8   V P                   P                  ^ 4      R3V P                   P                  ^4      R3V P                   P                  ^4      R3.pY!P                  R,           Uu. uF  q3P                  VP                  3NK  	  up,          pV# u upi )r   r  r  )r3   r  r   r   r   r  s   &&  r#   r=   LlamaConverter.vocabO  s    $$::1=sC$$::1=sC$$::1=sC

 	,,r:JK:J;;,:JKK Ls   , Bc                
    ^ pV# r:   r;   r  s   && r#   ru   LlamaConverter.unk_idX  r  r%   c                    \         P                  ! R R4      \         P                  ! 4       \         P                  ! 4       .pV'       d!   V\         P                  ! R^R7      .,          p\         P
                  ! V4      # rr  r   )contentr  r   ru  rx  ry  r  rL  rg   r  r(   sequences   &&& r#   r   LlamaConverter.decoder\  \    UC(!!#MMO

 !<==H  **r%   c                (   \        V P                  R R4      '       du   . p\        V P                  RR4      '       d    V\        P                  ! RR7      .,          pV\        P                  ! RRR7      .,          p\        P
                  ! V4      # R# )r/   Tr(   rr  )prependr   )patternr,  N)r2   r3   r	   Prependru  rL  )rg   rf   r/  s   && r#   r   LlamaConverter.normalizerf  sx    4**Hd;;Ht..0BDII[00?@@,,S%HIIH''11r%   c                    \        V P                  R R4      '       g.   \        W P                  4      p\        P                  ! WRR7      # R# )r/   TFr  r4   splitN)r2   r3   r5   r
   r  r  s   &&& r#   r   LlamaConverter.pre_tokenizero  s?    t..$??01ACZCZ[N!++joppr%   c                    R # r   r;   r   s   &r#   r   LlamaConverter.post_processoru  s    r%   r;   N)r   r   r   r   r  r=   ru   r   r   r   r   r   r   r   s   @r#   r%  r%  L  s.     + r%   r%  c                   2   a  ] tR tRt o V 3R lR ltRtV tR# )MarkupLMConverteriz  c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   MarkupLMConverter.__annotate__{  s     " "9 "r%   c                   V P                   pVP                  p\        VP                  P	                  4       4      p\        \        VVR RRRV P                   P                  R7      4      p\        P                  ! VP                  R7      Vn        \        P                  ! 4       Vn        \        V P                   P                  4      p\        V P                   P                   4      pV P                   P"                  pV P                   P$                  p\&        P(                  ! V RV 2V RV RV 2WW3Wh3.R7      Vn        V# )Nr%  Fr  r(  z $A z $B r   )r3   r  rr   r  r  r   r   r   r
   r,  r(   r   r   r   r+   r   r   r   r   r   r   r   )	rg   rX  r=   rQ   r   r   r   r   r   s	   &        r#   r   MarkupLMConverter.converted{  s(   $$

bll'')**,#%11;;

	 #1":":BL_L_"`	$..0	$))334$))334..;;..;;#-#@#@U$se$5SEcU+##$
	  r%   r;   Nr   r   s   @r#   r>  r>  z  s     " "r%   r>  c                   <   a  ] tR tRt o RtR tR tR tR tRt	V t
R# )	MoshiConverteri  Tc                &   \        V R 4       \        P                  W4       \        4       pVP	                  4       p\        VR4      ;_uu_ 4       pVP                  VP                  4       4       RRR4       W@n        R#   + '       g   i     L; ir   ra   N	r   r   rk   r$   rb   rc   rd   re   rf   rg   r  r   rh   ri   rj   s   &&,   r#   rk   MoshiConverter.__init__  sh    $
+4, $%	  "*d##qaffh' $
 $#    B  B	c                    VP                   P                  p\        P                  ! R R4      .pV'       g   \        P                  ! V4      # \        P                  ! \        P
                  ! V4      .V,           4      # r  )r   r}   r	   ru  rL  rw  r  s   &&  r#   r   MoshiConverter.normalizer  sg    $44IIU+
 $''55'')@)@AU)V(WZf(fggr%   c                    \         P                  ! R R4      \         P                  ! 4       \         P                  ! 4       .pV'       d!   V\         P                  ! R^R7      .,          p\         P
                  ! V4      # r+  r-  r.  s   &&& r#   r   MoshiConverter.decoder  r1  r%   c                6    R p\         P                  ! WRR7      # )r0   Fr8  )r
   r  r  s   &&& r#   r   MoshiConverter.pre_tokenizer  s     ''Kfkllr%   r   N)r   r   r   r   r  rk   r   r   r   r   r   r   s   @r#   rD  rD    s'     h+m mr%   rD  c                   X   a  ] tR tRt o RtRR ltR tR tR tR t	R	 t
R
 tR tRtV tR# )HeliumConverteri  TNc                &   \        V R 4       \        P                  W4       \        4       pVP	                  4       p\        VR4      ;_uu_ 4       pVP                  VP                  4       4       RRR4       W@n        R#   + '       g   i     L; irF  rG  rH  s   &&,   r#   rk   HeliumConverter.__init__  sf    $
+4,#%	  "*d##qaffh' $
 $#rJ  c                p   V P                  V4      p\        \        VV P                  V4      V P                  R 7      4      p\        VP                  4       UUu. uFT  w  rEVP                  R
9   g   K  YEP                  VP                  ^8H  ;'       g    VP                  V P                  9   3NKV  	  pppTP                  \        VR R7       UUUu. uF  w  rGp\        VRVRR7      NK  	  uppp4       VP                  \        RRRR7      .4       VP                  R^R	7       V# u uppi u upppi )r  c                     V ^ ,          # r:   r;   rw   s   &r#   r>   +HeliumConverter.tokenizer.<locals>.<lambda>  r  r%   r@   FT)rz   r{   single_word
ry   r  )r  pad_idr   )r=   r   r   ru   r  r   r   r   r   r   r  rN   r   enable_padding)	rg   rf   rP   r   r   r   r   r   r{   s	   &&       r#   r   HeliumConverter.tokenizer  s#   zz%({{5)"77
	 #5<<0
0vv IR!&&A+GGD4G4G)GH0 	 

 	 +11A~*V*V&Bw 5UGQUV*V	
 	j%OPQ  71 =
s   D+4 D+D+D1c                    . pVP                    FK  pVP                  R 8X  d   VRVP                  3.,          pK,  W#P                  VP                  3.,          pKM  	  V# )z<0x0A>rY  r  r  s   &&  r#   r=   HeliumConverter.vocab  sV    \\E{{h&4-..;;455	 "
 r%   c                
    ^ pV# r:   r;   r  s   && r#   ru   HeliumConverter.unk_id  r  r%   c                    \         P                  ! R R4      \         P                  ! 4       \         P                  ! 4       .pV\         P                  ! R^R7      .,          p\         P
                  ! V4      # r+  r-  r.  s   &&& r#   r   HeliumConverter.decoder  sY    UC(!!#MMO

 	X^^Ca899  **r%   c                    \         P                  ! \         P                  ! R 4      \         P                  ! R R4      .4      # r  )r	   rL  r5  ru  r  s   &&r#   r   HeliumConverter.normalizer
  s2    ##[%8%8%={?R?RSWY^?_$`aar%   c                Z    \         P                  ! \         P                  ! R R4      .4      # )rY  
contiguous)r
   rL  rM  r  s   &&&r#   r   HeliumConverter.pre_tokenizer  s#    &&(<(<T<(P'QRRr%   c                >    \         P                  ! R R.. ROR.R7      # )r  r  r   )r  r  r  r  )r  r9  )r   r   r   s   &r#   r   HeliumConverter.post_processor  s/    ,, 
 	
r%   r   r   )r   r   r   r   r  rk   r   r=   ru   r   r   r   r   r   r   r   s   @r#   rR  rR    s:     
8+bS
 
r%   rR  c                   4   a  ] tR tRt o RtRR ltR tRtV tR# )ParakeetConverteri"  TNc                2   Wn         \        V R 4       \        P                  W4       \	        4       pVP                  4       p\        VR4      ;_uu_ 4       pVP                  VP                  4       4       RRR4       W@n	        R#   + '       g   i     L; irF  )
r  r   r   rk   r$   rb   rc   rd   re   rf   )rg   r  r  rh   ri   rj   s   &&*   r#   rk   ParakeetConverter.__init__%  sk    $$
+4,#%	  "*d##qaffh' $
 $#s    BB	c                   V P                  V4      p\        V4       UUUu/ uF
  w  pw  rEWCbK  	  pppp\        Wb4      p\        \	        VVVP
                  P                  R V P                  RR7      4      p\        VP                  4       U	U
u. uFT  w  rV
P                  R9   g   K  YP                  V
P                  ^8H  ;'       g    V
P                  V P                  9   3NKV  	  pp	p
TP                  \        VR R7       U	UUu. uF  w  rp\        VRVR7      NK  	  uppp	4       V# u upppi u up
p	i u uppp	i )TNr  c                     V ^ ,          # r:   r;   rw   s   &r#   r>   -ParakeetConverter.tokenizer.<locals>.<lambda>K  r  r%   r@   Fry   r   )r=   r   rX   r   r   r   rs  r  r   r   r   r   r  rN   r   )rg   rf   rP   r   r   r   r  rQ   r   r   r   r   r   r{   s   &&            r#   r   ParakeetConverter.tokenizer2  s3   zz%(5>|5LM5L!1MTTW5L	M 9,,66"77	
	 #5<<0
0vv IR!&&A+GGD4G4G)GH0 	 

 	 +11A~*V*V&Bw 5UGD*V	
 5 N
s   D/D6* D6D6
D<)rf   r  r   )	r   r   r   r   r  rk   r   r   r   r   s   @r#   rk  rk  "  s      r%   rk  c            	     >   \        \        \        R4      \        R4      ^,           4      4      \        \        \        R4      \        R4      ^,           4      4      ,           \        \        \        R4      \        R4      ^,           4      4      ,           p V R,          p^ p\        R4       F=  pW09  g   K  V P                  V4       VP                  RV,           4       V^,          pK?  	  V Uu. uF  p\	        V4      NK  	  pp\        \        W4      4      # u upi )	a  
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.

The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
!~   ¡   ¬   ®   ÿ:NNN   )rr   rL   ordrM   chrrJ   r  )bscsnbs       r#   bytes_to_unicoder  R  s     	U3s8SX\*+d5TCIPQM3R.SSVZ[`adeiajloptluxyly[zV{{  
AB	A4[;IIaLIIdQhFA	 
 	"Q#a&"B	B 
s   0Dc                   X   a  ] tR tRt o RtRR ltV 3R lR ltR tV 3R lR	 ltR
t	V t
R# )TikTokenConverterij  z
A general tiktoken converter.
Nc                    Wn         W n        W0n        \        V\        4      '       d   VP                  4       V n        R # TV n        R # r   )r  r4  r(   rt  rJ   r  extra_special_tokens)rg   r  r4  r(   r  r   s   &&&&&,r#   rk   TikTokenConverter.__init__o  sA     % 0+56JD+Q+Q %%' 	!Wk 	!r%   c                    < V ^8  d   QhRS[ /# r'   tiktoken_urlr]   )r    r^   s   "r#   r,   TikTokenConverter.__annotate__~  s      C r%   c                j  aa  ^ RI Hp T! T4      o\	        4       oT3R lp. p/ pSP                  4        F  w  rgYuT! T4      &   \        T4      ^8X  d   K!  . p\        ^\        T4      4       F?  p	TRT	 YiR rT
S9   g   K  TS9   g   K  Y,           S9   g   K,  TP                  YT34       KA  	  \        TT3R lRR7      pTP                  T4       K  	  \        TR RR7      pT Uu. uF   q! T^ ,          4      T! T^,          4      3NK"  	  ppYT3#   \         d    \        R4      hi ; iu upi )	r   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c           	         < R P                  V P                  R4       Uu. uF  pS\        V4      ,          NK  	  up4      # u upi r%  zlatin-1r  decoderz  r  charbyte_encoders   & r#   token_bytes_to_stringPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string  s;    77@ST@SLT33@STUUT   ANc                 D   < SV ^ ,          ,          SV ^,          ,          3# r:   r;   )r<   r  s   &r#   r>   CTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s    1Q4)AaD/0Rr%   FrG   c                     V ^,          # rC   r;   rE   s   &r#   r>   r        Ar%   )tiktoken.loadr  r  
ValueErrorr  rK   rD   rL   rM   rN   rO   )rg   r  r  r  rQ   r=   r   rankrT   rU   rV   rW   rF   r  r  s   &&           @@r#   extract_vocab_merges_from_model1TikTokenConverter.extract_vocab_merges_from_model~  sC   	7 &l3	')	V $??,KE26'./5zQEq#e*-#(%=%-i'Gy,@gFW\eEeLL'D!9: . 5&R\abEMM%  - $6F\bc\bUX(Q02GA2OP\bc}5  	k 	2 ds   D +&D0D-c                    V P                  V P                  4      w  r\        \        WR R7      4      p\	        VP
                  R4      '       d   RVP
                  n        V# F)r  ignore_mergesTr  r  r   r   r   r\   r  rg   rP   rQ   r   s   &   r#   r   TikTokenConverter.tokenizer  M    #CCDOOTc,GH	9??O44,0IOO)r%   c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   r    s      9 r%   c                   V P                  4       p\        P                  ! \        P                  ! \	        V P
                  4      R RR7      \        P                  ! V P                  RR7      .4      Vn        \        P                  ! 4       Vn
        V P                  e8   TP                  V P                   Uu. uF  p\        VRRR7      NK  	  up4       \        P                  ! RR7      Vn        V# u upi )rE  FrF  rI  Try   r*  )r   r
   rL  rM  r   r4  r,  r(   r   r   r   r  r  r   r   r   )rg   r   r   s   &  r#   r   TikTokenConverter.converted  s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	$$0((PTPiPijPiuEeTBPij $.#7#7U#K	  ks   9C5)r(   r  r4  r  Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN)r   r   r   r   r   rk   r  r   r   r   r   r   s   @r#   r  r  j  s(     
 > r%   r  c                   T   a  ] tR tRt o R
R ltV 3R lR ltR tV 3R lR ltR	tV t	R# )MistralConverteri  Nc                    Wn         W n        W0n        \        V\        4      '       d   VP                  4       V n        R # TV n        R # r   )r  r4  r(   rt  rJ   r  r|   )rg   r  r4  r(   r|   r   s   &&&&&,r#   rk   MistralConverter.__init__  sG     % 0 3T:: &**, 	& + 	&r%   c                    < V ^8  d   QhRS[ /# r  r]   )r    r^   s   "r#   r,   MistralConverter.__annotate__  s     % %C %r%   c                  aa ^ RI p^ RIp\        V P                  RRR7      ;_uu_ 4       pVP	                  V4      pRRR4       XR,          R,          V n        VR,           Uu. uF  p\        VR,          VR	,          R
7      NK   	  upV n        VR,          p\        4       o\        V3R l4       p. p	/ p
\        V P                  4       F  w  rWVP                  &   K  	  V Uu. uF  qbP                  VR,          4      NK  	  pp\        V4      p\        V4       UUu/ uF  w  rWbK	  	  uppo\        \        VRR7      4       F  w  rWV! V4      &   \        V4      ^8X  d   K!  . p\!        ^\        V4      4       FC  pVRV VVR ppVV9   g   K  VV9   g   K  VV,           V9   g   K/  VP#                  VVV34       KE  	  \%        VV3R lRR7      pV	P'                  V4       K  	  \%        V	R RR7      p	V	 Uu. uF!  pV! V^ ,          4      V! V^,          4      3NK#  	  p	pW3#   + '       g   i     EL; iu upi u upi u uppi u upi )r   Nrzutf-8)encodingconfigr4  r   	token_str
is_control)r{   r=   c           	         < R P                  V P                  R4       Uu. uF  pS\        V4      ,          NK  	  up4      # u upi r  r  r  s   & r#   r  OMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string  s;    77@ST@SLT33@STUUTr  token_bytesz(Converting tekken.json to tokenizer.json)descc                 D   < SV ^ ,          ,          SV ^,          ,          3# r:   r;   )r<   token_to_ranks   &r#   r>   BMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s    qt1DmTUVWTXFY0Zr%   FrG   c                     V ^,          # rC   r;   rE   s   &r#   r>   r    r  r%   )base64jsonrc   r  loadr4  r   r|   r  r   r   r,  	b64decoderI   r   rD   rL   rM   rN   rO   )rg   r  r  r  rj   untypedkr  r  rQ   r=   idxr   rank_setr  rT   rU   rV   rW   rF   r  r  s   &&                  @@r#   r  0MistralConverter.extract_vocab_merges_from_model  s0   $//399QiilG :x(3IPQaIb*
IbAJq~q?Ib*
& G$	')		V 
	V #D$B$BCJC#&%--  DAJKA%%a&67	Ky>8A)8LM8L8LM$T):d%efKD26'./5zQEq#e*-#(%=%-h&7h+>GgDUZbCbLL'7D!9: . 5&ZdijEMM%  g $6F\bc\bUX(Q02GA2OP\bc}C :99*
 LM ds#   H.%$I' I"I'I.H?	c                    V P                  V P                  4      w  r\        \        WR R7      4      p\	        VP
                  R4      '       d   RVP
                  n        V# r  r  r  s   &   r#   r   MistralConverter.tokenizer  r  r%   c                    < V ^8  d   QhRS[ /# rn   r   )r    r^   s   "r#   r,   r    s      9 r%   c                   V P                  4       p\        P                  ! \        P                  ! \	        V P
                  4      R RR7      \        P                  ! V P                  RR7      .4      Vn        \        P                  ! 4       Vn
        VP                  V P                  4       \        P                  ! RR7      Vn        V# )rE  FrF  rI  r*  )r   r
   rL  rM  r   r4  r,  r(   r   r   r   r  r|   r   r   )rg   r   s   & r#   r   MistralConverter.converted  s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	T;;<#-#7#7U#K	 r%   )r(   r|   r4  r  r  )
r   r   r   r   rk   r  r   r   r   r   r   s   @r#   r  r    s$     
"% %N r%   r  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerReformerTokenizerRemBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizerSplinterTokenizerXGLMTokenizerLlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3Tokenizerc                $    V ^8  d   QhR\         /# rn   r   )r    s   "r#   r,   r,   C  s     $ $) $r%   c                R   V P                   P                  pV\        9   d,   V'       g$   \        V,          pV! V 4      P                  4       # V P                  P                  R4      '       d?   W n        \        P                  R4       \        V P                  4      P                  4       #  \        P                  R4       \        V P                  V P                  R7      P                  4       #   \         d+    \        R\        \        P                  4       4       24      hi ; i)a\  
Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

Args:
    transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
        Instance of a slow tokenizer to convert in the backend tokenizer for
        [`~tokenization_utils_base.PreTrainedTokenizerFast`].
   from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
        Defaults to False.

Return:
    A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
    [`~tokenization_utils_base.PreTrainedTokenizerFast`]
ztekken.jsonz#Converting from Mistral tekken.jsonzConverting from Tiktoken)r  r  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )r  r   SLOW_TO_FAST_CONVERTERSr   r  endswithr3   loggerinfor  r  r  r  r  rr   r  )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classs   &&  r#   convert_slow_tokenizerr  C  s      1::CC66}12FG45??AA		)	)	2	2=	A	A3H09: 5 @ @AKKMM	KK23$0;;%:%O%O ik  	>>BCZC_C_Ca>b=ce 	s   -AC1 15D&)r  r  r  r  r  r  r  r  r  r  r!  r#  r%  r'  r)  r+  r-  r/  r1  r3  r5  r7  r9  r;  r=  )rU  rV  rW  rX  rY  rZ  r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  re  rf  rg  rh  ri  rj  rk  rl  rm  rn  ro  )r%  r   )F)Ur   r  collections.abcr   	functoolsr   	packagingr   
tokenizersr   r   r   r   r	   r
   r   r   r   r   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerr   r  rO  rx  r$   r5   rX   rZ   r   r   r   r   r   r   r  r  r  r3  r@  rP  r\  rf  rp  r  r  r  r  r  rS  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r%  r>  rD  rR  rk  r  r  r  r  r  r;   r%   r#   <module>r     s    &   f f f 5 5  ` ` 5 
		H	%8 $ '  >G"02 2j"8 "I$ $$I $N/	 /d$i $N$Y $N 6#I #Ly >)Y )Xy :$	 $Ny >x9 xv"
l "
J
| 
 - -`
 
BG\ GT-| -`,L ,^
< 
2,, ,^"
\ "
J	 	
| 
@	l 	;x| ;x|, 8
L 
!y !H	
| 	
(I (V$) $N) :
L 
61
\ 1
h+\ +\#	 #L&m\ &mRV
l V
r- -`0K K\M M`88%8 (8 ]	8
 (8 .8 ,8 ]8 8 8 (8 ,8 =8 -8 "=8  !-!8" #8$ _%8& '8( ])8* (+8, -8. =/80 +182 -384 +586 $788 }98: *;8< n=8> (?8@ nA8B =C8D $E8F ]G8H ,I8J (K8L nM8N *O8P (Q8R (S8T *U8V 0W8X MY8Z ;[8\ ]]8^ (_8` .a8b nc8d *e8f ]g8h n.n^o8 v$ $r%   