
    hY                     V   S SK r S SKrS SKrS SKJr  S SKrS SKJrJ	r	J
r
  S SKJr  S\	S\S\S\S	\R                  4
S
 jrS\	S\S\S	\R                  4S jrS\S\4S jr S,S\	S\S\S\S\S\S	\R                  4S jjrS\S\\\R                  4   4S jrS\S\S\S\S\S\S\	S\	S\	S\S\S\4S jr S-S\S\S\S\S\S\	S\	S\	S\S\S\S\4S jjrS r   S.S\S\S-  S \S-  S!\S-  S	\\R                  S-  \R                  S-  \R                  S-  4   4
S" jjr   S.S#\S\S-  S \S-  S!\S-  S	\\R                  S-  \R                  S-  \R                  S-  4   4
S$ jjrS% rS&\S'\S\S\S\S\S\S\S-  S \S-  S!\S-  S(\S\S\S\4S) jrS* r\ S+:X  a  \" 5         gg)/    N)Path)
ModelProtoTensorProtonumpy_helper)	OnnxModel	input_ids
batch_sizesequence_lengthdictionary_sizereturnc                 ,   U R                   R                  R                  [        R                  [        R
                  [        R                  4;   d   e[        R                  R                  X1U4[        R                  S9nU R                   R                  R                  [        R                  :X  a  [        R                  " U5      nU$ U R                   R                  R                  [        R                  :X  a  [        R                  " U5      nU$ )a@  Create input tensor based on the graph input of input_ids

Args:
    input_ids (TensorProto): graph input of the input_ids input tensor
    batch_size (int): batch size
    sequence_length (int): sequence length
    dictionary_size (int): vocabulary size of dictionary

Returns:
    np.ndarray: the input tensor created
)sizedtype)typetensor_type	elem_typer   FLOATINT32INT64nprandomrandintint32float32int64)r   r	   r
   r   datas        a/var/www/fran/franai/venv/lib/python3.13/site-packages/onnxruntime/transformers/bert_test_data.pyfake_input_ids_datar      s     >>%%//4    99_3PXZX`X`aD~~!!++{/@/@@zz$ K 
	#	#	-	-1B1B	Bxx~K    segment_idsc                    U R                   R                  R                  [        R                  [        R
                  [        R                  4;   d   e[        R                  " X4[        R                  S9nU R                   R                  R                  [        R                  :X  a  [        R                  " U5      nU$ U R                   R                  R                  [        R                  :X  a  [        R                  " U5      nU$ )a  Create input tensor based on the graph input of segment_ids

Args:
    segment_ids (TensorProto): graph input of the token_type_ids input tensor
    batch_size (int): batch size
    sequence_length (int): sequence length

Returns:
    np.ndarray: the input tensor created
r   )r   r   r   r   r   r   r   r   zerosr   r   r   )r    r	   r
   r   s       r   fake_segment_ids_datar$   1   s     ''116    88Z1BD##--1B1BBzz$ K 
			%	%	/	/;3D3D	Dxx~Kr   max_sequence_lengthaverage_sequence_lengthc                     US:  a  X::  d   eSU-  U :  a  [         R                  " SU-  U -
  U 5      $ [         R                  " SSU-  S-
  5      $ )N      )r   r   )r%   r&   s     r   get_random_lengthr*   L   sa    "a',C,ZZZ 	""%88~~a"99<OOQdee~~a%<!<q!@AAr   
input_maskrandom_sequence_length	mask_typec                    U R                   R                  R                  [        R                  [        R
                  [        R                  4;   d   eUS:X  ac  [        R                  " U[        R                  S9nU(       a!  [        U5       H  n[        X#5      Xg'   M     GO[        U5       H  nX6U'   M	     GOUS:X  a  [        R                  " X4[        R                  S9nU(       a7  [        U5       H&  n[        X#5      n[        U5       H	  n	SXgU	4'   M     M(     GO?[        R                  " X4[        R                  S9n
XSU
R                  S   2SU
R                  S   24'   OUS:X  d   e[        R                  " US-  S-   [        R                  S9nU(       az  [        U5       H  n[        X#5      Xg'   M     [        US-   5       HH  nUS:  a  XaU-   S-
     XgS-
     -   OSXaU-   '   US:  a  XaU-   S-
     XgS-
     -   OSUSU-  S-   U-   '   MJ     OD[        U5       H  nX6U'   M	     [        US-   5       H  nXs-  XaU-   '   Xs-  USU-  S-   U-   '   M     U R                   R                  R                  [        R                  :X  a  [        R                  " U5      nU$ U R                   R                  R                  [        R                  :X  a  [        R                  " U5      nU$ )a  Create input tensor based on the graph input of segment_ids.

Args:
    input_mask (TensorProto): graph input of the attention mask input tensor
    batch_size (int): batch size
    sequence_length (int): sequence length
    average_sequence_length (int): average sequence length excluding paddings
    random_sequence_length (bool): whether use uniform random number for sequence length
    mask_type (int): mask type - 1: mask index (sequence length excluding paddings). Shape is (batch_size).
                                 2: 2D attention mask. Shape is (batch_size, sequence_length).
                                 3: key len, cumulated lengths of query and key. Shape is (3 * batch_size + 2).

Returns:
    np.ndarray: the input tensor created
r(   r"   r)   Nr      )r   r   r   r   r   r   r   r   onesr   ranger*   r#   shaper   r   )r+   r	   r
   r&   r,   r-   r   iactual_seq_lenjtemps              r   fake_input_mask_datar7   V   s   0 ??&&005    A~ww
2884!:&+OU ' :&1Q '	axx5RXXF!:&!2?!\~.A!"DAJ / '
 77J@QD594::a=/DJJqM/12A~~xxa!+BHH=!:&+OU ' :>*QRUVQVtNQ,>'?$1u+'M\]!^$YZ]^Y^tNQ4F/G$STu+/UdeQ^a'!+, + :&1Q ':>*'('B!^$/0/JQ^a'!+, + "",,0A0AAzz$ K 
	$	$	.	.+2C2C	Cxx~Kr   	directoryinputsc           	      <   [         R                  R                  U 5      (       d'   [         R                  " U 5        [	        SU  S35        O[	        SU  S35        [        UR                  5       5       Ht  u  nu  p4[        R                  " XC5      n[        [         R                  R                  U SU S35      S	5       nUR                  UR                  5       5        S
S
S
5        Mv     g
! [
         a    [	        SU  S35         Nf = f! , (       d  f       M  = f)zOutput input tensors of test data to a directory

Args:
    directory (str): path of a directory
    inputs (Dict[str, np.ndarray]): map from input name to value
z#Successfully created the directory  zCreation of the directory z failedzWarning: directory z$ existed. Files will be overwritten.input_.pbwbN)ospathexistsmkdirprintOSError	enumerateitemsr   
from_arrayopenjoinwriteSerializeToString)r8   r9   indexnamer   tensorfiles          r   output_test_datarP      s     77>>)$$	FHHY 7	{!DE#I;.RST(8|((4"'',,yF5'*=>EJJv//12 FE  9  	C.ykAB	C FEs   C-   D-D	D	
D	
test_casesverboserandom_seedc           	         Uc   e[         R                  R                  U5        [        R                  " U5        / n[        U5       H  n[	        X`X5      nUR
                  U0nU(       a  [        XpU5      XR
                  '   U(       a  [        XXX5      XR
                  '   U(       a  [        U5      S:X  a  [        SU5        UR                  U5        M     U$ )a  Create given number of input data for testing

Args:
    batch_size (int): batch size
    sequence_length (int): sequence length
    test_cases (int): number of test cases
    dictionary_size (int): vocabulary size of dictionary for input_ids
    verbose (bool): print more information or not
    random_seed (int): random seed
    input_ids (TensorProto): graph input of input IDs
    segment_ids (TensorProto): graph input of token type IDs
    input_mask (TensorProto): graph input of attention mask
    average_sequence_length (int): average sequence length excluding paddings
    random_sequence_length (bool): whether use uniform random number for sequence length
    mask_type (int): mask type 1 is mask index; 2 is 2D mask; 3 is key len, cumulated lengths of query and key

Returns:
    List[Dict[str,numpy.ndarray]]: list of test cases, where each test case is a dictionary
                                   with input name as key and a tensor as value
r   zExample inputs)r   r   seedr1   r   rM   r$   r7   lenrC   append)r	   r
   rQ   r   rR   rS   r   r    r+   r&   r,   r-   
all_inputs
_test_caseinput_1r9   s                   r   fake_test_datar[      s    D    IINN;
KKJJ'
%i_^..'*'<[Ve'fF##$&:Rh'F??# s:!+"F+&! ( r   rU   c                 f    [        U UUUUUUUUUU	U
5      n[        U5      U:w  a  [        S5        U$ )au  Create given number of input data for testing

Args:
    batch_size (int): batch size
    sequence_length (int): sequence length
    test_cases (int): number of test cases
    seed (int): random seed
    verbose (bool): print more information or not
    input_ids (TensorProto): graph input of input IDs
    segment_ids (TensorProto): graph input of token type IDs
    input_mask (TensorProto): graph input of attention mask
    average_sequence_length (int): average sequence length excluding paddings
    random_sequence_length (bool): whether use uniform random number for sequence length
    mask_type (int): mask type 1 is mask index; 2 is 2D mask; 3 is key len, cumulated lengths of query and key

Returns:
    List[Dict[str,numpy.ndarray]]: list of test cases, where each test case is a dictionary
                                   with input name as key and a tensor as value
z$Failed to create test data for test.)r[   rV   rC   )r	   r
   rQ   rU   rR   r   r    r+   r&   r,   r-   r   rX   s                r   generate_test_datar]      sP    B  J :*$45r   c                    U[        UR                  5      :  a  g UR                  U   nU R                  U5      nUcB  U R                  X5      nUb.  UR                  S:X  a  U R                  UR                  S   5      nU$ )NCastr   )rV   inputfind_graph_input
get_parentop_type)
onnx_model
embed_nodeinput_indexr`   graph_inputparent_nodes         r   get_graph_input_from_embed_noderi   $  s    c***++[)E--e4K ++JD"{':':f'D$55k6G6G6JKKr   rd   input_ids_namesegment_ids_nameinput_mask_namec                    U R                  5       nUb  U R                  U5      nUc  [        SU 35      eSnU(       a"  U R                  U5      nUc  [        SU 35      eSnU(       a"  U R                  U5      nUc  [        SU 35      eSU(       a  SOS-   U(       a  SOS-   n[        U5      U:w  a  [        SU S[        U5       35      eXVU4$ [        U5      S:w  a  [        S[        U5       35      eU R	                  S	5      n	[        U	5      S:X  ak  U	S   n
[        X
S5      n[        X
S5      n[        X
S
5      nUc-  U H'  nUR                  R                  5       nSU;   d  M%  UnM)     Uc  [        S5      eXVU4$ SnSnSnU H9  nUR                  R                  5       nSU;   a  UnM'  SU;   d  SU;   a  UnM7  UnM;     U(       a  U(       a  U(       a  XVU4$ [        S5      e)a  Find graph inputs for BERT model.
First, we will deduce inputs from EmbedLayerNormalization node.
If not found, we will guess the meaning of graph inputs based on naming.

Args:
    onnx_model (OnnxModel): onnx model object
    input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None.
    segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None.
    input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None.

Raises:
    ValueError: Graph does not have input named of input_ids_name or segment_ids_name or input_mask_name
    ValueError: Expected graph input number does not match with specified input_ids_name, segment_ids_name
                and input_mask_name

Returns:
    Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]: input tensors of input_ids,
                                                                             segment_ids and input_mask
Nz Graph does not have input named r(   r   zExpect the graph to have z inputs. Got r/   z'Expect the graph to have 3 inputs. Got EmbedLayerNormalization   maskz#Failed to find attention mask inputtokensegmentz?Fail to assign 3 inputs. You might try rename the graph inputs.)'get_graph_inputs_excluding_initializersra   
ValueErrorrV   get_nodes_by_op_typeri   rM   lower)rd   rj   rk   rl   graph_inputsr   r    r+   expected_inputsembed_nodesre   r`   input_name_lowers                r   find_bert_inputsr{   1  s2   4 EEGL!//?	??OPQQ$556FGK" #CDTCU!VWW
#44_EJ! #COCT!UVVKqQ7
1PQR|/88IWZ[gWhVijkkz11
<AB3|CTBUVWW112KLK
;1 ^
3JAN	5jaP4ZQO
%#(::#3#3#5 --!&J & BCCz11 IKJ ::++-%%J''98H+HKI  [Zz11
V
WWr   	onnx_filec                     [        5       n[        U S5       nUR                  UR                  5       5        SSS5        [	        U5      n[        XaX#5      $ ! , (       d  f       N%= f)a  Find graph inputs for BERT model.
First, we will deduce inputs from EmbedLayerNormalization node.
If not found, we will guess the meaning of graph inputs based on naming.

Args:
    onnx_file (str): onnx model path
    input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None.
    segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None.
    input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None.

Returns:
    Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]: input tensors of input_ids,
                                                                             segment_ids and input_mask
rbN)r   rH   ParseFromStringreadr   r{   )r|   rj   rk   rl   modelrO   rd   s          r   get_bert_inputsr     sS    ( LE	i	$diik* 
 5!JJ8HZZ	 
	s    A
A$c                     [         R                  " 5       n U R                  SS[        SS9  U R                  SS[        S SS9  U R                  S	S[        S
SS9  U R                  SS[        SSS9  U R                  SS[        S SS9  U R                  SS[        S SS9  U R                  SS[        S SS9  U R                  SS[        S
SS9  U R                  SS[        SSS9  U R                  SSSSS9  U R                  SS9  U R                  SSSS S9  U R                  SS!9  U R                  S"S#S$[        S%S&9  U R                  S'S(SSS)S9  U R                  SS*9  U R                  S+S[        S,S-S9  U R                  5       nU$ ).Nz--modelTzbert onnx model path.)requiredr   helpz--output_dirFz4output test data path. Default is current directory.)r   r   defaultr   z--batch_sizer(   zbatch size of inputz--sequence_length   z maximum sequence length of inputz--input_ids_namezinput name for input idsz--segment_ids_namezinput name for segment idsz--input_mask_namezinput name for attention maskz	--samplesz$number of test cases to be generatedz--seedr/   zrandom seedz	--verbose
store_truezprint verbose information)r   actionr   )rR   z--only_input_tensorsz-only save input tensors and no output tensors)only_input_tensorsz-az--average_sequence_lengthz)average sequence length excluding padding)r   r   r   z-rz--random_sequence_lengthz3use uniform random instead of fixed sequence length)r,   z--mask_typer)   z^mask type: (1: mask index, 2: raw 2D mask, 3: key lengths, cumulated lengths of query and key))argparseArgumentParseradd_argumentstrintset_defaults
parse_args)parserargss     r   parse_argumentsr     s'   $$&F
	DsAXY
C   S!Rgh
/   '   )   ,   3   5sAMZ
(	   &
<	   51
#8   "B   u5
m   DKr   r   
output_dirr   c                    [        XX5      u  pn[        UUUUUUUUUUU5      n[        U5       H>  u  nn[        R                  R                  US[        U5      -   5      n[        UU5        M@     U
(       a  gSSKnSUR                  5       ;   a  SS/OS/nUR                  U US9nUR                  5        Vs/ s H  nUR                  PM     nn[        U5       H  u  nn[        R                  R                  US[        U5      -   5      nUR                  UU5      n[        U5       H  u  nn[        R                  " [         R"                  " UU   5      U5      n[%        [        R                  R                  USU S35      S	5       nUR'                  UR)                  5       5        SSS5        M     M     gs  snf ! , (       d  f       M  = f)
a	  Create test data for a model, and save test data to a directory.

Args:
    model (str): path of ONNX bert model
    output_dir (str): output directory
    batch_size (int): batch size
    sequence_length (int): sequence length
    test_cases (int): number of test cases
    seed (int): random seed
    verbose (bool): whether print more information
    input_ids_name (str): graph input name of input_ids
    segment_ids_name (str): graph input name of segment_ids
    input_mask_name (str): graph input name of input_mask
    only_input_tensors (bool): only save input tensors,
    average_sequence_length (int): average sequence length excluding paddings
    random_sequence_length (bool): whether use uniform random number for sequence length
    mask_type(int): mask type
test_data_set_Nr   CUDAExecutionProviderCPUExecutionProvider)	providersoutput_r=   r>   )r   r]   rE   r?   r@   rI   r   rP   onnxruntimeget_available_providersInferenceSessionget_outputsrM   runr   rG   r   asarrayrH   rJ   rK   )r   r   r	   r
   rQ   rU   rR   rj   rk   rl   r   r&   r,   r-   r   r    r+   rX   r3   r9   r8   r   r   sessionoutputoutput_namesresultoutput_nametensor_resultrO   s                                 r   create_and_save_test_datar     s   D *9P`)r&IJ#J z*	6GGLL-=A-FG	F+ +  #k&I&I&KK 
!"89$% 
 **5I*FG.5.A.A.CD.CFFKK.CLDz*	6GGLL-=A-FG	\62'5NA{(33BJJvay4I;WMbggll9s#.>?F$

=::<= GF 6 + E GFs   ;G  G
Gc                     [        5       n U R                  S::  a  U R                  U l        U R                  nUcY  [	        U R
                  5      n[        R                  R                  UR                  SU R                   SU R                   35      nUb  [	        U5      nUR                  SSS9  O[        S5        [        U R
                  UU R                  U R                  U R                  U R                  U R                   U R"                  U R$                  U R&                  U R(                  U R                  U R*                  U R,                  5        [        SU5        g )Nr   batch__seq_T)parentsexist_okz7Directory existed. test data files will be overwritten.z Test data is saved to directory:)r   r&   r
   r   r   r   r?   r@   rI   parentr	   rB   rC   r   samplesrU   rR   rj   rk   rl   r   r,   r-   )r   r   pr@   s       r   mainr   Y  s   D##q('+';';$JWW\\!((fT__4EU4K_K_J`,ab
J

4$
/GH

		$$##" 

,j9r   __main__)r)   )i'  )NNN)!r   r?   r   pathlibr   numpyr   onnxr   r   r   rd   r   r   ndarrayr   r$   r*   boolr7   r   dictrP   r[   r]   ri   tupler{   r   r   r   r   __name__ r   r   <module>r      s    	    6 6  (+>ATWZZ<{  VY ^`^h^h 6B3 B B  FFF F !	F
 !F F ZZFR3 3T#rzz/-B 3.777 7 	7
 7 7 7 7 7 !7 !7 7L !111 1 	1
 1 1 1 1 !1 !1 1 1h
 "&#'"&	YXYX$JYX DjYX 4Z	YX
 2::bjj4/d1BBCYX| "&#'"&	[[$J[ Dj[ 4Z	[
 2::bjj4/d1BBC[8aHI>I>I> I> 	I>
 I> I> I> $JI> DjI> 4ZI> I> !I> !I> I>X$:N zF r   