U
    h}                     @   sV  d Z ddlZddlZddlZddlZddlmZ ddlZddlZddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlmZ dd	l m!Z! e"d
Z#ej$ddZ$dej%kre&e$ej%d< ddl'Z'ddl(m)Z)m*Z*m+Z+ dd Z,dd Z-e.e.dddZ/dd Z0dd Z1dd Z2e3dkrRe2  dS )a   Benchmarking the inference of pretrained transformer models.
    PyTorch/TorchScript benchmark is based on https://github.com/huggingface/transformers/blob/master/examples/benchmarks.py.
    One difference is that random input_ids is generated in this benchmark.

    For onnxruntime, this script will convert a pretrained model to ONNX, and optimize it when -o parameter is used.

    Example commands:
        Export all models to ONNX, optimize and validate them:
            python benchmark.py -b 0 -o -v -i 1 2 3
        Run OnnxRuntime on GPU for all models:
            python benchmark.py -g
        Run OnnxRuntime on GPU for all models with fp32 optimization:
            python benchmark.py -g -o
        Run OnnxRuntime on GPU with fp16 optimization:
            python benchmark.py -g -o -p "fp16"
        Run TorchScript on GPU for all models:
            python benchmark.py -e torchscript -g
        Run TorchScript on GPU for all models with fp16:
            python benchmark.py -e torchscript -g -p "fp16"
        Run ONNXRuntime and TorchScript on CPU for all models with quantization:
            python benchmark.py -e torchscript onnxruntime -p "int8" -o
        Run OnnxRuntime with the ROCM provider and graph optimization script:
            python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm

    It is recommended to use run_benchmark.sh to launch benchmark.
    N)datetime)ConfigModifierOptimizerInfo	Precisioncreate_onnxruntime_sessionget_latency_resultinference_ortinference_ort_with_io_bindingoutput_detailsoutput_fusion_statisticsoutput_summarysetup_logger)FusionOptions)MODEL_CLASSESMODELS)create_onnxruntime_inputexport_onnx_model_from_ptexport_onnx_model_from_tfload_pretrained_model)version)QuantizeHelper F)logicalZOMP_NUM_THREADS)
AutoConfigAutoTokenizerLxmertConfigc           3      C   s  dd l }g }| rBd| krBd| krBd| krBtd |S d}|dkrrtj}d}d| krrtd	 |S |tjkrtd
| d |D ]6}t| d }|
D ]}|t|kr q|d | }t| d |_	t
|}d|krHt N t|t| d t| d t| d |||||| |||||||\}}} }!W 5 Q R X d|krt|t| d t| d t| d |||||| |||||||\}}} }!|sqt|| |d||d}"|"d krqdd |" D }#g }$| rdnd}%tj||d}&tt|t|t| |&jg}'tt||&jg}(|D ]})|)dkr@q,|D ]|}*|!d k	rb|*|!krbqDd|krrtjntj}+t| |)|*||&|+},d|j||%||| ||||)|*| tt d}-|&j	dkrt d| d|)d|&j!|&j!g  nt d| d|)|*g  |r t"|"|,|-|	|)|}.n|"#|#|,}/|'g}0t$t|/D ]8}1|1dkrjt| d dkrj|0%|( n
|0%|' q>d|krtj&ntj'}2t(|"|,|-|	|#|/|$|0|)|%|2|}.t |. |%|. qDq,qq|S )Nr   ZCUDAExecutionProviderZROCMExecutionProviderZDmlExecutionProviderzPlease install onnxruntime-gpu or onnxruntime-directml package instead of onnxruntime, and use a machine with GPU for testing gpu performance.Ztensorrt   ZTensorrtExecutionProviderzhPlease install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance.zOptimizerInfo is set to zA, graph optimizations specified in FusionOptions are not applied.   pt      tfT)Zenable_all_optimizationnum_threadsverbosec                 S   s   g | ]
}|j qS  )name).0Znode_argr$   r$   F/tmp/pip-unpacked-wheel-socb9apf/onnxruntime/transformers/benchmark.py
<listcomp>   s     z#run_onnxruntime.<locals>.<listcomp>cudacpu	cache_dironnxruntimeZenginer   Z	providersdeviceZ	optimizer	precisionZ
io_binding
model_nameinputsthreads
batch_sizesequence_lengthZcustom_layer_numr   vitZswinzRun onnxruntime on  with input shape Zgpt))r-   Zget_available_providersloggererrorr   ZNOOPTwarningr   len
model_typer   parsetorchZno_gradr   r   r   get_outputsr   from_pretrainednumpyprodmaxZhidden_sizeZint64int32r   __version__get_layer_numstrr   nowinfo
image_sizer   runrangeappendZlonglongZintcr	   )3use_gpuprovidermodel_namesmodel_classconfig_modifierr0   r"   batch_sizessequence_lengthsrepeat_timesinput_countsoptimizer_infovalidate_onnxr,   onnx_dirr#   	overwritedisable_ort_io_bindinguse_raw_attention_maskmodel_fusion_statisticsmodel_sourceargsr-   resultsZwarm_up_repeatr1   Zall_input_namesZ
num_inputsZinput_namesfusion_optionsZonnx_model_fileZis_valid_onnx_model
vocab_sizeZmax_sequence_lengthZort_sessionZort_output_namesZoutput_buffersr/   configZmax_last_state_sizeZmax_pooler_sizer4   r5   Zinput_value_typeZ
ort_inputsZresult_templateresultZort_outputsZoutput_buffer_max_sizesiZ	data_typer$   r$   r'   run_onnxruntimeW   sd   






















	


rg   c                    s  g }| r t j s td |S t d |D ]}tj||	|d}|| t	||||d}|j
dkrt|d g}n&tj||d}||jkr|j| nd}td	|  td
|   |tjkr|  t | rdnd}|| |tjkrt|}|D ]}|dkrq|D ]}|j
dkrtd| d|d|j|jg  t j|d|j|jf|tjkrvt jnt j|dnR|d k	r||krqtd| d||g  t jd|jd ||ft j|dz|	rt j |n|
rt !|n|   t"j# fdd|dd}|	r,dn|
r6dndt j$d| rHdndd|d|d||||% t&t'( d}|)t*|| t| |+| W n8 t,k
r } zt-| t j.  W 5 d }~X Y nX qqq.|S )NzYPlease install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.F)torchscriptr,   )rd   r,   custom_model_classr6   r   r+      zModel zNumber of parameters zcuda:0r*   zRun PyTorch on r8   r   )sizedtyper/   r   )lowhighrk   rl   r/   c                      s    S Nr$   r$   	inference	input_idsr$   r'   <lambda>      zrun_pytorch.<locals>.<lambda>repeatnumberrh   torch2r?   NAr)   r   r.   )/r?   r)   Zis_availabler9   r:   Zset_grad_enabledr   rA   modifyr   r=   r   max_model_input_sizesdebugZnum_parametersr   FLOAT16Zhalfr/   toINT8r   Zquantize_torch_modelrJ   rK   ZrandnZfloat16Zfloat32randintrc   longZjittracecompiletimeitrv   rF   rG   rH   r   rI   updater   rN   RuntimeError	exceptionZempty_cache)rO   rQ   rR   rS   r0   r"   rT   rU   rV   rh   rx   r,   r#   ra   r1   rd   model	tokenizermax_input_sizer/   r4   r5   runtimesre   er$   rp   r'   run_pytorch5  s    










&


&r   do_eager_modeuse_xlac                    s*   ddl m dd l fdd}|S )Nr   )wrapsc                    sX     fdd} j d fdd}dkrPdksLtd|S |S d S )	Nc                     s
    | |S ro   r$   r`   kwargsfuncr$   r'   run_in_eager_mode  s    zFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_eager_mode)Zexperimental_compilec                     s
    | |S ro   r$   r   r   r$   r'   run_in_graph_mode  s    zFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_graph_modeTFzcCannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`.)functionAssertionError)r   r   r   r   r!   r   r   r   r'   run_func  s    
z+run_with_tf_optimizations.<locals>.run_func)	functoolsr   
tensorflow)r   r   r   r$   r   r'   run_with_tf_optimizations  s    r   c                    s  g }dd l jj| | s,jg d | rHj sHtd |S | rj	d}z8j|d d jj
|d d jjdd W n, tk
r } zt| W 5 d }~X Y nX |tjks|tjkrtd|D ]4}tj||	d |  t| |	|dd	tj||	d}||jkr2|j| nd
}|D ]}|dkrNq:|D ]}|d k	rp||krpqRtd| d||g  dd l}|  fddt|| D }j|||fjdzt dddfdd}t dddfdd}t ddd fdd}| j!r2|nt" t#rB|  t$j%fdd|dd}dj&d| rrdndd |d |d||||' t(t)* d!}|+t,|| t| |-| W nJ tk
r
 } z*t| dd"l.m/} |0 }|1  W 5 d }~X Y nX qRq:q|S )#Nr   ZGPUzVPlease install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.Tz/gpu:0)r/   z+Mixed precision is currently not supported.r+   )rd   r,   ri   Zis_tf_modelrj   zRun Tensorflow on r8   c                    s   g | ]} d  jd qS )r   r   )r   rc   )r&   rf   )rd   rngr$   r'   r(     s     z"run_tensorflow.<locals>.<listcomp>)shaperl   Fr   c                      s    ddS )NF)trainingr$   r$   rr   r   r$   r'   encoder_forward  s    z'run_tensorflow.<locals>.encoder_forwardc                      s     ddS )NF)Zdecoder_input_idsr   r$   r$   r   r$   r'   encoder_decoder_forward  s    z/run_tensorflow.<locals>.encoder_decoder_forwardc                     s8   j dd jg} j dd jg}| |ddS )Nr   F)Zvisual_featsZ
visual_posr   )randomnormalZvisual_feat_dimZvisual_pos_dim)Zfeatspos)rd   rr   r   r!   r$   r'   lxmert_forward  s    z&run_tensorflow.<locals>.lxmert_forwardc                      s     S ro   r$   r$   )rq   r$   r'   rs   &  rt   z run_tensorflow.<locals>.<lambda>r   ru   r   ry   r)   r*   r   r.   )r)   )2r   rd   	threadingZ set_intra_op_parallelism_threadsZset_visible_devicestestZis_built_with_cudar9   r:   Zlist_physical_devicesZexperimentalZset_memory_growthZ
distributeZOneDeviceStrategyr   r   r   r}   r   NotImplementedErrorr   rA   rz   r   r   r{   rJ   r   RandomrM   ZconstantrE   r   Zis_encoder_decoder
isinstancer   r   rv   rF   rG   rH   r   rI   r   r   rN   Znumbar)   Zget_current_devicereset)rO   rQ   rR   rS   r0   r"   rT   rU   rV   r,   r#   ra   Zphysical_devicesr   r1   r   r   r4   r5   r   valuesr   r   r   r   re   r)   r/   r$   )rd   rq   rr   r   r   r!   r'   run_tensorflow  s    












$r   c                  C   s  t  } | jddddtdddgtt dd	t  d
 | jdddtdddgdd
 | jddtd ttdd	t d | jddddtdgdddddgdd
 | jdddtt	j
dddd  | jd!dtt	j
dd"d#d  | jd$d%dd&d'd( | jd)dtd d*d  | jd+d,ttjttd-d. | jd/dd&d0d( | jd1dd&d2d( | jd3d4ttjttd5d. | jd6d7dd&d8d( | jd9d:dd d;d< | jd=d>dd d?d< | jd@dAdd dBd< | jdCdDdddgtddEdFgdGdH | jdIdJddKtdLdM | jdNdOdtdgdP | jdQdRdtdSdTdUdVdWdXdYgdP | jdZdd&d[d( | jdd\ | jd]d^ddtd_gd`da | jdbdtd dcd  t|  |  }|S )dNz-mz--modelsF+zbert-base-casedzroberta-baseZgpt2z Pre-trained models in the list: z, )requirednargstypedefaultchoiceshelpz--model_sourcer   r   r!   zExport onnx from pt or tfz--model_classz!Model type selected in the list: )r   r   r   r   r   z-ez	--enginesr-   r?   rx   rh   r   zEngines to benchmarkz-cz--cache_dir.Zcache_modelsz%Directory to cache pre-trained models)r   r   r   r   z
--onnx_dirZonnx_modelszDirectory to store onnx modelsz-gz	--use_gpu
store_truezRun on gpu device)r   actionr   z
--providerzExecution provider to usez-pz--precisionzfPrecision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization)r   r   r   r   z	--verbosezPrint more informationz--overwritezOverwrite existing modelsz-oz--optimizer_infozjOptimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_optz-vz--validate_onnxzValidate ONNX modelz-fz--fusion_csvz:CSV file for saving summary results of graph optimization.)r   r   r   z-dz--detail_csvz#CSV file for saving detail results.z-rz--result_csvz$CSV file for saving summary results.z-iz--input_countsr    r   zXNumber of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.)r   r   r   r   r   r   z-tz--test_timesd   z8Number of repeat times to get average inference latency.)r   r   r   r   z-bz--batch_sizes)r   r   r   z-sz--sequence_lengths             @         z--disable_ort_io_bindingz=Disable running ONNX Runtime with binded inputs and outputs. )r\   z-nz--num_threadsr   zThreads to use)r   r   r   r   r   z--force_num_layersz%Manually set the model's layer number)argparseArgumentParseradd_argumentrH   listr   keysjoinr   ospathr   ZFLOAT32r   ZBYSCRIPTintset_defaultsr   add_arguments
parse_args)parserr`   r$   r$   r'   parse_argumentsE  sH   

					

r   c                  C   s  t  } t| j | jtjkr0| js0td d S | jtj	krP| jrPtd d S t
| jdkr|t| jd  d dkr|dg| _tdd	 | jD | _td
|   tj| jszt| j W n$ tk
r   td| j  Y nX d| jk}d| jk}d| jk}d| jk}d| jk}|rLttjtdk rLtdtj  d S t| j}g }| jD ]}t| ttj !  |s|s|r^| j"dgkrt#d |r|t$| j| j| j%|| j|| j&| j| j'dd| j| j7 }|r"|t$| j| j| j%|| j|| j&| j| j'dd| j| j7 }|r^|t$| j| j| j%|| j|| j&| j| j'dd| j| j7 }|r|t(| j| j| j%|| j|| j&| j| j'| j| j7 }i }	|r`zd| j) }
|t*| j| j+| j| j%|| j|| j&| j| j'| j"| j,| j-| j| j.| j| j/| j0|
|	| j1| 7 }W n$ t2k
r(   tjddd Y nX q`t34 5d}|	r`| j6pTd| d}t7|	| t
|dkr| j&dgkrt#d d S | j8pd| d}t9|| | j:pd| d}t;|||  d S )Nzfp16 is for GPU onlyzint8 is for CPU onlyr   r   r   )r7   Zswimr   c                 S   s   h | ]}|d krt n|qS )r   )	cpu_count)r&   xr$   r$   r'   	<setcomp>  s     zmain.<locals>.<setcomp>zArguments: z#Creation of the directory %s failedr?   rx   rh   r-   r   z2.0.0z2PyTorch version must be >=2.0.0 and you are using zB--input_counts is not implemented for torch or torchscript engine.TF	Exception)exc_infoz%Y%m%d-%H%M%SZbenchmark_fusion_z.csvzNo any result avaiable.Zbenchmark_detail_Zbenchmark_summary_)<r   r   r#   r0   r   r}   rO   r9   r:   r   r<   modelsr   rU   sortedr"   rJ   r   r   existsr,   mkdirOSErrorZenginesr   r>   r?   rF   r   Zforce_num_layersZset_num_threadsr|   Z
__config__Zparallel_inforW   r;   r   rR   rT   Z
test_timesr   Zuse_mask_indexrg   rP   rX   rY   rZ   r[   r\   r_   r   r   rI   strftimeZ
fusion_csvr   Z
detail_csvr
   Z
result_csvr   )r`   Zenable_torchZenable_torch2Zenable_torchscriptZenable_onnxruntimeZenable_tensorflowrS   ra   r"   r^   r]   Z
time_stampZcsv_filenamer$   r$   r'   main  s   


$











r   __main__)4__doc__r   loggingr   r   r   rB   ZpsutilZbenchmark_helperr   r   r   r   r   r   r	   r
   r   r   r   rb   r   Zhuggingface_modelsr   r   Zonnx_exporterr   r   r   r   	packagingr   Zquantize_helperr   	getLoggerr9   r   environrH   r?   Ztransformersr   r   r   rg   r   boolr   r   r   r   __name__r$   r$   r$   r'   <module>   s>   4

 _r  A "
