U
    h                      @   s0  d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlZd dl	m
Z
 d dlmZmZ d dlmZmZmZmZmZ d dlmZ d dlmZmZ d dlZedZe jd	d
dZe jedddZe jeee dddZ!ee" dddZ#g fee" dddZ$e%dkr,dZ&ej'&e& e(e& e$  dS )    N)List)setup_logger)get_rankget_size)add_io_bindingsconvert_inputs_for_ort%get_merged_sample_with_past_kv_inputsget_sample_inputsget_sample_with_past_kv_inputs)setup_torch_model)
AutoConfigAutoModelForCausalLM )argsc                 C   sR   | j r
dnd\}}| j dddd}d|kr8dnd|krDd	nd
}|||fS )N)      )r   r   -r   _Z	codellamai @  Zllama2i   i   )use_past_kv
model_namelowerreplace)r   past_sequence_lengthZcurr_sequence_lengthZ	temp_namemax_sequence_length r   V/tmp/pip-unpacked-wheel-socb9apf/onnxruntime/transformers/models/llama/llama_parity.pyget_sequence_lengths   s    r   )r   configc                 C   s|   t  }d}t| \}}}| jrBt|| j||||| j| jd|d
}n6| jrdt|| j||| jd|d}nt	|| j||dd}|S )N   T)Zseq_lenpast_seq_lenmax_seq_lenuse_fp16use_gqareturn_dict
world_size)r!   r#   r$   )r#   )
r   r   mergedr   devicer!   r"   r   r
   r	   )r   r   r$   Z
batch_sizer   Zsequence_lengthr   inputsr   r   r   
get_inputs!   s8    
r(   )r   r   pt_modelkv_cache_ortvaluesc                 C   s  t | |}| jdkrtj  t }|f |j  	 }| jdkrRtj  t }t
d||  d ~t| \}}	}
t|| j||
| jt| jd}| j  d}|dkr|d| jif}tj| jt |gd}| jdkrBt||| jt| j| j|\}}|  t }|| |  t }| d	 }~n$t }|d |}t }|d	 }t
d
||  d d| jksd| jkrdnd}tj||||d}t
d|  |st
dt||   |S )NcpuzPyTorch took z s)r"   r   r    r&   	device_idZExecutionProviderZCUDAExecutionProviderr,   )Zsess_optionsZ	providersr   zONNX Runtime took int4int8g      4@g      ?)ZrtolZatolz,Are PyTorch and ONNX Runtime results close? z
Max diff: ) r(   execution_providertorchcudaZsynchronizetimeZlogitsdetachr+   numpyloggerinfor   r   r"   intrankupperortZInferenceSessionZonnx_model_pathZSessionOptionsr   Zsynchronize_inputsZrun_with_iobindingZsynchronize_outputsZcopy_outputs_to_cpurunnpZallclosewarningmax)r   r   r)   r*   r'   
start_timeZ
pt_outputsZend_timer   r   r   epZ	ort_modelZ
io_bindingZort_outputsZtolZparityr   r   r   verify_parityD   sl    




		
 rA   )argvc                 C   sb  t  }|jddddd |jdddtjd	d
d |jdddtjd	dd |jdddddddgdd |jddddd |jdd |jddddd |jdd |jd d!dd"d |jdd# |jd$dd%d |jdd& |jd'd(dd)d*d+d,gd-d. |jd/dtd0d1d2 | g kr&| n|| }|j	d3ksT|j	d)krX|j
dkrXd,nd+|_	|S )4Nz-mz--model_nameTzModel name in Hugging Face)requiredhelpz-tz--torch_model_directoryF.zMPath to folder containing PyTorch model and associated files if saved on disk)rC   defaultrD   z-oz--onnx_model_pathzSPath to ONNX model (with external data files saved in the same folder as the model)z-epz--execution_providerr+   r1   Zrocmz(Execution provider to verify parity with)rC   rF   choicesrD   z-vz	--verbose
store_truezPrint verbose logs)actionrD   )verbosez-pz--use_past_kvzfUse past key and past value as inputs to the model. Necessary for decoder_with_past_model.onnx models.)r   z-gz	--use_gqaz$Use if model has GroupQueryAttention)r"   z--mergedz2Use merged model (i.e. decoder_merged_model.onnx).)r%   z-fpz--precisionr-   r.   fp16fp32zPrecision of model)rC   rG   rD   z--cache_dirz./model_cachezQmodel cache dir to override default HF cache dir to avoid overflood the /home dir)rC   typerF   rD   >   rL   r.   )argparseArgumentParseradd_argumentospathjoinset_defaultsstr
parse_args	precisionr/   )rB   parserr   r   r   r   get_args   s    

	
rY   c                 C   s  t | }t|j td|  t }t|d|jdk ||_t|d|j	dkrTdnd|  t|dt
|j |jtjdk}|r|jn|j}t||||jrt
jnt
j|jd	\}}i }|jst|||| n(d
|_t||||}d|_t|||| d S )NzArguments: r!   rK   device_namer+   zcuda:r&   rE   )Ztorch_dtyper&   FT)rY   r   rJ   r5   r6   r   setattrrW   r8   r/   r0   r&   rZ   Ztorch_model_directoryrQ   rR   rS   r   r   r!   Zfloat16Zfloat32r%   rA   r   )rB   r   r8   Zuse_auth_tokenlocationr   Zllamar*   r   r   r   main   s0    
 
r]   __main__r   ))rN   loggingrQ   r2   typingr   r4   r<   r0   Zbenchmark_helperr   Zdist_settingsr   r   Zllama_inputsr   r   r   r	   r
   Zllama_torchr   Ztransformersr   r   Zonnxruntimer:   	getLoggerr5   	Namespacer   r(   dictrA   rU   rY   r]   __name__seedrandomZmanual_seedr   r   r   r   <module>   s6   
$   J]#

