U
    hh                     @   sn  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZ	d dl
Z
d dlZd dlZd dlmZmZ d dlmZmZ d dlmZmZmZmZmZ d dlmZ d dlmZmZmZ d dlm Z  d dl!m"Z"m#Z#m$Z$ d dl%Z&e'e(Z)d	d
 Z*e j+e,dddZ-e j+dddZ.dd Z/dd Z0dd Z1dd Z2dd Z3dd Z4d"ddZ5dd  Z6e(d!krje6  dS )#    N)measure_memorysetup_logger)get_rankget_size)add_io_bindings%get_merged_sample_with_past_kv_inputsget_msft_sample_inputsget_sample_inputsget_sample_with_past_kv_inputs)ORTModelForCausalLM)ProfilerActivityprofilerecord_function)trange)
AutoConfigAutoModelForCausalLMAutoTokenizerc                 C   sR   | j dkrdS | j dkrFzt|jW S  tk
rD   t|jj Y S X t| S )N   hf-pt-compilehf-pt-eagerr   hf-ort)benchmark_typelenZinputs_names	ExceptiondecoderZinput_names
get_inputsargsmodel r   S/tmp/pip-unpacked-wheel-socb9apf/onnxruntime/transformers/models/llama/benchmark.pyget_ort_model_inputs_len"   s    

r!   )r   ort_model_inputs_lenc                 C   s  d\}}| j  dddd}| jdkr0dnd|kr<dnd	|krHd
nd}| jdkrt| j| j| j| jdd}t	| j| j| j| j| j
dd}nh| jdkr2|dkrt| j| j| j| jdd}t	| j| j| j| j| j
dd}nPt| j| j| j| jd|| j
| jddd
}t| j| j| jd| j|| j
| jddd
}n| jdkrt| j| j| j| jd|| j
| jdd| jd}t| j| j| jd| j|| j
| jdd| jd}nb| jdkr|dk}t| j| jd| j|| j
| j|d}t| j| j| jd|| j
| j|d}ntd||fS )NNN- _ort-msfti   Z	codellamai @  Zllama2i   r   T)return_dict)use_fp16r(   r      r   pt)seq_lenpast_seq_lenmax_seq_lenr)   use_gqaenginer(      ort-convert-to-onnxort)r,   r-   r.   r)   r/   r0   r(   
world_size   )r-   r,   r.   r)   r/   split_kvz/Unable to auto-detect inputs for provided model)
model_namelowerreplacer   r	   configtarget_device
batch_sizesequence_lengthr
   r)   r   r/   r4   r   r   )r   r"   init_inputsiter_inputsZ	temp_namer.   r6   r   r   r    r   /   s    


	

r   r   c                 C   s(  d\}}d\}}| j dkr|| jr&| jn| j}t }tj|| jrFtjntj	| j
dd| j}t }| j dkrt|}n>| j dkrt }| j|_| jrd|_d|_ntd| j  | j d	krt| jtkr| jd
 n| j}t| jtkr| jd nd }d }d }	t| jD ]`}
d|
ksd|
ksd|
kr:qd|
ksN|
dkrR|
}d|
kr`|
}	d|
kr|
}|
}	qt }tj| j||	| j
| jdk|dkrdnd |||d	}t }| j dkrtd| j !| j"  t }tj#| j !| j"|| jgd}t }td||  d |S )Nr#   r   T)Ztorch_dtypeuse_auth_tokenZ	use_cacher   >   r   r2   r'   r1   Cannot recognize r   r   z.onnxz
.onnx_dataz
.onnx.dataZdecoder_modelz
model.onnxZdecoder_with_past_modelZdecoder_merged_modelcpu)decoder_file_namedecoder_with_past_file_namerA   Zuse_io_bindingZ
use_mergedproviderprovider_optionsZsession_options   r2   r'   zLoading model from )Z	providerszLoaded model in  s)$r   Zhf_pt_dir_pathr7   timer   from_pretrainedr)   torchZfloat16Zfloat32authtor;   compiler3   ZSessionOptionsr   Zenable_profilingverboseZlog_verbosity_levelZlog_severity_levelr   typeexecution_providertupleoslistdirhf_ort_dir_pathr   deviceloggerinfoort_model_pathformatrankZInferenceSession)r   r   Zsess_options
start_timeend_timesourcerF   rG   rD   rE   filenamer   r   r    	get_model   s|    	




ra   c                    sX   j dkrt jnt jtjdd} jr>||}t|  fdd} fdd}|D ]}|  || |  qZd} j dkrt j	nt j	tjdd}	|	D ]4}|  t

 }
|| |  t

 }|||
 7 }q j dkrtd	 | j	 } j| } jdkrTtd
 j  td j  td| d td| d d S )NrH   zWarm up)filedescc                     s*    j dkr jdkr j S  fddS )NrC   rH   c                     s&    j dkrtj rtj S dd S )NrC   c                  W   s   d S Nr   kwargsr   r   r    <lambda>      =time_fn.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>rW   rL   cudais_availableZsynchronizere   r@   r   r    rg     s
    +time_fn.<locals>.<lambda>.<locals>.<lambda>)rW   r   
io_bindingZsynchronize_inputsre   r@   r   r    rg     s
    ztime_fn.<locals>.<lambda>c                     s*    j dkr jdkr j S  fddS )NrC   rH   c                     s&    j dkrtj rtj S dd S )NrC   c                  W   s   d S rd   r   re   r   r   r    rg   &  rh   ri   rj   re   r@   r   r    rg   $  s
    rm   )rW   r   rn   Zsynchronize_outputsre   r@   r   r    rg   "  s
    r   Z	Benchmarkr%   zBatch Size: zSequence Length: z	Latency: rI   zThroughput: z tps)r   rangeZwarmup_runsr   sysstdoutrP   rX   rY   Znum_runsrJ   r<   r\   r=   )r   fninputsZwarmup_rangeoutputsZ
input_syncZoutput_syncr&   Z
total_timeZbench_ranger]   r^   ZlatencyZ
throughputr   r@   r    time_fn  sJ    

	




ru   c           	      C   s  d| j  d| j d| j  d| j d| j d|jdd d| dtj	 d}d }| jdkrt
tjtjgddd"}td	 || W 5 Q R X W 5 Q R X |jd
dj| j| jd}tj| j| d}t|d}|| W 5 Q R X n|| | d}|S )NbZ_sr&   r$   z%Y-%m-%d_%H:%M:%Sr   T)Z
activitiesZrecord_shapesZprofile_memoryZmodel_inferencer5   )Zgroup_by_stack_n)Zsort_byZ	row_limitz.logwz.json)r<   r=   r   r8   	precisionrW   __name__r9   datetimenowr   r   ZCPUZCUDAr   Zkey_averagestableZpt_filter_byZpt_num_rowsrT   pathjoin
log_folderopenwrite)	r   rr   rs   Zinputs_typeprefixr`   ZprofZ	prof_datafr   r   r    
profile_fnO  s$    T

  

r   c                    s   t  }t|}|jdd   | jdkrVtd|jd dtjdd  d t	
  tj  t| jdk fd	d
d tj  d S )Ng?)intervalr   zCPU usage: F)logical%rC   c                      s    S rd   r   r   rr   rs   r   r    rg   y  rh   zmeasure_fn.<locals>.<lambda>)Zis_gpufunc)rT   getpidpsutilProcessZcpu_percentr\   rX   rY   	cpu_countgcZcollectrL   rk   Zempty_cacher   rW   rp   rq   flush)r   rr   rs   pidprocessr   r   r    
measure_fnl  s    

&
r   c                    s    fdd}|}| j dkr*|| || | jrt| ||d}| j dkr jj }td| d|  t	|tj
| j| t| ||d}| j dkrԈ jj }td| d|  t	|tj
| j| d S td	 t| || t| || td
 t| || t| || d S )Nc                    s    f | }|S rd   r   rs   rt   r   r   r    
get_logits  s    
z$run_hf_inference.<locals>.get_logitsr   promptr   	Renaming  to token7
Evaluating `model(inputs)` step to get past_key_values5
Evaluating `model(inputs)` step with past_key_values)r   r   r   r   sessionend_profilingrX   warningrT   renamer}   r~   r   Zdecoder_with_pastrY   ru   r   )r   r>   r?   r   r   generate_fnnew_lognameold_lognamer   r   r    run_hf_inference  s.    




r   c                    sV   fdd}fdd}fdd} j dkr4|n|}i } jr|||\}	}t ||	d}
 }td	| d
|
  t|tj	 j
|
 t |||\}}t ||d}
 }td	| d
|
  t|tj	 j
|
 d S td |||\}	}t ||	 t ||	 td |||\}}t || t || d S )Nc                    s   t tdd  }t |  }|| }t|rJtd|  td|| }t|r||D ]}td| d | |= q^ j	dkrt
|  j	t j j|\}}t d| ||fS | |fS )	Nc                 S   s   | j S rd   )name)Zmodel_inputr   r   r    rg     rh   z?run_ort_inference.<locals>.prepare_ort_inputs.<locals>.<lambda>z(The following model inputs are missing: zEThere are missing inputs to the model. Please add them and try again.zRemoving unnecessary input 'z' from user provided inputsrC   rn   )setmapr   keysr   rX   errorr   rY   rW   r   intr\   r/   setattr)rs   kv_cache_ortvaluesZmodel_inputsZuser_inputsZmissing_inputsZunnecessary_inputsZunnecessary_inputrn   r   r   r    prepare_ort_inputs  s.    
     z-run_ort_inference.<locals>.prepare_ort_inputsc                    s     |  d S rd   )Zrun_with_iobinding)rn   r   r   r    with_io_binding  s    z*run_ort_inference.<locals>.with_io_bindingc                    s     d | }|S rd   )runr   r   r   r    without_io_binding  s    z-run_ort_inference.<locals>.without_io_bindingrC   r   r   r   r   r   r   )rW   r   r   r   rX   r   rT   r   r}   r~   r   ra   rY   ru   r   )r   r>   r?   r   r   r   r   r   r   Zort_init_inputsr   r   Zort_iter_inputsr   r   r    run_ort_inference  s4    

r   c                 C   sH   | j dkrt| ||| n*| j dkr4t| ||| ntd| j  d S )N>   r   r   r   rH   rB   )r   r   r   r   )r   r>   r?   r   r   r   r    run_inference  s
    

r   c              
   C   s  t  }|jddtddddddgd	 |jd
dtddd |jdddddd |jdddtdddddgdd |jdtddd |jd tdd!d |jd"tdd#d |jd$d%d&d' |jd(d)d*d' |jd+d,ttj rd-nd.d.d-d/gd0 |jd1d2td3d4 |jd5d6td7d4 |jd8d9td:d4 |jd;td<d4 |jd=td>d4 |jd?td@d4 |jdAdddB |jdCtdDdEd |jdFtdGdHd |jdIdddB |jdJttj	
dKdLd | }tj|j t|j dM|jkr6t|dN|j  dO |jdPkr|jdQ| if|_n"|jdRkr6|jdQ| if|_d-|_|jdkrR|jsRtdS|jdTkrn|jsntdU|jdV|_|jdV|_|jdWks|jdkr|jd.krdnd|_|jrt|jd@krt|jd@kstdX|S )YNz-btz--benchmark-typeTr   r   r   r'   r2   )rQ   requiredchoicesz-mz--model-namez<Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf'))rQ   r   helpz-az--authF
store_truez5Use Hugging Face authentication token to access model)defaultactionr   z-pz--precisionfp32Zint4int8fp16zePrecision for model. For ONNX models, the model's precision should be set before running this script.)r   rQ   r   r   r   z--hf-pt-dir-pathr%   zNPath to directory containing all PyTorch files (e.g. tokenizer, PyTorch model))rQ   r   r   z--hf-ort-dir-pathzhPath to directory containing all ONNX files (e.g. tokenizer, decoder_merged, decoder, decoder_with_past)z--ort-model-pathzPath to ONNX modelz-bz--batch-sizesz1 2)r   z-sz--sequence-lengthsz32 64 128 256 512z-dz--devicerk   rC   Zrocm)rQ   r   r   z-idz--device-idr   )rQ   r   z-wz--warmup-runsr5   z-nz
--num-runs
   z--seed   z--max-length    z--num-return-sequencesr1   z	--profile)r   r   z--pt-filter-byZself_cpu_time_totalz"What to filter PyTorch profiler byz--pt-num-rowsi  z.Number of rows for PyTorch profiler to displayz	--verbosez--log-folder.zFolder to cache log filesr3   rR   ZExecutionProviderZCUDAExecutionProviderZ	device_idZROCMExecutionProviderz,Please specify a path to `--hf-ort-dir-path`rH   z+Please specify a path to `--ort-model-path` >   r   r   zOPlease provide only one (batch_size, sequence_length) combination for profiling)argparseArgumentParseradd_argumentstrrL   rk   rl   r   rT   r}   r~   
parse_argsnprandomseedZmanual_seedr   r   rW   upperrR   rV   AssertionErrorrZ   batch_sizessplitsequence_lengthsrx   r   r   )r\   parserr   r   r   r    get_args  s        
	   *r   c                  C   s  t  } t }t| }t|j t|j dtj	j
_| |_||_t|j}t|j}|jdkrnd|j n|j}|jdk}t|d| t|d| t|d| t|d| t|}t||}|jd	kr&tj|j|jd
d}	ttdd |	jj}
|ot |
dko|jdk}t|d| nt|dd
 t!"|j#|j$D ]j\}}|jdkrntd| d| d t|dt%| t|dt%| t&||\}}t'|||| qBd S )NTrC   zcuda:r   	tokenizerr:   r;   r)   rH   F)Zload_external_datac                 S   s
   | j dkS )NZGroupQueryAttention)Zop_type)noder   r   r    rg     rh   zmain.<locals>.<lambda>r   r/   z
Batch size = z and sequence length = z...r<   r=   )(r   r   r   r   rP   rX   rY   __dict__rL   backendsZcudnnZ	benchmarkr\   r4   r   rK   r7   r   rW   rx   r   ra   r!   r   onnxZ
load_modelrZ   r[   listfiltergraphr   r   	itertoolsproductr   r   r   r   r   )r\   r4   r   r   r:   r;   r)   r   r"   Z
onnx_modelZ	gqa_nodesZuse_buffer_sharer<   r=   r>   r?   r   r   r    main  s>    



r   __main__)r   )7r   rz   r   r   loggingrT   rp   rJ   Znumpyr   r   r   rL   Zbenchmark_helperr   r   Zdist_settingsr   r   Zllama_inputsr   r   r   r	   r
   Zoptimum.onnxruntimer   Ztorch.profilerr   r   r   Ztqdmr   Ztransformersr   r   r   Zonnxruntimer3   	getLoggerry   rX   r!   	Namespacer   r   ra   ru   r   r   r   r   r   r   r   r   r   r   r    <module>   sD   
 RB>G	
{.
