U
    h^                     @   s0  d dl Z d dlZd dlZd dlZd dlZd dlmZ dddgZd'ddZdd	 Z	d
d Z
d(ddZd)ddZdd Zdd Zdd Zdd Zd*ddZdd Zdd Zdd  Zd!d" Zed#kr,e Zed$e d d%lmZ eej ejsej std&eeZnejZeeeZ e D ]Z!ee! qdS )+    N)TensorProtoZScanZLoopZIfc                 C   s  t  }|jdddtdd |jdddtdd |jd	d
dtddd |jdddtddd |jddtddd |jddtddd |jddtddd |jddtddd |jddtddd |jddtd d d |jd!dtd d"d |jd#dtd d$d |jd%dd&d'd(d)d&gd*d+ |jd,d-dd.d/d0 |jdd1 |jd2dtd3d4d |jd5dd.d6d0 |jdd7 |jd8dd.d9d0 |jdd: |jd;d<dd.d= |jdd> || S )?Nz-iz--inputFz2Set the input file for reading the profile results)requiredtypehelpz-mz--modelzIonnx model path to run profiling. Required when --input is not specified.z-bz--batch_size   zbatch size of input)r   r   defaultr   z-sz--sequence_length    zsequence length of inputz--past_sequence_lengthzpast sequence length for gpt2z--global_lengthz&number of global tokens for longformerz	--samplesi  z\number of samples to test. Set it large enough to reduce the variance of performance result.z--thresholdg{Gz?zfThreshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.z--thread_numznumber of threads to usez--input_ids_namez"input name for input IDs, for bertz--segment_ids_namez$input name for segment IDs, for bertz--input_mask_namez'input name for attention mask, for bertz--dummy_inputsr   bertgpt2
longformerzEType of model inputs. The default will create dummy inputs with ones.)r   r   choicesr   z-gz	--use_gpu
store_truezuse GPU)r   actionr   )use_gpuz
--providerZcudazExecution provider to usez--basic_optimizationz_Enable only basic graph optimizations. By default, all optimizations are enabled in OnnxRuntime)basic_optimizationz--kernel_time_onlyz.Only include the kernel time and no fence time)kernel_time_onlyz-vz	--verbose)r   r   )verbose)argparseArgumentParseradd_argumentstrintfloatset_defaults
parse_args)argvparser r   E/tmp/pip-unpacked-wheel-socb9apf/onnxruntime/transformers/profiler.pyparse_arguments   s    		
r    c                 C   sD   ddl m} || ||| |dd}|D ]}|d |}	q&| }
|
S )Nr   )create_onnxruntime_sessionT)Zenable_all_optimizationnum_threadsZenable_profiling)benchmark_helperr!   runZend_profiling)Zonnx_model_pathr   providerr   
thread_num
all_inputsr!   sessioninputs_profile_filer   r   r   run_profile   s    	r,   c              	   C   s@   t d|  d t| }t|}W 5 Q R X t|ts<t|S )Nzloading profile output z ...)printopenjsonload
isinstancelistAssertionError)r+   Zopened_file	sess_timer   r   r   load_profile_json   s
    
r5   c                 C   sV  i }i }i }d}d}| D ]}|d dkr8|d dkr8d}|s>q|d dkrd	|krd
|krd|d
 kr|d }|d
 d }	|	t krq|	sd| d}	||kr||  |d	 7  < ||  d7  < n|d	 ||< d||< |	||< ||d	 7 }q|sdgS g }
|
d|d dd |
d |
d t| dd ddD ]f\}}|| }||k rZq<|| }|t| }|
|dd|d dd|dd|dd| 	 q<i }| D ]6\}}	|| }|	|kr||	  |7  < n|||	< q|
d |
d |
d  t| d!d ddD ]4\}	}|| }|
|dd|d dd|	  q|
S )"  Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.

    Args:
        sess_time (List[Dict]): profile data
        kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
        threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.

    Returns:
        List[str]: lines of string for output.
    r   FcatSessionnameZsession_initializationTZKerneldurargsop_name()r   zNo kernel record found!z%
Top expensive kernels with Time% >= d   .2f:@----------------------------------------------------------------u&   Total(μs)	Time%	Calls	Avg(μs)	Kernelc                 S   s   | d S Nr   r   xr   r   r   <lambda>       z&parse_kernel_results.<locals>.<lambda>keyreverse10d	      Y@5.2f5d8.1fz
Group kernel time by operator:u   Total(μs)	Time%	Operatorc                 S   s   | d S rC   r   rD   r   r   r   rF     rG   )NODES_TYPE_CONTAINING_SUBGRAPHappendsorteditemsr   )r4   	thresholdZkernel_name_to_op_namekernel_timeZkernel_freqtotalZsession_inititemZkernel_namer<   linesdurationratiocallsavg_timeZop_timer   r   r   parse_kernel_results   sd    (


6



&r^   Fc                 C   s  g }i }i }i }d}| D ]2}|d dkrd|krd|krd|d kr|d  dd	 d
d	 dd	}	d|d kr|d d dkrd}
n*|d d dkrd}
n|d d dkrd}
|	|kr|
||	< q||	 |
kstn|rq|d d }|tkrq|	|kr"||	  |d 7  < ||	  d7  < n|d ||	< d||	< ||	 ||d 7 }qdddg}d}|D ]}	||	 }||	 }|t| }|| d }|	|kr||	 nd	}||7 }||dd|dd|dd|dd|dd|dd|	  q`|d|d  d!d" |d |d# t| d$d% d&d'D ]\}	}|| }||k rLq.||	 }|t| }|| d }|	|kr~||	 nd	}||dd|dd|dd|dd|dd|	  q.|S )(r6   r   r7   Noder:   r;   r<   r9   Z_kernel_time Z_fence_beforeZ_fence_afterr%   ZCPUExecutionProviderZCPUZCUDAExecutionProviderZCUDAZDmlExecutionProviderZDMLr   z
Nodes in the original order:rB   u3   Total(μs)	Time%	Acc %	Avg(μs)	Calls	Provider	Nodeg        rM   rK   rL   rN   rP   rO   8sz#
Top expensive nodes with Time% >= r?   r@   rA   u-   Total(μs)	Time%	Avg(μs)	Calls	Provider	Nodec                 S   s   | d S rC   r   rD   r   r   r   rF   V  rG   z$parse_node_results.<locals>.<lambda>TrH   )replacer3   rQ   rR   r   rS   rT   )r4   r   rU   Znode_name_listZ	node_timeZ	node_freqZnode_providerrW   rX   Z	node_nameZdevicer<   rY   Zbefore_percentagerZ   r\   r]   
percentager%   r[   r   r   r   parse_node_results  sv    
(


6


:rd   c                 C   sZ  i }i }d}i }i }i }i }	d}
i }| D ]}|d dkr(d|kr(d|kr(d|d kr(|d d }|t krlq(d|d krd|d	 kr(||	kr|	|  |d 7  < n|d |	|< |
|d 7 }
q(d|d kr|d d nd
}||kr||  d7  < nd||< | d| }||kr8||  |d 7  < ||  d7  < n|d ||< d||< ||krl||  |d 7  < n|d ||< ||kr||  |d 7  < ||  d7  < n|d ||< d||< ||d 7 }q(d
dg}|d |d t| dd ddD ]\}}||	kr|	| nd}|| }|| }|||
  }|| }|| }||dd|d dd|dd|d dd|dd|dd|dd|  q|d
dg7 }|d |d t| dd ddD ]\}}|d}|d }|d }|dd
}|| }|| }|||  }||dd|d dd|dd|dd|d d|  q|S )!a  Group results by operator name.

    Args:
        sess_time (List[Dict]): profile data
        kernel_time_only (bool): Only include items for kernel time.
        use_gpu (bool): GPU is used in profiling or not.

    Returns:
        List[str]: lines of string for output.
    r   r7   r_   r:   r;   r<   r%   Zfencer9   r`   r   rA   zGrouped by operatorrB   uM   Total(μs)	Time%	Kernel(μs)	Kernel%	Calls	AvgKernel(μs)	Fence(μs)	Operatorc                 S   s   | d S rC   r   rD   r   r   r   rF     rG   z$group_node_results.<locals>.<lambda>TrH   rK   rL   rM   rN   Z11drO   z14.1fzGrouped by provider + operatoru<   Kernel(μs)	Provider%	Calls	AvgKernel(μs)	Provider	Operatorc                 S   s   | d S rC   r   rD   r   r   r   rF     rG   ZExecutionProviderz9.2fra   )rQ   rR   rS   rT   splitrb   )r4   r   r   Zop_kernel_timeZop_kernel_recordsZtotal_kernel_timeZprovider_op_kernel_timeZprovider_op_kernel_recordsZprovider_kernel_timeZop_fence_timeZtotal_fence_timeZprovider_counterrX   r<   r%   rI   rY   rV   Z
fence_timeZkernel_time_ratioZ
total_timeZ
time_ratioZkernel_callsZavg_kernel_timepartsZshort_epr\   Zprovider_time_ratior   r   r   group_node_resultsd  s    
(




F


2rg   c                 C   s&   t | dtkr"t| | dS d S )Nvalue)r   Z
WhichOneofr   getattr)dimr   r   r   get_dim_from_type_proto  s    rk   c                 C   s   dd | j jjD S )Nc                 S   s   g | ]}t |qS r   )rk   ).0dr   r   r   
<listcomp>  s     z-get_shape_from_type_proto.<locals>.<listcomp>)tensor_typeshaperj   )Z
type_protor   r   r   get_shape_from_type_proto  s    rq   c                    s  i  |   D ]}t|j}g }t|D ]\}}t|tr&|| q&t|dkrV dS t|dkrn|||d < t|dkr|||d < |jjj	}	|	t
jt
jt
jfkst|	t
jkrtjn|	t
jkrtjntj}
tj||
d}| |j< q fddt|D }|S )a  Create dummy inputs for ONNX model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        samples (int): number of samples

    Returns:
        List[Dict]: list of inputs
       Nr   r   Zdtypec                    s   g | ]} qS r   r   rl   r*   dummy_inputsr   r   rn     s     z'create_dummy_inputs.<locals>.<listcomp>)'get_graph_inputs_excluding_initializersrq   r   	enumerater1   r   rR   lenro   	elem_typer   FLOATINT32INT64r3   numpyfloat32int64int32onesr9   range)
onnx_model
batch_sizesequence_lengthsamplesgraph_inputrp   Zsymbol_dimsirj   rz   	data_typedatar'   r   ru   r   create_dummy_inputs  s.    


r   c                 C   sB   ddl m}m} || |||\}	}
}||||dd|	|
|dd	}|S )a-  Create dummy inputs for BERT model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        samples (int): number of samples
        input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None.
        segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None.
        input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None.

    Returns:
        List[Dict]: list of inputs
    r   )find_bert_inputsgenerate_test_data{   F)Z
test_casesseedr   	input_idssegment_ids
input_maskZrandom_mask_length)Zbert_test_datar   r   )r   r   r   r   input_ids_namesegment_ids_nameinput_mask_namer   r   r   r   r   r'   r   r   r   create_bert_inputs  s    r   c                    s   ||||| d}i  |   D ]}t|j}t|D ]6\}}	t|	tr4|	|kr^td|	 q4||	 ||< q4|jjj}
|
t	j
t	jt	jfkst|
t	j
krtjn|
t	jkrtjntj}tj||d}| |j< q fddt|D }|S )a  Create dummy inputs for GPT-2 model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        past_sequence_length (int): past sequence length
        samples (int): number of samples

    Raises:
        RuntimeError: symbolic is not supported. Use the tool convert_to_onnx.py to export ONNX model instead.

    Returns:
        List[Dict]: list of inputs
    )r   Zseq_lenZpast_seq_lenZtotal_seq_lensymbol is not supported: rs   c                    s   g | ]} qS r   r   rt   ru   r   r   rn   O  s     z&create_gpt2_inputs.<locals>.<listcomp>)rw   rq   r   rx   r1   r   RuntimeErrorro   rz   r   r{   r|   r}   r3   r~   r   r   r   r   r9   r   )r   r   r   past_sequence_lengthr   symbolsr   rp   r   rj   rz   r   r   r'   r   ru   r   create_gpt2_inputs#  s.    


r   c                    s  ||d}i  |   D ]}t|j}t|D ]6\}}	t|	tr,|	|krVtd|	 q,||	 ||< q,|jjj}
|
t	j
t	jt	jfkst|
t	j
krtjn|
t	jkrtjntj}d|jkrtj||d}d|ddd|f< ntj||d}| |j< q fddt|D }|S )	a  Create dummy inputs for Longformer model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        global_length (int): number of global tokens
        samples (int): number of samples

    Raises:
        RuntimeError: symbolic is not supported. Use the tool convert_longformer_to_onnx.py to export ONNX model instead.

    Returns:
        List[Dict]: list of inputs
    )r   r   r   globalrs   r   Nc                    s   g | ]} qS r   r   rt   ru   r   r   rn   ~  s     z,create_longformer_inputs.<locals>.<listcomp>)rw   rq   r   rx   r1   r   r   ro   rz   r   r{   r|   r}   r3   r~   r   r   r   r9   zerosr   r   )r   r   r   global_lengthr   r   r   rp   r   rj   rz   r   r   r'   r   ru   r   create_longformer_inputsS  s,    




r   c                 C   s@   t | }t||j}|t||j|j7 }|t||j|j7 }|S )N)r5   r^   rU   rd   r   rg   r   )r+   r;   Zprofile_recordsrY   r   r   r   process_results  s
    r   c                 C   s  | j dkr| j n
tjdd}dtjkr4t|tjd< ddlm} ddlm	} ||| j
}d }| jdkrt|| j| j| j| j| j| j}n\| jdkrt|| j| j| j| j}n8| jd	krt|| j| j| j| j}nt|| j| j| j}t| j
| j| j| j| j |}|S )
Nr   F)logicalZOMP_NUM_THREADS)r0   )	OnnxModelr
   r   r   )r&   psutil	cpu_countosenvironr   onnxr0   r   r   modelrv   r   r   r   r   r   r   r   r   r   r   r   r   r,   r   r%   r   )r;   r"   r0   r   r   r'   r+   r   r   r   r$     sV    

	

	r$   __main__	Arguments)setup_loggerzMrequires either --model to run profiling or --input to read profiling results)N)r   )Fr   )NNN)"r   r/   r   r~   r   r   r   rQ   r    r,   r5   r^   rd   rg   rk   rq   r   r   r   r   r   r$   __name__	argumentsr-   r#   r   r   inputr   r3   r+   resultsliner   r   r   r   <module>   sH   


 

P
Wd/   
)0/6




