U
    hI                     @   s   d dl mZ d dlmZmZmZmZ d dlZd dl	m
Z
 d dlmZ d dlmZmZ d dlmZmZmZmZ d dlmZ eeZG d	d
 d
ZG dd de
ZdS )    )	getLogger)ListOptionalTupleUnionN)Fusion)AttentionMaskFormat)FusionUtilsNumpyHelper)	NodeProtoTensorProtohelpernumpy_helper)	OnnxModelc                   @   sL   e Zd ZdZedddZedddZdd	 Zd
d Z	e
e
dddZdS )AttentionMask:
    Fuse Attention subgraph into one Attention node.
    )modelc                 C   s2   || _ i | _i | _t|| _tj| _| | _	d S N)
r   mask_indicemask_castedr	   utilsr   ZMaskIndexEndmask_formatZget_opset_versionopset_version)selfr    r   M/tmp/pip-unpacked-wheel-socb9apf/onnxruntime/transformers/fusion_attention.py__init__   s    
zAttentionMask.__init__r   c                 C   s
   || _ d S r   r   )r   r   r   r   r   set_mask_format!   s    zAttentionMask.set_mask_formatc                 C   s*   || j kr|| j | kst|| j |< d S r   )r   AssertionError)r   mask
mask_indexr   r   r   set_mask_indice$   s    
zAttentionMask.set_mask_indicec                 C   s    t | jdksttt| jS Nr   )lenr   r   nextiter)r   r   r   r   get_first_mask)   s    zAttentionMask.get_first_mask)inputreturnc              	   C   sb  | j tjkrd S || jkr$| j| S | j|rB| j|\}}n| j|\}}d}|rd|| j	|< | j tj
kr~|| j|< |S | jd}| jdk rtjd|g|g| jddd}|jtddgtd	d
g npd}| j|d kr| jtj|tjdgdgdd tjd||g|g| jddd}|jtd	d
g | j| || j|< |S )NTr!      Z	ReduceSumZMaskReduceSuminputsoutputsnameaxes   Zkeepdimsr   Zort_const_1_reduce_sum_axesFr.   	data_typedimsvalsraw)r   r   ZNoMaskr   r   Zfind_graph_inputr   Zcast_graph_input_to_int32Zcast_input_to_int32r   r   create_node_namer   r   	make_node	attributeextendmake_attributeget_initializeradd_initializerZmake_tensorr   INT64add_node)r   r(   ZcastedZ
input_nameZ	cast_nodeZoutput_nameZmask_index_nodeZ	axes_namer   r   r   process_mask-   sV    




$	
zAttentionMask.process_maskN)__name__
__module____qualname____doc__r   r   r   r   r"   r'   strr?   r   r   r   r   r      s   
r   c                       s  e Zd ZdZdddddgfeeeee eee	e
 d fddZeeeef d	d
dZeeeef dddZedddZe
dddZe
e
e
dddZe
e
e
e
fdddZe
e
e
dddZe
e
dddZeeedf eedf e
eedf dd d!Zeeeeeedf eedf eeedf d"d#d$Zd.eeee
df eee
df eeedf eedf eee
e
e
e
e
e
e
eeedf d&d'd(Zd/e
eeeeeeeee
e
e
e
e
e
e
ee eeedf d)d*d+Zd,d- Z  ZS )0FusionAttentionr   NFSkipLayerNormalizationLayerNormalization)r   hidden_size	num_headsattention_maskuse_multi_head_attention!disable_multi_head_attention_biassearch_op_typesc           	         s\   |rdnd}t  ||| || _|| _|r0|nt|| _|| _|| _d | _d| _	d| _
d S )NMultiHeadAttention	AttentionT)superr   rH   rI   r   rJ   rK   rL   mask_filter_valuenum_heads_warninghidden_size_warning)	r   r   rH   rI   rJ   rK   rL   rM   Zattention_op_name	__class__r   r   r   m   s    
zFusionAttention.__init__)concatr)   c                 C   s   t |jdkrv| j|jd }| j|jd }t|tjrv|jdkrvt|tjrv|jdkrv|d |d |d  fS | j| j	fS )aU  
        Detect num_heads and hidden_size from Concat node in the following subgraph:

        SkipLayerNormalization or EmbedLayerNormalization
                        /        |
                     MatMul    Shape
                        |        |
                       Add     Gather(indices=0)
                        |        |
                        |      Unsqueeze
                        |        |
                        |     Concat (*, -1, 12, 64)
                        |     /
                       Reshape
                          |
                       Transpose
                 r0   r   )
r$   r(   r   Zget_constant_value
isinstancenpZndarraysizerI   rH   )r   rV   rI   	head_sizer   r   r   )get_num_heads_and_hidden_size_from_concat   s    

z9FusionAttention.get_num_heads_and_hidden_size_from_concat)	reshape_qr)   c                 C   sR  | j |jd }|dkrf| j |d}|dk	rD|jdkrD| |S t|jd  d | j| j	fS t
|}t|dks|d dks|d dkrtd	| d
 | j| j	fS |d }|d }|| }| jdkr
|| jkr
| jr
td| j d| d d| _| j	dkrJ|| j	krJ| jrJtd| j	 d| d d| _||fS )zDetect num_heads and hidden_size from a reshape node.

        Args:
            reshape_q (NodeProto): reshape node for Q

        Returns:
            Tuple[int, int]: num_heads and hidden_size
        r0   NConcatz is not initializer.rW   rX   r   rY   zq_shape_value=z7. Expected value are like [0, 0, num_heads, head_size].z--num_heads is z. Detected value is z. Using detected value.Fz--hidden_size is )r   r;   r(   Z
get_parentop_typer^   loggerdebugrI   rH   r
   to_arrayr$   rR   warningrS   )r   r_   Zq_shaperV   Zq_shape_valuerI   r]   rH   r   r   r   get_num_heads_and_hidden_size   s2    


$z-FusionAttention.get_num_heads_and_hidden_sizeadd_qkc                 C   s   | j jdd}|d krd S ||jd }||jd }|d ksJ|d kr`td| d d S ||kr~td| d d S |jd S )	NT)updater   r0   zone of the inputs of z is Nonezthe shape of two inputs of z is not same)r   Zinfer_runtime_shapeZget_edge_shaper(   rb   rc   )r   rh   Zshape_inferZinput_0_shapeZinput_1_shaper   r   r   get_add_qk_str   s    zFusionAttention.get_add_qk_strc                    s    d t tfdd| j}t|dkr0S t|dks@t| jd}tjd fddt	| j
D g|dd	}| j| | j| j|< S )
NZ_maskc                    s   | j d  kS r#   )output)node)mask_output_namer   r   <lambda>       z0FusionAttention.reshape_add_qk.<locals>.<lambda>r0   r   r`   c                    s   g | ]} qS r   r   ).0_rg   r   r   
<listcomp>   s     z2FusionAttention.reshape_add_qk.<locals>.<listcomp>r,   r-   r.   axis)listfilternodes_to_addr$   r   r   r6   r   r7   rangerI   appendthis_graph_namenode_name_to_graph_name)r   rh   Zconcat_nodeconcat_node_nameZconcat_add_qk_fp32r   )rh   rm   r   reshape_add_qk   s     zFusionAttention.reshape_add_qk)past_kpast_vr)   c                 C   s   | j d}| j d}|d dd}|d dd}tjd|g|g|dgd}tjd|g|g|dgd}| j| | j| | j| j|< | j| j|< | j d}	|dd	ddd
d}
tjd||g|
g|	dd}| j| | j| j|	< |
S )zConcatenate past_k and past_v inputs to create past_kv input.

        Args:
            past_k (str): name of past K value
            past_v (str): name of past V value

        Returns:
            kv_output_name (str): name of past KV value
        	UnsqueezeZ_5d.rq   r   )r,   r-   r.   r/   r`   z.valuez.kv_valueZ_kvrs   )	r   r6   replacer   r7   rw   ry   rz   r{   )r   r~   r   Zunsqueeze_k_nameZunsqueeze_v_nameZ	k_5d_nameZ	v_5d_nameZk_5dZv_5dr|   Zkv_output_name	concat_kvr   r   r   r      sD    		zFusionAttention.concat_kvc                 C   s   d}| j |}|dkrJtjtjdd| j jgdd|d}| j || j | j 	d}| j 	d}|d	 
d
d}|d	 
d
d}tjd||g|g|d}	tjd||g|g|d}
| j|	 | j|
 | j| j|< | j| j|< ||fS )ah  Reshape past_k and past_v from 4D to 3D to use as inputs for multihead attention node.

        Args:
            past_k (str): name of past K value of shape 4D
            past_v (str): name of past V value of shape 4D

        Returns:
            k_3d (str): name of past K value of shape 3D
            v_3d (str): name of past V value of shape 3D
        Zkv_4d_to_3dNr   int64Zdtyper.   ReshapeZ_3dr   rq   r+   )r   r;   r   
from_arrayr[   arrayrH   r<   rz   r6   r   r   r7   rw   ry   r{   )r   r~   r   Znew_dims_nameZnew_dimsZreshape_k_nameZreshape_v_nameZ	k_3d_nameZ	v_3d_nameZk_3dZv_3dr   r   r   
reshape_kv.  s:     zFusionAttention.reshape_kv)present_k_namepresent_v_namekv_nodec                 C   s   d\}}| j |}| j |}|dkrPtjtjddd|d}| j || j |dkrtjtjddd|d}| j || j | j d}| j d}	t	j
d||g|g|dd	}
t	j
d||g|g|	dd	}| j|
 | j| | j| j|< | j| j|	< dS )
a?  Split kv_node containing present KV values into separate present K and present V values.

        Args:
            present_k_name (str): name of output to store present K value in
            present_v_name (str): name of output to store present V value in
            kv_node (str): name of present KV values
        )Zindex_0Zindex_1Nr   r   r   r   r0   ZGatherrs   )r   r;   r   r   r[   r   r<   rz   r6   r   r7   rw   ry   r{   )r   r   r   r   Zk_indexZv_indexZk_dimZv_dimZgather_k_nameZgather_v_name	present_k	present_vr   r   r   split_kv_  s:    	zFusionAttention.split_kv)r~   r   c           	   	   C   s   |d  dd}|d  dd}| jd}| jd}tjd|g|g|ddddgd	}tjd|g|g|ddddgd	}| j| | j| | j| j|< | j| j|< ||fS )
a}  Transpose past_k and past_v from (B,N,P,H) to (B,P,N,H)

        Args:
            past_k (str): name of past K value of shape (B,N,P,H)
            past_v (str): name of past V value of shape (B,N,P,H)

        Returns:
            past_k_transpose (str): name of past K value of shape (B,P,N,H)
            past_v_transpose (str): name of past V value of shape (B,P,N,H)
        Z_transposedr   rq   	Transposer   rX   r0   rY   )r,   r-   r.   perm)	r   r   r6   r   r7   rw   ry   rz   r{   )	r   r~   r   Zpast_k_transposeZpast_v_transposeZtranspose_k_nameZtranspose_v_nameZtranspose_kZtranspose_vr   r   r   transpose_kv  s.    

	zFusionAttention.transpose_kv)q_addk_addv_addname_prefixr)   c                 C   s   | j |jd p"| j |jd }t|}t|}t|}|d k	rx| j |jd pl| j |jd }	t|	}|d k	r| j |jd p| j |jd }
t|
}tj|||fdd}dt|j	 }|d }| j
||j|g|d |S )Nr0   r   rt   rY   	_qkv_biasr.   r2   r3   r4   )r   r;   r(   r
   rd   r[   Z
zeros_likestackprodshaper<   r2   )r   r   r   r   r   q_biasqbkbvbk_biasv_biasqkv_biasqkv_bias_dim	bias_namer   r   r   create_combined_qkv_bias  s(    $


$
$
z(FusionAttention.create_combined_qkv_bias)q_matmulk_matmulv_matmulr   r   r   rI   r)   c           $      C   s"  | j d}|jd |jd kr4|jd |jd ks8t| j |jd }	| j |jd }
| j |jd }t|	}t|
}t|}|j|jkr|j|jkst|jd }tj	|||fdd
|d| f}|d }| j||	j|jd |jd g|d |d }tjd|jd |g|g|d	}| j| j|< |g}|d
 }| j|tjdgdgdd |d }| j|tjdg|gdd |d }| j|tjdgd| gdd |d }| j|tjdgd| gdd |d }| j|tjdgdgdd |d }tjd||||g|g| j dd	}| j| j|j< |d }tjd||||g|g| j dd	}| j| j|j< |d }tjd||||g|g| j dd	}| j| j|j< |} |}!|}"||||g | jr|dk	r,| j |jd rdnd}#tt| j |j|# r,||jd|# < |} || | j| j|j< |dk	r| j |jd rNdnd}#tt| j |j|# r||jd|# < |}!|| | j| j|j< |dk	r| j |jd rdnd}#tt| j |j|# r||jd|# < |}"|| | j| j|j< | j| | |!|"fS )a  Create packed QKV MatMul node before MultiHeadAttention node.
           This is for the scenario where an Attention node should be created but cannot be created
           because past_key and past_value are separate inputs and not one concatenated input.

        Args:
            q_matmul (NodeProto): name of MatMul from Q path - (batch_size, sequence_length, hidden_size)
            k_matmul (NodeProto): name of MatMul from K path - (batch_size, sequence_length, hidden_size)
            v_matmul (NodeProto): name of MatMul from V path - (batch_size, sequence_length, hidden_size)
            q_add (NodeProto): name of Add from Q path
            k_add (NodeProto): name of Add from K path
            v_add (NodeProto): name of Add from V path
            num_heads (int): number of heads

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        MatMulr   r0   r   rY   _qkv_weightr   Z_qkv_outr+   Z_q_start_indexFr1   Z_k_start_indexZ_v_start_indexrX   Z_end_of_qkv_indexZ_qkv_last_axisr   Z_q_outZSliceZ_k_outZ_v_outN)r   r6   r(   r   r;   r
   rd   r   r[   r   Zreshaper<   r2   r   r7   rz   r{   r   r=   r.   r9   rL   anyry   rw   )$r   r   r   r   r   r   r   rI   Zmatmul_node_nameq_weightk_weightv_weightqwkwvwd
qkv_weightZqkv_weight_nameZqkv_matmul_outputZ
qkv_matmul	qkv_nodesZq_slice_nameZk_slice_nameZv_slice_nameZend_of_qkv_nameZqkv_last_axis_nameZq_slice_outputq_sliceZk_slice_outputk_sliceZv_slice_outputv_sliceZq_outputZk_outputZv_outputZinitializer_inputr   r   r   create_packed_qkv_matmul_node  s    ,



"






 

 

 
z-FusionAttention.create_packed_qkv_matmul_node )r   r   r   r   r   r   rI   rH   rk   key_padding_maskrh   r~   r   r   r   
packed_qkvr)   c              	   C   s`  |dkst |dkr:|| dkr:td| d|  dS tdd | j jD }tdd | j jD }| jd}g }|r| 	|||||||\}}}|
|jd |jd |jd g nt|tkr.t|tkr.| jr
|
|jd |jd |jd g n"|
|jd |jd |jd g nlt|tkrt|tkr||kr||kr| jr~|
|jd ||g n|
|jd ||g ndS | js| ||||}|| n
|d	 |r|r||kr||kr|
|
|||g |	g}|r.|r.||kr.||kr.|
||g tjd
|||d}d|_|j
td|g |S )a[  Create a MultiHeadAttention node.

        Args:
            q_matmul (NodeProto): name of MatMul from Q path - (batch_size, sequence_length, hidden_size)
            k_matmul (NodeProto): name of MatMul from K path - (batch_size, sequence_length, hidden_size) or (batch_size, num_heads, past_sequence_length, head_size)
            v_matmul (NodeProto): name of MatMul from V path - (batch_size, sequence_length, hidden_size) or (batch_size, num_heads, past_sequence_length, head_size)
            q_add (NodeProto): name of Add from Q path
            k_add (NodeProto): name of Add from K path
            v_add (NodeProto): name of Add from V path
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
            output (str): output name of MHA
            key_padding_mask (str): name of key padding mask
            add_qk (str): name of add after Q x K'
            past_k (str): name of past K value - (batch_size, num_heads, past_sequence_length, head_size)
            past_v (str): name of past V value - (batch_size, num_heads, past_sequence_length, head_size)
            present_k (str): name of present K value - (batch_size, num_heads, sequence_length, head_size)
            present_v (str): name of present V value - (batch_size, num_heads, sequence_length, head_size)
            packed_qkv (bool): whether to combine MatMuls from Q, K, V paths
                               Note: This is for the scenario where an Attention node should be created but cannot be created
                               because past_key and past_value are separate inputs and not one concatenated input.

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        r   input hidden size # is not a multiple of num of heads Nc                 S   s   g | ]
}|j qS r   r   rp   rl   r   r   r   rr     s     zCFusionAttention.create_multihead_attention_node.<locals>.<listcomp>c                 S   s   g | ]
}|j qS r   r   r   r   r   r   rr     s     rO   r   rN   r+   com.microsoftrI   )r   rb   rc   setr   graphr(   rk   r6   r   r9   typer   rL   rD   r   ry   r   r7   domainr8   r:   )r   r   r   r   r   r   r   rI   rH   rk   r   rh   r~   r   r   r   r   Zgraph_input_namesZgraph_output_namesZmha_node_nameZ
mha_inputsr   r   r   r   Zmha_outputsZmha_noder   r   r   create_multihead_attention_node`  sj    -      
$$$


  z/FusionAttention.create_multihead_attention_node)r!   r   r   r   r   r   r   rI   rH   r(   rk   
add_qk_strr~   r   r   r   scalecausalr)   c           6      C   s|  |dkst |	dkr:|	| dkr:td|	 d|  dS d}|dkrZ|dkrZ|dkrZd}| j|jd }| j|jd }| j|jd }d\}}}|r*| j|jd p| j|jd }| j|jd p| j|jd }| j|jd p| j|jd }|r&|r&|r&|s*dS |dkrLt|jd  d	 dS t|}t|}t|}|j	|j	ks|t |j	d }|j	d }|j	d }||  kr|ksn t |	dkr|	|krt
d
|	 d| d d} |j	|j	krd} t|j	dd }!t|j	dd }"t|j	dd }#d}$| rbtj|||fdd}%|!|" |# }$ntj|||fdd}%d|! }$|r6t|}&t|}'t|}(t|&j	})t|'j	}*t|(j	}+|)|*  kr|!ksn t |+|#kst | rtj|&|'|(fdd},|)|* |+ }-ntj|&|'|(fdd},d|) }-| jd}.| jsf| j|.d |j||$g|%d |r| j|.d |j|-g|,d | jr|rtd dS |jd |jd |jd |.d g}/|dk	r|/| tjd|/|g|.d}0n|
|.d |r|.d ndg}/|dk	r |/| n
|/d |o2|}1|1rP| ||}2|/|2 |dk	r~| |}3|1st|/d |/|3 |g}4|r|r|dddddd}5|4|5 | |||5 tjd|/|4|.d}0d|0_|0jtd|g |r|0jtddg |dk	r.|0jtd|g | rP|0jtd|!|"|#gg | jdk	rx|0jtd t | jg |0S )!a+  Create an Attention node.

        Args:
            mask_index (str): mask input
            q_matmul (NodeProto): MatMul node in fully connection for Q
            k_matmul (NodeProto): MatMul node in fully connection for K
            v_matmul (NodeProto): MatMul node in fully connection for V
            q_add (NodeProto): Add bias node in fully connection for Q
            k_add (NodeProto): Add bias node in fully connection for K
            v_add (NodeProto): Add bias node in fully connection for V
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
            input (str): input name
            output (str): output name
            add_qk_str (str): name of Add node after Q x K'
            past_k (str): name of input for past K value
            past_v (str): name of input for past V value
            present_k (str): name of output to store present K value
            present_v (str): name of output to store present V value
            scale: scale before softmax
            causal: whether it is uni-directional mask.

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        r   r   r   NTFr0   )NNNzl is not an initializer. Please set do_constant_folding=True in torch.onnx.export to unblock attention fusionzInput hidden size (z3) is not same as weight matrix dimension of q,k,v (z:). Please provide a correct input hidden size or pass in 0r   rY   rO   r   r   r   zVMultiHeadAttention does not support relative_position_bias: cannot fuse the attention.rN   r+   r   z.key_keyr   rq   r   rI   Zunidirectionalr   Zqkv_hidden_sizesrQ   )!r   rb   rc   r   r;   r(   printr
   rd   r   re   r[   r   Zconcatenater   r6   rK   r<   r2   rk   ry   r   r7   r   r}   r   r   r   r8   r9   r:   rQ   float)6r   r!   r   r   r   r   r   r   rI   rH   r(   rk   r   r~   r   r   r   r   r   Zhas_biasr   r   r   r   r   r   r   r   r   Z
qw_in_sizeZ
kw_in_sizeZ
vw_in_sizeZis_qkv_diff_dimsZqw_out_sizeZkw_out_sizeZvw_out_sizeZqkv_weight_dimr   r   r   r   Zq_bias_shapeZk_bias_shapeZv_bias_shaper   r   Zattention_node_nameZattention_inputsZattention_nodeZpast_existsZpast_kvrm   Zattention_outputsZ
present_kvr   r   r   create_attention_node  s   .
$$&






















z%FusionAttention.create_attention_nodec           6      C   s&  |}|j dkr0| j|dd}|d k	r,|}nd S | j|dddddgd d dddg}d }|d k	rp|\}}}	}
}n:| j|ddddgdd ddg}|d k	r|\}}}
}nd S g }t|jD ]0\}}||krq||d jd krq|| qt|dkrd S |d }| j|d	d}|d k	r||jd  }|d k	rht|d
krh|d }|j dkrb|jd }nd S n(|d k	rt|dkr|jd }nd S n6|j dkr|| }|D ]}|j dkr|jd }q|| }|j dkrt|jdkr|jd }|| }dd |D }|	ddkr d S | j|ddddgdddd g}|d krXt
d d S |\}}}}d}d}d}ddddgddd dgfddd	dgddd dgfddddgddd
dgfddddgdddd
gfdddgdddgfd}d }| D ]`\}} | j|| d | d }|d krq|dkr(d}|dkr6d}|dkrDd} qNq|d krft
d d S d }!d }"d }#|r|\}}#}"}n2|r|\}}!}#}"n|r|\}}}"n|\}}!}}"| j|"ddddgdddd g}$|$d kr| j|"dddddgddddd g}$|$d krt
d d S |$d }%|$d }&|$d }'| j|"ddddgdddd g}(|(d kr| j|"dddddgddddd g}(|(d krt
d  d S |(d })|(d }*d }+d },|r| j|#d!dd"gdddgfd"d#d#gdddgfd$d!dd"gddddgfg|\}}+}n|r| j|#d$d"d#d#gddddgfd"d#d#gdddgfg|\}}+}|!d k	r| |!},|,d krt
d%|!  d S nN|rnF| j|!d	d&d$d#d#gd ddddgfd	d&d#d#gd dddgfg|\}}+}|s|+d krt
d' d S |s8t|+dkr8|+d j d	kr8| j|+d \}}-|-d(kr8|-| _|jd |kr"|'jd |kr"|*jd |kr"|s| j|+d jd nd }.|d kr|	n|
}/| |%\}0}1|0dks|1dkrt
d) d S | |.|'|*||&|)||0|1||/jd |,}2|2d krd S | j|2 | j| j|2j< |d k	r|jd }3d*|3 }4| jd+|3 tjdgtdd|0t|1|0 gdd,}5| j t!"d|/jd |5jg|4gd-|3 | j |4|jd< | j#$|/|
|g | j#$| | j#$| j%s|$n
|$d d  | j#$| j%s|(n
|(d d  | j#$| j%s|n
|d d  d| _&d S ).NrG   ZAddr   r   r   r   ZEinsumr0   ZMulrX      rF   rW   c                 S   s   g | ]
}|j qS r   )ra   )rp   childr   r   r   rr     s     z(FusionAttention.fuse.<locals>.<listcomp>rY   z&fuse_attention: failed to match v pathFZSoftmaxZDivZWhere)path1path2path3path4path5r   Tr   r   z'fuse_attention: failed to match qk pathz&fuse_attention: failed to match q pathr   z&fuse_attention: failed to match k pathZExpandZEqualr   ZCastz4fuse_attention: failed to verify shape inference of ZSubz)fuse_attention: failed to match mask pathizmFailed to detect num_heads and hidden_size for Attention fusion. Please specify those parameters in argument.Zedge_modified_Zshape_modified_tensorr1   Zreshape_modified_)'ra   r   Zmatch_parentZmatch_parent_path	enumerater(   rk   ry   r$   countrb   rc   itemsZmatch_parent_pathsrj   Zget_constant_inputrQ   rJ   r?   rf   re   r   rw   rz   r{   r.   r<   r   r=   r[   r   intr>   r   r7   Znodes_to_remover9   rK   Zprune_graph)6r   Znormalize_nodeZinput_name_to_nodesZoutput_name_to_nodeZ
start_nodeZadd_before_layernormr   Zeinsum_noderq   Zreshape_qkvZtranspose_qkvZ
matmul_qkvZother_inputsZ_ir(   Z
root_inputZmul_before_layernormZmul_childrenZlayernorm_nodechildrenr   Zparent_nodeZchildren_typesZv_nodesZadd_vZmatmul_vZ
is_distillZis_distill_addZis_no_mask_attentionZqk_pathsZqk_nodeskvrh   Z	matmul_qkZwhere_qkZq_nodesr_   Zadd_qZmatmul_qZk_nodesZadd_kZmatmul_kZ
mask_nodesr   Zmul_valr!   Zattention_last_nodeZq_num_headsZq_hidden_sizeZnew_nodeZunique_indexZnew_edgeZshape_tensorr   r   r   fuse  s   
 
 
	

 








 


 


	





$
0 


	
   zFusionAttention.fuse)r   r   r   r   r   r   F)r   r   r   r   r   NF)r@   rA   rB   rC   r   r   r   r   boolr   rD   r   r   r   r^   rf   rj   r}   r   r   r   r   r   r   r   r   r   r   r   __classcell__r   r   rT   r   rE   h   s   	)71-)


 


        


w       
 frE   )loggingr   typingr   r   r   r   Znumpyr[   Zfusion_baser   Zfusion_optionsr   Zfusion_utilsr	   r
   Zonnxr   r   r   r   Z
onnx_modelr   r@   rb   r   rE   r   r   r   r   <module>   s   V