U
    h                     @   s   d dl mZ d dlmZmZmZmZmZ d dlm	Z	 d dl
mZ d dlmZmZmZ d dlmZ eeZG dd de	ZG d	d
 d
eZdS )    )	getLogger)DictListOptionalTupleUnion)Fusion)FusionUtils)	NodeProtoTensorProtohelper)	OnnxModelc                       s
  e Zd ZdZd*eed fddZeede	eef f ddd	Z
eeeee f eed
ddZdd Zdd Zdd Zdd Zdd Zee	eedef f dddZd+eeeeedef ee dddZdd Zd d! Zd,d"d#Zd$d% Zd&d' Zd(d) Z  ZS )-FusionEmbedLayerNoMaskz
    Fuse embedding layer into one node (EmbedLayerNormalization).
    It supports the following model types: BERT, DistilBert, ALBert.
    no mask)modeldescriptionc                    sB   t  |dddg| t|| _| jji dd| _d | _d | _d S )NEmbedLayerNormalizationLayerNormalizationSkipLayerNormalizationT)update)	super__init__r	   utilsr   Zinfer_runtime_shapeshape_infer_helper	attention
embed_node)selfr   r   	__class__ N/tmp/pip-unpacked-wheel-socb9apf/onnxruntime/transformers/fusion_embedlayer.pyr      s    
zFusionEmbedLayerNoMask.__init__N)addreturnc                 C   sP   | j |dgdg}|d kr d S | j |dgdg}|d kr@d S |d |d fS )NGatherr      )r   match_parent_path)r   r!   gather_0_pathgather_1_pathr   r   r    match_two_gather$   s    z'FusionEmbedLayerNoMask.match_two_gather)	layernorminput_name_to_nodesis_distil_bertr"   c           
   	   C   s  | j j|d|dd| _| jdk	r$dS |jd |kr6dS ||jd  }tdd |D }|d	d	d	d
gkr|D ]^}|jd
krj| j |dd	dd	gddddg}|dk	rj|d jd |jd krj|d | _ dS qjt	|dkr|d jd	kr|d jd |kr||d jd  }t	|dkr|d jdkr|d jd |kr||d jd  }	|	D ]}|jdkrV|| _ dS qVtdd |	D }|r|d	d	d	dd
gkr|dd	d	d	ddgkr|dd	d	d	dgkrt
d dS n2|dd	d	d	gkr|d	d	d	d
gkrt
d dS dS )a  Check that LayerNormalization has a child of Attention node or subgraph like Attention.

        Args:
            layernorm (NodeProto): LayerNormalization node
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            is_distil_bert (bool): whether it is DistilBert or not

        Returns:
            bool: whether there is Attention node or subgraph like Attention
        	AttentionF)	recursiveNTr   c                 S   s   g | ]
}|j qS r   op_type.0childr   r   r    
<listcomp>I   s     zCFusionEmbedLayerNoMask.check_attention_subgraph.<locals>.<listcomp>ZMatMulr   AddMultiHeadAttention   r$   c                 S   s   g | ]
}|j qS r   r.   r0   r   r   r    r3   f   s     Shapez<No Attention like subgraph in children of LayerNormalization)r   Zfind_first_child_by_typer   outputsortedr/   r%   inputcross_attentionlenloggerdebug)
r   r)   r*   r+   childrenZchildren_typesnodepath1ZgrandchildrenZnodesr   r   r    check_attention_subgraph/   s|       



 
2



z/FusionEmbedLayerNoMask.check_attention_subgraphc              	   C   s,  | j |ddgddg}|dkrL| j |ddddgddddg}|dkrLd	S |d |d
  }}|jd |krpd	S | j |dddddgdddddgfddddgddddgfg|\}}}|dkrd	S |d }	| j|	ddr| j|	ddsd	S |d }
| j|
ddsd	S |d
 }|jd |kr(d	S dS )az    Match position embedding path from input_ids to Gather for DistilBert.

        Pattern is like the following:
                 (input_ids)
                      |
                     Shape
                       |                          |    Gather (indices=1)
                       |       |
                       |      Cast (optional)
                       |       |
                       |      Range (start=0, end=*, delta=1)
                       |       |
                       |    Unsqueeze
                       |    /
                      Expand
                        |
                      Gather
        ZExpandr8   r$   NZWhereZReshaper7   r   Fr6   	UnsqueezeZRangeCastr#   T)r   r%   r;   Zmatch_parent_pathsr   check_node_input_value)r   position_embedding_gather	input_idsoutput_name_to_noderB   expandshape_path2Z
range_nodeZgather_nodeZ
shape_noder   r   r    #match_position_embedding_distilbert   sD    


z:FusionEmbedLayerNoMask.match_position_embedding_distilbertc                 C   s   dS )aY  Match position embedding path from input_ids to Gather for Roberta.

        Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
          (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Mul -- Cast(to=7) -- Add(B=1) -- Cast(to=7)* --> Gather
                                                |                              ^
                                                V                              |
                                                +------------------------------+

        Roberta new pattern from transformers v4.9:
           (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Add(B=0) -- Mul -- Cast(to=7) -- Add(B=1) --> Gather
                                                |                                           ^
                                                V                                           |
                                                +-------------------------------------------+

        start_node = position_embedding_gather
        start_index = 1

        # match optional Cast node.
        parent = self.model.get_parent(start_node, start_index, output_name_to_node)
        if parent is None:
            return
        if parent.op_type == "Cast":
            if OnnxModel.get_node_attribute(parent, "to") != 7:
                return
            start_node = parent
            start_index = 0

        i, path, return_indices = self.model.match_parent_paths(
            start_node,
            [ (['Add', 'Cast', 'Mul', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0]),
              (['Add', 'Cast', 'Mul', 'Add', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0, 0])],
            output_name_to_node)

        if path is not None:
            # constant input of Add shall be 1.
            i, value = self.model.get_constant_input(path[0])
            if value != 1:
                return False

            _, self.padding_word_id = self.model.get_constant_input(path[-1])

            return input_ids == path[-1].input[0]
        Fr   r   rH   rI   rJ   r   r   r     match_position_embedding_roberta   s    -z7FusionEmbedLayerNoMask.match_position_embedding_robertac                 C   s  | j |ddgddg|}|dkr&dS |\}}| j |jd }|dk	rt|jdkr|jd dkr| j|ddgr| j|ddgrt|jd	ks| j|d	dgsdS | j  }|d
k rt	
|ddgsdS n| j|ddgsdS | j |d|}	|	dkrdS |	jdkr<| j|	dds*dS | j |	d|}
n|	}
|
dksV|
jdkrZdS | j|
ddspdS | j |
d|}|dks|jdkrdS ||jd kS )a	    Match position embedding path from input_ids to Gather for BERT.

        BERT Embedding Layer Pattern:
                                    (input_ids)
                                   /                                          /          Shape
                                /              |
                              /              Gather (indices=1)
                             /                  |
                            /                  Add (optional, B=0)
                           /                    |
                        Gather (segment_ids) Unsqueeze (axes=0)
                           \        |           |
                            \     Gather      Slice (data[1,512], starts=0, ends=*, axes=1, steps=1)
                              \    /            |
                                Add          Gather
                                   \       /
                                      Add
                                       |
                                LayerNormalization
        ZSlicerD   r$   r7   NFr            Zaxesr4   r#   r8   )r   r%   get_constant_valuer;   r=   rL   r   rG   Zget_opset_versionr	   Zcheck_node_attributeZ
get_parentr/   )r   rH   rI   rJ   pathsliceZ	unsqueezeZslice_weightZopset_versionrA   ZgatherrL   r   r   r    match_position_embedding_bert   s^    

z4FusionEmbedLayerNoMask.match_position_embedding_bertc                 C   s(   |  |||rdS | |||r$dS dS )NTF)rX   rO   rP   r   r   r    match_position_embedding8  s
    z/FusionEmbedLayerNoMask.match_position_embeddingc                 C   s  |j d }|r|j d nd}|j d }| jdk	r| j|}| j|}|rP|sTtt|dkr|t|dkr||d |d kstd|| dS |r| j||std|| j| dS | j	
|j d }	|	dkst|	jdkrtd dS | j	
|j d }
|
dks:t|
jdks:|	jd |
jd krHtd	 dS |r| j	
|j d }|dkst|jdks|	jd |jd krtd
 dS |	jd |
jd krtd|j d  d|	jd  d|j d  d|
jd   |r|	jd |jd krDtd|j d  d|	jd  d|j d  d|jd   |
jd |jd krtd|j d  d|
jd  d|j d  d|jd   dS )zXSanity check of embedding weights, and match hidden_size of weights and shape of inputs.r$   Nr7   zfCannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}FzaCannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}r   zICannot fuse EmbedLayerNormalization: word embedding table is not expectedzMCannot fuse EmbedLayerNormalization: position embedding table is not expectedzLCannot fuse EmbedLayerNormalization: segment embedding table is not expectedzword_embedding_table (z) size z <= position_embedding_table (z <= segment_embedding_table (zposition_embedding_table (T)r;   r   Zget_edge_shapeAssertionErrorr=   r>   infoformatZcompare_shaper   rU   rL   warning)r   word_embedding_gathersegment_embedding_gatherrH   rI   segment_idsposition_idsZinput_ids_shapeZposition_ids_shapeZword_embedding_tableZposition_embedding_tableZsegment_embedding_tabler   r   r    check_embeddingF  s    




 



222z&FusionEmbedLayerNoMask.check_embedding)
input_namer"   c                 C   sX   d}| j |}|dk	r@|jjjtjkr:| j|\}}qP|}n| j|\}}||fS )a  Cast a graph input or node input to int32.

        Args:
            input_name (str): name of graph input or node input

        Returns:
            A tuple of casted input name and the cast node.
            int32_output (str): If input is int32, it is the input name, Otherwise it is output name of Cast node.
            input_cast_node (Union[None, NodeProto]): Cast node. It could be None if input is int32.
        N)	r   find_graph_inputtypeZtensor_typeZ	elem_typer   ZINT32r   Zcast_input_to_int32)r   rc   Zinput_cast_nodeZgraph_inputZint32_outputr   r   r    cast_to_int32  s    z$FusionEmbedLayerNoMask.cast_to_int32F)rI   r)   r^   rH   r_   ra   c	                 C   s  g }	|  |\}}
| jd}|jdkr>|jd }|jd }n|jd }|jd }d}|dk	r|  |jd \}}
|||jd |jd |jd ||g}n|d|jd |jd d||g}|dk	r|d |  |\}}
|| |d	 |d
 g}|r|dk	r|n|d }|| tjd|||d}d|_|j	D ] }|j
dkr4|j	|g q4t|j	dkr||j	tddg |	| |	D ]}| j| j|j
< q| j|	 || _|S )ag  Create an EmbedLayerNormalization node. Note that segment embedding is optional.

        Args:
            input_ids (str): input_ids for word embeddings
            layernorm (NodeProto): LayerNormalization or SkipLayerNormalization node.
            word_embedding_gather (NodeProto): the Gather node for word embedding
            position_embedding_gather (NodeProto): the Gather node for position embedding
            segment_embedding_gather (Union[None, NodeProto]): the Gather node for segment embedding, or None.

        Returns:
            NodeProto: the EmbedLayerNormalization node created.
        r   r   r$   r7   rR   Nr    _outputZ_dummy_mask_indexZ_embedding_sum)outputsnamezcom.microsoftepsilong-q=)rf   r   Zcreate_node_namer/   r;   appendr   Z	make_nodedomain	attributerj   extendr=   Zmake_attributeZthis_graph_nameZnode_name_to_graph_namenodes_to_addr   )r   rI   r)   r^   rH   r_   ra   embedding_sum_outputembedding_sum_namerp   rM   Z	node_namegammabetaZembed_node_inputsr`   Zembed_node_outputsrj   r   ZattrA   r   r   r    create_fused_node  sj    









z(FusionEmbedLayerNoMask.create_fused_nodec                 C   s$   | j |jd |jd  d| _d S )Nr   T)r   replace_input_of_all_nodesr9   Zprune_graph)r   r)   r   r   r   r    finish_fusion
  s    z$FusionEmbedLayerNoMask.finish_fusionc                 C   s*   |j dko(t|jdko(t|jd dkS )Nr   rR   r   )r/   r=   r9   )r   rA   r   r   r    "is_skip_layer_norm_with_sum_output  s    z9FusionEmbedLayerNoMask.is_skip_layer_norm_with_sum_outputc              
   C   s  |  |}|d krdS |\}}|jd }	|jd }
| j||ddsFdS | |d |sXdS |jdkr| |}d}|}|r|jd nd }|d k	o| j|d k	}n|}|jdkrdnd}t	|j|kr|j| nd }|d k	o| j|d k	}|o||kot	|| dk}|d k	o(|jdkp(|p(|}| j
|	|||||
||rF|nd d}|rxd	|j|< |sx| j||jd
  | || dS )NFr$   r+   r   rR   r4   r   )rq   rr   Z_no_use__to_be_removed_r7   T)r(   r;   rC   rb   r/   rx   r9   r   Zfind_graph_outputr=   ru   rv   rw   )r   r)   add_before_layernormr*   rJ   optional_segment_gather
two_gatherr^   rH   rI   ra   Zneed_embedding_sum_outputZsum_output_indexZnode_with_sum_outputZ
sum_outputZis_sum_graph_outputZis_sum_used_by_multiple_nodesr   r   r   r    	fuse_gpt2  sX    






z FusionEmbedLayerNoMask.fuse_gpt2c           
      C   s   |  |}|dkrdS |\}}|jd }| j||dds<dS | |||sNdS | |d|s`dS | ||||d}	| ||	 dS )a  Fuse embedding layer for DistilBert
        Args:
            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
        NFr$   Try   )r(   r;   rC   rY   rb   ru   rw   )
r   r)   rz   r*   rJ   r|   r^   rH   rI   r   r   r   r    fuse_distilbertc  s(    

    z&FusionEmbedLayerNoMask.fuse_distilbertc                 C   s   | j |dgdg}|dkr dS | |d }|dkr:dS |\}}|jd }	| j||dds`dS | j |dgdg}
|
dkrdS |
d }| ||	|s| ||	|sdS |}|}|}| |||sdS | |	||||}| || dS )	a  Fuse embedding layer for Bert
        Args:
            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
        r4   r   NFr$   ry   r#   T)	r   r%   r(   r;   rC   rY   rb   ru   rw   )r   r)   rz   r*   rJ   Zadd_2_gatherr|   r^   r_   rI   Zposition_embedding_pathrH   tempr   r   r   r    	fuse_bert  s>    	
z FusionEmbedLayerNoMask.fuse_bertc           	      C   s  | j |dgdg}|jdkr8|d kr*d S |d }d }n| j |dgdg}| j |dgdg}|d kr|d k	r|d kr|d S |d }|d }nJ|d k	r|d kr| j |dgdg}|d krd S |d }|d }n|}d }| |||||rd S | ||||rd S | ||||rd S d S )Nr4   r   r   r#   r$   )r   r%   r/   r}   r~   r   )	r   rA   r*   rJ   Zfirst_add_pathrz   r{   r&   r'   r   r   r    fuse  sB    


    zFusionEmbedLayerNoMask.fuse)r   )NFN)N)__name__
__module____qualname____doc__r   strr   r
   r   r   r(   r   r   boolrC   rO   rQ   rX   rY   rb   rf   r   ru   rw   rx   r}   r~   r   r   __classcell__r   r   r   r    r      s@    T>/HK    
b 
Q)2r   c                       s8   e Zd Zd	ed fddZdd Z fddZ  ZS )
FusionEmbedLayerNormalizationF)r   c                    s   t  |d || _d S )Nz	with mask)r   r   use_mask_index)r   r   r   r   r   r    r     s    z&FusionEmbedLayerNormalization.__init__c                 C   s   | j }t|jdkr0|j| td|j nDt|jdkrb|jd sb||jd< td|j ntd|j d S |D ]H}td|j |jdkr|jd |jd< qx|jd	krx|jd |jd
< qxd S )N   zappend mask to %szreplace mask in %szskip mask in %szupdate mask_index in %sr,   r$   rR   r5   rS   )	r   r=   r;   rl   r>   r?   rj   r/   r9   )r   
mask_int32attention_nodesr   Zattention_noder   r   r    replace_mask  s    


z*FusionEmbedLayerNormalization.replace_maskc                    sh  d | _ d | _d | _t ||| | jd kr0d S | jsNtd | d d S | j d krz| jd krztd | d d S | j r| j j	d }n| jj	d }|| }| j
|rdd |D }| || | d d S ||krtd	| | d d S || }|jd
krddd |D }|jdkrN|j	d }t|t|krN| j| | || | d d S )NzG--use_mask_index is not set: EmbedLayerNormalization will not have maskz EmbedLayerNormalization(no mask)zLEmbedLayerNormalization will not have mask since attention node is not foundrR   rS   c                 S   s   g | ]}|j d kr|qS )r,   r5   r.   r1   rA   r   r   r    r3     s     
 z6FusionEmbedLayerNormalization.fuse.<locals>.<listcomp>z"EmbedLayerNormalization(with mask)zHEmbedLayerNormalization will not have mask since %s is not a node output)	ReduceSumrE   c                 S   s   g | ]}|j d kr|qS r   r.   r   r   r   r    r3   $  s     
 r   r   )r   r<   r   r   r   r   r>   r?   Zincrease_counterr;   r   rd   r   r/   r=   Znodes_to_removerl   )r   rA   r*   rJ   r   Zchildren_nodesr   r   r   r    r     sH    







z"FusionEmbedLayerNormalization.fuse)F)r   r   r   r   r   r   r   r   r   r   r   r    r     s   r   N)loggingr   typingr   r   r   r   r   Zfusion_baser   Zfusion_utilsr	   Zonnxr
   r   r   Z
onnx_modelr   r   r>   r   r   r   r   r   r    <module>   s        W