U
    qh                     @  sP   d dl mZ d dlZd dlZd dlZd dlZd dlm	Z	m
Z
 G dd de	ZdS )    )annotationsN)OpRunRuntimeTypeErrorc                   @  s@   e Zd ZdZdddZedddZedd Zed	d
 ZdS )StringNormalizerzThe operator is not really threadsafe as python cannot
    play with two locales at the same time. stop words
    should not be implemented here as the tokenization
    usually happens after this steps.
    Nc              
   C  s  |}|d krt  }t  }n@t |}|dkr:dd |D }n |dkrRdd |D }nt |}tj|j|jd}	t|jdkrt|jd D ]4}
| j|d d |
f |	d d |
f |||||d	 qn0t|jdkr| j||	|||||d	 ntd
t|	jdkrN|	jd dkrNt	dd |	
 d D g}	|	jd dkrt	dgg}	nBt|	jdkrt	dd |	
 D }	t|	dkrt	dg}	|	fS )NLOWERc                 S  s   h | ]}|  qS  )lower.0wr   r   K/tmp/pip-unpacked-wheel-xnis5xre/onnx/reference/ops/op_string_normalizer.py	<setcomp>%   s     z(StringNormalizer._run.<locals>.<setcomp>UPPERc                 S  s   h | ]}|  qS r   )upperr	   r   r   r   r   '   s     )dtype      )slocalestops	raw_stopsis_case_sensitivecase_change_actionzx must be a matrix or a vector.r   c                 S  s   g | ]}t |d kr|qS r   lenr	   r   r   r   
<listcomp>C   s      z)StringNormalizer._run.<locals>.<listcomp> c                 S  s   g | ]}t |d kr|qS r   r   r	   r   r   r   r   G   s      )setnpemptyshaper   r   range_run_columnr   arraytolist)selfxr   r   localeZ	stopwordsr   r   r   resir   r   r   _run   sT    
	
 zStringNormalizer._runc           	      C  s  t  |krjzt t j| W nJ t jk
rh } z*tjd|dt  d|ddd W 5 d }~X Y nX | d d  |d d < t| jd D ].}t	|| t
rd||< qt|| ||< q|rt|dkrt| jd D ]}t|| |||< q|d	kr&t| jd D ]}||  ||< qnL|d
krXt| jd D ]}||  ||< q>n|dkrrtd|d|st|dkrt| jd D ]}t|| |||< q|S )NzUnknown local setting z (current: z) - .r   )
stacklevelr   r   r   r   NONEz'Unknown option for case_change_action: )pylocale	getlocale	setlocaleLC_ALLErrorwarningswarnr!   r    
isinstancefloatr   strip_accents_unicoder   _remove_stopwordsr   r   RuntimeError)	ZcinZcoutr   r   r   r   r   er)   r   r   r   r"   L   s<    





zStringNormalizer._run_columnc                   s"   |  d}dt fdd|S )N c                   s   |  kS )Nr   )sr   r   r   <lambda>       z4StringNormalizer._remove_stopwords.<locals>.<lambda>)splitjoinfilter)textr   Zsplr   r=   r   r8   |   s    
z"StringNormalizer._remove_stopwordsc                 C  sR   z| j ddd | W S  tk
rL   td| }ddd |D } |  Y S X dS )	aa  Transforms accentuated unicode symbols into their simple counterpart.
        Source: `sklearn/feature_extraction/text.py
        <https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/
        feature_extraction/text.py#L115>`_.

        Args:
            s: string The string to strip

        Returns:
            the cleaned string
        ASCIIstrict)errorsZNFKDr   c                 S  s   g | ]}t |s|qS r   )unicodedata	combining)r
   cr   r   r   r      s     
 z:StringNormalizer.strip_accents_unicode.<locals>.<listcomp>N)encodeUnicodeEncodeErrorrG   	normalizerA   )r<   
normalizedr   r   r   r7      s    z&StringNormalizer.strip_accents_unicode)NNNN)NNNNN)	__name__
__module____qualname____doc__r*   staticmethodr"   r8   r7   r   r   r   r   r      s"   	    
6     /
r   )
__future__r   r'   r.   rG   r3   Znumpyr   Zonnx.reference.op_runr   r   r   r   r   r   r   <module>   s   