U
    |h<                     @   s   d dl m Z  d dlmZmZ d dlmZ d dlmZ d dlZ	d dl
mZ ddlmZmZmZ ddlmZ dd	lmZ G d
d deeZdS )    )array)IterableMapping)Number)
itemgetterN   )BaseEstimatorTransformerMixin_fit_context)check_array)check_is_fittedc                   @   s   e Zd ZU dZdegdgdgdZeed< ej	dddddd	Z
dd
dddddZeddd ddZdd Zeddd!ddZefddZdd Zd"ddZd#ddZdd ZdS )$DictVectorizera  Transforms lists of feature-value mappings to vectors.

    This transformer turns lists of mappings (dict-like objects) of feature
    names to feature values into Numpy arrays or scipy.sparse matrices for use
    with scikit-learn estimators.

    When feature values are strings, this transformer will do a binary one-hot
    (aka one-of-K) coding: one boolean-valued feature is constructed for each
    of the possible string values that the feature can take on. For instance,
    a feature "f" that can take on the values "ham" and "spam" will become two
    features in the output, one signifying "f=ham", the other "f=spam".

    If a feature value is a sequence or set of strings, this transformer
    will iterate over the values and will count the occurrences of each string
    value.

    However, note that this transformer will only do a binary one-hot encoding
    when feature values are of type string. If categorical features are
    represented as numeric values such as int or iterables of strings, the
    DictVectorizer can be followed by
    :class:`~sklearn.preprocessing.OneHotEncoder` to complete
    binary one-hot encoding.

    Features that do not occur in a sample (mapping) will have a zero value
    in the resulting array/matrix.

    For an efficiency comparision of the different feature extractors, see
    :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.

    Read more in the :ref:`User Guide <dict_feature_extraction>`.

    Parameters
    ----------
    dtype : dtype, default=np.float64
        The type of feature values. Passed to Numpy array/scipy.sparse matrix
        constructors as the dtype argument.
    separator : str, default="="
        Separator string used when constructing new features for one-hot
        coding.
    sparse : bool, default=True
        Whether transform should produce scipy.sparse matrices.
    sort : bool, default=True
        Whether ``feature_names_`` and ``vocabulary_`` should be
        sorted when fitting.

    Attributes
    ----------
    vocabulary_ : dict
        A dictionary mapping feature names to feature indices.

    feature_names_ : list
        A list of length n_features containing the feature names (e.g., "f=ham"
        and "f=spam").

    See Also
    --------
    FeatureHasher : Performs vectorization using only a hash function.
    sklearn.preprocessing.OrdinalEncoder : Handles nominal/categorical
        features encoded as columns of arbitrary data types.

    Examples
    --------
    >>> from sklearn.feature_extraction import DictVectorizer
    >>> v = DictVectorizer(sparse=False)
    >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
    >>> X = v.fit_transform(D)
    >>> X
    array([[2., 0., 1.],
           [0., 1., 3.]])
    >>> v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},
    ...                            {'baz': 1.0, 'foo': 3.0}]
    True
    >>> v.transform({'foo': 4, 'unseen_feature': 3})
    array([[0., 0., 4.]])
    Zno_validationbooleandtype	separatorsparsesort_parameter_constraints=Tc                C   s   || _ || _|| _|| _d S Nr   )selfr   r   r   r    r   O/tmp/pip-unpacked-wheel-ig1s1lm8/sklearn/feature_extraction/_dict_vectorizer.py__init__f   s    zDictVectorizer.__init__FNfittingtransformingindicesvaluesc                C   s   |D ]}	t |	tr(d|| j|	f }
d}	ntdt|	 d|r^|
|kr^t|||
< ||
 |r|
|kr|||
  || |	 qdS )z)Add feature names for iterable of strings%s%s%s   zUnsupported type z; in iterable value. Only iterables of string are supported.N)
isinstancestrr   	TypeErrortypelenappendr   )r   fvfeature_namesvocabr   r   r   r   vvfeature_namer   r   r   _add_iterable_elementl   s    

z$DictVectorizer._add_iterable_element)Zprefer_skip_nested_validationc           	   
   C   s   g }i }|D ]}|  D ]\}}t|tr<d|| j|f }nbt|tsN|dkrT|}nJt|trtdt| d| d| dnt|trd}| 	|||| |dk	r||krt
|||< || qq| jr|  dd t|D }|| _|| _| S )	a)  Learn a list of feature name -> indices mappings.

        Parameters
        ----------
        X : Mapping or iterable over Mappings
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).

            .. versionchanged:: 0.24
               Accepts multiple string values for one categorical feature.

        y : (ignored)
            Ignored parameter.

        Returns
        -------
        self : object
            DictVectorizer class instance.
        r    NzUnsupported value type  for : z$.
Mapping objects are not supported.c                 S   s   i | ]\}}||qS r   r   ).0ir(   r   r   r   
<dictcomp>   s      z&DictVectorizer.fit.<locals>.<dictcomp>)itemsr"   r#   r   r   r   r$   r%   r   r.   r&   r'   r   	enumeratefeature_names_vocabulary_)	r   Xyr*   r+   xr(   r)   r-   r   r   r   fit   s2    


zDictVectorizer.fitc                 C   sX  t djdkstd| j}|r*g }i }n| j}| j}d}t|trJ|gn|}t d}dg}g }	|D ]}
|
 D ]\}}t|t	rd|| j
|f }d}nvt|ts|d kr|}n^t|tst|trd }| j||||||||	d n*td	t| d
| d| dt| d	|d k	rr|r>||kr>t|||< || ||krr|||  |	| | qr|t| qdt|dkrtdtj|tjd}t|d t|f}tj|	||f||d}|r(| jr(|  tjt|tjd}t|D ]\}}|| ||< |||< q|d d |f }| jr:|  n| }|rT|| _|| _|S )Nr2      zsizeof(int) != 4 on your platform; please report this at https://github.com/scikit-learn/scikit-learn/issues and include the output from platform.platform() in your bug reportTr   r    r!   r   zUnsupported value Type r/   r0   z.
z objects are not supported.zSample sequence X is empty.r   )shaper   )r   itemsizeAssertionErrorr   r6   r7   r"   r   r4   r#   r   r   r   r.   r$   r%   r&   r'   
ValueErrornpZ
frombufferZintcspZ
csr_matrixr   emptyZint32r5   r   Zsort_indicesZtoarray)r   r8   r   r   r*   r+   r   r   Zindptrr   r:   r(   r)   r-   r>   Zresult_matrix	map_indexnew_valr   r   r   
_transform   s    

$
  
zDictVectorizer._transformc                 C   s   | j |ddS )a  Learn a list of feature name -> indices mappings and transform X.

        Like fit(X) followed by transform(X), but does not require
        materializing X in memory.

        Parameters
        ----------
        X : Mapping or iterable over Mappings
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).

            .. versionchanged:: 0.24
               Accepts multiple string values for one categorical feature.

        y : (ignored)
            Ignored parameter.

        Returns
        -------
        Xa : {array, sparse matrix}
            Feature vectors; always 2-d.
        Tr   rG   )r   r8   r9   r   r   r   fit_transform$  s    zDictVectorizer.fit_transformc           
         s   t |ddgd}|jd }| j} fddt|D }t|rpt|  D ] \}}|||f || || < qLnLt|D ]B\}}t||ddf D ]$\}}	|	dkr|||f ||| < qqx|S )aN  Transform array or sparse matrix X back to feature mappings.

        X must have been produced by this DictVectorizer's transform or
        fit_transform method; it may only have passed through transformers
        that preserve the number of features and their order.

        In the case of one-hot/one-of-K coding, the constructed feature
        names and values are returned rather than the original ones.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Sample matrix.
        dict_type : type, default=dict
            Constructor for feature mappings. Must conform to the
            collections.Mapping API.

        Returns
        -------
        D : list of dict_type objects of shape (n_samples,)
            Feature mappings for the samples in X.
        ZcsrZcsc)Zaccept_sparser   c                    s   g | ]
}  qS r   r   )r1   _	dict_typer   r   
<listcomp>Z  s     z4DictVectorizer.inverse_transform.<locals>.<listcomp>N)	r   r>   r6   rangerC   issparsezipZnonzeror5   )
r   r8   rM   Z	n_samplesnamesZdictsr2   jdr)   r   rL   r   inverse_transform>  s    

z DictVectorizer.inverse_transformc                 C   s   | j |ddS )a  Transform feature->value dicts to array or sparse matrix.

        Named features not encountered during fit or fit_transform will be
        silently ignored.

        Parameters
        ----------
        X : Mapping or iterable over Mappings of shape (n_samples,)
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).

        Returns
        -------
        Xa : {array, sparse matrix}
            Feature vectors; always 2-d.
        FrH   rI   )r   r8   r   r   r   	transformg  s    zDictVectorizer.transformc                 C   sD   t | d tdd | jD r0dd | jD }n| j}tj|tdS )a^  Get output feature names for transformation.

        Parameters
        ----------
        input_features : array-like of str or None, default=None
            Not used, present here for API consistency by convention.

        Returns
        -------
        feature_names_out : ndarray of str objects
            Transformed feature names.
        r6   c                 s   s   | ]}t |t V  qd S r   )r"   r#   r1   namer   r   r   	<genexpr>  s     z7DictVectorizer.get_feature_names_out.<locals>.<genexpr>c                 S   s   g | ]}t |qS r   )r#   rW   r   r   r   rN     s     z8DictVectorizer.get_feature_names_out.<locals>.<listcomp>r=   )r   anyr6   rB   Zasarrayobject)r   Zinput_featuresr*   r   r   r   get_feature_names_outz  s
    
z$DictVectorizer.get_feature_names_outc                 C   s`   |st |d }| j}i }|D ]}t|||| < q || _dd t| tddD | _| S )a=  Restrict the features to those in support using feature selection.

        This function modifies the estimator in-place.

        Parameters
        ----------
        support : array-like
            Boolean mask or list of indices (as returned by the get_support
            member of feature selectors).
        indices : bool, default=False
            Whether support is a list of indices.

        Returns
        -------
        self : object
            DictVectorizer class instance.

        Examples
        --------
        >>> from sklearn.feature_extraction import DictVectorizer
        >>> from sklearn.feature_selection import SelectKBest, chi2
        >>> v = DictVectorizer()
        >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
        >>> X = v.fit_transform(D)
        >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])
        >>> v.get_feature_names_out()
        array(['bar', 'baz', 'foo'], ...)
        >>> v.restrict(support.get_support())
        DictVectorizer()
        >>> v.get_feature_names_out()
        array(['bar', 'foo'], ...)
        r   c                 S   s   g | ]\}}|qS r   r   )r1   r(   r2   r   r   r   rN     s    z+DictVectorizer.restrict.<locals>.<listcomp>r!   )key)rB   wherer6   r&   r7   sortedr4   r   )r   Zsupportr   rR   Z	new_vocabr2   r   r   r   restrict  s    !zDictVectorizer.restrictc                 C   s
   ddgiS )NZX_typesdictr   )r   r   r   r   
_more_tags  s    zDictVectorizer._more_tags)N)N)N)F)__name__
__module____qualname____doc__r#   r   ra   __annotations__rB   Zfloat64r   r.   r
   r;   rG   rJ   rU   rV   r\   r`   rb   r   r   r   r   r      s,   
M5c)

0r   )r   collections.abcr   r   Znumbersr   operatorr   ZnumpyrB   Zscipy.sparser   rC   baser   r	   r
   utilsr   Zutils.validationr   r   r   r   r   r   <module>   s   