U
    |±ËhÄ<  ã                   @   sŠ   d dl m Z  d dlmZmZ d dlmZ d dlmZ d dlZ	d dl
mZ ddlmZmZmZ ddlmZ dd	lmZ G d
d„ deeƒZdS )é    )Úarray)ÚIterableÚMapping)ÚNumber)Ú
itemgetterNé   )ÚBaseEstimatorÚTransformerMixinÚ_fit_context)Úcheck_array)Úcheck_is_fittedc                   @   s¸   e Zd ZU dZdegdgdgdœZeed< ej	ddddœdd	„Z
dd
dddœdd„Zeddd dd„ƒZdd„ Zeddd!dd„ƒZefdd„Zdd„ Zd"dd„Zd#dd„Zdd„ ZdS )$ÚDictVectorizera©  Transforms lists of feature-value mappings to vectors.

    This transformer turns lists of mappings (dict-like objects) of feature
    names to feature values into Numpy arrays or scipy.sparse matrices for use
    with scikit-learn estimators.

    When feature values are strings, this transformer will do a binary one-hot
    (aka one-of-K) coding: one boolean-valued feature is constructed for each
    of the possible string values that the feature can take on. For instance,
    a feature "f" that can take on the values "ham" and "spam" will become two
    features in the output, one signifying "f=ham", the other "f=spam".

    If a feature value is a sequence or set of strings, this transformer
    will iterate over the values and will count the occurrences of each string
    value.

    However, note that this transformer will only do a binary one-hot encoding
    when feature values are of type string. If categorical features are
    represented as numeric values such as int or iterables of strings, the
    DictVectorizer can be followed by
    :class:`~sklearn.preprocessing.OneHotEncoder` to complete
    binary one-hot encoding.

    Features that do not occur in a sample (mapping) will have a zero value
    in the resulting array/matrix.

    For an efficiency comparision of the different feature extractors, see
    :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.

    Read more in the :ref:`User Guide <dict_feature_extraction>`.

    Parameters
    ----------
    dtype : dtype, default=np.float64
        The type of feature values. Passed to Numpy array/scipy.sparse matrix
        constructors as the dtype argument.
    separator : str, default="="
        Separator string used when constructing new features for one-hot
        coding.
    sparse : bool, default=True
        Whether transform should produce scipy.sparse matrices.
    sort : bool, default=True
        Whether ``feature_names_`` and ``vocabulary_`` should be
        sorted when fitting.

    Attributes
    ----------
    vocabulary_ : dict
        A dictionary mapping feature names to feature indices.

    feature_names_ : list
        A list of length n_features containing the feature names (e.g., "f=ham"
        and "f=spam").

    See Also
    --------
    FeatureHasher : Performs vectorization using only a hash function.
    sklearn.preprocessing.OrdinalEncoder : Handles nominal/categorical
        features encoded as columns of arbitrary data types.

    Examples
    --------
    >>> from sklearn.feature_extraction import DictVectorizer
    >>> v = DictVectorizer(sparse=False)
    >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
    >>> X = v.fit_transform(D)
    >>> X
    array([[2., 0., 1.],
           [0., 1., 3.]])
    >>> v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},
    ...                            {'baz': 1.0, 'foo': 3.0}]
    True
    >>> v.transform({'foo': 4, 'unseen_feature': 3})
    array([[0., 0., 4.]])
    Zno_validationÚboolean©ÚdtypeÚ	separatorÚsparseÚsortÚ_parameter_constraintsú=Tc                C   s   || _ || _|| _|| _d S ©Nr   )Úselfr   r   r   r   © r   úO/tmp/pip-unpacked-wheel-ig1s1lm8/sklearn/feature_extraction/_dict_vectorizer.pyÚ__init__f   s    zDictVectorizer.__init__FN©ÚfittingÚtransformingÚindicesÚvaluesc                C   sŽ   |D ]„}	t |	tƒr(d|| j|	f }
d}	ntdt|	ƒ› dƒ‚|r^|
|kr^t|ƒ||
< | |
¡ |r|
|kr| ||
 ¡ | |  |	¡¡ qdS )z)Add feature names for iterable of stringsú%s%s%sé   zUnsupported type z; in iterable value. Only iterables of string are supported.N)Ú
isinstanceÚstrr   Ú	TypeErrorÚtypeÚlenÚappendr   )r   ÚfÚvÚfeature_namesÚvocabr   r   r   r   ÚvvÚfeature_namer   r   r   Ú_add_iterable_elementl   s    
ÿ
z$DictVectorizer._add_iterable_element)Zprefer_skip_nested_validationc           	   
   C   sø   g }i }|D ]º}|  ¡ D ]¬\}}t|tƒr<d|| j|f }nbt|tƒsN|dkrT|}nJt|tƒr€tdt|ƒ› d|› d|› dƒ‚nt|tƒržd}|  	||||¡ |dk	r||krt
|ƒ||< | |¡ qq| jrè| ¡  dd„ t|ƒD ƒ}|| _|| _| S )	a)  Learn a list of feature name -> indices mappings.

        Parameters
        ----------
        X : Mapping or iterable over Mappings
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).

            .. versionchanged:: 0.24
               Accepts multiple string values for one categorical feature.

        y : (ignored)
            Ignored parameter.

        Returns
        -------
        self : object
            DictVectorizer class instance.
        r    NzUnsupported value type ú for ú: z$.
Mapping objects are not supported.c                 S   s   i | ]\}}||“qS r   r   )Ú.0Úir(   r   r   r   Ú
<dictcomp>º   s      z&DictVectorizer.fit.<locals>.<dictcomp>)Úitemsr"   r#   r   r   r   r$   r%   r   r.   r&   r'   r   Ú	enumerateÚfeature_names_Úvocabulary_)	r   ÚXÚyr*   r+   Úxr(   r)   r-   r   r   r   Úfit‹   s2    

ÿ
zDictVectorizer.fitc                 C   sX  t dƒjdkstdƒ‚| j}|r*g }i }n| j}| j}d}t|tƒrJ|gn|}t dƒ}dg}g }	|D ]}
|
 ¡ D ]ò\}}t|t	ƒršd|| j
|f }d}nvt|tƒs¬|d kr²|}n^t|tƒsæt|tƒræd }| j||||||||	d n*td	t|ƒ› d
|› d|› dt|ƒ› d	ƒ‚|d k	rr|r>||kr>t|ƒ||< | |¡ ||krr| || ¡ |	 |  |¡¡ qr| t|ƒ¡ qdt|ƒdkrŒtdƒ‚tj|tjd}t|ƒd t|ƒf}tj|	||f||d}|r(| jr(| ¡  tjt|ƒtjd}t|ƒD ]\}}|| ||< |||< qø|d d …|f }| jr:| ¡  n| ¡ }|rT|| _|| _|S )Nr2   é   z¯sizeof(int) != 4 on your platform; please report this at https://github.com/scikit-learn/scikit-learn/issues and include the output from platform.platform() in your bug reportTr   r    r!   r   zUnsupported value Type r/   r0   z.
z objects are not supported.zSample sequence X is empty.©r   )Úshaper   )r   ÚitemsizeÚAssertionErrorr   r6   r7   r"   r   r4   r#   r   r   r   r.   r$   r%   r&   r'   Ú
ValueErrorÚnpZ
frombufferZintcÚspZ
csr_matrixr   ÚemptyZint32r5   r   Zsort_indicesZtoarray)r   r8   r   r   r*   r+   r   r   Zindptrr   r:   r(   r)   r-   r>   Zresult_matrixÚ	map_indexÚnew_valr   r   r   Ú
_transformÁ   s†    ÿ

ø$ÿ
  ÿ
zDictVectorizer._transformc                 C   s   | j |ddS )a¬  Learn a list of feature name -> indices mappings and transform X.

        Like fit(X) followed by transform(X), but does not require
        materializing X in memory.

        Parameters
        ----------
        X : Mapping or iterable over Mappings
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).

            .. versionchanged:: 0.24
               Accepts multiple string values for one categorical feature.

        y : (ignored)
            Ignored parameter.

        Returns
        -------
        Xa : {array, sparse matrix}
            Feature vectors; always 2-d.
        T©r   ©rG   )r   r8   r9   r   r   r   Úfit_transform$  s    zDictVectorizer.fit_transformc           
         sÀ   t |ddgd}|jd }| j}‡ fdd„t|ƒD ƒ}t |¡rpt| ¡ Ž D ] \}}|||f || || < qLnLt|ƒD ]B\}}t||dd…f ƒD ]$\}}	|	dkr”|||f ||| < q”qx|S )aN  Transform array or sparse matrix X back to feature mappings.

        X must have been produced by this DictVectorizer's transform or
        fit_transform method; it may only have passed through transformers
        that preserve the number of features and their order.

        In the case of one-hot/one-of-K coding, the constructed feature
        names and values are returned rather than the original ones.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Sample matrix.
        dict_type : type, default=dict
            Constructor for feature mappings. Must conform to the
            collections.Mapping API.

        Returns
        -------
        D : list of dict_type objects of shape (n_samples,)
            Feature mappings for the samples in X.
        ZcsrZcsc)Zaccept_sparser   c                    s   g | ]
}ˆ ƒ ‘qS r   r   )r1   Ú_©Ú	dict_typer   r   Ú
<listcomp>Z  s     z4DictVectorizer.inverse_transform.<locals>.<listcomp>N)	r   r>   r6   ÚrangerC   ÚissparseÚzipZnonzeror5   )
r   r8   rM   Z	n_samplesÚnamesZdictsr2   ÚjÚdr)   r   rL   r   Úinverse_transform>  s    

z DictVectorizer.inverse_transformc                 C   s   | j |ddS )a  Transform feature->value dicts to array or sparse matrix.

        Named features not encountered during fit or fit_transform will be
        silently ignored.

        Parameters
        ----------
        X : Mapping or iterable over Mappings of shape (n_samples,)
            Dict(s) or Mapping(s) from feature names (arbitrary Python
            objects) to feature values (strings or convertible to dtype).

        Returns
        -------
        Xa : {array, sparse matrix}
            Feature vectors; always 2-d.
        FrH   rI   )r   r8   r   r   r   Ú	transformg  s    zDictVectorizer.transformc                 C   sD   t | dƒ tdd„ | jD ƒƒr0dd„ | jD ƒ}n| j}tj|tdS )a^  Get output feature names for transformation.

        Parameters
        ----------
        input_features : array-like of str or None, default=None
            Not used, present here for API consistency by convention.

        Returns
        -------
        feature_names_out : ndarray of str objects
            Transformed feature names.
        r6   c                 s   s   | ]}t |tƒ V  qd S r   )r"   r#   ©r1   Únamer   r   r   Ú	<genexpr>ˆ  s     z7DictVectorizer.get_feature_names_out.<locals>.<genexpr>c                 S   s   g | ]}t |ƒ‘qS r   )r#   rW   r   r   r   rN   ‰  s     z8DictVectorizer.get_feature_names_out.<locals>.<listcomp>r=   )r   Úanyr6   rB   ZasarrayÚobject)r   Zinput_featuresr*   r   r   r   Úget_feature_names_outz  s
    
z$DictVectorizer.get_feature_names_outc                 C   s`   |st  |¡d }| j}i }|D ]}t|ƒ||| < q || _dd„ t| ¡ tdƒdD ƒ| _| S )a=  Restrict the features to those in support using feature selection.

        This function modifies the estimator in-place.

        Parameters
        ----------
        support : array-like
            Boolean mask or list of indices (as returned by the get_support
            member of feature selectors).
        indices : bool, default=False
            Whether support is a list of indices.

        Returns
        -------
        self : object
            DictVectorizer class instance.

        Examples
        --------
        >>> from sklearn.feature_extraction import DictVectorizer
        >>> from sklearn.feature_selection import SelectKBest, chi2
        >>> v = DictVectorizer()
        >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
        >>> X = v.fit_transform(D)
        >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])
        >>> v.get_feature_names_out()
        array(['bar', 'baz', 'foo'], ...)
        >>> v.restrict(support.get_support())
        DictVectorizer()
        >>> v.get_feature_names_out()
        array(['bar', 'foo'], ...)
        r   c                 S   s   g | ]\}}|‘qS r   r   )r1   r(   r2   r   r   r   rN   ¸  s    z+DictVectorizer.restrict.<locals>.<listcomp>r!   )Úkey)rB   Úwherer6   r&   r7   Úsortedr4   r   )r   Zsupportr   rR   Z	new_vocabr2   r   r   r   ÚrestrictŽ  s    !ÿzDictVectorizer.restrictc                 C   s
   ddgiS )NZX_typesÚdictr   )r   r   r   r   Ú
_more_tags¾  s    zDictVectorizer._more_tags)N)N)N)F)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r#   r   ra   Ú__annotations__rB   Zfloat64r   r.   r
   r;   rG   rJ   rU   rV   r\   r`   rb   r   r   r   r   r      s,   
Müö5c)

0r   )r   Úcollections.abcr   r   Znumbersr   Úoperatorr   ZnumpyrB   Zscipy.sparser   rC   Úbaser   r	   r
   Úutilsr   Zutils.validationr   r   r   r   r   r   Ú<module>   s   