U
    }hw"                     @   s>   d Z ddlZG dd dZG dd deZG dd deZdS )	z(Stochastic optimization methods for MLP
    Nc                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )BaseOptimizera9  Base (Stochastic) gradient descent optimizer

    Parameters
    ----------
    learning_rate_init : float, default=0.1
        The initial learning rate used. It controls the step-size in updating
        the weights

    Attributes
    ----------
    learning_rate : float
        the current learning rate
    皙?c                 C   s   || _ t|| _d S N)learning_rate_initfloatlearning_rate)selfr    r	   Q/tmp/pip-unpacked-wheel-ig1s1lm8/sklearn/neural_network/_stochastic_optimizers.py__init__   s    zBaseOptimizer.__init__c                 C   s4   |  |}tdd |D |D ]\}}||7 }qdS )a  Update parameters with given gradients

        Parameters
        ----------
        params : list of length = len(coefs_) + len(intercepts_)
            The concatenated list containing coefs_ and intercepts_ in MLP
            model. Used for initializing velocities and updating params

        grads : list of length = len(params)
            Containing gradients with respect to coefs_ and intercepts_ in MLP
            model. So length should be aligned with params
        c                 s   s   | ]
}|V  qd S r   r	   ).0pr	   r	   r
   	<genexpr>+   s     z.BaseOptimizer.update_params.<locals>.<genexpr>N)_get_updateszip)r   paramsgradsupdatesparamupdater	   r	   r
   update_params   s    
zBaseOptimizer.update_paramsc                 C   s   dS )zhPerform update to learning rate and potentially other states at the
        end of an iteration
        Nr	   r   Z	time_stepr	   r	   r
   iteration_ends.   s    zBaseOptimizer.iteration_endsc                 C   s   |rt |d  dS )aH  Decides whether it is time to stop training

        Parameters
        ----------
        msg : str
            Message passed in for verbose output

        verbose : bool
            Print message to stdin if True

        Returns
        -------
        is_stopping : bool
            True if training needs to stop
        
 Stopping.T)printr   msgverboser	   r	   r
   trigger_stopping4   s    zBaseOptimizer.trigger_stoppingN)r   )__name__
__module____qualname____doc__r   r   r   r   r	   r	   r	   r
   r   
   s
   
r   c                       s:   e Zd ZdZd fdd	Zd	d
 Zdd Zdd Z  ZS )SGDOptimizera  Stochastic gradient descent optimizer with momentum

    Parameters
    ----------
    params : list, length = len(coefs_) + len(intercepts_)
        The concatenated list containing coefs_ and intercepts_ in MLP model.
        Used for initializing velocities and updating params

    learning_rate_init : float, default=0.1
        The initial learning rate used. It controls the step-size in updating
        the weights

    lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant'
        Learning rate schedule for weight updates.

        -'constant', is a constant learning rate given by
         'learning_rate_init'.

        -'invscaling' gradually decreases the learning rate 'learning_rate_' at
          each time step 't' using an inverse scaling exponent of 'power_t'.
          learning_rate_ = learning_rate_init / pow(t, power_t)

        -'adaptive', keeps the learning rate constant to
         'learning_rate_init' as long as the training keeps decreasing.
         Each time 2 consecutive epochs fail to decrease the training loss by
         tol, or fail to increase validation score by tol if 'early_stopping'
         is on, the current learning rate is divided by 5.

    momentum : float, default=0.9
        Value of momentum used, must be larger than or equal to 0

    nesterov : bool, default=True
        Whether to use nesterov's momentum or not. Use nesterov's if True

    power_t : float, default=0.5
        Power of time step 't' in inverse scaling. See `lr_schedule` for
        more details.

    Attributes
    ----------
    learning_rate : float
        the current learning rate

    velocities : list, length = len(params)
        velocities that are used to update params
    r   constant?T      ?c                    s8   t  | || _|| _|| _|| _dd |D | _d S )Nc                 S   s   g | ]}t |qS r	   npZ
zeros_liker   r   r	   r	   r
   
<listcomp>   s     z)SGDOptimizer.__init__.<locals>.<listcomp>)superr   lr_schedulemomentumnesterovpower_t
velocities)r   r   r   r,   r-   r.   r/   	__class__r	   r
   r   y   s    	zSGDOptimizer.__init__c                 C   s(   | j dkr$t| j|d | j  | _dS )a  Perform updates to learning rate and potential other states at the
        end of an iteration

        Parameters
        ----------
        time_step : int
            number of training samples trained on so far, used to update
            learning rate for 'invscaling'
        Z
invscaling   N)r,   r   r   r/   r   r   r	   r	   r
   r      s    

zSGDOptimizer.iteration_endsc                 C   sd   | j dkr|rt|d  dS | jdkr<|r8t|d  dS |  jd  _|r`t|d| j   dS )	NZadaptiver   Tgư>z# Learning rate too small. Stopping.g      @z Setting learning rate to %fF)r,   r   r   r   r	   r	   r
   r      s    

zSGDOptimizer.trigger_stoppingc                    sD    fddt  j|D }| _ jr@ fddt  j|D }|S )  Get the values used to update params with given gradients

        Parameters
        ----------
        grads : list, length = len(coefs_) + len(intercepts_)
            Containing gradients with respect to coefs_ and intercepts_ in MLP
            model. So length should be aligned with params

        Returns
        -------
        updates : list, length = len(grads)
            The values to add to params
        c                    s$   g | ]\}} j |  j|  qS r	   r-   r   r   Zvelocitygradr   r	   r
   r*      s   z-SGDOptimizer._get_updates.<locals>.<listcomp>c                    s$   g | ]\}} j |  j|  qS r	   r5   r6   r8   r	   r
   r*      s   )r   r0   r.   r   r   r   r	   r8   r
   r      s    



zSGDOptimizer._get_updates)r   r$   r%   Tr&   )	r   r    r!   r"   r   r   r   r   __classcell__r	   r	   r1   r
   r#   I   s   2     r#   c                       s*   e Zd ZdZd
 fdd	Zdd	 Z  ZS )AdamOptimizera  Stochastic gradient descent optimizer with Adam

    Note: All default values are from the original Adam paper

    Parameters
    ----------
    params : list, length = len(coefs_) + len(intercepts_)
        The concatenated list containing coefs_ and intercepts_ in MLP model.
        Used for initializing velocities and updating params

    learning_rate_init : float, default=0.001
        The initial learning rate used. It controls the step-size in updating
        the weights

    beta_1 : float, default=0.9
        Exponential decay rate for estimates of first moment vector, should be
        in [0, 1)

    beta_2 : float, default=0.999
        Exponential decay rate for estimates of second moment vector, should be
        in [0, 1)

    epsilon : float, default=1e-8
        Value for numerical stability

    Attributes
    ----------
    learning_rate : float
        The current learning rate

    t : int
        Timestep

    ms : list, length = len(params)
        First moment vectors

    vs : list, length = len(params)
        Second moment vectors

    References
    ----------
    :arxiv:`Kingma, Diederik, and Jimmy Ba (2014) "Adam: A method for
        stochastic optimization." <1412.6980>
    MbP?r%   +?:0yE>c                    sH   t  | || _|| _|| _d| _dd |D | _dd |D | _d S )Nr   c                 S   s   g | ]}t |qS r	   r'   r)   r	   r	   r
   r*      s     z*AdamOptimizer.__init__.<locals>.<listcomp>c                 S   s   g | ]}t |qS r	   r'   r)   r	   r	   r
   r*      s     )r+   r   beta_1beta_2epsilontmsvs)r   r   r   r?   r@   rA   r1   r	   r
   r      s    zAdamOptimizer.__init__c                    s     j d7  _  fddt j|D  _ fddt j|D  _ jtd j j    d j j     _	 fddt j jD }|S )r4   r3   c                    s(   g | ] \}} j | d  j  |  qS )r3   )r?   )r   mr7   r8   r	   r
   r*     s   z.AdamOptimizer._get_updates.<locals>.<listcomp>c                    s,   g | ]$\}} j | d  j  |d   qS )r3      )r@   )r   vr7   r8   r	   r
   r*     s   c                    s,   g | ]$\}} j  | t| j  qS r	   )r   r(   sqrtrA   )r   rE   rG   r8   r	   r
   r*     s   )
rB   r   rC   rD   r   r(   rH   r@   r?   r   r9   r	   r8   r
   r      s"    




zAdamOptimizer._get_updates)r<   r%   r=   r>   )r   r    r!   r"   r   r   r:   r	   r	   r1   r
   r;      s   .       r;   )r"   Znumpyr(   r   r#   r;   r	   r	   r	   r
   <module>   s   ?}