
    8hE                        S SK r S SKJrJr  S SKrS SKJr  S SKJ	s  J
r  S SKJ	s  Jr  S SKJr  S SKJr  S SKJrJrJr  S SKJrJr  S SKJr  S SKJrJrJr  S S	KJ r   S S
K!J"r"  \RF                  RH                  r$S/r%\ RL                  S 5       r'S\(\"S4   S\)S\)4S jr*S\(\"S4   S\S\4S jr+S\RX                  RZ                  S\(\.S4   S\/\0\.4   S\4S jr1S r2S\RX                  RZ                  S\(\.S4   S\/\0\.4   S\.4S jr3S\RX                  RZ                  S\(\.S4   S\/\0\.4   S\.4S jr4S\S\S\\   S\\   S\)S \)S!\Rj                  S"\)S\S#\)S\(\\4   4S$ jr6S\RX                  RZ                  S\(\.S4   S\/\0\.4   S\.4S% jr7S&\S\S\S\\   S\)S \)S'\S!\Rj                  S"\)S\S#\)S\4S( jr8S\RX                  RZ                  S\(\.S4   S\/\0\.4   S\.4S) jr9\$Rd                  Rt                  \3\$Rv                  Rt                  \4\$Rx                  Rt                  \7\$Rz                  Rt                  \7\$R|                  Rt                  \9\$R~                  Rt                  \90r@S* rAS+ rBg),    N)castOptional)Tensor)
DeviceMesh)DTensor	ReplicateShard)DTensorSpec
TensorMeta)_MaskPartial)	_skip_dim	Reductionreplicate_reduction_dims)normalize_dim)	Placementloss_parallelc               #   <   #    [        5         Sv   [        5         g7f)a  
A context manager that enables loss parallelism, where efficient parallelized loss computation
can be performed when the input is sharded on the class dimension. Currently only the cross-entropy
loss is supported.

Within this context manager, one can use :func:`~torch.nn.functional.cross_entropy` or
:class:`~torch.nn.CrossEntropyLoss` as usual, with the following assumptions on the input parameters.
The corresponding ``backward()`` call, if any, also needs to happen under this context manager.

Args:
    input (:class:`DTensor`):
        Input logits. Assumed to be sharded on the class dimension.
    target (Union[:class:`torch.Tensor`, :class:`DTensor`]):
        Must be ground truth class indices (class probabilities currently not supported).
        Assumed to be replicated across the ``DeviceMesh``.
    weight (Union[:class:`torch.Tensor`, :class:`DTensor`], optional):
        If given, assumed to be replicated across the ``DeviceMesh``.
    label_smoothing:
        Currently not supported.

Returns:
    A replicated :class:`DTensor`.

Example:
    A sharded DTensor is manually created here to showcase the usage.
    In practice, it is usually the output of a TP module.

    >>> # xdoctest: +SKIP("distributed")
    >>> from torch.distributed.tensor.parallel import loss_parallel
    >>> from torch.distributed.device_mesh import init_device_mesh
    >>> ...
    >>> device_mesh = init_device_mesh("cuda", (8,))
    >>> input = torch.randn(4, 16, device="cuda", requires_grad=True)
    >>> dist_input = distribute_tensor(input, device_mesh, placements=[Shard(1)])
    >>> target = torch.randint(16, (4,), device="cuda")
    >>> with loss_parallel():
    >>>     loss = F.cross_entropy(dist_input, target, reduction="mean")
    >>>     loss.backward()
    >>> ...
N)_enable_custom_loss_ops_disable_custom_loss_ops     `/var/www/fran/franai/venv/lib/python3.13/site-packages/torch/distributed/tensor/parallel/loss.pyr   r      s     T 	s   
placements.dimreturnc                     [        U 5      S:X  d  [        S5      eU S   R                  U5      (       d  [        SU S35      eg)N   zLCurrently loss_parallel() only supports input on one-dimensional DeviceMesh.r   zUloss_parallel() should be enabled only when the input tensor is sharded on dimension .)len
ValueErroris_shard)r   r   s     r   _find_all_reduce_mesh_dimr"   Q   sV    z?aZ
 	
 a=!!#&&cdgchhij
 	
 r   meshc                     [        U [        5      (       a.  U R                  U:X  a  U $ [        SU SU R                   S35      e[        U [        R
                  5      (       a  [        R                  " XUSS9$ [        S[        U 5       35      e)Nz	Expected z	 but got r   F)device_meshr   	run_checkzUnsupported type )	
isinstancer   r   RuntimeErrortorchr   
from_local	TypeErrortype)tensorr   r#   s      r   _cast_to_dtensorr.   ]   s     &'""
*M:,i@Q@Q?RRSTUU	FELL	)	)!!u
 	
 +DL>:;;r   op_callargskwargsc                 :   [         R                  R                  XU5      n[         R                  R                  R	                  UR
                  5      n[        U[        5      (       a  U$ [        U[        5      (       a  US   $ [        S[        U5       S35      e)Nr   zUnexpected tensor meta type: r   )r   _op_dispatcherunwrap_to_op_infosharding_propagator_propagate_tensor_metaschemar'   r   tupler(   r,   )r/   r0   r1   op_infotensor_metas        r   r6   r6   m   s    
 $$66wfMG((<<SSK +z**	K	'	'1~:4;L:MQOPPr   c                    U(       a   U R                   [        R                  :X  d   e[        R                  " U [        R
                  R                  S9u  pVU R                  U[        R                  S9n U R                  5       S:X  a  U nOR[        R                  " XSS9n[        R                  " U[        R                  R                  R                   X44S9nX-
  n[        R"                  " [        R$                  " U5      USS9n	[        R                  " U	[        R                  R&                  R                   X44S9n	[        R(                  " U	5      n
Xz-
  nU(       d  UR                  U5      nU$ )N)type_promotion_kind)dtypememory_formatr   T)keepdim)reduceOpgroup)r=   r)   halfutilselementwise_dtypesELEMENTWISE_TYPE_PROMOTION_KINDDEFAULTtocontiguous_formatnumelamaxfuncol
all_reducec10dReduceOpMAXnamesumexpSUMlog)xr   half_to_floatr#   mesh_dimcomputation_dtyperesult_dtypeshiftedx_maxshifted_sumexpshifted_logsumexpresults               r   _log_softmaxr_      s$   ww%**$$$&+&>&>	uDDLL'# 	
$E4K4KLAwwyA~

140!!DMM--224:J
 )YYuyy13EN&&!2!2!7!7?ON 		.1(F<(Mr   c                    [        [        US   5      n[        [        US   5      n[        [        US   5      nUR                  n[        XCR                  5       5      n[        UR                  U5      n[        XU5      n[        UR                  XEUR                  U5      n	[        UR                  UR                  US9n
[        U	U
U	R                  S9$ )Nr   r      r:   requires_grad)r   r   intbool_specr   r   r"   r   r6   r_   _local_tensorr#   r
   rd   )r/   r0   r1   rU   r   rV   specrW   output_tensor_metaresres_specs              r   _log_softmax_handlerrm      s    
 	Wd1gA
sDG
CtAw'M77D
UUW
%C(#>H/vF
qDIIx
PC		&H '' r   c                     [        [        US   5      n[        [        R                  US   5      nUR	                  U5      $ )Nr      )r   r   r)   r=   rG   )r/   r0   r1   grad_outputinput_dtypes        r   _log_softmax_backward_handlerrr      s7    
 wQ(Ku{{DG,K>>+&&r   rU   targetweightlocal_weight	reductionignore_indexinput_shapechannel_dimrW   c
                 Z  ^^ U R                  5       mSmTS:  a  SmS[        S[        4UU4S jjn
Ub  U
" U5      nUc   eU
" U5      nX-  n [        R                  " X:g  US5      nUR	                  T5      n[        UTS9nUR                  XU	5      n[        R                  " U TU5      nUR                  UX5      nUR                  T5      * n[        R                  " X:g  US5      nU[        R                  R                  :X  a  TS:  a  U R                  SS	5      nUU4$ Ub}  [        U R                  5      nS
UT'   WR!                  U5      n[        R                  " UTU5      R                  T5      n[        R                  " X:g  US5      nUR#                  5       nO!X:g  R#                  5       R%                  U 5      nU[        R&                  R                  :X  a  UR#                  5       nUU4$ U[        R(                  R                  :X  a  UR#                  5       U-  nUU4$ )Nr   ra   r   rt   r   c                 n   > TS:  a+  S/T-  nU R                   S   UT'   U R                  U5      nU$ U nU$ )Nr   r   )shapeview)rt   r|   wry   n_dimss      r   _weight_view'_nll_loss_forward.<locals>._weight_view   sQ    A:E "(aE+E"A  Ar   offset_shape
offset_dimr   g        )r   r   r)   where	unsqueezer   _partition_valuegather_reduce_valuesqueezer   NONEvaluenew_fulllistr|   expandrQ   rG   rS   MEAN)rU   rs   rt   ru   rv   rw   rx   ry   r#   rW   r   r~   local_wsafe_targetsafe_target_partial_placementsafe_target_partial_result_partialresult_reducedr^   total_weight	new_shapewsumr   s          `               @r   _nll_loss_forwardr      s    UUWFKz	V 	 	 	  '''|,K++f4fa@K((5L %++V,==H \\![2FGN&44^TTN$$[11F[[/;FINN(((VaZzz"c*|##M	!#	+HHY||A{L9AA+N{{614;xxz.33588; IMM''' < 
inn**	*,<r   c                    [        [        US   5      nUS   nUS   n[        [        US   5      n[        [        US   5      nUR                  5       S:  a  SOSnUR                  n	[        U	R                  U5      n
[        [        U	R                  U/5      U5      n[        5       4U	R                  R                  -  n[        XKU	R                  5      nS nUb  [        X\U	R                  5      n[        U	R                  R                  5       Vs/ s H  oU
:X  a  [        S5      O	[        5       PM     nnUR                  U	R                  U5      R                   nUR"                  S   UR                   R"                  U   :X  d   eU[$        R&                  R(                  :X  a  UnOUn[+        U5      nXEsUS'   US'   [-        U [/        U5      U5      n[1        UR                   UR                   Ub  UR                   OS UUUUR"                  UU	R                  U
5
      u  nn[3        U	R                  UUS9n[        UUUR4                  S9U4$ s  snf )Nr   r   ra   ro      rb   rc   )r   r   re   r   rg   r"   r   r   r   r   r#   ndimr.   ranger	   redistributerh   r|   r   r   r   r   r6   r8   r   r
   rd   )r/   r0   r1   rU   rs   rt   rv   rw   ry   ri   rW   target_placementsall_replicate_placementsru   isharded_placementsoutput_placementsrj   r^   r   out_specs                        r   _nll_loss_forward_handlerr     s6   
 	Wd1gA!WF!WFS$q'"IT!W%Luuw!|!K77D(+FH " ;-@+ !*~		>fCFL!&DIIN
 AFdiinn@U
@U1XE!H9;6@U 	 
 **4996HIWW!!!$(=(=k(JJJJINN(((-4 :DDGT!W/tfM,	 & 2			FL 499&7EWXH 	 ..	

 	 =
s   $Irp   r   c                    UR                  5       S:  a  SOSnU[        R                  R                  :X  a  X-  n UR	                  U5      n[
        R                  " X%:g  US5      n[
        R                  " U5      n[        XxS9nUR                  U5      R                  5       nUR                  XU
5      nUR                  R                  c   eUR                  R                  R                  UR                  5      S-
  n[
        R                   " UR"                  S   UR$                  S9nUR                  5       S:X  a  XU'   OUR                  5       S:X  a  XUU4'   OeUR'                  US5      nUR"                  nUR)                  SUR"                  U   5      nUUUU4'   UR+                  U5      R'                  US5      nUR                  5       U R                  5       s=:  a  S:  a  O  OU R	                  U5      n Ub  [-        UR                  5       5       Vs/ s H  nSPM     nnUR"                  S   UU'   UR)                  U5      n[/        UR"                  5      nSUU'   UR1                  U5      n[
        R2                  " UX5      nU U-  n [
        R                  " X%:g  U S5      n U[
        R4                  " U5      -   U -  $ s  snf )Nra   r   r   r   g      ?)devicer   )r   r   r   r   r   r)   r   
zeros_liker   r   flattenr   mask_bufferdatarG   r=   aranger|   r   	transposereshaper}   r   r   r   r   rR   )rp   rU   rs   rt   rv   rw   r   rx   ry   r#   rW   r   
grad_inputr   masked_safe_targetgrad_update	arange_1dgrad_input_tintermidate_shapegrad_input_2d_r   r~   w_targets                           r   "_nll_loss_and_log_softmax_backwardr   Z  s    uuw{!KINN(((!0k*F++f4fa@K!!!$J %+V%%k2::<K*;;KxX((--999#//4477
8H8HICOK  #,>,E,EI
 	uuw!|)4%&	
A4?9001!++K<(..$,,R1EF7Bi!334"''(9:DD[RTU
~~+//+/a/!++K8 %aeeg/1Q	/!'a	+	* M	!#	+MM)$<<;7!H,++f4k1EK 1%44# 0s   !Kc                    [        [        US   5      n[        [        US   5      nUS   nUS   n[        [        US   5      n[        [        US   5      n[        [        US   5      n	UR	                  5       S:  a  SOSn
UR
                  n[        UR                  U
5      n[        [        UR                  U
/5      U
5      n[        5       4UR                  R                  -  n[        X]UR                  5      nUb  [        XnUR                  5      n[        U5      nXVsUS'   US'   [        XUR                  5      US'   [        U [!        U5      U5      n[#        UR$                  UR$                  UR$                  Ub  UR$                  OS UUU	UR&                  U
UR                  U5      n[)        UR                  UR                  US9n[        UUUR*                  S	9$ )
Nr   r   ra   ro   r         rb   rc   )r   r   re   r   r   rg   r"   r   r   r   r   r#   r   r.   r   r6   r8   r   rh   r|   r
   rd   )r/   r0   r1   rp   rU   rs   rt   rv   rw   r   ry   ri   rW   r   r   rj   r^   r   s                     r   _nll_loss_backward_handlerr     s   
 wQ(KWd1gA!WF!WFS$q'"IT!W%LQ(Luuw!|!K77D(+FH " ;-@+ !*~		>fCF!&DIIN :DDGT!W|tyyQDG/tfM/!!	 & 2			F 		&H ** r   c                  ^    [         R                  R                  R                  [        5        g N)r   r3   _custom_op_handlersupdatecustomized_loss_opsr   r   r   r   r     s    ..556IJr   c                  p    [          H,  n [        R                  R                  R	                  U 5        M.     g r   )r   r   r3   r   pop)	custom_ops    r   r   r     s&    (	2266yA )r   )C
contextlibtypingr   r   r)   torch._prims_common_prims_commonrC   )torch.distributed._functional_collectivesdistributed_functional_collectivesrK   "torch.distributed.distributed_c10ddistributed_c10drM   r   torch.distributed.device_meshr   torch.distributed.tensorr   r   r	   &torch.distributed.tensor._dtensor_specr
   r   ,torch.distributed.tensor._ops._embedding_opsr   'torch.distributed.tensor._ops._math_opsr   r   r   #torch.distributed.tensor._ops.utilsr   (torch.distributed.tensor.placement_typesr   opsaten__all__contextmanagerr   r8   re   r"   r.   _ops
OpOverloadobjectdictstrr6   r_   rm   rr   Sizer   r   r   r   default_log_softmax_backward_datanll_loss_forwardnll_loss2d_forwardnll_loss_backwardnll_loss2d_backwardr   r   r   r   r   r   <module>r      s    !  # : : 1 1  4 > > J E 
 > > yy~~ 
 - -d	%	3*? 	c 	c 	<in-<5?<< QZZ""Q

Q fQ 	Q&4ZZ""

 f 	@'ZZ""'

' f' 	'F F F  VF  6"	F 
 F  F  F  F  F  F  66>F RAZZ""A

A fA 	AVB5B5B5 B5 V	B5
 B5 B5 B5 B5 B5 B5 B5 B5J8ZZ""8

8 f8 	8x 	3##++-J!!#<##%>""$>$$&@ KBr   