
    7h^                         S SK r S SKrS SKJr  S SKrS SKrSSKJr  SSKJ	r	J
r
  SSKJr   " S S\5      r " S	 S
\5      r\ R                  S\4S j5       rS\R"                  S\4S jrS\R"                  S\4S jrS\R"                  S\4S jr " S S\5      r " S S\5      r " S S\5      rS/S//rS/S//S/S//S/S///r/ SQ/ SQ/ SQ/rS\R"                  S\4S jrg)     N)IntEnum   )ir)get_dtype_sizesympy_product)Vc                        \ rS rSrSrSrSrSrg)	NCCL_COLL   r   r       N)__name__
__module____qualname____firstlineno__
ALL_REDUCE
ALL_GATHERREDUCE_SCATTER__static_attributes__r       W/var/www/fran/franai/venv/lib/python3.13/site-packages/torch/_inductor/comm_analysis.pyr
   r
      s    JJNr   r
   c                        \ rS rSrSrSrSrSrg)NVIDIA_GPU_TYPE   r   r   r   r   N)r   r   r   r   VOLTAAMPEREHOPPERr   r   r   r   r   r      s    EFFr   r   returnc                  N   [         R                  R                  R                  [         R                  R                  R                  5      =(       d    Sn SU ;   a  [
        R                  $ SU ;   a  [
        R                  $ SU ;   a  [
        R                  $ [
        R                  $ )N V100A100H100)	torchutilscollect_envget_gpu_inforunr   r   r   r   )gpu_infos    r   get_gpu_typer*      s|    {{&&33EKK4K4K4O4OPVTVH$$$	8	%%%	8	%%% %%%r   nodec                    [        U [        R                  5      (       d  [        SU  35      eU R                  nUc   eSU;   a  [
        R                  $ SU;   a  [
        R                  $ SU;   a  [
        R                  $ [        SU 35      e)Nz!node is not a collective kernel: 
all_reduce
all_gatherreduce_scatterzUnsupported collective kernel: )	
isinstancer   _CollectiveKernel
ValueErrorpython_kernel_namer
   r   r   r   )r+   kernel_names     r   get_collective_typer5   (   s    dB0011<TFCDD))K"""{"###		$###	[	(''':;-HIIr   c                 \   SnU R                    H  n[        UR                  R                  5      n[	        U[
        R                  5      (       a  [        U5      nO([        R                  R                  R                  USS9nX[        UR                  R                  5      -  -  nM     U$ )Nr   )fallback)inputsr   layoutsizer0   sympyIntegerintr   graphsizevars	size_hintr   dtype)r+   sz_bytesinpnumels       r   get_collective_input_size_bytesrE   8   s    H{{cjjoo.eU]]++JEGG$$..uq.AEN3::+;+;<<<  Or   c                     [        U 5      [        R                  :X  a  SSKJn  U" U R
                  S   5      $ [        SU  35      e)Nr   )_get_group_size_by_namezUnsupported collective type: )typer   r1   "torch.distributed.distributed_c10drG   constant_args	TypeError)r+   rG   s     r   get_collective_group_sizerM   E   s@    DzR)))N&t'9'9"'=>>7v>??r   c                        \ rS rSrSrSrSrSrg)NCCL_HWS   r   r   r   r   N)r   r   r   r   NVLINKPCINETr   r   r   r   rO   rO   S   s    F
C
Cr   rO   c                       \ rS rSrSrSrSrg)	NCCL_ALGOY   r   r   r   N)r   r   r   r   TREERINGr   r   r   r   rU   rU   Y   s    DDr   rU   c                       \ rS rSrSrSrg)
NCCL_PROTO^   r   r   N)r   r   r   r   LLr   r   r   r   rZ   rZ   ^   s	     
Br   rZ   g333333@gffffff@g333333?      ?g      @g@)     C@r^   gffffff4@)gU@g     6@g      3@c                    [        U 5      nUS-  S-  S-  nSn[        U 5      n[        R                  " XC-  5      nUnUS::  a  g[        R
                  n[        R                  n[        U 5      n	[        R                  R                  R                  n
[        R                  R                  R                  n[        5       nUS::  a  US-
  OSnUS:X  a  UOSn[        U   U   nUS:X  a  U
OUnSnUU-  n[!        UUUS:  d  U	["        R$                  :X  a  SOS-  5      nU	["        R$                  :X  a	  SUS-
  -  nO)U	["        R&                  ["        R(                  4;   a  US-
  nSU-  W-  nUU-  nUS	-  n[*        R,                  nU	["        R$                  :X  a  US:  a  SU-  nO,SnO)U	["        R&                  ["        R(                  4;   a  US-
  n[.        U   U   n[0        U   U   U   n[0        [*        R2                     U   U   nS
nUS:  a  Sn[5        UU5      nUUW-
  U-  UU-  -   -  nUS-  nUU-  nUU-   $ )a  
Returns estimated NCCL collective runtime in nanoseconds (ns).

The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
We aim to estimate the runtime as accurately as possible.

Assumptions:
- only ring algorithm (NCCL_ALGO_RING) is used
- only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
- 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
- collective is one of: allreduce, reducescatter, allgather
i      r   r   r   g      ?gUUUUUU?r]   g    eAg        g     @@)rE   rM   mathceilrU   rX   rZ   r\   r5   r$   	_inductorconfigintra_node_bwinter_node_bwr*   llMaxBwsminr
   r   r   r   rO   rQ   baseLathwLatrS   max)r+   tensor_storage_size_bytestensor_storage_size_GBnum_gpus_per_node
group_sizenNodesnRanks	nccl_algo
nccl_protocollbwIntrabwIntercompCapIndexindex2index1llMaxBwbw	nChannelsbusBwnstepsratio	bandwidthbandwidth_GB_per_nsintraHwnInterStepslatencyintraLatinterLatnetOverhead
latency_nstransport_nss                                  r    estimate_nccl_collective_runtimer      so    !@ E6=DtK *40JYYz56FF{ IJt$D
 oo$$22Goo$$22G>L!Q;VaZAF#q[\aFvv&G aKWBINE !ty/C/C'C9)	UE y###fqj!	)**I,@,@A	A! 6\V#EI#c/ nnGy###A:f*KK	)**I,@,@A	Aqj i ,GW~i(4HW[[!),Z8H Kz8[)H$0;3IIIG3J *,??L*$$r   )	functoolsra   enumr   r;   r$   r    r   r%   r   r   virtualizedr   r
   r   	lru_cacher*   IRNoder5   r=   rE   rM   rO   rU   rZ   ri   rj   rg   floatr   r   r   r   <module>r      sJ         0  g  
&o 
& 
&Jbii JI J 
")) 
 
@BII @# @g  
  	
 		  
	 
	 
		,,b%299 b% b%r   