
    ɯwg^                         d dl Z d dlZd dlmZ d dlZd dlZddlmZ ddlm	Z	m
Z
 ddlmZ  G d de      Z G d	 d
e      Ze j                  defd       Zdej"                  defdZdej"                  defdZdej"                  defdZ G d de      Z G d de      Z G d de      ZdgdggZdgdggdgdggdgdgggZg dg dg dgZdej"                  defdZy)     N)IntEnum   )ir)get_dtype_sizesympy_product)Vc                       e Zd ZdZdZdZy)	NCCL_COLLr   r      N)__name__
__module____qualname__
ALL_REDUCE
ALL_GATHERREDUCE_SCATTER     b/home/mcse/projects/flask/flask-venv/lib/python3.12/site-packages/torch/_inductor/comm_analysis.pyr
   r
      s    JJNr   r
   c                       e Zd ZdZdZdZy)NVIDIA_GPU_TYPEr   r   r   N)r   r   r   VOLTAAMPEREHOPPERr   r   r   r   r      s    EFFr   r   returnc                  8   t         j                  j                  j                  t         j                  j                  j                        xs d} d| v rt
        j                  S d| v rt
        j                  S d| v rt
        j                  S t
        j                  S )N V100A100H100)	torchutilscollect_envget_gpu_inforunr   r   r   r   )gpu_infos    r   get_gpu_typer&      s|    {{&&33EKK4K4K4O4OPVTVH$$$	8	%%%	8	%%% %%%r   nodec                    t        | t        j                        st        d|        | j                  }|J d|v rt
        j                  S d|v rt
        j                  S d|v rt
        j                  S t        d|       )Nz!node is not a collective kernel: 
all_reduce
all_gatherreduce_scatterzUnsupported collective kernel: )	
isinstancer   _CollectiveKernel
ValueErrorpython_kernel_namer
   r   r   r   )r'   kernel_names     r   get_collective_typer1   (   s    dB001<TFCDD))K"""{"###		$###	[	(''':;-HIIr   c                 V   d}| j                   D ]  }t        |j                  j                        }t	        |t
        j                        rt        |      }n+t        j                  j                  j                  |d      }||t        |j                  j                        z  z  } |S )Nr   )fallback)inputsr   layoutsizer,   sympyIntegerintr   graphsizevars	size_hintr   dtype)r'   sz_bytesinpnumels       r   get_collective_input_size_bytesrA   8   s    H{{ =cjjoo.eU]]+JEGG$$..uq.AEEN3::+;+;<<<= Or   c                     t        |       t        j                  k(  rddlm}  || j
                  d         S t        d|        )Nr   )_get_group_size_by_namezUnsupported collective type: )typer   r-   "torch.distributed.distributed_c10drC   constant_args	TypeError)r'   rC   s     r   get_collective_group_sizerI   E   s@    DzR)))N&t'9'9"'=>>7v>??r   c                       e Zd ZdZdZdZy)NCCL_HWr   r   r   N)r   r   r   NVLINKPCINETr   r   r   rK   rK   S   s    F
C
Cr   rK   c                       e Zd ZdZdZy)	NCCL_ALGOr   r   N)r   r   r   TREERINGr   r   r   rP   rP   Y   s    DDr   rP   c                       e Zd ZdZy)
NCCL_PROTOr   N)r   r   r   LLr   r   r   rT   rT   ^   s	     
Br   rT   g333333@gffffff@g333333?      ?g      @g@)     C@rW   gffffff4@)gU@g     6@g      3@c                 j   t        |       }|dz  dz  dz  }d}t        |       }t        j                  ||z        }|}|dk  ryt        j
                  }t        j                  }t        |       }	t        j                  j                  j                  }
t        j                  j                  j                  }t               }|dk  r|dz
  nd}|dk(  r|nd}t        |   |   }|dk(  r|
n|}d}||z  }t!        |||dkD  s|	t"        j$                  k(  rdndz        }|	t"        j$                  k(  r	d|dz
  z  }n'|	t"        j&                  t"        j(                  fv r|dz
  }d|z  z  }||z  }|d	z  }t*        j,                  }|	t"        j$                  k(  r|dkD  rd|z  }n*d}n'|	t"        j&                  t"        j(                  fv r|dz
  }t.        |   |   }t0        |   |   |   }t0        t*        j2                     |   |   }d
}|dkD  rd}t5        ||      }||z
  |z  ||z  z   z  }|dz  }||z  }||z   S )a9  
    Returns estimated NCCL collective runtime in nanoseconds (ns).

    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
    We aim to estimate the runtime as accurately as possible.

    Assumptions:
    - only ring algorithm (NCCL_ALGO_RING) is used
    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
    - collective is one of: allreduce, reducescatter, allgather
    i      r   r   r   g      ?gUUUUUU?rV   g    eAg        g     @@)rA   rI   mathceilrP   rR   rT   rU   r1   r    	_inductorconfigintra_node_bwinter_node_bwr&   llMaxBwsminr
   r   r   r   rK   rL   baseLathwLatrN   max)r'   tensor_storage_size_bytestensor_storage_size_GBnum_gpus_per_node
group_sizenNodesnRanks	nccl_algo
nccl_protocollbwIntrabwIntercompCapIndexindex2index1llMaxBwbw	nChannelsbusBwnstepsratio	bandwidthbandwidth_GB_per_nsintraHwnInterStepslatencyintraLatinterLatnetOverhead
latency_nstransport_nss                                  r    estimate_nccl_collective_runtimer      sr    !@ E6=DtK *40JYYz$556FF{ IJt$D
 oo$$22Goo$$22G>L!Q;VaZAF#q[\aFvv&G aKWBINE !ty/C/C'C9)	UE y###fqj!	)**I,@,@A	A! 6\V#EI#c/ nnGy###A:f*KK	)**I,@,@A	Aqj i ,GW~i(4HW[[!),Z8H Kz8[)H$0;3IIIG3J *,??L*$$r   )	functoolsrZ   enumr   r7   r    r   r   r!   r   r   virtualizedr   r
   r   	lru_cacher&   IRNoder1   r9   rA   rI   rK   rP   rT   rb   rc   r`   floatr   r   r   r   <module>r      sJ         0  g  
&o 
& 
&Jbii JI J 
")) 
 
@BII @# @g  
  	
 		  
	 
	 
		,,b%299 b% b%r   