
    wg)                         d dl Z d dlZd dlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZ  e j                         d        Zd Zd Zd	 Z	 dd
Zd Zy)    N   )cdiv)driver)get_dram_gbpsget_max_simd_tflopsget_max_tensorcore_tflopsnvsmic                      	 t        dg      d   dz  S # t        $ rG dd l} | j                          | j	                  d      }| j                  || j                        dz  cY S w xY w)Nzclocks.max.smr   g     @@)r	   FileNotFoundErrorpynvmlnvmlInitnvmlDeviceGetHandleByIndexnvmlDeviceGetMaxClockInfoNVML_CLOCK_SM)r   handles     a/home/mcse/projects/flask/flask-venv/lib/python3.12/site-packages/triton/ops/matmul_perf_model.pyget_clock_rate_in_khzr      sm    To&'*S00 T2215//8L8LMPSSSTs    AA$#A$c                     |t        |d      z  }t        j                  j                  j	                  |       d   dz  }t        ||      |z  t        |t               |       z  }|S z# return compute throughput in TOPS    multiprocessor_count)minr   activeutilsget_device_propertiesr   r   devicenum_ctas	num_warpsdtypetotal_warpsnum_subcorestflopss          r   get_tensorcore_tflopsr$      sg    SA..K==&&<<VDE[\_``L{+l:=V$&>0 0FM    c                     |t        |d      z  }t        j                  j                  j	                  |       d   dz  }t        ||      |z  t        |t               |       z  }|S r   )r   r   r   r   r   r   r   r   s          r   get_simd_tflopsr'       sd    SA..K==&&<<VDE[\_``L{+l:=PQVXmXoqw=xxFMr%   c                     t         j                  j                  |       }|d   dk  r!|t         j                  k(  rt	        | |||      S t        | |||      S )Nr      )torchcudaget_device_capabilityfloat32r'   r$   )r   r   r   r    
capabilitys        r   
get_tflopsr/   (   sO    11&9J!}qUemm3vxEBB 9eDDr%   c                    t         j                  j                         }|j                  }|j	                         }t        ||      }t        ||	      }|}||z  |z  }t        ||      t        ||	      }}d|z  |z  |z  dz  }t        ||| |      }||z  }t        j                  j                  j                  |      d   }t        d||z        }t        d|dz        }t        t        d|dz
  dz        d      }t        |      |dz  |d	z  z   z  }|d
z  }||z  |z  dd|dz
  z  z   z  }||z  |z  dz  |dz
  z  }||z  |z  dd|dz
  z  z   z  } ||z  |z  dz  |dz
  z  }!|| z   dz  }"||!z   dz  }#|"|z  |#|z  z   }$|dz  }%||z  |z  |z  dz  }&|dk(  r|&|%z  }'n|%}(|&|(z  }'||z  dz  dz  |%z  })|'|)z  }'t        ||$      |'z   }*|rt        d|* d| d|$ d|' d|dz   d       |*S )zO return estimated running time in ms
          = max(compute, loading) + store r   i   @r          L   r   gffffff?g?r   g?g?i   g333333?zTotal time: zms, compute time: zms, loading time: zms, store time: zms, Activate CTAs: d   %)r*   r+   current_devicer    element_sizer   maxr/   r   r   r   r   r   r   print)+r   
num_stagesABCMNKBLOCK_MBLOCK_NBLOCK_KSPLIT_Kdebugkwargsr   r    dtsize	num_cta_m	num_cta_n	num_cta_kr   	total_opstput
compute_msnum_smactive_cta_ratioactive_cta_ratio_bw1active_cta_ratio_bw2dram_bwl2_bwload_a_dram	load_a_l2load_b_dram	load_b_l2
total_dramtotal_l2load_msstore_bwstore_c_dramstore_ms	reduce_bwzero_mstotal_time_mss+                                              r   estimate_matmul_timera   /   s    ZZ&&(FGGE^^FQ IQ II9$y0H q'?C7OqA A	A!34Ifh	59DT!J ]]  66v>?UVF1h/0q(R-0s1x"}&BCQGF#';d'BEY\`E`'`aGaKEa%&.Ay1}(=$=>KA$	A6Ia%&.Ay1}(=$=>KA$	A6I+<JI%+6H7"X%55G }Hq56>G+{;L!|(*	)+a%!){+h6G
G,x7M]O+=j\ J&i'7z B  0 45Q8 	9 r%   c                    t         j                  j                         }t         j                  j                         }|d   j	                         }|d   j
                  }g }| D ]}  }|j                  }	|	d   |	d   |	d   |j                  f\  }
}}}t        j                  j                  j                  |      d   }|
|z   |z  |z  |z  }||k  sm|j                  |        |} |t         j                  t         j                  fvr"| D cg c]  }|j                  d   dk(  s| } }i }| D ]g  }|j                  }	|	d   |	d   |	d   |	d   |j                  |j                  f\  }
}}}}}|
||||f}||v r||   j                  ||f       `||fg||<   i g }|j!                         D ]  \  }}|\  }
}}}}|d   d	k\  r[|
|z  |z  d
z  }|t#        d|      z  d	z  }d}||z  t%        j&                  d|fd      }|D ]  }|j                  |d           q|d   d   }d|_        |j                  |        |S c c}w )Nr;   rA   rB   rC   max_shared_memrD   r1   r   r)   i   r   i,  r   c                 R    | d   z
  dk  rdt        | d   z
        z   S | d   z
  S )Nr1   r   
   )abs)xoptimal_num_stagess    r   <lambda>z$early_config_prune.<locals>.<lambda>   sB    aD--2 %'QqT4F-F)G$G 89!?Q8Q r%   )key)r*   r+   r6   r,   r7   r    rF   r:   r   r   r   r   appendfloat16r-   r   itemsr   heapq	nsmallest)configs
named_argsrF   r   r.   rG   r    pruned_configsconfigkwrA   rB   rC   r:   max_shared_memoryrequired_shared_memoryconfigs_maprD   r   rj   kvmmas
mma_cyclesldgsts_latencynearestnrandom_configrh   s                               @r   early_config_pruner   p   s   ZZ&&(F113J_))+FsO!!E N *]]yM2i="Y-9J9JJ 	.': #MM//EEfMN^_")G"3w!>!Kf!T!%66!!&)* G U]]EMM22(/Qf6==3Kq3P6QQ K 	6]]yM2i="Y-IHXHXZ`ZkZkk 	B'7Iz '9=+##VZ$89!' 45K	6 N!!# 11895'7Ia=AW$w.+>DAy 11A5J N!/*!< oo1 RSG  ,%%ad+, aDGM'(M$!!-0)1* K Rs   H?$H?)F)	functoolsrn   r*    r   runtimer   testingr   r   r   r	   	lru_cacher   r$   r'   r/   ra   r    r%   r   <module>r      s[         \ \ T TE >B;r%   