
    ɯwg[                    v	   U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% d dl&m'Z'm(Z( d dlm)Z) d dl*Z*d dl+Z+dd	gZ, ejZ                  d      d
        Z.d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z<m=Z=m>Z>m?Z?m@Z@ d dlAmBZBmCZC d dlDmEZEmFZF ddlGmHZH ddlImJZK ej                  dk(  ZL e
j                  eN      ZO e#d      ZPee*j                  e*j                  f   ZRe$e+j                  eTf   ZUdZVdZWdZXeXeXdz
  z  d k(  reXdk\  sJ d       d ZYddZZ G d d e*j                        Z\ddd!Z] ejZ                  d      dd"       Z^dd#Z_d$ Z`d% Zadd&Zb	 	 	 	 	 	 dd'ZJd( Zc	 	 	 	 dd)Zd	 	 	 	 dd*Zedd+Zf	 d	 dd,Zgd- Zhddd.Zi	 d	 	 	 	 	 	 	 dd/Zj	 d	 dd0Zkdd1Zldd2Zmdd3Znd4 Zod5 Zp e(d6      Zq e#d7d89      Zr G d: d;e eeqerf         Zsdd<Ztd= Zud> Zvd? Zw	 d	 	 	 dd@ZxdA ZyddBZzdC Z{ddDZ|dE Z}ddFZ~ddGZddHZddIZdJ ZdK ZddLZg ZdMedN<   ddOZdP Zej                  ddQ       ZddRZ ejZ                  d      dS        Z G dT dUe      Z G dV dW      Z G dX dYe      Zej                  dZ        Z G d[ d\      Z ejZ                  d      dd]       Zdd^Zdd_Zdd`ZddaZdbdbdcddZde Z ejZ                  d      df        Z ejZ                  d      dg        Zdh Zdi ZddjZdk Z G dl dm      Zdn Zdo Zdp Zdq Zdr Zds Zej                  dt        ZdduZdv Zdw Zdx Zdy Zdz Zdd{Zej                  d|        Zd} Z ejZ                  d      d~        Z ejZ                  d      d        Zd Zd Zd ZddZd ZddZd Z G d dejr                        Zd ZddZd Zd Zd Zd Zd Z	 ddZ	 ddZddZddZej                   G d d             Zej                  d        ZddZddZddZd Z	 	 ddZd ZddZddZd Zd ZddZ	 	 	 	 	 	 ddZddZ	 	 	 	 	 	 ddZ	 	 	 	 ddZd Zy)    )annotationsN)datetime)StringIO)AnyCallableDictGenericIterableList
NamedTupleOptionalProtocolSequenceSetTypeVarUnion
ValuesView)Concatenate	ParamSpec)mockcudaxpuc                     t         D  cg c]#  } t        t        |       j                         s"| % }} t	        |      dk  sJ t	        |      dk(  rd}|S |j                         }|S c c} w )N   r   r   )	GPU_TYPESgetattrtorchis_availablelenpop)x
avail_gpusgpu_types      Z/home/mcse/projects/flask/flask-venv/lib/python3.12/site-packages/torch/_inductor/utils.pyget_gpu_typer%   6   sg    &K'%*;*H*H*J!KJKz?aZA-vHO 4>>>3CHO Ls
   #A'A')get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesr   )config)ceildivwin32_T   @      zmust be power of 2c                *    | t         z   dz
  t          z  S )z/Round up to the nearest multiple of ALIGN_BYTESr   )ALIGN_BYTES)nbytess    r$   _alignr?   b   s    [ 1$44    c                   t        | t        j                  t        j                  f      r#t	        t        t        | j                              S t        | t              xs! t        j                  | t              t        k(  S )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdr=   )vs    r$   rH   rH   g   sQ    !eii+,3{AFF+,,aK599Q#<#KKr@   c                  (    e Zd ZdZdZdZed        Zy)rJ   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr   Tc                    t        |t        t        j                  f      rt	        t        |            S t        |      r|S y N)rB   intrC   Integerr?   rH   )clsvalues     r$   evalz
align.evalt   s6    ec5==12#e*%%uL r@   N)__name__
__module____qualname____doc__nargs
is_integerclassmethodrU    r@   r$   rJ   rJ   n   s!    FEJ r@   rJ   c                    |         t         j                  j                          t        j                  t	        d      t         j                  d      }t         j                  j                  d      }t         j                  j                  d      }|j                          t        d      D ]  }|j                           |          |j                          t         j                  j                          |j                  |      dz  }t        dt	        ||z              }t        dt	        ||z              }	t        |      D ]	  } |          t         j                  j                  t         j                  j                  j                  g      5 }
t        |	      D ]  }|j                           |          t         j                  j                          d	d	d	       t        j!                  d
       t        j!                  
j#                         j%                  dd             t'        |
j)                         D cg c]0  }|j*                  t,        j                  k(  r|j.                  dk7  r|2 c}      }t1        |      |	z  dk7  rt3        dt1        |      |	      t1        |      |	z  }t'        t5        |      D cg c]  \  }}||z  dk7  r| c}}      }|j7                          |j#                         }t        j!                  d       t        j!                  |j%                  d             t9        d |D              dz  |	z  }t        j!                  d|       |S # 1 sw Y   xY wc c}w c c}}w )aR  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
    g    Ar   )dtypedeviceT)enable_timing   r   )
activitiesNz
raw eventsself_device_time_total)sort_by	row_limitzContext Syncr   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %szprofiling time breakdown)rg   c              3  4   K   | ]  }|j                     y wrP   )device_time_total).0events     r$   	<genexpr>z+do_bench_using_profiling.<locals>.<genexpr>   s     A%e%%As   g     @@zprofiling results: %s ms)r   r   synchronizeemptyrQ   Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAlogdebugkey_averagestabler)   eventsdevice_typer(   namer   RuntimeError	enumerate_build_treesum)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpirk   filtered_eventsnum_event_per_groupactual_eventsress                    r$   do_bench_using_profilingr   |   s    D	JJKKJuyyHE **"""6K

  t 4I1X 
 	JJ**959K 1c&;./0H1c#+,-H 8_ 
 
		NN++00
 
  
 ! 
x 	AKKMD		 	

 ! IIlIIann$$-EQS$TU 	
  JOO3

n8T 	
O ?h&!+- 	
 	
 o.9 &o6	
5&&!+ 	
M !..0MII()IIm!!B!/0
A=A
AF
JX
UCII(#.J_! !$	
	
s   AM05M=
N
0M:c                    	 ddl m}  t        j                  j	                  dd       | d uxr% t        t        t        j                  dd       d      S # t        $ r Y yt        $ r}dt        |      v sJ Y d }~yd }~ww xY w)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr   Fztorchvision::nms does not exist)torchvision.opsr   r   _C%_dispatch_has_kernel_for_dispatch_keyhasattrr   opsImportErrorr   str)r   es     r$   has_torchvision_roi_alignr      s|    -667I6R$ 
EII}d3[*
 	
   0CF:::s   AA 	A?A?&A::A?c                b   | t        j                  d      j                  S t        | t              rt        j                  |       } | j
                  dvrZ| j                  Nt        | j
                        }t        j                  | j
                  |j                  j                               S | S )Ng        )cpumeta)index)
r   tensorr`   rB   r   typer   r&   Workercurrent_devicer`   device_interfaces     r$   decode_devicer      s    ~||C '''&#f%{{/)fll.B3FKK@||FKK/?/F/F/U/U/WXXMr@   c                r    t        j                  t        j                  | t	        j
                  d            S Nr   )	functoolsreduceoperatormulrC   rR   )its    r$   sympy_productr      s$    HLL"emmA.>??r@   c           	         t        |       t        |      k(  sJ t        j                  t        d t	        | |      D                    S )Nc              3  ,   K   | ]  \  }}||z    y wrP   r]   )rj   abs      r$   rl   zsympy_dot.<locals>.<genexpr>   s     >daAE>s   )r   rC   expandr   zip)seq1seq2s     r$   	sympy_dotr      s8    t9D	!!!<<>c$o>>??r@   c                \    | D ci c]  }t        |      | c}j                         S c c}w rP   )idvalues)r   r!   s     r$   uniquer      s'     !BqE1H!((**!s   )c           
     n   t        | t        j                        st        |t        j                        r2t        t        j                  |       t        j                  |            S t        | t
              rt        |t
              s$J |  dt        |        d| dt        |              t        | |      S )Nz: , )rB   rC   Exprr,   sympifyrQ   r   runtime_ceildiv)numerdenoms     r$   r6   r6      s     %$
5%**(Eu}}U+U]]5-ABB eS!js' 9
4;-r%4;-89  5%((r@   c                0   | yt        |       j                  d      d   }i dddddd	d
ddddddd	ddddddddddddddddd d!d"d#d$d%d&}t        |j                               D ]  }|||<   	 t	        | t               r| S d'||    S )(Nz*i8.re   booli1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float16fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64uint8u8uint16u16u32u64)uint32uint64*)r   splitlistr   rB   )key	dtype_strtysrL   s       r$   _type_ofr     sD   
 {Cs#B'Ii 	G 	z	
 	 	 	w 	6 	F 	6 	6 	 	 	 	  	!" 	%#$ 'C, #**, AS#&3@aI/?,@@r@   c                R    | D cg c]  }t        j                  |       c}S c c}w )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    )rC   r   )lstr   s     r$   convert_shape_to_inductorr   '  s!     '**EMM!***s   $c                   ddl m} | D cg c]j  }t        |t              r|nUt        |t        j
                        rt        |      n0|j                  j                  j                  j                  |d      l c}S c c}w )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    r   VN)hint)
virtualizedr   rB   rQ   rC   rR   graphsizevars	shape_envcreate_symintnode)r   r   r   s      r$   convert_shape_to_symintr  2  sy       	 a 	
 a' VWW''99!$9G		H  s   A/A=c                    t        | t        j                  j                        sJ t	        d | j
                  j                  D              S )z-
    Does this op overload have aliasing
    c              3  8   K   | ]  }|j                   d u  y wrP   )
alias_inforj   r   s     r$   rl   zis_view.<locals>.<genexpr>J  s     FAq||4'Fs   )rB   r   _ops
OpOverloadany_schema	arguments)ops    r$   is_viewr  E  s9     b%**//000F1E1EFFFr@   c                   | j                   dk(  syt        | j                  t        j                  j
                        s| j                  t        j                  u sy| j                  t        j                  u st        | j                        rt        fd| j                  D              S t        j                  j                  | j                  j                  v xs duxr  | j                        S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc              3  6   K   | ]  }t        |        y wrP   )is_pointwise_use)rj   uis_pointwise_fns     r$   rl   z#is_pointwise_use.<locals>.<genexpr>_  s     KA#A7Ks   N)r  rB   targetr   r  r  r   getitemr  rF   usersTag	pointwisetags)user  s    `r$   r  r  M  s     66_$ 	3::uzz445xGWGW9W
zzX%%%)<KKKK99#**//1 t#C

(Cr@   c                   t         j                  j                         }g }g }t        |      D ]e  \  }}t	        |t         j
                        r5|j                  |j                  d|              |j                  |       U|j                  |       g t        d |j                         D              sJ |j                  | t        |      |      }t        | j                  j                        dk(  r2t        | j                  j                  d   j                         dk(  r|f}|j#                  |       t         j                  j%                  i |      }	|	|fS )Nargc              3  R   K   | ]  }t        |t        j                          ! y wrP   )rB   r   Tensorrj   r!   s     r$   rl   z$gen_gm_and_inputs.<locals>.<genexpr>p  s     H1:a..Hs   %'r   r   r  )r   fxGraphr   rB   r  appendplaceholderrF   r   r  tupler   r
  returnsr   r   outputGraphModule)
r  rI   kwargsgg_argsa_argsnr  nodegms
             r$   gen_gm_and_inputsr/  f  s   AFFD/ 3c5<<(MM!--#aS	23MM#MM# HHHHH??65=&9DFNN""#q(&&q)../8;wHHTN			b!	$Bv:r@   c                h    | dk(  ry t        |       }|j                         r|j                          y y Nr   )r&   r   rm   r   s     r$   rm   rm   }  s4    /7$$&$$& 'r@   c                    t        |       t        j                  d       t        j                         }t        |      D ]  } | | }t        |        t        j                         }J ||z
  S )Ni9  )rm   r   manual_seedtimeperf_counterrq   )modelexample_inputstimesr`   t0r   resultt1s           r$   timedr<    sr     	d				B5\ 'F 
			B7Nr@   c                    t        j                  t        |      D cg c]  }t        | |||       c}      }t        j                  |      |z  }t        ||z  d       |S c c}w )Nz.6f)r   r   rq   r<  medianprint)	r   rI   r8  repeatbaseliner`   r   timingstooks	            r$   print_performancerD    s[     llE&MRqE"dE6:RSG<< 5(D	TH_S!#K Ss   A#c                H     t        | |             t        | |fd       y)zKReplace obj.method() with a new method that returns a precomputed constant.c                      S rP   r]   )r:  s   r$   <lambda>z#precompute_method.<locals>.<lambda>  s     r@   N)r   setattr)objmethodr:  s     @r$   precompute_methodrK    s     !WS&!#FC(r@   c                *    |D ]  }t        | |        y)zFReplace methods with new methods that returns a precomputed constants.N)rK  )rI  methodsrJ  s      r$   precompute_methodsrN    s     '#v&'r@   c                <    t        | |kD        t        | |k        z
  S rP   )rQ   )r   r   s     r$   cmprP    s    q1u:AE
""r@   c                R    t        |       dk(  r t        |       | d   g      |z  S | S )Nr   r   )r   r   )r!   sizes     r$   pad_listlikerS    s-    
1v{tAw!v%%r@   c                B    t        |       dk(  rg S d }t        | |      S )Nr   c                F    t        | t              r| S | j                         S rP   )rB   r   get_name)elems    r$   	sort_funcztuple_sorted.<locals>.sort_func  s    dC K ==?"r@   r   )r   sorted)r!   rX  s     r$   tuple_sortedr[    s&    
1v{	# !##r@   PRVT)	covariantc                  &    e Zd Zedd       ZddZy)CachedMethodc                     y rP   r]   selfs    r$   clear_cachezCachedMethod.clear_cache  s    r@   c                     y rP   r]   rc  rI   r(  s      r$   __call__zCachedMethod.__call__  s    r@   NreturnNone)rI   zP.argsr(  zP.kwargsri  r]  )rV   rW   rX   staticmethodrd  rg  r]   r@   r$   r`  r`    s     r@   r`  c                ~     d j                    dt        j                          fd       }fd}||_        |S )N___cachec                Z    t        |       st        |  |              t        |       S rP   )r   rH  r   )rc  r   r   s    r$   wrapperzcache_on_self.<locals>.wrapper  s*    tS!D#r$x(tS!!r@   c                8    t        |       rt        |        y y rP   )r   delattr)rc  r   s    r$   rd  z"cache_on_self.<locals>.clear_cache  s    4D# r@   )rV   r   wrapsrd  )r   rp  rd  r   s   `  @r$   cache_on_selfrt    sD    r{{m6
"C__R" "
 &GNr@   c           
     ^   ddl m} t        | t              rgt	        j
                  t        j                  | D cg c]0  }t        |d      r"|j                  r|j                  j                  2 c}t                     S t        | |j                        r| j                  S t               S c c}w )Nr   irr-  ) rw  rB   r   r   r   r   or_r   r-  originssetExternKernel)node_schedulerw  r-  s      r$   aggregate_originsr~    s    -&LL *4(TYY 		!!
 E
 	
 
M2??	3$$$us   5B*
c                   t        |       }|dk(  rq|D cg c]Q  }|j                  dk(  r@d|j                  v r2|j                  d   #|j                  d   j                  j                  S }}t        t        |            }n|dk(  rg }|D ]y  }|j                  dk(  sd|j                  v s"|j                  d   d   }t        |d   t              r|j                  |d          \|j                  |d   j                         { t        t        |            }n5|dk(  r*|D cg c]  }|j                  dk(  s|j                    }}nt        |}dj                  d	g|z         S c c}w c c}w )
Noriginal_atenr  r   source_fn_stackre   r   inductor_noder   fused)r~  r  r   _overloadpacketrV   rZ  r{  rB   r   r"  r   NotImplementedErrorjoin)r}  descriptive_namesall_originsoriginsources	source_fns         r$   get_fused_kernel_namer    si   #M2KO+ &
yyO+6;;.O,8	 KK(88AA
 
 W&	g	%! 	:FyyO+0AV[[0P"KK(9:2>	ilC0NN9Q<0NN9Q<#8#89	: W&	o	-&1
"VYY/5QFKK
 
 "!G88WI'((5
(
s   AE(%E-:E-c                   t        |       }|D cg c]  }|j                  dk(  s| }}t        j                  t              }t        j                  t              }d t        |      r|D ch c]  }|j                   }}t        |      dk(  r[|d   j                  t        d      s+i }	t        j                        D ]
  \  }
}|
|	|<    |	_
        |j                  fd       |D ]  }d|j                  v rO|j                  d   @t        |j                  d   j                        }||   j                  |j                          d|j                  v so|j                  d   d   d   }||   j                  |j                           d	nd
}|j"                   d| ddj%                  |j'                                ddj%                  |j'                                d}|j"                   dg}t)        |j+                               D ]@  \  }}|j                  |j"                   d| ddj%                  t)        |                    B S|j                  |j"                   d       |D ]0  }|j                  |j"                   d|j-                                 2 |dj%                  |      fS c c}w c c}w )Nr  r   r   )_inductor_kernel_metadata_node_to_idx_mapc                "    j                   |    S rP   )r  )r,  single_graphs    r$   rG  z%get_kernel_metadata.<locals>.<lambda>-  s    lTTUVW r@   rY  r  	from_nodezTopologically SortedUnsorted z Source Nodes: [r   z], Original ATen: []z" Source node to ATen node mapping:z   z => z Graph fragment:
)r~  r  collectionsdefaultdictr   r   r   r   r   nodesr  sortr   r   r  r"  r   commentr  keysrZ  itemsformat_node)r}  rp  r  r  inductor_nodesfrom_node_dictoriginal_aten_dictr,  unique_graphsnode_to_idx_mapidxr-  r   sort_strmetadatadetailed_metadataoriginal_noder  r  s                     @r$   get_kernel_metadatar    s   #M2K+6W&)):VfWNW ,,T2N$006
 L
>*89Q99}")!,22L<)TU"$'(:(:; -FC),OA&-IXFW     2dii'DIIo,F,Rdii0@@ACs#**4995$))#))K(+A.C3&&tyy12 *6)A%zH??
1XJ&6tyyATATAV7W6X Y99%7%<%<%>?@	C  $OO,,NOP &~';';'= > 
u  s=/diiu6N5OP	

   GOO#44D!EF 	OA $$'8AMMO;L%MN	O
 TYY0111g X :s   J?J?/Kc                    t        |       } t        |       }| rV| j                         }|j                  D ]4  }|r	 ||      r||vs|j	                  |       | j                  |       6 | rV|S )zJReturns the set of nodes whose values depend on those within initial_queue)r   r{  r    r  addr"  )initial_queueskip_filterdominated_setr-  users        r$   dominated_nodesr  O  sz     'M&M
  "JJ 	+D{40=(!!$'$$T*	+  r@   c                   dd l }ddlm fd|j                         D cg c]  } |      s|j                   }}| D cg c]  } |      s|j                   }}t         |j                  g ||       S c c}w c c}w )Nr   r   rv  c                    t        | j                        r | j                        S t        | j                        r | j                        S t        | j                        xr t        | j
                        S rP   )rB   	TensorBoxdata
StorageBoxIRNode	Pointwise)r,  rw  is_unrealized_nodes    r$   r  z*gather_origins.<locals>.is_unrealized_nodeg  s^    a&%aff--a'%aff--!RYY'GJq",,,GGr@   )	itertoolsrx  rw  r   rz  r{  chain)	rI   r(  r  valkwarg_originsr  arg_originsrw  r  s	          @@r$   gather_originsr  b  s}    H -3MMOWS?QRU?VS[[WMW*.J32DS2I3;;JKJy<<m<== XJs   BBBBc                6   t        | t        j                        r| j                  S t        | t        j                        r)dj                  t        t        | j                              S t        | t        j                        r)dj                  t        t        | j                              S t        | t        t        t        t        f      rC| j                  j                   ddj                  t        t        | j                               dS t!        |       S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    z + z * (r   ))rB   rC   Symbolr   rD   r  rG   	sympy_strrI   Mulr0   r-   r.   r/   funcrV   r   )exprs    r$   r  r  s  s     $%yy$		"zz#i344$		"zz#i344$(HhGH))$$%QtyyY		1J'K&LANNt9r@   c                    ddl m} t        j                  r3t	        |j
                  dd       x}r|j                  dk7  rt        |       S t        j                         S )Nr   r   current_node
index_expr)
r   r   r5   compute_all_boundsr   interpreterr  r3   r4   unknown)r   r   fx_nodes      r$   get_bounds_index_exprr    sN     	!!~tDDWDNNl*5!!""$$r@   c                J    | t         j                  k7  sJ t        | |dd      S )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)r2   SIZEr1   )prefixr  s     r$   sympy_index_symbol_with_prefixr    s)     TYY vsDdCCr@   c                N    | xs t         j                  xr t         j                  S rP   )r5   debug_index_assertsassert_indirect_indexing)checks    r$   generate_assertr    s    /V//TV5T5TTr@   c                F    | d   dk7  sJ t        j                  | dd      S )r  r   sTr  )rC   r  r   s    r$   sympy_index_symbolr    s)     7c>> <<d==r@   c                    d }t        j                  |       j                  |j                         D ci c]  \  }}| |||       c}}      S c c}}w )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    c                    t        | t        j                        sJ t        |t              r,t        j                  || j
                  | j                        S |S )Nr  )rB   rC   r   r   r  r[   is_nonnegative)replacedreplacements     r$   	to_symbolzsympy_subs.<locals>.to_symbol  sN    (EJJ///k3'<< ++$33  r@   )rC   r   xreplacer  )r  replacementsr  krL   s        r$   
sympy_subsr    sN    	 ==''(4(:(:(<=1IaO	= =s   A
c                    t        | t        j                        xs^ t        | t        j                        xrB t	        d t        j                  | j                         | j                               D              S )Nc              3  2   K   | ]  }t        |        y wrP   is_symbolicr  s     r$   rl   zis_symbolic.<locals>.<genexpr>  s     N1AN   )	rB   r   SymIntr  r	  r  r  rR  stride)r   s    r$   r  r    sS    a& 1ell# 	ON	!((*(MNNr@   c                 &    t        d | D              S )Nc              3  2   K   | ]  }t        |        y wrP   r  r  s     r$   rl   z"any_is_symbolic.<locals>.<genexpr>  s     ,!{1~,r  r	  )rI   s    r$   any_is_symbolicr    s    ,t,,,r@   c                &   ddl m} h d}t        j                         r|j	                  h d       | j
                  j                  D ]G  }t        |j                        |v r|c S |j                  j                  d      x}< ||      sE|c S  y )Nr   )free_unbacked_symbols>	   aten._assert_scalaraten._local_scalar_denseaten.multinomial.defaultfbgemm.dense_to_jagged.default%fbgemm.jagged_to_padded_dense.default,aten._fused_moving_avg_obs_fq_helper.default7aten._fused_moving_avg_obs_fq_helper_functional.defaultrun_with_rng_staterun_and_save_rng_state>   aten.scatter.srcaten.scatter_add_aten.scatter.reduceaten.index_put.defaultaten.index_put_.defaultaten.scatter_reduce.twoaten.scatter_add.defaultaten.scatter_reduce_.twoaten.scatter.value_reduceaten.scatter_reduce.two_outaten._unsafe_index_put.default0aten._unsafe_masked_index_put_accumulate.defaultr  )%torch.fx.experimental.symbolic_shapesr  r   $are_deterministic_algorithms_enabledupdater   r  r   r  r   get)r.  r  forbidden_setr-  r  s        r$   %get_first_incompatible_cudagraph_noder    s    KM 113	
   t{{},K99==''C49Ns9SK	
 r@   c                    t        |       d uS rP   )r  )r.  s    r$   has_incompatible_cudagraph_opsr    s    04D@@r@   c                    t        t        t        | j                  j                                    }|j
                  dk(  sJ |S )z$Get the output node from an FX graphr&  )nextiterreversedr   r  r  )r.  	last_nodes     r$   output_noder     s6    T(288>>234I<<8###r@   z	List[Any]_registered_cachesc                    t        | d      rt        | j                        st        |  d      t        j                  |        | S )zq
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_inductor_cache().
    cache_clearz# does not have a cache_clear method)r   callabler  AttributeErrorr  r"  rI  s    r$   clear_on_fresh_inductor_cacher  
  s?    
 3&hs.Gu$GHIIc"Jr@   c                 :    t         D ]  } | j                           y)z&
    Clear all registered caches.
    N)r  r  r  s    r$   clear_inductor_cachesr!    s     " r@   c              #  
  K   t                t        j                  |      }	 t        j                  j                  t        j                  d|i      5  t        j                  d|       t        j                  j                  |d      }t        j                  j                  t        j                  d|i      5  d t        | t
              rt        |       dk(  sJ d       t        j                  j                  |      rtt        j                  |      }| j!                  |D ci c]D  }d	|vr>|t        j                  j#                  t        j                  j                  ||            F c}       ddd       ddd       |rt%        j&                  |       t                yc c}w # 1 sw Y   8xY w# 1 sw Y   <xY w# t(        $ r! t*        s	 t        j-                  d
|        Y Qw xY w# t                w xY ww)z
    Contextmanager that provides a clean tmp cachedir for inductor.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    )dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNr   z!expected empty cache_entries dictz.lockz(on error, temporary cache dir kept at %s)r!  tempfilemkdtempr   patchdictosenvironry   rz   pathr  rB   r   existslistdirr  getsizeshutilrmtree	Exception_IS_WINDOWSwarning)cache_entriesr#  deleteinductor_cache_dirtriton_cache_dirfilesfs          r$   fresh_inductor_cacher<    s     !))c2 ZZ__JJ24FG
 	 II35GH!ww||,>I.@BR-ST mT2}-2W4WW2ww~~&67 "

+; <%,, */$%#*!#3 !"277??277<<@PRS3T#U U	$ MM,- 	# 	 	(   KKBDVW  	sy   !H0G A'F;;A-F/(A	F*1F/8F; G H*F//F8	4F;;G G 'G1.G4 0G11G4 4H  Hc           	         | j                   }t        t        |             }t        t	        t        ||d                  S )NT)r   reverse)__getitem__rq   r   r   r  rZ  )seqgettera_rs      r$   argsortrC  J  s1    __F
C/C>?@@r@   c                L    t        j                  d|       j                         S )Nr]   r_   )r   rn   element_sizerE  s    r$   get_dtype_sizerG  Q  s    ;;r'4466r@   c                      e Zd ZU ded<   y)LineContextr   contextN)rV   rW   rX   __annotations__r]   r@   r$   rI  rI  V  s    Lr@   rI  c                      e Zd ZdZddZddZddZddZd Zd Z	d Z
d	 Zd
 Zd ZddZddZddZddZddZd Zd Zy)IndentedBuffer   c                     g | _         || _        y rP   )_lines_indent)rc  initial_indents     r$   __init__zIndentedBuffer.__init__]  s    %r@   c                   t               }d}g }| j                  D ]  }t        |t              r
 |       }|/t        |t              r|j                  ||j                  f       Kt        |t              sJ |j                  |       |j                  d       |d|j                  d      z   z  } |j                         |fS )Nr   r  )r   rP  rB   DeferredLineBaserI  r"  rJ  r   writecountgetvalue)rc  bufr   linemaplines        r$   getvaluewithlinemapz"IndentedBuffer.getvaluewithlinemapa  s    jKK 	&D$ 01v<D+.4<<01dC(((IIdOIIdOTZZ%%%A	& ||~w&&r@   c                ,    | j                         \  }}|S rP   )r\  )rc  rL   r   s      r$   rX  zIndentedBuffer.getvalues  s    '')1r@   c                b   t               }| j                  D ]  }t        |t              r
 |       }|t        |t              r.t        |t
              sJ |j                  d      r|j                  |d d        f|j                  |       |j                  d        |j                         S )N\re   r  )	r   rP  rB   rU  rI  r   endswithrV  rX  )rc  rY  r[  s      r$   getrawvaluezIndentedBuffer.getrawvaluew  s    jKK 	 D$ 01v<D+.dC(((}}T"		$s)$		$		$	  ||~r@   c                8    | j                   j                          y rP   )rP  clearrb  s    r$   rc  zIndentedBuffer.clear  s    r@   c                ,    t        | j                        S rP   )r   rP  rb  s    r$   __bool__zIndentedBuffer.__bool__  s    DKK  r@   c                :    d| j                   | j                  z  z  S )Nr  )rQ  tabwidthrb  s    r$   r  zIndentedBuffer.prefix  s    dllT]]233r@   c                &    | j                  d       y )Nr  	writelinerb  s    r$   newlinezIndentedBuffer.newline  s    tr@   c                   t        |t              r| j                  j                  |       y t        |t              r9| j                  j                  |j                  | j                                      y |j                         r.| j                  j                  | j                          |        y | j                  j                  d       y Nrx  )rB   rI  rP  r"  rU  with_prefixr  striprc  r[  s     r$   rj  zIndentedBuffer.writeline  s    dK(KKt$./KKt//>?ZZ\KK$++-78KKr"r@   c                4    |D ]  }| j                  |        y rP   ri  )rc  linesr[  s      r$   
writelineszIndentedBuffer.writelines  s     	!DNN4 	!r@   c                F     t         j                   fd       } |       S )Nc               3     K   xj                    z  c_         	 d  xj                    z  c_         y # xj                    z  c_         w xY wwrP   rQ  )offsetrc  s   r$   ctxz"IndentedBuffer.indent.<locals>.ctx  s9     LLF"L'&&s   A4 AAA)
contextlibcontextmanager)rc  rw  rx  s   `` r$   indentzIndentedBuffer.indent  s$    		"	"	' 
#	' ur@   c                .    | xj                   |z  c_         y rP   rv  rc  rw  s     r$   	do_indentzIndentedBuffer.do_indent      r@   c                .    | xj                   |z  c_         y rP   rv  r}  s     r$   do_unindentzIndentedBuffer.do_unindent  r  r@   c           	        t        |t              rt        d      }|j                  D ]E  }t        |t              r|st        |t        |      t        |j                               z
        }G t        j                  |      rd}|j                  D ]P  }t        |t              r| j                  j                  |       /t        j                  | |t        |      d         R y t        j                  |      }|r|j                         }|sy |j                         }|j!                  d      D ]  }| j                  |        y )Ninfr   r  )rB   rM  floatrP  rI  minr   lstripmathisinfr"  rj  rQ   textwrapdedentrstripr   )rc  
other_codero  r  r[  s        r$   splicezIndentedBuffer.splice  s   j.15\F")) I!$4 TS5G)GHFI zz&!")) HdK0KK&&t,",,T4F3FG	H "4J'..0
#**,J"((. %t$%r@   c                    t        | j                        }| j                  D cg c]
  } ||       c}|_        |S c c}w N)rR  )rM  rQ  rP  )rc  r  r   r[  s       r$   rG   zIndentedBuffer.map  s4    DLL9-1[[9Td4j9

 :s   >c                @    t        |        d| j                          dS )Nr  r  )r   rX  rb  s    r$   __repr__zIndentedBuffer.__repr__  s     t*Qt}}/q11r@   c                    | j                   |j                   k(  sJ t        | j                         }|j                  | j                         |j                  |j                         |S r  )rQ  rM  rs  rP  )rc  otherr   s      r$   __add__zIndentedBuffer.__add__  sK    ||u}},,,DLL9t{{#u||$
r@   N)r   )ri  z)tuple[str, list[tuple[int, LineContext]]])ri  r   rN   F)r  zCallable[[Any], Any]ri  rM  )rV   rW   rX   rg  rS  r\  rX  ra  rc  re  r  rk  rj  rs  r{  r~  r  r  rG   r  r  r]   r@   r$   rM  rM  Z  s\    H&'$$!4#!	%.
2r@   rM  c                  &     e Zd Zd fdZd Z xZS )FakeIndentedBufferc                "    t         |           y rP   )superrS  )rc  	__class__s    r$   rS  zFakeIndentedBuffer.__init__  s    r@   c                V    |dk(  rt         j                  | |      S t        d| d      )Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )rc  r   s     r$   r  z#FakeIndentedBuffer.__getattribute__  s;    ;**466!$ (= =
 	
r@   rh  )rV   rW   rX   rS  r  __classcell__)r  s   @r$   r  r    s    
r@   r  c              #  x   K   	 d  | t         _        |t         _        y # | t         _        |t         _        w xY wwrP   )sysstdoutstderr)initial_stdoutinitial_stderrs     r$   restore_stdout_stderrr    s-     $#
#
 $
#
s   : :7:c                  D    e Zd ZdZd ZddZddZd Zd Zd Z	d Z
d	 Zy
)rU  z.A line that can be 'unwritten' at a later timec                6    |j                         sd}|| _        y rm  )ro  r[  rp  s     r$   rS  zDeferredLineBase.__init__  s    zz|D	r@   c                    t         )zJReturns either self.line or None to indicate the line has been 'unwritten'r  rb  s    r$   rg  zDeferredLineBase.__call__      !!r@   c                    t         )z3Returns a new deferred line with the same conditionr  rp  s     r$   	_new_linezDeferredLineBase._new_line  r  r@   c                @    | j                  | | j                         S rP   r  r[  )rc  r  s     r$   rn  zDeferredLineBase.with_prefix  s    ~~455r@   c                T    | j                  | j                  j                               S rP   )r  r[  r  rb  s    r$   r  zDeferredLineBase.lstrip  s    ~~dii..011r@   c                >    | j                  | j                  |         S rP   r  )rc  r   s     r$   r?  zDeferredLineBase.__getitem__	  s    ~~dii.//r@   c                ,    t        | j                        S rP   )r   r[  rb  s    r$   re  zDeferredLineBase.__bool__  s    DIIr@   c                ,    t        | j                        S rP   )r   r[  rb  s    r$   __len__zDeferredLineBase.__len__  s    499~r@   N)ri  zOptional[str])r[  r   ri  rU  )rV   rW   rX   rY   rS  rg  r  rn  r  r?  re  r  r]   r@   r$   rU  rU    s-    8
""620r@   rU  c                    d}t         j                  j                  |       j                  }||k  rt        j                  d||d       yy)ND   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extraFT)r   r   get_device_propertiesmulti_processor_country   r5  )r   r  r  s      r$   
is_big_gpur    sM    G

007MMI7:%I> 	 	
 r@   c                 F    t         j                  xs t         j                  S rP   )r5   max_autotunemax_autotune_gemmr]   r@   r$   use_max_autotuner     s    :&":"::r@   c                    t               xrN | j                  j                  dk(  xr3 | j                  |v xr# t	        | j                  j
                  xs d      S )Nr   r   )r  r`   r   r_   r  r   )layoutallowed_layout_dtypess     r$   _use_template_for_cudar  $  sT     	1MM&(	1LL11	1 v}}**/a0	r@   c                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w N,)upperr5   max_autotune_gemm_backendsr   ro  backendr!   s     r$   _use_autotune_backendr  -  M    ==?!<<BBDJJ3O	      Ac                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w r  )r  r5   max_autotune_conv_backendsr   ro  r  s     r$   _use_conv_autotune_backendr  3  r  r  F)enable_int32enable_float8c                  ddl m}m} t        j                  t        j
                  t        j                  g}|r>t        j                  t        j
                  t        j                  t        j                  g}|r/|j                  t        j                  t        j                  g       t        | |      xr* t        d      xr  || j                  |j                        S )Nr   )BackendFeaturehas_backend_featureTRITON)codegen.commonr  r  r   r   r   r   r   extendr   r   r  r  r`   TRITON_TEMPLATES)r  r  r  r  r  layout_dtypess         r$   use_triton_templater  9  s    C]]ENNEMMBMu{{Se1153D3DEFv}5 	P!(+	P~/N/NOr@   c                   ddl m} |j                  j                  j	                  ||z  |z  d      }|dk  s|t
        j                  j                  k  ryddlm	} t        j                  j                  ryt        j                  t        j                  t        j                  t        j                   g}t#        | |      xr t%        d      }|r |       st&        j)                  d	       y|S )
Nr   r   re   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir is set correctly. Skipping CUTLASS backend for now.)r   r   r   r   	size_hintr5   r   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsr  r   versionhipr   r   r   r   r  r  ry   r5  )	r  mr,  r  r   	gemm_sizer  r  r   s	            r$   use_cutlass_templater  H  s      **1q519r*BIA~V[[%N%NN> }}]]ENNEMM5;;OM
 
7 <Q=C !#KK4
 Jr@   c                T    t         j                  j                  |       j                  S rP   )r   r   r  gcnArchNamer`   s    r$   _rocm_native_device_arch_namer  d  s    ::++F3???r@   c                     	 dd l } ddlm}m} ddlm} t        j                  j                  | j                        }||||fS # t        $ r d }d } G d d      }d }Y #w xY w)Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                     g S rP   r]   r]   r@   r$   r  z*try_import_ck_lib.<locals>.gen_ops_libraryx      Ir@   c                     g S rP   r]   r]   r@   r$   r  z.try_import_ck_lib.<locals>.gen_ops_preselected{  r  r@   c                      e Zd Zy)*try_import_ck_lib.<locals>.CKGemmOperationN)rV   rW   rX   r]   r@   r$   r  r  ~  s    r@   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr  r  ck4inductor.universal_gemm.opr  r+  r-  dirname__file__r   )r  r  r  r  package_dirnames        r$   try_import_ck_libr  i  sl    	
	
 ''//+*>*>? O-@/QQ  			 	 s   ;A A! A!c                   t               syt        d      syt        j                  j                  sy| j
                  j                  dk(  syt        | j
                        }t        j                  j                  D ci c]  }|j                  d      d   | c}xs |j                  d      d   |i}|j                         t        j                  j                  z  D cg c]  }||   	 }}|sy| j                  t        j                  t        j                   fvryddlm} |j&                  j(                  j+                  ||z  z  d	      }|dk  ryt-               \  }	}
}
}
|	st.        j1                  d
       yt        j                  j2                  st.        j1                  d       y|	t        j                  j2                  k7  rt.        j1                  d       yyc c}w c c}w )NFCKr   :r   r   r   re   r  z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)r  r  r   r  r  r`   r   r  r5   rocmarchr   r  ck_supported_archr_   r   r   r   r   r   r   r  r  ry   r5  ck_dir)r  r  r,  r  native_archrequested_archsrequested_supported_archsr   r  ck_package_dirnamer   s              r$   use_ck_templater    s    &====' 0>K39;;3C3CDaqwws|A)D #q!;IO
 !%%'&++*G*GG! 	! ! %||EMM5>>::   **1q519r*BIA~ #4"51aBC;;BCV[[///01G E!s   >G$"G)c                L    t               xr | j                  j                  dk(  S r1  )r  r`   r   )r  s    r$   _use_template_for_cpur
    s    =&--"4"4"==r@   c                .   ddl m} ddlm} ddlm} ddlm} t        |       rt        d      syt        j                  j                  sy|j                         t        j                  k(  }t        j                   t        j"                  t        j$                  t        j                  g}	 ||||r| j&                  nd |      \  }
}}} }}t)        ||f      ryt+        ||j,                        r|j/                         } ||j                               \  }} |d	|
|||j                         |j                         |t1               
      }d }| j&                  |	v xr8 |d uxr2  ||      xr( t+        ||j2                        xr |j5                         S )Nr   rv  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposed
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsc                N    | j                          | j                         d   dk(  S )Nre   r   )freeze_layout
get_strider!   s    r$   is_last_dim_stride1z9use_cpp_packed_gemm_template.<locals>.is_last_dim_stride1  s"    	||~b!Q&&r@   )rx  rw  codegen.cpp_micro_gemmr  codegen.cpp_utilsr  kernel.mm_commonr  r
  r  r5   cppweight_prepack	get_dtyper   r   r   r   halfr_   has_free_symbolsrB   BaseViewunwrap_viewparallel_num_threadsr  is_module_buffer)r  mat1mat2r  rw  r  r  r  	int8_gemmr  r  r,  r  r  r   r  r  s                    r$   use_cpp_packed_gemm_templater+    s^   9M) (0Ee0L::$$ EKK/I]]ENNEJJLM")"+&,,'	#Aq!VT4 A$$!@AQROL!"			NN$^^%!(*	J'
 	% 	$d"	$%	$ tR]]+	$ !!#r@   c                 2    t                xs t        d      S )NATEN)r  r  r]   r@   r$   use_aten_gemm_kernelsr.    s    !!B%:6%BBr@   c                  P    e Zd ZU  ej                  d      Zded<   ddZd Zd Z	y)	DebugDirManagerr   r   prev_debug_namec                @    t        t        j                        | _        y rP   )r  r0  counterr   rb  s    r$   rS  zDebugDirManager.__init__  s    ../r@   c                    t         j                  j                  j                  | _        | j                   d| j
                   | _        | j                  t         j                  j                  _        y )N_tmp_)r   _dynamor5   debug_dir_rootr1  r   new_namerb  s    r$   	__enter__zDebugDirManager.__enter__  sM    $}}33BB//0dggY?.2mm+r@   c                    t        j                  | j                         | j                  t        j
                  j                  _        y rP   )r1  r2  r8  r1  r   r6  r5   r7  )rc  rI   s     r$   __exit__zDebugDirManager.__exit__  s*    dmm$.2.B.B+r@   Nrh  )
rV   rW   rX   r  rW  r3  rK  rS  r9  r;  r]   r@   r$   r0  r0    s(    iooa G0<
Cr@   r0  c                    ddl m} g dfd}t        j                  j	                  |d|      5  t
        j                  j                           | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr   GraphLoweringc                (    j                  |        y rP   r"  codesource_codess    r$   save_output_codez*run_and_get_code.<locals>.save_output_code      D!r@   rD  rB  r   r   r>  r   r)  r  r   r6  reset)r   rI   r(  r>  rD  r:  rC  s         @r$   run_and_get_coderI    ss    $ L" 
		=*<>N	O %T$V$% <% <s   'A$$A0c                $      fd}t        |      S )Nc                 R            } | j                         j                          | S rP   )r   backward)r:  r   s    r$   run_with_backwardz1run_fw_bw_and_get_code.<locals>.run_with_backward  s!    

r@   )rI  )r   rM  s   ` r$   run_fw_bw_and_get_coderN    s    
 -..r@   c                X   ddl m} g dfdd	fd}t        j                  j	                  |d|      5  t        j                  j	                  |d      5  t
        j                  j                           | |i |}ddd       ddd       S # 1 sw Y   xY w# 1 sw Y   S xY w)
zLGet the inductor-generated code, but skip any actual compilation or running.r   r=  c                (    j                  |        y rP   r@  rA  s    r$   rD  z"get_code.<locals>.save_output_code%  rE  r@   c                     G d d      }| j                   r| j                         n| j                         \  }} |        |       S )Nc                      e Zd ZdZddZd Zy)@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulec                     y rP   r]   rb  s    r$   rS  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__,  s    r@   c                     y rP   r]   rf  s      r$   callzEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.call/  s    r@   Nrh  )rV   rW   rX   rY   rS  rV  r]   r@   r$   DummyModulerS  )  s    Fr@   rW  )cpp_wrappercodegen_with_cpp_wrappercodegen)rc  rW  rB  r   rD  s       r$   patched_compile_to_modulez+get_code.<locals>.patched_compile_to_module(  sF    	 	 04/?/?D))+T\\^ 	a
 	}r@   compile_to_modulerD  NrF  )rc  r>  rG  )r   rI   r(  r>  r[  r   rD  rC  s         @@r$   get_coder]    s    $ L"( 
		*,E
  zz(:<LM  	          s#   "B'BBB	BB)c                |    t        | g|i |}dt        |      cxk  rdk  sn J dt        |              |d   S Nr      z%expected one or two code outputs got r   )r]  r   )r   rI   r(  rC  s       r$   get_triton_codera  F  sS    B000L 	
S#!#C	.s</@.ABC#?r@   c                    t        | g|i |\  }}dt        |      cxk  rdk  sn J dt        |              |d   S r_  )rI  r   )r   rI   r(  r   rC  s        r$   run_and_get_triton_coderc  O  sW    &r;D;F;OA| 	
S#!#C	.s</@.ABC#?r@   c                    ddl m} ddlm |j                  g fd}t
        j                  j                  |d|      5   | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr   )CompiledFxGraphr=  c                 ^     | i | | d   }t        |      sJ j                  |       y )Nr`  )rB   r"  )rI   r(  r   r>  graph_lowerings	real_inits      r$   	fake_initz-run_and_get_graph_lowering.<locals>.fake_init_  s7    4"6"Q%///u%r@   rS  )torch._inductor.codecachere  torch._inductor.graphr>  rS  r   r)  r  )	r   rI   r(  re  ri  r:  r>  rg  rh  s	         @@@r$   run_and_get_graph_loweringrl  X  so    93((IO& 
		?J		B %T$V$% ?""% ?""s   	AA'c              #     K   ddl m} |j                  |    }	 t        j                  ||      |j                  | <   d ||j                  | <   y# ||j                  | <   w xY ww)z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorrn  	loweringsr   partial)aten_opoverride_fnrn  orig_fns       r$   override_loweringru  k  s`      )  )G.&/&7&7W&M7#&-7#g7#s   A$'A  A$A!!A$c                     ddl m} |j                   fd}t        j                  j
                  j                  |d|      S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerc                B     | |        | |      }r	 | |       |S rP   r]   )	schedulerr  outrt  post_fnpre_fns      r$   rp  z(add_scheduler_init_hook.<locals>.wrapper  s+    y% i'Iu%
r@   rS  )torch._inductor.schedulerrw  rS  unittestr   r)  r  )r|  r{  rw  rp  rt  s   ``  @r$   add_scheduler_init_hookr  {  s9    
 4  G ==%%iWEEr@   c                z    t         j                  rt        j                  |        yt        j	                  |        y)z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)r5   developer_warningsry   r5  info)msgs    r$   developer_warningr    s$       Cr@   c                    	 t         j                  j                  d      } | dz   t        t         j                        k  rTt        t         j                  | dz            dkD  r2t         j                  | dz      d   dk7  rt         j                  | dz      S t         j                  D ]#  }|j                  d      s|t        d      d c S  y# t        $ r Y Bw xY w)a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr   r   -z--only=N)r  argvr   r   
ValueError
startswith)r  r  s     r$   get_benchmark_namer    s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx )>>)$s9~'(()  s   BC 	CCc                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    ywr   Nr]   r  s     r$   rl   zis_ones.<locals>.<genexpr>       %!qAv%   rF   r  s    r$   is_onesr        %u%%%r@   c                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    yw)r   Nr]   r  s     r$   rl   zis_zeros.<locals>.<genexpr>  r  r  r  r  s    r$   is_zerosr    r  r@   c                &    t        d | D              S )Nc              3     K   | ]@  }t        |t        j                        r$|j                  t        j                  d       k(   B yw)r   N)rB   r   r  r`   )rj   items     r$   rl   z is_cpu_device.<locals>.<genexpr>  s8      dELL) 	u||E**s   AAr  )inputss    r$   is_cpu_devicer    s       r@   c                    t        | t        j                        sJ d       | j                  rt        j
                  S t        j                  S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rB   rC   r   r[   r   r   r   )r  s    r$   get_sympy_Expr_dtyper    sD    UZZ BAB  ~~{{}}r@   c              /     K   | r-t        j                  j                  |i |5 }| d d d        y d  y # 1 sw Y   y xY wwrP   )r   ru   rv   )should_profilerI   r(  r   s       r$   maybe_profiler    sE     ^^##T4V4 	G	 	 		 	s   "A7AA Ac                 l    t         j                  j                  } | dk  rt        j                         } | S r   )r5   r  threadsr   get_num_threads)r  s    r$   r&  r&    s+    jj  G{'')Nr@   c                   ddl m}m} | t        j                  t        j
                  t        j                  fv sJ t        j                  |      j                  j                  d      rddlm}  |       }| t        j                  t        j
                  fv r	 || |      S t        j                  j                  j                  j                   r |t        j                  |      S  |t        j                  |      S | t        j                  t        j
                  fv r ||       S t        j                  j                  j                  j                   r |t        j                        S  |t        j                        S )Nr   )get_max_simd_tflopsget_max_tensorcore_tflops
clock_rate)max_clock_rate)triton.testingr  r  r   r   r   r   inspect	signature
parametersr  torch._utils_internalr  backendsr   matmul
allow_tf32)r_   r  r  r  sm_clocks        r$   get_device_tflopsr    s   MU]]ENNEMMBBBB,-88<<\J8!#U]]ENN33,UH==>>%%00,U]]HEE&u}}h??U]]ENN33,U33>>%%00,U]];;&u}}55r@   c                     ddl m}   |        S )Nr   get_dram_gbps)r  r  r  s    r$   get_gpu_dram_gbpsr    s    ,?r@   c                 x    ddl m}  | j                  j                  j	                  d      j                  dd      S )Nr   drivermax_shared_mem)triton.runtimer  activeutilsr  r  r  s    r$   get_gpu_shared_memoryr    s.    %==44Q7;;<LaPPr@   c                $    | j                  d      S )Nwelford)r  reduction_types    r$   is_welford_reductionr    s    $$Y//r@   c                     t        |       rdS dS )N   r   )r  r  s    r$   reduction_num_outputsr    s    $^41;!;r@   c                 0    t        j                         dk(  S )NLinux)platformsystemr]   r@   r$   is_linuxr    s    ??''r@   c                 (    t         j                  dk(  S )Nr7   )r  r  r]   r@   r$   
is_windowsr    s    <<7""r@   c                &    t        d | D              S )Nc              3  n   K   | ]-  }t        |t        j                        xr |j                    / y wrP   )rB   rC   r   	is_numberr  s     r$   rl   z#has_free_symbols.<locals>.<genexpr>  s)     Jz!UZZ(<_<Js   35r  )itrs    r$   r#  r#    s    JcJJJr@   c                 `   ddl m} | D ]"  }t        ||j                        r`t	        |j
                  j                               s;t        |j
                  d      sTt	        |j
                  j                               sx yt        ||j                  |j                  |j                  f      rOt        |d      rt        |d      sJ t	        |j                               st	        |j                               s yt        ||j                        st        dt        |              y)Nr   rv  r  Tget_sizezunexpected type for is_dynamic F)rx  rw  rB   r  r#  r  r  r   r  r  r$  ComputedBufferr  	TypeErrorr   )rI   rw  ts      r$   
is_dynamicr     s     Ia& 12-2B166CTCTCV2WBMM2;;8I8IJK1j)ga.FFF

-1A!,,.1QAryy)=d1gYGHHI r@   c                      e Zd ZdZdZy)PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)rV   rW   rX   r  r  r]   r@   r$   r  r  6  s      K *r@   r  c                   ddl m} t        j                  ddd      5 }t	        j
                         }t	        j
                         } t        |t        |            j                  |  t        d|j                   |	       t        |j                  |	       t        j                         }t        ||t        j                  j                         5   | |j                         d d d        t        j                         |z
  }	 ||j                         |j                  j#                          |j%                          t        d
|j                   |	       t        |j                  |	       |j'                         |j'                         k(  }
t(        j+                  d||j,                  |
|	       d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nr   )stable_topological_sortwzutf-8F)modeencodingr7  )r.  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  r'  NamedTemporaryFileior   r+   r'   	propagater?  r   r   nowr*   r5   tracelog_url_for_graph_xformlint	recompilerX  ry   r  r   )r  r.  inpr  r  r;  	before_ioafter_io
start_timetime_elapsedr  s              r$   pass_execution_and_saver  @  sb   8		$	$
 
 
KKM	;;=C	R#3C#89CCSI	"(($1-bhhY'\\^
#BV\\-Q-QR 	N	||~
2)


#!,bhhX& H$5$5$77hFF	
-
 
	 	
 
s%   B6GG(CGG
	GGc                j    ddl m} t        |       |j                  k(  xr |d u xs | j                  |u S Nr   rv  )rx  rw  r   _CollectiveKernelop_overloadr-  r  rw  s      r$   is_collectiver  b  s4    :---X2:3WAQAQUWAWXr@   c                >    ddl m} t        |       |j                  k(  S r  )rx  rw  r   _WaitKernel)r-  rw  s     r$   is_waitr  h  s    :''r@   c                    ddl m}m} t        | |      sJ t        | |      rt	        d | j
                  D              S t        | j                        S )Nr   BaseSchedulerNodeGroupedSchedulerNodec              3  2   K   | ]  }t        |        y wrP   )contains_collectiver  s     r$   rl   z&contains_collective.<locals>.<genexpr>s  s     @a&q)@r  )r}  r  r  rB   r	  snodesr  r-  snoder  r  s      r$   r   r   n  sE    Qe.///%-.@5<<@@@UZZ((r@   c                    ddl m}m} t        | |      sJ t        | |      rt	        d | j
                  D              S t        | j                        S )Nr   r  c              3  2   K   | ]  }t        |        y wrP   )contains_waitr  s     r$   rl   z contains_wait.<locals>.<genexpr>}  s     :=#:r  )r}  r  r  rB   r	  r  r  r-  r  s      r$   r  r  x  sE    Qe.///%-.:U\\:::uzz""r@   c                    ddl m} t        |t        j                  j
                        r|h}t        | |j                        xr | j                  |v S r  )rx  rw  rB   r   r  r  FallbackKernelr  r  s      r$   is_fallback_opr	    sC    "ejj++,TdB--.I43C3Cr3IIr@   c                B    |||    j                   j                            S rP   )defining_oprV  )buf_namename_to_bufname_to_fused_nodes      r$   buf_name_to_fused_snoder    s!    k(3??HHJKKr@   c                    |r	 ||       ry |j                  |        | j                  D ].  }t        |j                  ||      }||v rt	        |||||       0 y )Ncriteria_cb)r  unmet_dependenciesr  r   find_recursive_deps_of_node)r  collected_node_setr  r  r  depdefining_op_for_deps          r$   r  r    sp     {5)5!'' 
5HHk#5
 "44##	

r@   c           	     ~   |r	 ||       ry |j                  |        | j                         D ]  }|j                  D ]}  }|j                  J |j                  j	                         dk(  r/|j                  j	                         |vrL||j                  j	                            }||v rnt        |||||         y )NOUTPUTr  )r  get_outputsr  r-  rV  find_recursive_users_of_node)r  r  r  r  r  or  user_ops           r$   r  r    s     {5)5!  GG 	D99(((yy!!#x/yy!!#+==(););)=>G,,(""'	r@   c                    t         j                  j                  j                  rdnd}t         j                  j                  j
                  r)t         j                  j                  j                         sy|| z
  |z
  S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r`  r   )r   
_functorchr5   functionalize_rng_opsr6  inline_inbuilt_nn_modulesr  is_parameter_freezing)dynamo_gm_num_inputsaot_fw_gm_num_inputsnum_rng_seed_offset_inputss      r$   num_fw_fixed_argumentsr&    sb     $$::  	66##99;"669SSSr@   c                    d }d}g }| j                   j                  D ]0  }|j                  dk(  s ||      r|j                  |       |dz  }2 |t	        t        t        |                  k(  sJ t        |      S )z>
    Infers which inputs are static for a backwards graph
    c                ^    d| j                   vxr d| j                   vxr d| j                   vS )Ntangentsbwd_seedbwd_base_offsetr  r  s    r$   is_saved_tensorz'count_tangents.<locals>.is_saved_tensor  s5    aff$ 0!&&(0!/	
r@   r   r#  r   )r   r  r  r"  r   rq   r   )fx_gr,  	arg_countstatic_arg_idxsr,  s        r$   count_tangentsr0    s    

 IOZZ 44= q!&&y1NI	 d5_)=#>????r@   c                  .    e Zd ZU ded<   d Zed        Zy)	BoxedBoolr   rT   c                    | j                   S rP   )rT   rb  s    r$   re  zBoxedBool.__bool__  s    zzr@   c                6    t        | t              r	d| _        | S yNF)rB   r2  rT   r  s    r$   disablezBoxedBool.disable  s    c9%CIJr@   N)rV   rW   rX   rK  re  rk  r6  r]   r@   r$   r2  r2    s     K  r@   r2  c              #      K   ddl m} |j                   fd}t        j                  j
                  j                  |d|      5  d  d d d        y # 1 sw Y   y xY ww)Nr   )WrapperCodeGenc                D    j                  |        | |||g|i |S rP   r@  )rp  r   kernel_coder  rI   r(  kernel_listorig_define_kernels         r$   new_define_kernelz2collect_defined_kernels.<locals>.new_define_kernel  s-    ;'!'4hXXQWXXr@   define_kernel)codegen.wrapperr8  r>  r~  r   r)  r  )r;  r8  r=  r<  s   `  @r$   collect_defined_kernelsr@    sR     /'55Y
 
			#	#NOEV	W   s   AA"A	A"AA"c                    | dz   S )N__original__r]   r  s    r$    get_cloned_parameter_buffer_namerC    s    .  r@   c                <    t        | t              s	| J |        | dv S )N)r   r   )rB   r   r  s    r$   is_gpurE    s&    fc"fn<f<4_$$r@   c                <    t        | t              sJ t        |       S rP   )rB   r   rE  r  s    r$   device_need_guardrG    s    fc"""&>r@   c                d    | t         j                  t         j                  t         j                  hv S rP   )r   r   r   r   rE  s    r$   ,needs_fallback_due_to_atomic_add_limitationsrI    s     U[[%**enn===r@   c                   | j                   t        j                  j                  j                  t        j                  j                  j
                  fv r|y| j                   t        j                  j                  j                  k(  rdnd}|d |hvxs |xr t        |      xr t        |      xs | j                   t        j                  j                  j                  k(  xrW |dk(  xrP |xrL |dk(  xrE t        j                  j                  xr) t        j                  j                  xs t               dk7  xs? ||k(  xr" |t        j                  t        j                  hv xs t        j                          S )NFr  r   r   r   )overloadpacketr   r   atenscatter_reduce_scatter_reducescatter_rE  rI  r5   r  fallback_scatter_reduce_sumdynamic_threadsr&  r   r   r  )r  r  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tys          r$   use_scatter_fallbackrW    sZ    	""IINN**EIINN,I,IJ	K" ++uyy~~/F/FFE 
 	tY// 	8 H'H<YG		8 &&%))..*H*HH L%'LL  5(L 

66	L
 ++J/C/E/J	8 i'SJ5::u{{:S,S	8 557!r@   c                   ddl m}m} ddlm} t        dt        |        d       t        |       D ]  \  }}t        d|dd       ||u rt        d	       '||u rt        d
       7t        ||      r|j                         }t        |rdnd d       |r:|j                  J t        d|j                  j                  j                          t        d       |j                  j                  D ]  }t        |        t        d       |j                  j                  D ]  }t        |        t!        dt#        |              y)z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr  3r  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdrY  rZ  r}  r[  r?  r   r   rB   is_reductionr-  r  reduction_hintread_writesreadswritesr   r   )r}  rY  rZ  r[  r  r-  is_redr  s           r$   dump_node_schedulerf  =  s=   
 O7	M 236
:;}- H	T#al?"$%%%%&m,&&(FfU$/?@yy,,,01N1N0OPQ*''-- c
+''.. c
 !9$t*FGG'Hr@   c                z    ddl m}  || j                         t        | j                        z  t
        z  dk(        S )Nr   )statically_known_true)r  rh  storage_offsetrG  r_   GPU_ALIGN_BYTES)r   rh  s     r$   tensor_is_alignedrk  \  s:     L 				 >&,,#?	??RVWW r@   c                |    t        | j                  j                        syt        j                  xs t        |       S r5  )rE  r`   r   r5   assume_aligned_inputsrk  )example_inputs    r$   should_assume_input_alignedro  j  s2     -&&++,''K+<]+KKr@   c                     t         j                  j                  j                         } | st	        j
                         S | j                  j                  }|st	        j
                         S |j                         S rP   )	r   _guardsTracingContexttry_getry  nullcontextr  r   suppress_guards)tracing_contextr   s     r$   #maybe_get_suppress_shape_guards_ctxrw  s  sb    
 mm22::<O%%''  ))33I%%''$$&&r@   c                   t         j                  j                  j                  t        dd      5  t
        j                  j                          dd l}dd l	} |j                         } |j                  |      }ddlm} |j                  |       |j                  }|j!                  |j"                          | |i |}	|j%                         }
|j!                  |       |j'                  |       d d d        |	|
fS # 1 sw Y   	
fS xY w)Nrz   Tr   )output_code_log)r~  r   r)  r  r5   r   r6  rH  r  loggingr   StreamHandlerrj  ry  
addHandlerlevelsetLevelDEBUGrX  removeHandler)r   rI   r(  r  rz  log_capture_stringchry  
prev_levelr:  r  s              r$   run_and_get_cpp_coder    s     
			#	#FGT	: *(R[[]"W""#56=""2&$**
  /T$V$'')  ,%%b)*  19!*  19s   CC>>D
c                    d }t        |       }||j                  S | D ]4  }t        |t        j                        s|j
                  j                  c S  y rP   )r'   r   rB   r   r  r-  )r  r   r  inputs       r$   shape_env_from_inputsr    sY    I (I """  (eU\\*::'''(
 r@   c                8     t              dk(  r S d fd}|S )Nr   c                ,    t        |         |       S rP   )copy_misaligned_inputs)
new_inputsinputs_to_checkr6  s    r$   runz)align_inputs_from_check_idxs.<locals>.run  s    z?;Z  r@   )r  List[InputType])r   )r6  r  r  s   `` r$   align_inputs_from_check_idxsr    s#     ?q ! Jr@   c                *   t        d t        | j                         | j                               D              dz   }t	        j
                  | |fd      j                         }t	        j
                  || j                         | j                               S )Nc              3  2   K   | ]  \  }}|d z
  |z    ywr  r]   )rj   shaper  s      r$   rl   z)clone_preserve_strides.<locals>.<genexpr>  s     P]UFUQY& Pr  r   rN   )r   r   rR  r  r   
as_stridedclone)r!   needed_sizebuffers      r$   clone_preserve_stridesr    sp    Pc!&&(AHHJ6OPPSTT  a+6<<>FFAFFHahhj99r@   c                    |D ]I  }| |   }t        |t        j                        sJ |j                         t        z  s<t        |      | |<   K y rP   )rB   r   r  data_ptr	ALIGNMENTr  )r  check_inputs_idxsr   _inps       r$   r  r    sL      9!}$---==?Y&248JqM	9r@   c                    g }|D ]N  }| |   }t        |t        j                        s#|j                         t        z  dk(  s>|j                  |       P t        |      t        |      k7  r|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )rB   r   r  r  r  r"  r   )r  static_input_idxsaligned_static_input_idxsr  r  s        r$   remove_unaligned_input_idxsr    st     !#  2seU\\*0@90LQR/R%,,S12 $%->)??((r@   c                x   t         j                  j                  j                         }||j                  t        |j                        dk(  sJ t        |       |j                  D ]M  }||j                  j                  d        !|j                  j                  t        fd|D                     O y y y )Nr   c              3  Z   K   | ]"  }j                  |      n
t        |       $ y wrP   )evaluate_symexprrQ   )rj   r   r   s     r$   rl   z5set_tracing_context_output_strides.<locals>.<genexpr>  s:         )4 &66q9!$Q(s   (+)	r   rq  rr  rs  output_stridesr   r  r"  r$  )r7  compiled_graphrJ  exprsr   s       @r$   "set_tracing_context_output_stridesr    s    mm**224Gw55A7))*a///).9	#22 	E}&&--d3&&--  "' 			  Br@   )rL   
sympy.Expr)   d   )r   zCallable[[], Any]ri  r  )ri  r   )r`   z"Union[Optional[torch.device], str]ri  ztorch.device)r   zIterable[_T]ri  zValuesView[_T])r   Union[int, sympy.Expr]r   r  ri  r  )r   z"Iterable[Union[int, torch.SymInt]]ri  zList[sympy.Expr])r   z Iterable[Union[int, sympy.Expr]]ri  zList[Union[int, torch.SymInt]])r  torch._ops.OpOverloadrP   )r  z1Optional[Callable[[torch._ops.OpOverload], bool]])r   )r`   r   )r   r   )r6  zCallable[..., Any]r8  rQ   r`   r   ri  r  )r]   
   r  g      ?r   )rI  r   rJ  r   )rI  r   rM  z	List[str])ri  rQ   )r   z!Callable[Concatenate[Any, P], RV]ri  zCachedMethod[P, RV])r  zIterable[torch.fx.Node]ri  zSet[torch.fx.Node])r  r  ri  r   )r  r2   r  rQ   ri  sympy.Symbol)r   r   ri  r  )r  r  r  zDict[sympy.Expr, Any]ri  r  )r   r   ri  r   )rI   r   ri  r   )r.  torch.fx.GraphModule)rI  r   )NNT)ri  z	List[int])r  zList[torch.dtype]ri  r   )r  r   ri  r   r  )r  r  ri  ztorch.dtype)r  zIterable[Any])r#  rQ   r$  rQ   )r-  r  )r   r   )r  r  )r   torch.Tensor)rn  r  )r  zList[torch.Tensor])r6   Callable[[List[InputType]], Any]r  Sequence[int]ri  r  )r!   r  )r  r  r  r  ri  rj  )r  r  r  r  )
__future__r   r  ry  dataclassesenumr   r  r  r  rz  r  r   r+  r  r1  r  r'  r  r4  r~  r   r   typingr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   rC   r   r   	lru_cacher%   torch._dynamo.device_interfacer&   torch._dynamo.utilsr'   torch.autogradr(   torch.autograd.profiler_utilr)   (torch.fx.passes.graph_transform_observerr*   torch.fx.passes.shape_propr+   torch.utils._sympy.functionsr,   r-   r.   r/   r0   torch.utils._sympy.symbolr1   r2   torch.utils._sympy.value_rangesr3   r4   rx  r5   runtime.runtime_utilsr6   r   r4  	getLoggerrV   ry   r8   r   	VarRangesr  rQ   	InputTyperj  r  r=   r?   rH   FunctionrJ   r   r   r   r   r   r   r   r   r  r  r  r/  rm   r<  rD  rK  rN  rP  rS  r[  r\  r]  r`  rt  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rK  r  r!  rz  r<  rC  rG  rI  rM  r  r  rU  r  r  r  r  r  r  r  r  r  r  r
  r+  r.  r0  rI  rN  r]  ra  rc  rl  ru  r  r  r  r  r  r  r  r  r&  r  r  r  r  r  r  r  r#  r  Enumr  r  r  r  r   r  r	  r  r  r  r&  r0  	dataclassr2  r@  rC  rE  rG  rI  rW  rf  rk  ro  rw  r  r  r  r  r  r  r  r]   r@   r$   <module>r     sr   "       	     	   
           5    UO	
 T  D 0 % 2 K 0  8 D  = llg%g!T]UZZ'(	%,,#$	 	{Q'A-+2B XDX XB5
LENN Od T @@
+)!)*@)) AF+	+++	)#&G OSK2.' NT69GJ
  CI<?)'#$ cNTT"8WQU^ "&)B52r 9=*&>"$%	DU	>.-(VA !# I "	 (  ( VA Q7 7* ~ ~B
 
 $ $ @ T	 	; 16U 8 T@ @ TR R63l>1hCC C" /$N#& . .F&	)>&&   T6 66 T Q0<(#K,*$)) *
DY()#JL
 MQ
, MQ2T" 0     !%

>
$&$NH>L'".*+" &:994A9	9$$r@   