
    ɯwg                   
   U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZ d dl m!Z! d dlm"Z" d dl#m$Z$ d d	l%m%Z%m&Z& d d
l'm(Z( d dl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;Z;d dl<m=Z> d dl;m?Z?m@Z@ d dlAmBZBmCZCmDZD d dlEmFZFmGZGmHZH d dlImJZJ d dlKmLZLmMZM d dlNmOZO ddlPmQZQ  e7d      ZRe6rd dlSmTZT ddlUmVZVmWZW 	 d dlXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZa d dlbmcZc d dldmeZemfZfmgZg d dlhmiZimjZjmkZk d dllmmZmmnZn d dlompZpmqZqmrZrmsZsmtZtmuZumvZv d dlwmxZx d dlymzZzm{Z{m|Z| d d l}m~Z~mZmZ e6rd d!lmZ d d"lmZ d d#lmZ d d$lmZmZ ej                  j                  e      Zej                  j                  ej                  j                  e            Zej                  j!                  ed%      Zej$                  d&k(  Z eFj(                         rd d'lmZ d d(lmZ d d)lmZmZmZmZ ndd*Zdd+Zdd,Zdd-Ze;j<                  j?                  ed.      Zd/Zej$                  d&k(  Z e
jF                  e      Zdd0Zdd1Z G d2 d3      Z G d4 d5e      Z G d6 d7e      Zdd8Zdd9Zddd:Z	 d	 	 	 	 	 	 	 dd;Z	 d	 	 	 	 	 	 	 dd<Z	 	 	 d	 	 	 	 	 	 	 	 	 	 	 dd=Zdd>Z	 	 d	 	 	 	 	 	 	 	 	 dd?Zejd                   G d@ dA             ZddBZ	 	 	 	 	 	 ddCZ	 	 	 	 	 	 ddDZ	 	 	 	 	 	 ddEZddFZddGZ G dH dIejt                        Z	 	 	 	 	 	 	 	 ddJZ ejz                  d      ddK       ZddLZejd                   G dM dN             Z G dO dPe      Z G dQ dR      Z	 	 	 	 	 	 	 	 	 	 ddSZ	 	 	 	 	 	 	 	 ddUZ	 	 	 	 	 	 	 	 ddVZddWZ G dX dY      ZeZdZed[<   ejd                   G d\ dT             Zdd]Z ejz                  d      dd^       Zes G d_ d`             Z G da db      Zesejz                  ddc              ZdddZ	 	 	 	 	 	 	 	 ddeZ	 	 	 	 	 	 	 	 ddfZdadgedh<   ddiZes G dj dk             Z	 	 	 	 	 	 	 	 	 	 ddlZes G dm dne׫             Zes G do dpe٫             Zes G dq dre٫             ZddsZddtZes G du dv             Z G dw dx      ZddyZddzZdd{Zdd|Zdd}Z	 d	 	 	 	 	 	 	 	 	 dd~Z G d d      Zes G d d             Zes G d d             Z G d d      Z G d de      Z G d de      Zy)    )annotationsN)bisect_right)copy)c_void_pCDLLcdll)	timedelta)partial)Path)timetime_ns)
ModuleType)AnyCallablecastCounterDict	GeneratorListNoReturnOptionalSequenceSetTupleTYPE_CHECKINGTypeVarUnion)	TypeAlias)SymIntTensor)countersdynamo_timedget_chromium_event_logger)configexcmetrics)cuda_env)rocm_compile_commandrocm_compiler)log_cache_bypass   )_alignT)KeysView)
JsonDataTyRemoteCache)	_set_gpu_runtime_env_transform_cuda_paths
CppBuilder
CppOptionsCppTorchCudaOptionsget_compiler_version_infoget_cpp_compiler&get_name_and_dir_from_output_file_pathnormalize_path_separator)pick_vec_isa)BoxedDeviceIndexCudagraphCachedInfo#log_cudagraph_skip_and_bump_counter)_module_to_triton_kernel_reload_python_module _reload_python_module_in_subproc)	cache_dirdefault_cache_dir)ALIGN_BYTESalign_inputs_from_check_idxs	BoxedBoolclear_on_fresh_inductor_cacheis_linux
is_windows"set_tracing_context_output_strides)trace_structured)extract_tensor_metadata
FakeTensorTensorMetadata)has_hinthint_intShapeEnv)FutureGraphLowering)ChoiceCaller)HalideInputSpec
HalideMetaz_inductor/script.ldwin32)build_paths)_run_build_command)log_global_cache_errorslog_global_cache_statslog_global_cache_valsuse_global_cachec                      y N argskwargss     ^/home/mcse/projects/flask/flask-venv/lib/python3.12/site-packages/torch/_inductor/codecache.pyrZ   rZ              c                      y r_   r`   ra   s     rd   r[   r[      re   rf   c                      y r_   r`   ra   s     rd   r\   r\      re   rf   c                      yNFr`   r`   rf   rd   r]   r]      s    rf   output_codeiX  c                   t         j                  j                  dn,dt         j                  j                  j                  dd       }dt        j
                  j                   t        j
                  j                   }| d| }t        j                  j                  t               |      }t        j                  j                  ||       }t        j                  |d       |S )	Ncpucu. py_Texist_ok)torchversioncudareplacesysversion_infomajorminorospathjoinrA   makedirs)namecu_strpython_versionbuild_foldercpp_wrapper_dircpp_wrapper_build_directorys         rd   cpp_wrapper_cache_dirr      s     ==% 	%--$$,,S"567 
 #**001#2B2B2H2H1IJN$%Qvh/Lggll9;=O"$'',,"EKK+d;&&rf   c                 >    t         j                  j                  dS dS )N
cubin_path
hsaco_path)ru   rv   hipr`   rf   rd   get_cpp_wrapper_cubin_path_namer      s     ==,,4<F,Frf   c                      e Zd Ze ej
                  d      dd              Zee ej
                  d      d	d                     Ze ej
                  d      d
d              Z	ddZ
ddZddZy)	CacheBaseNc                    	 ddl m}   |        }	 dd id|id}t        j                  j                  t        j                  j                               }t        j                  j                  3|j                  |d   d<   t        j                  j                  |d   d<   n2|j                  |d   d<   t        j                  j                  |d   d	<   t        j                  t        j                   |d
      j#                  d            j%                         |d<   |S # t        $ r d }Y w xY w# t        t        f$ r i }Y qw xY w)Nr   )
triton_keyr   triton)devicerv   r   rv   rw   r   T)	sort_keysutf-8hash)triton.compiler.compilerr   ModuleNotFoundErrorru   rw   get_device_propertiescurrent_devicerv   r   gcnArchNamer   AssertionErrorRuntimeErrorhashlibsha256jsondumpsencode	hexdigest)r   triton_versionsystemdevice_propertiess       rd   
get_systemzCacheBase.get_system   s3   	"; (\N	!4.n&F !&

 @ @

))+! }}!!-+<+A+Ax (,1MM,>,>y!&)+<+H+Hx (+0==+<+<y!%(
 !JJv.55g>

)+ 	v 7 # 	"!N	"& - 	F	s#   D CD1 D.-D.1EEc                     t        t        j                  j                  t	               dt
        j                         d               S )Ncacher   )r   r}   r~   r   rA   r   r   r`   rf   rd   get_local_cache_pathzCacheBase.get_local_cache_path   s0     BGGLLgy7K7K7Mf7UVWWrf   c                     t         j                  Lt        t        j                  j                  t         j                  t        j                         d               S d S )Nr   )r$   global_cache_dirr   r}   r~   r   r   r   r`   rf   rd   get_global_cache_pathzCacheBase.get_global_cache_path   sL    
 &&2 f55y7K7K7Mf7UVW	
 	
rf   c                6    t         j                         | _        y r_   )r   r   r   selfs    rd   __init__zCacheBase.__init__   s    **,rf   c                    | j                         }|j                         si S t        |      5 }t        j                  |      }d d d        |d   S # 1 sw Y   d   S xY wNr   )r   is_fileopenr   load)r   local_cache_pathlocal_cache_fplocal_caches       rd   get_local_cachezCacheBase.get_local_cache   sa    446'')I"# 	4~))N3K	47##	47##s   AAc                    | j                         }t        t        |      t        j                  | j
                  |dd      d       y )N)r   r      )indentT	make_dirs)r   write_atomicstrr   r   r   )r   r   r   s      rd   update_local_cachezCacheBase.update_local_cache   s<    446 !JJ$++DQO	
rf   returnDict[str, Any])r   r   )r   zOptional[Path]r   None)r   r   r   r   )__name__
__module____qualname__staticmethod	functools	lru_cacher   rF   r   r   r   r   r   r`   rf   rd   r   r      s    Y"  "H "YX  # X Y
  
-$
rf   r   c                      e Zd ZddZddZy)
LocalCachec                N    | j                         }|}|D ]  }||v r||   } y  |S r_   )r   )r   keysr   	sub_cachekeys        rd   lookupzLocalCache.lookup	  s?    $$&	 	Ce|!#J			 rf   c                   | j                         }|}|dd D ]  }|j                  |i        ||   } |||d   <   | j                  |       y )Nr   )r   
setdefaultr   )r   valuer   r   r   r   s         rd   	set_valuezLocalCache.set_value  sa    $$&	": 	'C  b)!#I	' $	$r(&rf   N)r   r   r   Optional[Dict[str, Any]])r   r   r   r   r   r   )r   r   r   r   r   r`   rf   rd   r   r     s    
	'rf   r   c                  Z    e Zd Z ej                  d      dd       Z	 	 	 	 	 	 	 	 	 	 ddZy)PersistentCacheNc                    | j                         }||j                         si S t        |      5 }t        j                  |      }d d d        |d   S # 1 sw Y   d   S xY wr   )r   r   r   r   r   )r   global_cache_pathglobal_cache_fpglobal_caches       rd   get_global_cachez PersistentCache.get_global_cache"  sg     668$,=,E,E,GI#$ 	699_5L	6G$$	6G$$s   AA c                   t        j                         t        t        | j                        }t        t
        | j                        }t        t        | j                        }i ddfd}t        j                  st        j                  rt        j                  r| j                         ni }	 ||	      st               r || j                         |      s|	  |      t        fdD              sJ |	j                  i        |	   j                  i       j                  i        j!                         D ]!  \  }
}||	         |
j#                         <   # 	 | j'                  |	       D 
ci c]  }
|
j#                         |
    }}
 ||       S t               r || j                         |       S # t$        $ r} ||       |d}~ww xY wc c}
w )aG  
        Check to see if we have benchmarked the given choice callers. For each
        choice caller:

            1. Check global_cache[op][inputs][choice][precision], return benchmark if cached.
            2. Check local_cache[op][inputs][choice][precision], return benchmark if cached.
            3. If benchmark is not None:
                a. `max_autotune_gemm=True`: benchmark the choice, update
                    local_cache[op][inputs][choice], and return the benchmark.
                b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing.
        Nc                    d}D ][  }|j                         }|| j                  i       j                  i       j                  i       v r|          |   	|<   Yd} n |r	 ||       |S )z2Check if `cache` contains data for all the choicesTF)cached)hash_keyget)
r   callbackhitchoicechoice_hashchoicesinputsop	precisiontimingss
        rd   check_cachez+PersistentCache.lookup.<locals>.check_cacheF  s    C! $oo/%))B"3"7"7"C"G"G	SU"VV&+Bi&7	&B;&OGFO  C $Jrf   )r   c              3  &   K   | ]  }|v  
 y wr_   r`   ).0r   r   s     rd   	<genexpr>z)PersistentCache.lookup.<locals>.<genexpr>d  s     GVv0Gs   r_   )r   r   r   r   r   bool)ru   get_float32_matmul_precisionr
   r[   r   r\   rZ   r$   max_autotunemax_autotune_gemmautotune_local_cacher   r]   r   allr   itemsr   r   r   )r   r   r   r   	benchmark	log_statslog_vals
log_errorsr   r   r   timingetimings_to_logr   r   s    ```          @@rd   r   zPersistentCache.lookup+  s   $ 668	2DKKVYW	0$++r69U#T[["fi

 	 	  &":":4:4O4O$..0UWK  ,$&#D$9$9$;iP)'0GGwGGGG**2r2O..vr:EEiQST*1--/ WPVB/	:6??;LMW ''4 FM";AFOO%wv6" " (  --/)D ! $ qMG"s   .B	G G3	G0!
G++G0r   )
r   zList[ChoiceCaller]r   r   r   r   r   z4Optional[Callable[[Any], Dict[ChoiceCaller, float]]]r   zDict[ChoiceCaller, float])r   r   r   r   r   r   r   r`   rf   rd   r   r   !  s]    Y% %N#N N 	N
 HN 
#Nrf   r   c                     t         j                  j                  t               d      } t         j                  j	                  |       st        j
                  | d       | S )NlocksTrs   )r}   r~   r   rA   existsr   )lock_dirs    rd   get_lock_dirr  |  s;    ww||IK1H77>>(#
Ht,Orf   c                    t        j                  t        j                  |       j	                               d d j                  d      j                         S )N3   r   )base64	b32encoder   r   digestdecodelower)datas    rd   sha256_hashr    s@    GNN40779:3B?FFwOUUWWrf   c                    t        | t              r| n| j                  d      }|dk7  r|dz   |j                  d      z   }dt        |      z   S )Nr   rp   s   ||c)
isinstancebytesr   r  )codeextrahashing_strs      rd   	code_hashr    sJ    $T51$t{{77KK{!E)ELL,AA[)))rf   c                F   |rKt         j                  j                  |      r|}nTt         j                  j                  t	               |      }n+t         j                  j                  t	               | dd       }t         j                  j                  ||  d|       }| ||fS )Nr+      ro   )r}   r~   isabsr   rA   )basename	extensionspecified_dirsubdirr~   s        rd   get_pathr    sz     77=='"FWW\\)+}=Fik8Aa=977<<8*Ai[ 9:DVT!!rf   c                p    |dk(  rt        | |      S |dv rt        t        |             S t        d|       )Nr  )cubinhsacospvzUnknown hash type )r  reprr   )contentr  	hash_types      rd   get_hashr&    sD     F%((--g''
-i[9
::rf   c                    t        | j                         ||      }t        |||      \  }}}|dk(  }	t        j                  j                  |      st        || d       ||fS )Nr  Tr   )r&  stripr  r}   r~   r  r   )
r$  r  r  r%  r  r   r  r  r~   encode_utf_8s
             rd   writer*    s]     	:C%c9mDHfd"f,L77>>$T7d3T>rf   c                     t        | d      d   S )zT
    Write the `text` to a file and return the path computed based on the hash.
    txtr+   r*  )texts    rd   
write_textr/    s     ua  rf   c                   t        |t        t        f      sJ d       t        |       }|r|j                  j                  dd       |j                  dt        j                          dt        j                          dz  }t        |t              rdnd}|j                  ||rdnd 	      5 }|j                  |       d d d        |j                  |       y # 1 sw Y   xY w)
Nz6Only strings and byte arrays can be saved in the cacheTparentsrt   ro   z.tmpwwbr   )encoding)r  r   r  r   parentmkdirr}   getpid	threading	get_identr   r*  rename)path_r$  r   r)  r~   tmp_path
write_modefs           rd   r   r     s     #u @?@  ;D$6{{qQy/B/B/D.ETJJH"7C0dJ	z|G	N RS	OOD s   -CC"c                  &    e Zd ZU dZded<   ded<   y)TensorMetadataAndValueszk
    TensorMetadata plus the elements as a list of raw values.
    Used for hashing inlined constants.
    rM   tensor_metadata	List[Any]valuesNr   r   r   __doc____annotations__r`   rf   rd   rA  rA    s    
 $#rf   rA  c                    | S r_   r`   xs    rd   _identrK    s    Hrf   c                    t        |      }t        |d      st        j                  |dd      }|j                  | vr|j                  | |j                  <   t        j                  || |j                           }|S )zs
    Extracts the tensor metadata and removes fields of the TensorMetadata
    that are not needed for caching
    _is_inductor_staticr   N)storage_offsetstorage_bytes)r   )rK   hasattrdataclassesrx   r   )
device_maptmetas      rd   %extract_tensor_metadata_for_cache_keyrU    sm     #1%D1+,""4N {{*$"&++
4;;tJt{{,CDDKrf   c                ,    t        | |      }t        |ffS )zH
    See FxGraphCachePickler. Custom reducer to pickle FakeTensors.
    )rU  rK  )rR  rS  metadatas      rd   _reduce_fake_tensorrX    s     5ZCHXK  rf   c                    |j                   rt        d      t               }|j                         }t               |z
  }|dkD  rt	        j
                  d|dd       t        | |      }t        t        ||      ffS )a4  
    See FxGraphCachePickler. Custom reducer to pickle Tensors.
    If we see tensors, we know they're constants stored as attributes on
    the GraphModule. Include the values in the key calculation. Small
    tensors will be inlined, so we can't serve the same cache entry for
    different values anyway. Large constants are treated as parameters,
    so we could conceivably reuse a cache entry. To do that, however,
    PyCodeCache would need more complexity to create a new module from its
    cache, but with the right constants attached as attributes.
    zmkldnn tensors unpickleable.g      ?z1FX graph cache handling of a large constant took z.1zs. Please file an issue.)		is_mkldnnBypassFxGraphCacher   tolistwarningswarnrU  rK  rA  )rR  rS  startrD  elapsedrW  s         rd   _reduce_tensorra    s~     	{{ !!?@@ FEXXZFfunG}?|Kcd	
 5ZCH,Xv>@AArf   c                &    t         t        |       ffS )zD
    See FxGraphCachePickler. Custom reducer to pickle SymInts.
    )rK  r   ss    rd   _reduce_symintre  (  s     SVIrf   c                    t        d      )z
    See FxGraphCachePickler. Custom reducer to handle any objects that we don't
    support and therefore raise to bypass caching.
    zReduce unsupported.)r[  rc  s    rd   _reduce_unsupportedrg  2  s    
 2
33rf   c                  n   e Zd ZU dZi Zded<   ej                  j                         Z e	j                  ee      ee<    e	j                  ee      eej                  <   eeej"                  <   eeej&                  j(                  j*                  j,                  <   edd       Zed	d       Zed
d       Zy)FxGraphCachePicklera:  
    Custom pickler to customize the pickling of some objects (Tensors), only for the
    purpose of computing a hash for keying into the FxGraphCache. Tensors contain
    objects that don't pickle and/or vary between runs, and we want to capture the
    data that allow us to compute a stable, but safe hash.
     Dict[torch.device, torch.device]_device_mapc                ,   t        j                         5 } | |      }d|_        	 |j                  |       |j                         cddd       S # t        t
        f$ r(}t        j                  dd       t        d      |d}~ww xY w# 1 sw Y   yxY w)zA
        Pickle an object using the FxGraphCachePickler.
        TzCan't pickleexc_infoz#Config options may be unpickleable.N)
ioBytesIOfastdump	TypeErrorAttributeErrorlogwarningr[  getvalue)clsobjstreampicklerr   s        rd   r   zFxGraphCachePickler.dumpsN  s    
 ZZ\ 	%V&kGGLWS! ??$	% 	% ~. W NT:()NOUVV	W	% 	%s-   B
AB
B#BBB

Bc                :    | j                  |      }t        |      S )zt
        Serialize an object using the FxGraphCachePickler and return a hash
        of the pickled object.
        )r   r  )rx  ry  serialized_datas      rd   r&  zFxGraphCachePickler.get_hash`  s     ))C.?++rf   c                >    d fd}g }t        |      j                         D ]  \  }}t        |t              rTt	        t        |            D ]<  } j                  ||         }|j                  d| d| d| d |||                 > jt        |t              rM|j                         D ]9  \  }}	 j                  |	      }|j                  d| d| d| d ||	              ; ǉ j                  |      }|j                  d| d| d ||               |S )z
        Get a printable string describing in more detail all the attributes
        comprising an object. Useful for debugging when one graph hashes
        to a different value than another.
        c                0   t        | t        j                        rt        t	        j
                  |             S t        | t              ryt        |       j                  v r*t         j                  t        |          |       d         S t        |       S )Nz<bytes>r+   )	r  ru   r    r   rU  rk  r  typedispatch_table)ry  rx  s    rd   get_strz0FxGraphCachePickler.debug_lines.<locals>.get_strq  sx    #u||,@RUVWWC' cc00083--d3i8=a@AA3xrf   [z] z]: z: ry  r   r   r   )	varsr   r  listrangelenr&  appenddict)
rx  inpr  linesattrry  iihkvs
   `         rd   debug_lineszFxGraphCachePickler.debug_linesi  s(   		  c* 	>ID##t$C/ LBSW-ALL1QCr$qCB8H7I!JKL C&IIK EDAqQALL1QCr$q3wqzl!CDE LL%q2dV2gcl^<=	> rf   N)ry  r   r   r  r  )r  FxGraphHashDetailsr   	List[str])r   r   r   rF  rk  rG  copyregr  r   r   r
   rX  rL   ra  ru   r    re  r   rg  fxexperimental_backward_stateBackwardStateclassmethodr   r&  r  r`   rf   rd   ri  ri  :  s     57K16++002N!2!2!23F!TN:#49#4#4^[#QN5<< #1N5<<  	 --;; % %" , ,  rf   ri  c                   t        t        j                  | |      d       D ]  }|j                  j	                  |j
                  d       }|J |j                  }|J t        |d      5 }|j                  |j
                  j                  d             |j                  |j                                d d d        |j                  st        |j                  |j
                   d|        y # 1 sw Y   =xY w)Nc                    | j                   S r_   )r   rI  s    rd   <lambda>z!build_code_hash.<locals>.<lambda>  s
     rf   r   rbr   ro   )sortedpkgutiliter_modulesmodule_finder	find_specr   originr   updater   readispkgbuild_code_hashsubmodule_search_locations)rootsprefixhasherlibspecmoduler?  s          rd   r  r    s     g**5&9?OP 
V  **388T:!!!&$ 	$1MM$))**734MM!&&(#	$ 99D;;		{!_fU
V
	$ 	$s   ,A
C22C;	c                     t        j                         sdd}  | t              S ddlm} |j                  d      j                         j                  d      S )zS
    Compute a key that contains relevant information about torch source files
    c                >   d}t         j                  j                  t              }|D cg c]"  }t         j                  j	                  ||      $ }}t        j                         }|j                  t        j                  j                  d             t        | gd|       |D ]V  }t         j                  j                  |      s#t        |d      5 }|j                  |j                                d d d        X |j                         S c c}w # 1 sw Y   wxY w)N)z"codegen/aoti_runtime/interface.cppz'codegen/aoti_runtime/implementation.cppcodegen/cpp_prefix.h	script.ldr   rp   r  )r}   r~   dirname__file__r   r   r   r  ru   __version__r   r  r  r   r  r
  )rootextra_filesinductor_rootrJ  r  r~   r?  s          rd   get_code_hashz torch_key.<locals>.get_code_hash  s    K GGOOH5MCNOa277<<q9OKO^^%FMM%++227;<TFB/# 077>>$'dD) 0Qaffh/0 00 ==?" P0 0s   'D DD	r   parutilztorch/src_hash.txtascii)r  r   r   r  )r$   	is_fbcode_TORCH_PATHlibfb.pyr  get_file_contentsrstripr   )r  r  s     rd   	torch_keyr    sK    
 	#* [)) $$%9:AACJJ7SSrf   c                 H    t         j                  j                  t              S r_   )r}   r~   r  r  r`   rf   rd   get_inductor_rootr    s    77??8$$rf   c                      e Zd ZU dZded<   y)OrderedSetHolderzb
    See FxGraphHashDetails. Holds a sorted list to support stable hashing
    of set kwargs.
    rC  r   NrE  r`   rf   rd   r  r    s    
 rf   r  c                      e Zd ZdZy)r[  zI
    Exception to indicate that the FxGraphCache should be bypassed.
    N)r   r   r   rF  r`   rf   rd   r[  r[    s    rf   r[  c                  :    e Zd ZdZdgZ	 	 	 	 	 	 	 	 	 	 ddZddZy)r  zz
    Object to capture all the details for a compiled FX graph relevant to computing
    a safe and stable cache key.
    graph_idc                N   || _         || _        i | _        t        |      D ]\  }|| j                  vst        ||         t        u r%t        t        ||               | j                  |<   K||   | j                  |<   ^ || _        t        j                         t        j                         t        j                  j                  j                  f| _        t        j                   j"                  j$                  j&                  t        j                   j"                  j$                  j(                  t        j                   j"                  j$                  j*                  f| _        t/               | _        t2        j5                         | _        t9        j:                         | _        y r_   )gmexample_inputs	fx_kwargsr  EXCLUDED_KWARGSr  setr  inputs_to_checkru   $are_deterministic_algorithms_enabled-is_deterministic_algorithms_warn_only_enabledutilsdeterministicfill_uninitialized_memory!deterministic_algorithms_settingsbackendsrw   matmul
allow_tf32&allow_fp16_reduced_precision_reduction&allow_bf16_reduced_precision_reductioncuda_matmul_settingsr  torch_versionr   r   system_infor$   save_config_portableinductor_config)r   r  r  r  r  r  s         rd   r   zFxGraphHashDetails.__init__  sE    , 	" 	5A,,,	!%, )9	!9M(NDNN1%(1!DNN1%	5  / 668??AKK%%??2
. NN&&11NN&&MMNN&&MM%
! '[$//1%::<rf   c                ,    t         j                  |       S )z
        Get a printable string describing in more detail all the attributes
        comprising this object. Useful for debugging when one graph hashes
        to a different value than another.
        )ri  r  r   s    rd   r  zFxGraphHashDetails.debug_lines  s     #..t44rf   N)
r  torch.fx.GraphModuler  List[torch.Tensor]r  r   r  Sequence[int]r   r   r   r  )r   r   r   rF  r  r   r  r`   rf   rd   r  r    sK     "lO)= )= +)= "	)=
 ')= 
)=V5rf   r  c                    t        | |||      }dt        j                  |      z   }|j                         }dj	                  |      }t
        j                  d| d|        ||fS )z=
    Generate a unique hash of the FX graph for caching.
    r?  
z$FX graph cache hash details for key z:
)r  ri  r&  r  r   ru  debug)r  r  r  r  detailsr   r  	debug_strs           rd   compiled_fx_graph_hashr    sm     !^YPG #,,W5
5C%%'K		+&III4SEYKHIrf   CompiledFxGraphc                   |j                   J |j                  J |j                  }|j                  }|j                  }|j                  }|j
                  d   }|j
                  d   }|s |j
                  }	|	d   }
|j                  }|j                  }t        j                  j                  s-| D ](  }t        |t        j                        st        |       * |1|s/|s-|j                  t!        t#        |j$                                     ddlm} |j                   }|J  |||
t!        t#        |j$                              |||t+        |j,                  j/                               |t+        |j0                        	      |_         yt3        j4                  |       |rt        j                  j                  re|J |j6                  J |j                   t        j8                  j                  j;                  |j6                  d	      J dfd
}||_         d|j<                  v r1|j>                  rtA        |j>                         ytA        d|        yy)z
    Checks for any reasons not to run cudagraphs and then
    runs it on compiled_graph.
    Mutates the `compiled_graph.current_callable` and `cudagraphs`
    Nis_inferenceis_backwardstatic_input_idxsr+   )cudagraphify)r  device_indexstack_tracesr  r  	constantsplaceholdersmutated_input_idxsF)create_if_none_existsc                4    j                           |       S r_   )set_to_running_backward)
new_inputscompiled_graph_callablemanagers    rd   compiled_artifactz1cudagraph_post_compile.<locals>.compiled_artifactj  s    //1.z::rf   rw   skipping cudagraphs due to )r  rC  r   Callable[..., Any])!current_callablecudagraph_infocudagraph_fail_reasonsr  boxed_forward_device_indexr  r  r  r$   r   cudagraph_treesr  ru   r   intr  nextiterdevice_idxs
compile_fxr  tupler  rD  r  rE   disabler   	_inductorget_managerdevice_typesdisabled_cudagraphs_reasonr=   )r  compiled_graph
cudagraphscached_infor  r  r  r  r  r  r  r  r  rS  r  r  r  r  r  s                    @@rd   cudagraph_post_compiler  #  sO    **666((444 //K(??$44O!/!J!J!++N;L **=9K!",,	%&9:"//"//}},,# a.F
 '2 &**4^5O5O0P+QR,)::+++*6/d>#=#=>?%#%N44;;=>%$^%F%FG
+
' 	*%
 6==88-999-33???&4&E&E#oo55AA*00 B G &&&; /@N+^000 883"== 412H1IJ 1rf   c                ~    | s;|j                   J t        |j                   |      }||j                   ur||_         yyy)z
    Realigns input strides from inputs_to_check if
    we didn't end up running cudagraphs. Mutates
    `compiled_graph.current_callable` if cudagraphs
    was run. Otherwise, does nothing.
    N)r  rD   )ran_cudagraphsr  r  new_callables       rd   maybe_realign_inputsr  }  sP     ..:::3++_
 ~>>>.:N+ ? rf   c                   t         j                  j                         rt         j                  j                         syt	        | dz        }t        j                         rJt         j                  j                  d      }t        j                  d||       |t	        ||z  dz        z  }t        j                  d|       t        j                  j                  t        |             |S )z}
    Ephemerally increases the NCCL timeout when compiling for a distributed job
    Returns amount of seconds increased
    r   g    eAz>pytorch/remote_cache:ephemeral_timeout_fudge_factor_percentagezNEphemeral NCCL timeout increase fudge factor %d and original increase value %dd   zIncreasing NCCL timeout by %d)seconds)ru   distributedis_availableis_initializedr  r$   r  _utils_internaljustknobs_getval_intru  infodistdistributed_c10d"_add_ephemeral_timeout_for_all_pgsr	   )time_saved_nsincreased_timeout_secfudge_factors      rd   .add_ephemeral_timeout_increase_for_distributedr)    s    
 ))+53D3D3S3S3U 45,,AAL
 	\!	

 	%:\%IC%O!PPHH,.CD<</0 ! rf   c                     e Zd ZdZedd       Zedd       Zedd       Zedd       Ze	 	 	 	 	 	 	 	 	 	 dd       Z	e	 	 	 	 	 	 	 	 dd       Z
e	 	 	 	 	 	 	 	 	 	 	 	 dd       Zedd	       Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
       Zedd       Zy)FxGraphCachea7  
    Supports caching and reusing compiled Fx graphs.

    The overall strategy is as follows:
    - This cache stores entries on disk. When saving an entry, we can't
      serialize callables (that could be C++, Triton, etc.), so we serialize
      their own disk cache location. We then recreate the compiled artifact
      after fetching from disk.
    - For indexing the cache, we gather the fields relevant to identifying an
      FxGraph (the graph module, graph inputs, system settings etc.) into an
      FxGraphCacheDetails object, pickle it, and compute a hash for the key.
      See FxGraphCachePickler.
    - Among the metadata we store, we also include a guards expression that's
      appropriate for validating any symbols for Tensor arguments that have
      symbolic bounds. On cache lookup then, we evaluate those guards in the
      current context to validate that a cached entry can be served.
    - A given graph could have multiple compiled versions, corresponding to
      different sets of guards. Therefore, we store cache entries in the form:
          <temp dir>/<fx graph hash>/<serialized metatdata>
    - On lookup, we compute the key from the graph details, iterate over all
      leaf files in the corresponding subdirectory, deserialize the entry, and
      evaluate its guards expression. If the evaluation succeeds, we have a
      cache hit. If it fails, we compile the graph and store a new entry.
    - Finally, on a cache hit, we need to make sure any guards that would
      have been created during compilation are added to the current context.
    c                 R    t         j                  j                  t               d      S )zS
        Get the toplevel temporary directory for storing compiled graphs.
        fxgraph)r}   r~   r   rA   r`   rf   rd   _get_tmp_dirzFxGraphCache._get_tmp_dir  s    
 ww||IK33rf   c                n    t         j                  j                  t        j	                         | dd |       S )zA
        Return the disk location for a given cache key.
        r+   r  )r}   r~   r   r+  r.  r  s    rd   _get_tmp_dir_for_keyz!FxGraphCache._get_tmp_dir_for_key  s*    
 ww||L557Qq3GGrf   c                z    | D cg c]+  }t        |t        j                        st        |      s*|- c}S c c}w )z
        Get the backed SymInt objects from the input list. Note that we can never
        have guards that depend on unbacked symint.
        )r  ru   r   rN   )r   rd  s     rd   _filter_backed_symintsz#FxGraphCache._filter_backed_symints  s+     "QaZ5<<%@Xa[QQQs   888c                     t         j                  j                  j                         } | sy| j                  j
                  S )zG
        Helper to get the shape env from the tracing context.
        N)ru   _guardsTracingContexttry_get	fake_mode	shape_env)ctxs    rd   _get_shape_envzFxGraphCache._get_shape_env  s2    
 mm**224}}&&&rf   c                j    t         j                         }|J t         j                  |      }|D cg c]  }t        |       }}d fd}d}	 |       D ]`  }
|
j                  s|
}	 nPt        |j                  |
j                  |            }t        j                  d |
j                  ||       |s^|
}	 n |	yt        |	j                  d      d   |	j                  t        j                  j                        st        d   dxx   dz  cc<   t!        t        j                  j#                              j%                  d	d	
       t'               }t        j                  j)                  |      v rC|v rn>dt        j                  j)                  |       d}t+        j,                  |d| d      t/        d	       	 t0        j3                  |	j                  |	j4                  |	j6                        j8                  |	_        |	j                  rLt        |j                  |	j                  |            }|d	u sJ t        j                  d |j@                         tB        jD                  jG                  |	jH                         t        dxx   |	jJ                  z  cc<   ddl&m'}  |jP                         tR        j                  d       tR        j                  d       tU        dfdfd       |	S c c}w # t<        $ r t        j?                  d       Y yw xY w)z
        Lookup a compiled graph in the cache by key. On a hit, return the
        deserialized CompiledFxGraph object. On a miss, return None.
        Nc               3    K   rt         j                        } t        j                  j	                  |       rnt        t        j                  |             D ]M  }	 t        t        j                  j                  | |      d      5 }t        j                  |       d d d        O rq	 j                        x}\t        |t               sJ |d   }t        |t"        t$        f      sJ t'        j(                  |      }t        j*                  |       y y y # 1 sw Y   xY w# t        $ r t        j                  dd       Y w xY w# t        $ r t        j                  dd       Y y w xY ww)Nr  z,fx graph cache unable to load compiled graphTrm  r  )r+  r0  r}   r~   r  r  listdirr   r   pickler   	Exceptionru  rv  r   r  r  r   r  r  	b64decodeloads)	r  r~   r?  
cache_datar  r$  r   localremote_caches	         rd   iterate_over_candidatesz;FxGraphCache._lookup_graph.<locals>.iterate_over_candidates  sN    %::3?77>>&) &rzz&'9 : !%bggll64&@$!G 51&,kk!n 45 
&2&6&6s&;;
H)*d;;;)&1)$e==="("2"24"8$ll733 I 5 5( KK N)- (  ! KKFQU   sg   AE4*D(DD(&E4+A.E E4D%	!D(( EE4
EE4 E1.E40E11E4zEfx graph cache key %s evaluating guards [%s] with values %s => hit=%srq      inductorfxgraph_lookup_write_filer+   Tr1  z#include\s*"[^"]+"
#include "r   z"Failed to load cached artifact: %sz*fx graph cache key %s post-load guards: %srR   Output code written to: %szOutput code: 
%sinductor_output_codec                     d iS )Nfilenamer`   )artifact_paths   rd   r  z,FxGraphCache._lookup_graph.<locals>.<lambda>n  s    Z/ rf   c                      S r_   r`   )r  s   rd   r  z,FxGraphCache._lookup_graph.<locals>.<lambda>o  s    t rf   
payload_fn)r   z&Generator[CompiledFxGraph, None, None])+r+  r:  r2  rO   guards_exprr   evaluate_guards_expressionru  r  r  	cache_keysource_coder}   r~   r  r!   r   r  r7  cpp_prefix_pathr  resubr   PyCodeCacheload_by_key_pathcache_linemapr  callr  OSErrorerrorguardsr&   CachedMetricsHelperapply_deltasmetrics_deltascounter_deltasgraphrS   save_output_codeoutput_code_logrJ   )r   r  rC  rD  r8  symintsrd  hintsrE  re  	candidater   cpp_pppatterncheckrS   rO  r  s   ` ``            @@rd   _lookup_graphzFxGraphCache._lookup_graph  s    !//1	$$$55nE&-.!..	: 02 	I((! 44Y5J5JERC IIW%% !-	0 = !$7:  ww~~m,Z !<=B=/066td6S$&Fww'4/T> "32773C3CF3K2LANG66'Zxq+A4HD=	%0%A%A##	&
 d " 44U5F5FPE D= =II<c9CSCS 	##001E1EF 4 44(&&&t,:MJ148"/#	

 o /l  	 II:MJ		s   LAL L21L2c                (   t        ||        |rj| j                  rQd| j                  v rt        d| j                          nt        d   dxx   dz  cc<   t        j                  |       nt        || |       | j                  }t        || |       | S )a  
        Run a set of post processing steps after loading from the cache. These involve:
         - Setting the tracing context output strides
         - Running cudagraphs if enabled
         - Realigning inputs

        This runs whether or not we have a cache hit, and always runs directly after we get a CompiledFxGraph.
        The results of this function are *not* saved in the cache itself.
        rw   r   rG  cudagraph_skipsr+   )
rI   r  r  r=   r!   rE   r  r  r  r  )r  r  r  r  s       rd   post_compilezFxGraphCache.post_compiles  s     	+>>J 88^88875n6_6_5`a Z():;q@;!!*-&""
 )88 		
 rf   c                   t        |      }d|_        t        j                         }|J t        j	                  |      }|j                  |      }|j                  ||      |_        	 t        j                  |      }		 |rt        j                  |       }
t        j                   j#                  |
      st        j$                  |
d	       t        j                   j'                  |
t)        |	            }t+        ||	d
       |rVt-        |j.                  xs ddz        }t1        j2                  |	      j5                  d      |d}|j7                  | |       yy# t        $ r. t        j                  dd       t        d   dxx   dz  cc<   Y yw xY w# t        $ r. t        j                  dd       t        d   dxx   dz  cc<   Y yw xY w)z=
        Store a serialized CompiledFxGraph on disk.
        N)r  r`  z1fx graph cache unable to serialize compiled graphTrm  rG  fxgraph_cache_pickle_errorr+   rs   r   r   g    .Ar  )r  time_taken_msz!fx graph unable to write to cachefxgraph_cache_write_error)r   r  r+  r:  r2  get_pruned_guardsproduce_guards_expressionrS  r>  r   r?  ru  rv  r!   r0  r}   r~   r  r   r   r  r   r  _time_taken_nsr  	b64encoder  put)r   r  r  rC  rD  disk_compiled_graphr8  rh  r`  r$  r  r~   rt  rB  s                 rd   _save_graphzFxGraphCache._save_graph  s    #>2
 04, !//1	$$$55nE,,W5*3*M*M  +N +
'	ll#67G	C%::3?ww~~f-KK6
 ww||FK,@AT7d; #%8%G%G%L1QT$T U",,W5<<WE%2*
   j1 '  	KKCd   Z !=>!C>	4  	CKK;dKKZ !<=B=	Cs%   *E  CF 4FF4GGc                $   t         j                  st         j                  j                  rt	        d      t
        j                          t        j                  d       t	        d      | j                  j                  D ]  }t        |j                  t        j                  j                        rt	        d      |j                   dk(  sLt        t#        | |j                        t        j$                  j&                        st	        d       y)z
        Check some conditions that would preclude caching and raise BypassFxGraphCache
        to bypass in case caching is not possible.
        z@Freezing may introduce constants that aren't static across runs.Nzfx graph cache no shape envzNo shape env.z!Can't cache HigherOrderOperators.getattrzCan't cache torchbind objects.)r$   freezingaot_inductoruse_runtime_constant_foldingr[  r+  r:  ru  r  re  nodesr  targetru   _opsHigherOrderOperatorr   r~  _CScriptObject)r  nodes     rd   _check_can_cachezFxGraphCache._check_can_cache  s     ??f11NN$R  &&(0II34$_55
 HHNN 	KD$++uzz'E'EF()LMMww)#
DKK(%((*?*?) ))IJJ	Krf   c                r   |s	|sJ d       d}d}d}	i 	 t         j                  |       t        ||||      \  }
}|
d<   |d<   d}|r4d}	 t        j                         rddlm}  ||      }nddlm}  ||      }t         j                  |
|||      }|~t        j                  d|
       t        d   dxx   dz  cc<   d}t!               }|}	 | ||||      }t!               |z
  |_        |j"                  d<   t         j%                  |
||||       n^t        j                  d|
       t        d   dxx   dz  cc<   d}t!               }	|j"                  x}|d<   t'        |      x}dk7  r|d<   |
|_        |s | ||||      }|J |d<   t3               }|j5                  d| |	       t6        j8                  j;                  d d! fd"#       t         j=                  |||d$          |S # t        $ r#}d}t        j                  d	|       Y d}~d}~wt        $ r d}t        j                  d
d       Y w xY w# t*        $ rf}t        d   dxx   dz  cc<   d}t        j-                  d|       t/        |      d<   |rt1        dt/        |             t!               }	Y d}~6d}~ww xY w)%z
        Load a compiled graph from the cache. If a cached entry does not exist,
        compile the graph and save it to the cache.
        z(at least one of them needs to be enabledNr   
componentszfx-graph-v1r   )FbRemoteFxGraphCache)RemoteFxGraphCachez#Unable to create a remote cache: %szUnable to create a remote cacheTrm  zfx graph cache miss for key %srG  fxgraph_cache_missr+   misstime_taken_nszfx graph cache hit for key %sfxgraph_cache_hitr   r&  ephemeral_timeout_increasefxgraph_cache_bypassbypassz%Bypassing FX Graph Cache because '%s'cache_bypass_reasonbypass_fx_graphcache_statefx_graph_cache_)rW  artifactc                     dddS )Nfx_graph_cache_hashr   )r   r5  r`   r`   rf   rd   r  z#FxGraphCache.load.<locals>.<lambda>e  s    -"! rf   c                 .    t        j                         S r_   )r   r   )
cache_infos   rd   r  z#FxGraphCache.load.<locals>.<lambda>i  s    tzz*5 rf   )metadata_fnrR  r  )r+  r  r  r$   r  torch._inductor.fb.remote_cacher  torch._inductor.remote_cacher  r   ru  rv  r?  rn  r  r!   r   rx  r|  r)  _fx_graph_cache_keyr[  r"  r   r*   r#   log_instant_eventru   _loggingrJ   rq  )compile_fx_fnr  r  r  r  rC  remoter  r  cache_event_timer   r  rD  cache_idr  r  r   
start_timer&  ephemeral_increasechromium_logr  s                        @rd   r   zFxGraphCache.load  s'    J JJ%'
G	)))"-5NI C !$Ju'2J|$>BL(R'')X';H'ES'9('C *77^ULN %		:C@$%9:a?:$$Y
#- !." 18	J0F-.<.K.K
?+((""  		93?$%89Q>9##*9 %3%B%BBMO2?J/.\)/ * 	
 DV
#?@14N. *NOYN )))$/
=!02&&k]+-=
 	' 	
 	'' 6 	( 	
 	!!NIl,C	
 W + J#'LKK EqII  R#'LKK ADKQRP " 	)Z !78A=8"KHH<a@03AJ,- !2CF;&y	)sO   6I 1G4 >C=I 4	I=HI %I I II 	J6AJ11J6c                 r    	 t        j                  t        j                                y# t        $ r Y yw xY w)z.
        Clear out the on-disk cache.
        N)shutilrmtreer+  r.  FileNotFoundErrorr`   rf   rd   clearzFxGraphCache.clearq  s.    
	MM,3356  		s   '* 	66Nr   r   )r   r   r   r   )r   rC  r   zList[torch.SymInt])r   zOptional[ShapeEnv])
r   r   r  r  rC  r   rD  !Optional[RemoteCache[JsonDataTy]]r   zOptional[CompiledFxGraph])r  r  r  r  r  rE   r   r  )r   r   r  r  r  r  rC  r   rD  r  r   r   )r  r  r   r   )r  r  r  r  r  r  r  r   r  r  rC  r   r  r   r   )r   r   r   rF  r   r.  r0  r2  r:  rn  rq  r|  r  r   r  r`   rf   rd   r+  r+    s   : 4 4 H H R R ' ' EE*E E 8	E
 
#E EN +'+*+ + 
	+ +Z <C<C'<C +<C 	<C
 8<C 
<C <C| K K8 q)q q +q "	q
 'q q q qf  rf   r+  r   _StrideExprStrc                  L   e Zd ZU dZded<   ded<    ej                  d      Zded<   d	ed
<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   d ed!<   d"ed#<   d$ed%<   d&Zd'ed(<   d&Z	d)ed*<   d&Z
ded+<   	 	 	 	 	 	 	 	 	 	 	 	 	 	 d.d,Zd/d-Zy&)0r  zr
    Class holding a compiled FX graph. This is the object serialized on disk
    to support FxGraph caching.
    Optional[Callable[..., Any]]r  r   rU  F)r#  rV  Optional[List[Tuple[int, str]]]r\  zSet[str]r  zSet[int]r
  mutated_inputsr  zDict[str, torch.Tensor]r  z Dict[str, torch._C.ScriptObject]torchbind_constantsz4Optional[List[Optional[Tuple[_StrideExprStr, ...]]]]output_stridesOptional[str]r  metrics.CachedMetricsDeltasrc  Counter[str]rd  rS  zOptional[CudagraphCachedInfo]r  r   r  r  r  zOptional[BoxedDeviceIndex]r  NzOptional[int]rx  zOptional[bool]_boxed_callr  c                ~   || _         |j                  | _        |j                  r3t        |j                        5 }|j	                         | _        d d d        |j                  | _        t        |j                        | _        t        |j                        | _	        t        |j                        | _
        t        |j                        | _        |j                  | _        |j                  | _        || _        || _        || _        || _        d | _        d | _        i | _        d| _        d | _        y # 1 sw Y   xY w)Nr`   )r  rU  
cache_pathr   r  rV  r\  r  r  r
  r  r  r  r  r  r  rc  rd  rS  r  r  r  r  )r   r  re  r  r  rc  rd  r?  s           rd   r   zCompiledFxGraph.__init__  s    !1e&&' ,1#$668 ,"00 2 23u001!%"6"67"%e&>&>"?#(#<#< ,*D',,"!*.'%, ,s   D33D<c                @    | j                   J | j                  |      S r_   )r  )r   r   s     rd   __call__zCompiledFxGraph.__call__  s%    $$000$$V,,rf   )r  r  re  rS   r  z*List[Optional[Tuple[_StrideExprStr, ...]]]r  r  rc  r  rd  r  r   r   )r   rC  r   r   )r   r   r   rF  rG  rQ  fieldrV  rx  r  r  r   r  r`   rf   rd   r  r    s    
 32N({((e4K422  &&99HH --//   11"" ::$(NM("&K&)--/6/ / C	/
 %2/ 4/ %/ 
/@-rf   c                    t        j                  |       }	 t        j                  |       y # t        j                  $ r&}t        j                  ||j                        |d }~ww xY wr_   )shlexsplit
subprocess
check_callCalledProcessErrorr%   CppCompileErroroutput)cmd_cmdr   s      rd   run_command_and_checkr    sR    
++d
C8c"(( 8!!#qxx0a78s   - A& !A!!A&c                    | j                  d      rt        j                  j                  |       S | j                  d      rt        j                  j                  |       S | dfS )zDReturns the path where the AOT Inductor compiled kernels are stored..soz.pt2rp   )endswithr}   r~   r  )r~   s    rd   split_aot_inductor_output_pathr    sK     }}Uww}}T""	v	ww}}T""Rxrf   c                  v    e Zd ZU i Zded<    eej                        Zedd       Z	edd       Z
ed	d       Zy)
CudaKernelParamCachezDict[str, Dict[str, str]]r   c                    t        |||t        t        j                  j                        d         \  }}||t               <   || j                  |<   y )Nr   )r%  r  )r*  r  r$   r  output_pathr   r   )rx  r   paramsr   bin_typerr   r~   s          rd   r  zCudaKernelParamCache.set  sU    8##//	
4 59.01		#rf   c                :    | j                   j                  |d       S r_   )r   r   )rx  r   s     rd   r   zCudaKernelParamCache.get  s    yy}}S$''rf   c                6    | j                   j                         S r_   )r   r   )rx  s    rd   get_keyszCudaKernelParamCache.get_keys  s    yy~~rf   N)
r   r   r  zDict[str, str]r   r   r  r   r   r   )r   r   r   zOptional[Dict[str, str]])r   zKeysView[str])r   r   r   r   rG  r   r  cache_clearr  r  r   r  r`   rf   rd   r  r    sU    ')E$)u{{+K    ( (    rf   r  c                  2    e Zd Ze	 	 	 	 	 	 	 	 	 	 dd       Zy)AotCodeCompilerc           
       01234567 t         j                  dk(  rt        d      t                t	               }t        ddt        ||j                              }t        |j                               }d3d}t        j                         rPt        j                         6|s%j                  rt        j                         7d3d}nt        j                         7nd	6d
7t!        t        j"                  j$                        \  }	}
t'        d||	      \  }5t(        j+                  d5       t-        d5fdfd       t.        j0                  j3                  t.        j0                  j5                  5      d   |      2d;2367fd}d;2fd}ddlm} t;               } |t.        j0                  j3                  ||dz         t<              }|5  |rKt.        j0                  j?                  5      d   dz   }tA        |d      5 }|j'                  |       d d d        |
rt        j"                  j$                  n$t.        j0                  j?                  5      d   dz   }t.        j0                  j?                  5      d   dz   }tC        fdjD                  jG                         D              1d<d4tI        14fdjD                  jK                         D              }t        j                          xr |dkD  }t        j"                  jL                  rd}t        j"                  jN                  rtQ        5      \  }}t        ||j                  d||       }t        |5||!      }|j                         }|jS                         }t.        j0                  j?                  5      d   d"z   }|jU                  |       ntQ        5      \  }}t        ||j                  d||       }t        |5||!      }|j                         }|jS                         }tV        jY                  d#|       3rWt.        j0                  j?                  5      d   dz   }t[        5||j5                                t/        j\                  |d$       nt_        |       d=d%0d&j3                  01fd'jD                  jG                         D              }|s|}d}nxta        tb        te        jf                  dte        jh                  td        jj                        jl                  d(      jo                               }tq        jr                  d)|d*z   |      } ||d+t         j                     |      } t        j"                  jN                  r9tQ        |      \  }!}"t        ||j                  |,      }#t        |!|| g|"|#!      }$|$j                         }%|$jS                         }t.        j0                  j?                  5      d   d-z   }&|#jU                  |&       dd.l:m;}' |rpt.        j0                  j?                  5      d   d/z   }(tA        |(d0      5 })|)j'                  |       |)j'                  tq        jr                  d1|             d d d         |'t.        j0                  j5                  5      d         }*|*cd d d        S tQ        |      \  }!}"t        ||j                  |,      }#t        |!|| g|"|#!      }$|$j                         }%|$jS                         }tV        jY                  d2|%       3ru|
rt        j"                  j$                  n$t.        j0                  j?                  5      d   dz   }t[        || g||%j5                                t/        j\                  |d3       nt_        |%       |rdd l<}+|+j{                         },tm        d4|,      }-tA        |d5      5 }.|.j}                         }/|.j'                  d6|-|/|-z  z
  z         |.j'                  |       |.j'                  tq        jr                  d1|             d d d        tA        5d7      5 }|j'                  d8       |j'                  d9| d8       |j'                  d:|% d8       d d d        d d d        |S # 1 sw Y   xY w# 1 sw Y   +xY w# 1 sw Y   }xY w# 1 sw Y   9xY w# 1 sw Y   S xY w)>NrW   z.AotCodeCompiler not yet supported for inductoroi)vec_isarw   aot_moder   sourcesBuildOptionFTldobjcopycpp)r  r  rK  
graph_dumpc                     dd dS )Ninductor_aot_coder  )r   r  rN  r`   )
input_paths   rd   r  z)AotCodeCompiler.compile.<locals>.<lambda>3  s    +& rf   c                      S r_   r`   )rV  s   rd   r  z)AotCodeCompiler.compile.<locals>.<lambda>8  s    { rf   rQ  r   c           	        t        | d      \  }}t        j                  j                  |      d   dz   }	rv dt        j                  j	                  |       dt        j                  j	                  |       }t        |||j                                t        j                  |d       n d| d| }t        |       t        j                  d|       
j                  t        
j                  j                               z  rt        |       d	kD  rt!        d
      d}nd}t"        t"        dz
  z  dk(  r	t"        dk\  sJ d        d| dt"         d| d| 	}t        j                  d|       t        |       d| }t        j                  d|       t        |       	r5t%        j&                  ddt        j                  j	                  |            }nt%        j&                  dd|      }g }|j)                   d| d|        |j)                   d| d|        |j)                   d| d|        t        j                  ddj+                  |             |D ]  }t        |        |S )Nbinr  r   .oz -r -b binary -o    zaot constant binary command: %s 5wzPModels with buffer mutation included doesn't support constants greater than 2GB!z .data=.ldataz1 .data=.lrodata,alloc,load,readonly,data,contentsr+   @   zmust be power of 2 and >= 64z --rename-sectionz --set-section-alignment .data=z'aot constant rename section command: %szrm z$aot constant bin removal command: %sz[\W]rr   z --redefine-sym _binary_z#_start=_binary_constants_bin_start z!_size=_binary_constants_bin_size z_end=_binary_constants_bin_end z'aot constant binary redefine symbol: %s)r*  r}   r~   splitextr  compile_filer  chmodr  ru  r  mutated_buffersr  r  r   r  
ValueErrorrC   rX  rY  r  r   )constsrr   consts_pathconsts_or  rename_databodysymbol_listconsts_specified_dirfbcode_aot_cpu_rere  
ld_commandobjcopy_commands           rd   _compile_consts_linuxz6AotCodeCompiler.compile.<locals>._compile_consts_linux@  s   "2NA{ ww''4Q7$>H #$5bgg6F6Fx6P5QQRSUSZSZScScdoSpRqr[(CIIK@5)#$5hZqN%c*II7=$$s5??+?+?+A'BB v;.$j  .
 R {Q/"b(I*HI ) ###4-1+H:Qxj*  II?E!#&}%CII<cB!#& vvgsBGG,<,<[,IJvvgsK8K"##;D6Ademdno "##;D6Abckblm "##;D6A`ai`jk II?+AVW" +%c*+Orf   c                2   t         j                  j                  r't        | d      \  }}t        j                  d|       t        |       dkD  }d}|dz  }|dz  }|s| D ]  }|d| d	z  } | s |d
z  }n|dz  }|dt        |       dz
   d	z  }|dz  }|dz  }t        |d      \  }}t        j                  j                  |      d   dz   }t                d| d| }t        |       |rt        |d      5 }	|	j                  d       |	j                  d      }
|
j                  d      }|dk7  sJ |	j                  |       d}|t        |       k  r(|	j                  | |d        }||z  }|t        |       k  r(d d d        |S |S # 1 sw Y   |S xY w)Nr  r  zbinary constants path: %si   z	.section	__DATA,__data
z%	.globl	__binary_constants_bin_start
z__binary_constants_bin_start:
z	.byte r  z
	.space 1
z	.quad 0x1234567899abcdef
z	.space    z".globl	__binary_constants_bin_end
z__binary_constants_bin_end:
Sr   r   -c -o r  zr+bs   ͫxV4r   )r$   r  debug_dump_consts_binr*  ru  r  r  r}   r~   r  r7   r  r   seekr  find)r  rr   _binary_constants_pathis_large_consts
consts_asmr  r  r  r  r?  hdr	start_idxposrcr   s                 rd   _compile_consts_darwinz7AotCodeCompiler.compile.<locals>._compile_consts_darwin  s   ""88,1"6-))
 		57MN!&kD0O6JDDJ;;J" 3AHQCr"22J3 .0J<<
	#f+/):"==
@@J99J"2NA{
 ww''4Q7$>H%'(z;-HC!#&(E* 
"aFF1I&&,C #)L MI$?*?FF9%CF+WWVCD\2r	 F+
" O8O
" Os   <BFFFileLock.locktimeoutz.jsonr3  r  r  c              3  p   K   | ]-  }|j                   vrj                  |      j                   / y wr_   )folded_constantsget_original_value_of_constantis_cuda)r   r   re  s     rd   r   z*AotCodeCompiler.compile.<locals>.<genexpr>  s8      u555 44T:BBs   36c                    | j                   r)t        j                  j                  j	                  |       n| j                         j                         }|r|S t        |      S r_   )rZ  ru   opsmkldnn_nbytesuntyped_storagenbytesr,   )tensorall_cudan_bytess      rd   get_nbytes_of_tensorz5AotCodeCompiler.compile.<locals>.get_nbytes_of_tensor  sT     '' II$$,,V4//188: 
 #+w?w?rf   c              3  R   K   | ]  \  }}|j                   vr |         y wr_   )r  )r   r   r#  r$  r&  re  s      rd   r   z*AotCodeCompiler.compile.<locals>.<genexpr>  s2      "T6u555 %VX6s   $'r  )r  rw   r  compile_onlyuse_absolute_pathuse_mmap_weightsr   r  
output_dirr  z_compile_flags.jsonzaot compilation command: %sr  c                   dd}dd l }| j                         dk(  ry| j                  rSt        j                  j
                  j                  |       }t        j                  j
                  j                  |       }n>| j                         j                         }|j                         }|j                         }|j                  ||j                  |j                  |z              }t        |j                        }|r|S  ||      S )Nc                l    | j                  t        |       t        z   dz
  t        z  t        z  d      }|S )Nr+       )ljustr  rC   )	raw_bytespadded_bytess     rd   _pad_to_alignmentzEAotCodeCompiler.compile.<locals>._to_bytes.<locals>._pad_to_alignment  s6    #,??Y+59kIKW$L ('rf   r   rf   )r1  r  r   r  )ctypesnumelrZ  ru   r  r  data_ptrr   r!  rm   r"  r   POINTERc_ubyter  contents)	rS  r$  r3  r4  r6  r"  t_cpu	raw_arrayr1  s	            rd   	_to_bytesz*AotCodeCompiler.compile.<locals>._to_bytes  s    ( 779>;;$yy//88;H"YY--55a8F--/335E$~~/H"\\^F"KKNN6>>F#:;	 ")"4"45	$,yN2CI2NNrf   rf   c              3  j   K   | ]*  }|j                   vr j                  |             , y wr_   )r  r  )r   r   r<  r$  re  s     rd   r   z*AotCodeCompiler.compile.<locals>.<genexpr>4  s9      *u555 %>>tDhO*s   03)r+   qqr  )linuxdarwin)r  rw   r  r)  z_linker_flags.json)package_aotiz_serialized_weights.binr4  qzaot linkage command: %si  i @  za+b    ar  z// Compile cmd
// z// Link cmd
// )r  r  r   r   )r#  torch.Tensorr$  r   r   r  )rS  rE  r$  r   r   r  )?ry   platformr   r1   r:   r3   r5   r  r#  get_command_liner$   r  rX   r  objcopy_fallbackr  r  r  r  r*  rg  r"  rJ   r}   r~   r   r  filelockr  r  LOCK_TIMEOUTr  r   r   r  r   sumr   force_mmap_weightspackager8   get_target_file_pathsave_flags_to_fileru  r  r  r  r  r   r  ru   randintiinfoint64maxitemstructpacktorch._inductor.packagerA  resourcegetpagesizetell)8rx  re  rV  serialized_extern_kernel_nodesrw   picked_vec_isavec_isa_cmd_gencpp_commandr)  specified_output_pathspecified_so_namer   r  r  r  r  lockoutput_jsonr?  	output_sooutput_oconsts_sizer*  object_output_nameobject_output_dirobject_build_optionsobject_buildercompile_cmdcompile_flagsserialized_weightsaot_constantsmagic_numberr  output_namer,  so_build_options
so_builderlink_cmdlinker_flagsrA  weight_file	f_weightsarchive_pathrX  
page_size_	page_sizef_soso_sizer<  r$  r   r  r&  r  r  r  s8    ``                                             @@@@@@@@rd   compilezAotCodeCompiler.compile  s(    <<7"OPP%$+&
 ?;;=>!!$)JENN"-">">"@$(!$(!"-"5"5"7J'O
 +6+>+>+J+JK	
!/	
Z 	9:F
 +	
  "ww||BGGMM*,Ea,H#NB	 B	H-	^ 	&>XsW}=|T b	= . gg..z:1=G+s+ <qGG:;<
 % ##//WW%%j1!4u<  ww''
3A6=H !OO002 H@  &+oo&;&;&= K $*#3#3#55U+:U""55#' ""** ;:F&%':*"^^!%&7%5($ ",+&0 4	" -==?)>>@ " 0 0 <Q ?BW W$77F ;:F&%':*"^^!%&7%5($ ",+&0 4	" -==?)>>@		7E$!ww//
;A>EH X{7H7H7JKHHXu-)+6O< "% *!OO002* "
 $ 2 #q%++ekk*B*F*FMRRT  !'D+/< P.0 ll *+H
 ""***P+'Z $7*"^^&7	$  ($%x0) 0	
 &668&;;=	!ww//
;A>AUU 33LA@#((4Q7:SS   k40 HI!(:;!C(FGH  ,BGGMM*,Ea,HI#cb	= b	=f +Q+'Z $7*"^^&7	$  ($%x0) 0	
 &668&;;=			3X>$ - ++77WW--j9!<uD 
 !(H!5y(..BRSHHY.)(3##!)!5!5!7J #E: 6Ii/ C4"&))+

49w7J+J#KL

#56

6;;sL#ABC *c* =aGGDMGG1+bABGG.xj;<=b	=H < <NH HVC C= =b	=H su   4cb&P4c7b3
1cDc#A!c c<cc&b0	+c3b=	8c c		cc	cc"N)
re  rS   rV  r   r[  r  rw   r   r   r   )r   r   r   r  r{  r`   rf   rd   r  r    sK    `` ` )6	`
 ` 
` `rf   r  c                     t        t              j                  dz  } | j                         5 }|j	                         }t        |d      \  }}d d d        t        |      S # 1 sw Y   t              S xY w)Nr  r  )r   r  r6  r   r  r*  r9   )r~   r?  r$  rr   rN  s        rd   rW  rW    sm     >  #99D	 
&&(
8
 $H--
 $H--s    A  A3c                     t               } t        j                         r#dt        j                  j                  |        dS d|  dS )NrJ  rI  )rW  r$   r  r}   r~   r  rN  s    rd   
cpp_prefixr    sF     H BGG,,X67q99H:Q''rf   c                ^    t        d      5  t        | ||      cd d d        S # 1 sw Y   y xY w)Nr  )r"   _compile_file)r  r  r  s      rd   r  r    s-     
n	% ;Zc:; ; ;s   #,c           	        t        | t              r| gn| }|D cg c]7  }t        j                         rt        j
                  j                  |      n|9 }}	 t        j                         rt               }t        j
                  j                  |      }t        j
                  j                  |      }t        j
                  j                  t        d      }	t        j                         5 }
t        j                  |t        j
                  j                  |
|             t        j                  t        t        j
                  j                  |
d             t        ||      D ]9  \  }}t        j                  |t        j
                  j                  |
|             ; t        j
                  j                  |
d      }t        j                   |	|       t#        ||
|      }t        j
                  j%                  |      rt	        j&                  |       t        j                  ||       d d d        y t)        j*                  |t(        j,                         y c c}w # 1 sw Y   y xY w# t(        j.                  $ r]}|j0                  j3                  d      }d|v xs d|v }|rt4        j6                  dk(  rd}||z  }t9        j:                  ||      |d }~ww xY w)	Nincluder  )stderrr   z'omp.h' file not foundlibompr@  a  

OpenMP support not found. Please try one of the following solutions:
(1) Set the `CXX` environment variable to a compiler other than Apple clang++/g++ that has builtin OpenMP support;
(2) install OpenMP via conda: `conda install llvm-openmp`;
(3) install libomp via brew: `brew install libomp`;
(4) manually setup OpenMP and set the `OMP_PREFIX` environment variable to point to a path with `include/omp.h` under it.)r  r   r$   r  r}   r~   r  rW  r   r  tempfileTemporaryDirectoryr  r   _LINKER_SCRIPTzipcopytreerY   r  remover  check_outputSTDOUTr  r  r  ry   rF  r%   r  )r  r  r  input_pathsipinput_filesheader_pathheader_namero  torch_includes_pathtmp_dirpr?  dest_include_pathoutput_file_pathr   r  openmp_probleminstructions                      rd   r  r    s2    #-Z"=:,:KEP?A 0 0 2:K (6)+K''**;7K''**;7K #%'',,{I"F,,. ;'Kg{)KLNBGGLL+,NO[9 =DAqKK277<<#;<=$&GGLL)$D! 35FG#5c7K#P 77>>+.IIk*,k:; ; ##C
0A0AB9; ;  (( 6)1V;Qx6?Qcllh62  k!F!!#v.A56sD   <IBI0 .EI$0I0 9%I0 $I-)I0 -I0 0K AKK zOptional[CDLL]_libgompc                  	 d		fd	|D cg c]
  } 	|       }}| j                  d      s
J | dz          d }t        | j                  d            D ]+  \  }}|dk(  rt        j                  |      }t        ||      }- t        |      s
J | dz           || }t        |t        t        f      rT|D ]&  }t        |t        j                        rJ | dz           t        j                  j                  j                  |      S t        |t        j                        s
J | dz          t        j                  j                  j                  |      S c c}w )
Nc                    t        t        |             dk(  r)t        j                  j                  j                  |       S t        | t        t        f      r t        |       fd| D              S | S )Nz<class 'PyCapsule'>c              3  .   K   | ]  } |        y wr_   r`   )r   rD  convert_args     rd   r   z9custom_op_wrapper.<locals>.convert_arg.<locals>.<genexpr>  s     9[^9s   )	r   r  ru   r  _aoti&alloc_tensor_by_stealing_from_void_ptrr  r  r  )argr  s    rd   r  z&custom_op_wrapper.<locals>.convert_arg  sW    tCy>2288>>HHMMdE]+499S999Jrf   z
torch.ops.z, can not be called through custom_op_wrapperro   r   z, can not be loaded through custom_op_wrapperz returns a list of non-tensorsz returns a non-tensor)r  r   r   r   )
startswith	enumerater  	importlibimport_moduler~  callabler  r  r  ru   r    r  r  #unsafe_alloc_void_ptrs_from_tensors!unsafe_alloc_void_ptr_from_tensor)
r   rb   r  converted_argsfuncr  rd  resultrr  s
            @rd   custom_op_wrapperr    sE    3773k#&7N7==& 
;;& D"((3-(  16**1-DtQ 
 D>N2 NNN>>"F&4-( 	VAa.U5U0UU.	Vxx~~AA&II&%,,/M6M1MM/xx~~??GG' 8s   Ec                      e Zd ZU i Zded<    eej                        Zi Zded<   ed
d       Z	e
d
d       Ze
	 	 	 d	 	 	 	 	 	 	 	 	 dd       Ze
ddd	       Zy)CppCodeCache0Dict[str, Callable[[], Union[CDLL, ModuleType]]]r   r   cpp_compile_command_flagsc                ,    t        j                  |       S r_   )r   LoadLibrary)r~   r   s     rd   _load_library_innerz CppCodeCache._load_library_inner#  s    %%rf   c           	        	 | j                  ||      }||_        |S # t        t        f$ r}dt	        |      v rTt
        j                  j                  d      r5t        j                  d      a
| j                  ||      }||_        |cY d }~S dt	        |      v r9t        | dt        j                          dt        j                          d      | d }~ww xY w)Ngompz/usr/lib64/libgomp.so.1z(failed to map segment from shared objectz3.  The most common reason this may occur is if the zl folder is mounted with noexec (e.g., by default Docker mounts tmp file systems as noexec).  Please remount zi with exec enabled, or set another temporary directory with TORCHINDUCTOR_CACHE_DIR environment variable.)r  r   ImportErrorr^  r   r}   r~   r  r   r  r  r  
gettempdir)rx  r~   r   r  r   s        rd   _load_libraryzCppCodeCache._load_library'  s    	,,T37FFJMW% 	QBGGNN3L$M  ++,EF00s; 
9SVCcLXM`M`MbLc d33;3F3F3H2I J]]
  	s"    CACCACCNc           	     H    i  j                   |t               |d}t                t        ddt	        di |      }t        |j                               }t        |d|      \  } j                  vrddl	m
}	 t        j                  j                  t               d	z         }
t        |      \  }}	 |d d
 dz   }d d t	        di |}t        ||||      }t!        j"                  t$        |
|||      t'        t)        j*                         r|n|j-                               d fd}|> |	|
t.              5  t        j                  j1                        s |      d d d        | j                  <    j                     S # 1 sw Y   'xY w)N)rw   r  extra_flagsr  r  r  r  r  r   r  r  sor+  c                 r    3j                                  } | J j                        J S r_   )r  r  )r  binary_pathrx  futurer   r  	worker_fns    rd   load_fnz(CppCodeCache.load_async.<locals>.load_fn}  sI    ;)&[F!>)>++K=C?*?
rf   r  r`   r   r   )r  r:   r1   r3   r5   r#  rG  r*  r   rI  r  r}   r~   r   r  r8   r   r
   _worker_compile_cppr9   r$   r  rN  rJ  r  )rx  rV  rw   	submit_fnr  compile_commandcommand_genvec_isa_cmdr  r  	lock_pathro  r,  fb_output_pathcpp_build_optioncpp_builderr  r  r  r   r  r  s   `                @@@@@rd   
load_asynczCppCodeCache.load_async>  s   
++
#~&	
 	 c/B/U_/U
 ;779:U+FZcii)\^S7]CI&LZ&X#K (_t3N,0FC2E_E$ "%,	K "))#I 3##%  557K	 	 $i> 677>>+6!*9!56 %CIIcNyy~6 6s   
(FF!c                0     | j                  ||             S r_   )r  )rx  rV  rw   s      rd   r   zCppCodeCache.load  s    0s~~k4022rf   )r~   r   r   r   r   zUnion[CDLL, ModuleType])FNr`   )
rV  r   rw   r   r  r   r  Sequence[str]r   r   )F)rV  r   rw   r   r   r   )r   r   r   r   rG  r   r  r  r  r  r  r  r  r   r`   rf   rd   r  r    s    >@E;@u{{+K02~2& &  ,  %'PP P 	P
 #P 
P Pd 3 3rf   r  c           	        ddl m}  || t              5  t        j                         r|n|j                         }t        j                  j                  |      sSt        j                         r/t        ||t        j                  |j                                      n|j                          d d d        y # 1 sw Y   y xY w)Nr   r  r  )rI  r  rJ  r$   r  rN  r}   r~   r  r  r  r  rG  build)r  r  fb_input_pathr  r  r  s         rd   r  r    s     "	)\	2 $$..0Nk6V6V6X 	 ww~~k*!!"KK < < >? !!#$ $ $s   BB77C c                      e Zd ZU i Zded<    eej                        ZdddZdZ	dZ
dZ ej                  d	      Zedd
       Ze	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Zedd       Zy)CppPythonBindingsCodeCacher  r   FTinclude_pytorchsharedkernelzkernel(%s);Py_RETURN_NONE;rp   a  
        // Python bindings to call %s():
        #define PY_SSIZE_T_CLEAN
        #include <Python.h>
        #include <sstream>
        #include <cstdlib>

        #ifndef _MSC_VER
        #if __cplusplus < 202002L
        // C++20 (earlier) code
        // https://en.cppreference.com/w/cpp/language/attributes/likely
        #define likely(x)       __builtin_expect(!!(x), 1)
        #define unlikely(x)     __builtin_expect(!!(x), 0)
        #endif
        #else
        #define likely(x) (x)
        #define unlikely(x) (x)
        #endif

        // This is defined in guards.cpp so we don't need to import PyTorch headers that are slooow.
        // We manually link it below to workaround issues with fbcode build.
        static void* (*_torchinductor_pyobject_tensor_data_ptr)(PyObject* obj);

        template <typename T> static inline T parse_arg(PyObject* args, size_t n) {
            static_assert(std::is_pointer<T>::value, "arg type must be pointer or long");
            return static_cast<T>(_torchinductor_pyobject_tensor_data_ptr(PyTuple_GET_ITEM(args, n)));
        }
        template <> inline int64_t parse_arg<int64_t>(PyObject* args, size_t n) {
            auto result = PyLong_AsSsize_t(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == -1 && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return result;
        }
        template <> inline uintptr_t parse_arg<uintptr_t>(PyObject* args, size_t n) {
            auto result = PyLong_AsVoidPtr(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == reinterpret_cast<void*>(-1) && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return reinterpret_cast<uintptr_t>(result);
        }

        %s

        static PyObject* %s_py(PyObject* self, PyObject* args) {
            try {
                if(unlikely(!PyTuple_CheckExact(args)))
                    throw std::runtime_error("tuple args required");
                if(unlikely(PyTuple_GET_SIZE(args) != %s))
                    throw std::runtime_error("requires %s args");
                %s
            } catch(std::exception const& e) {
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return nullptr;
            } catch(...) {
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return nullptr;
            }
        }

        static PyMethodDef py_methods[] = {
            {"%s", %s_py, METH_VARARGS, ""},
            {NULL, NULL, 0, NULL}};

        static struct PyModuleDef py_module =
            {PyModuleDef_HEAD_INIT, "%s", NULL, -1, py_methods};

        PyMODINIT_FUNC PyInit_%s(void) {
            const char* str_addr = std::getenv("_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR");
            if(!str_addr) {
                PyErr_SetString(PyExc_RuntimeError, "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR must be set");
                return nullptr;
            }
            std::istringstream iss(str_addr);
            uintptr_t addr = 0;
            iss >> addr;
            _torchinductor_pyobject_tensor_data_ptr =
                reinterpret_cast<decltype(_torchinductor_pyobject_tensor_data_ptr)>(addr);
            return PyModule_Create(&py_module);
        }
        c                   t        t        j                  j                  j                  j
                        t        j                  d<   | d| j                   }	 t        j                  |   S # t        $ r Y nw xY wt        j                  j                  ||      }|J t        j                  j                  |      }|t        j                  |<   |j                   j#                  |       |S )N'_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTRro   )r   ru   r  _dynamor`  '_torchinductor_pyobject_tensor_data_ptrr}   environentry_functionry   modulesKeyErrorr  utilspec_from_file_locationmodule_from_specloaderexec_module)rx  r~   r   module_namer  r  s         rd   r  z.CppPythonBindingsCodeCache._load_library_inner	  s    @CHH##KKA


<= Qs1123	;;{++ 		~~55k4H006#)K 's   A. .	A:9A:Nc                   
 dj                  d t        |      D              } j                   j                   j                  r j                  |z  nd j                  t        |      t        |       j                  |z   j                   j                   j                   j                  f
z  } j                  ||z   |||      
dd 
fd}	|	S )a5  
        Wrap a C++ function in fast Python bindings.

        Args:
            argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
            source_code: C++ source code containing a ENTRY_FUNCTION() function

        Returns:
            A python version of ENTRY_FUNCTION()
        , c              3  T   K   | ]   \  }}d |j                  dd       d| d " yw)z
parse_arg<zconst rp   z>(args, )N)rx   )r   nargtypes      rd   r   zBCppPythonBindingsCodeCache.load_pybinding_async.<locals>.<genexpr>1	  s7      
7 267xs!D
s   &(rp   )r  r  Nc                 f            t        t              sJ t         j                        S r_   )r  r   r~  r  )rx  
get_resultr  s   rd   r  z?CppPythonBindingsCodeCache.load_pybinding_async.<locals>.futureF	  s2    ~#!&*55563#5#566rf   r  )r   r  suffix_templater  extra_parse_argr  call_entry_functionr  )rx  argtypesrV  rw   num_outputsr  r  	parseargssuffixr  r  r  s   `         @@rd   load_pybinding_asyncz/CppPythonBindingsCodeCache.load_pybinding_async	  s    ( II 
'1
 
	 $$141D1DC+-"MM##i/(
 
 ^^& $) $ 

 	7 rf   c                0      | j                   |i |       S r_   )r  rx  rb   rc   s      rd   load_pybindingz)CppPythonBindingsCodeCache.load_pybindingO	  s     8's''88::rf   )r~   r   r   r   r   r   )Fr   Nr`   )r  r  rV  r   rw   r   r  r  r  r   r  r  r   r   )rb   r   rc   r   r   r   )r   r   r   r   rG  r   r  r  r  r  r  r  textwrapdedentr  r  r  r  r  r`   rf   rd   r  r    s    >@E;@u{{+K !!
 N6O%hooN	POd    
 %'// / 	/
 / / #/ 
/ /b ; ;rf   r  c                  v    e Zd ZU i Zded<    eej                        ZdddZdZ	dZ
 ej                  d      Zy)	CppWrapperCodeCacher  r   Tr  inductor_entry_cppzreturn inductor_entry_cpp(%s);a  
        #include <torch/csrc/inductor/aoti_torch/c/shim.h>

        static inline std::vector<AtenTensorHandle> unpack_tensor_handle_list(PyObject* pyvec) {
            std::vector<AtenTensorHandle> result;
            size_t result_len = PyList_GET_SIZE(pyvec);
            result.reserve(result_len);
            for (size_t i = 0; i < result_len; i++) {
                // AtenTensorHandle is essentially a pointer
                void* elem = PyCapsule_GetPointer(PyList_GET_ITEM(pyvec, i), NULL);
                result.push_back(reinterpret_cast<AtenTensorHandle>(elem));
            }
            return result;
        }

        static inline PyObject* pack_tensor_handle_list(const std::vector<AtenTensorHandle>& cppvec) {
            size_t result_len = cppvec.size();
            PyObject* result = PyList_New(static_cast<Py_ssize_t>(result_len));
            for (size_t i = 0; i < result_len; i++) {
                PyObject *elem =
                    cppvec[i] == nullptr
                        ? Py_None
                        // Store AtenTensorHandle as PyCapsulate
                        : PyCapsule_New(reinterpret_cast<void*>(cppvec[i]), NULL, NULL);
                PyList_SET_ITEM(result, i, elem);
            }
            return result;
        }

        template <> inline std::vector<AtenTensorHandle> parse_arg<std::vector<AtenTensorHandle>>(PyObject* args, size_t n) {
            return unpack_tensor_handle_list(PyTuple_GET_ITEM(args, n));
        }

        PyObject* inductor_entry_cpp(std::vector<AtenTensorHandle>&& input_handles) {
            // For outputs, we only allocate a vector to hold returned tensor handles,
            // not allocating the actual output tensor storage here
            std::vector<AtenTensorHandle> output_handles(%s);
            try {
                inductor_entry_impl(input_handles.data(), output_handles.data());
                return pack_tensor_handle_list(output_handles);
            } catch(std::exception const& e) {
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return {};
            } catch(...) {
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return {};
            }
        }
        N)r   r   r   r   rG  r   r  r  r  r  r  r  r  r  r`   rf   rd   r  r  T	  sK    >@E;@u{{+K! *N:%hoo0	2Orf   r  c                     e Zd ZU i Zded<    eej                        ZdZded<    e	j                  d      Ze e	j                  d      z   Ze e	j                  d      z   Z e	j                  d	      Zedd
       Zedd       Ze ej&                  d      dd              Zedd       Ze ej&                  d      dd              Ze ej&                  d      dd              Ze	 d	 	 	 	 	 	 	 dd       Zedd       Zedd       Zy)HalideCodeCachez0Dict[str, Callable[[], Union[ModuleType, CDLL]]]r   Nr  _standalone_runtime_patha  
        #include "{halideruntime_h}"
        #include "{headerfile}"
        #include <stdexcept>
        #include <cmath>

        namespace c10 {{
            inline long div_floor_integer(long a, long b) {{
                if ((a<0) != (b<0)) {{
                    const auto quot = a / b;
                    const auto rem = a % b;
                    return rem ? quot - 1 : quot;
                }}
                return a / b;
            }}
        }}
        z
        void kernel({argdefs}) {{
            {buffers}
            int err = halide_kernel({buffer_names});
            if(err != 0) throw std::runtime_error("halide_kernel failed");
        }}
        a{  
        #include <cuda.h>
        static const halide_device_interface_t* cuda_interface = halide_cuda_device_interface();

        void kernel({argdefs}, uintptr_t stream) {{
            {buffers}
            int err = halide_kernel(reinterpret_cast<void*>(stream), {buffer_names});
            if(err != 0) throw std::runtime_error("halide_kernel failed");
        }}
        a  
        #include "{}"
        #include <cuda.h>

        static int acquire_context(void* user_context,
                                   void** cuda_context_out,
                                   bool create) {{
            return cuCtxGetCurrent(reinterpret_cast<CUcontext*>(cuda_context_out));
        }}

        static int release_context(void* user_context) {{
            return 0;
        }}

        static int get_stream(void* user_context,
                              void* cuda_context,
                              void** stream_out) {{
            *stream_out = user_context;
            return 0;
        }}

        static int register_halide_hooks() {{
            halide_set_cuda_acquire_context(&acquire_context);
            halide_set_cuda_release_context(&release_context);
            halide_set_cuda_get_stream(&get_stream);
            return 0;
        }}

        int inductor_register_halide_hooks_result = register_halide_hooks();
        c                   |j                   J |j                  +t        |j                         t        |j                        k(  sJ |j                  J |j                  xs |j
                   d|j                   }|rd| d}d}d}d}nd}d}d| d}d	}g }	t        |j                   |j                        D ]  \  }
}|	j                  d
|
 d| d        d| dd| ddj                  |	       d| d| d| d| d| d| d| d| d| d|j                          d| dt        |	       d| d| d| dg
S )Nz + zreinterpret_cast<uint64_t>(r  cuda_interfacenullptrhalide_buffer_flag_device_dirty0zreinterpret_cast<uint8_t*>(halide_buffer_flag_host_dirtyzhalide_dimension_t(0, r  zhalide_buffer_t ;zhalide_dimension_t z_dims[] = {z};z
.device = z.device_interface = z.host = z	.flags = z.type = z.dimensions = z.dim = z_dims;z.padding = nullptr;)
shapestrider  offsetalias_ofr   r  r  r   halide_type)rx  r   r  rw   r6  r   device_interfacehostflagsdimssizer  s               rd   _codegen_bufferzHalideCodeCache._codegen_buffer	  s   yy$$$zz%#cii.C

O*KKKzz%%%ll.chh/s3::,?28*A>F/D5EF(0
!<D3E		3::6 	DLD&KK0bBC	D tfA&!$|DIIdO3DCHfJvha(f()9(:!<fHTF!$fIeWA&fHS__./q1fN3t9+Q/fGD6(f'(
 	
rf   c           	        |j                         }|d|j                  v u sJ d|j                  v sJ g }g }t        |j                        D ]z  \  }}|j	                         r:|j                  d|        |j                  | j                  d| ||             Pd|j                  vsJ |j                  |j                         | dj                  |D cg c]  }d| 	 c}      j                         }|r| j                  n| j                  }	|	j                  | j                  |rdnd	      |d
j                  d |j                  D              |d
j                  |            }
|
S c c}w )Nuser_context
no_runtimez&hl_buf_hl_buf_*r      HalideRuntimeCuda.hzHalideRuntime.hr  c              3  r   K   | ]/  }|j                   !|j                          d|j                    1 y w)Nr  )r  bindings_typer   )r   rD  s     rd   r   z0HalideCodeCache._codegen_glue.<locals>.<genexpr>
  s9      ::% ??$%Qqvvh/s   57)halideruntime_h
headerfileargdefsbuffersbuffer_names)r  r  r  r  	is_bufferr  extendr  ctyper   r   lstripglue_template_cudaglue_template_cppformatfind_header)rx  rT  r  r  r!  r"  r  r  lineglue_template	glue_codes              rd   _codegen_gluezHalideCodeCache._codegen_glue
  sh   ,,.>T[[8999t{{***. 	.FAs}}##hqcN3s22WQC=#wOP#))+++##CHH-	. ))w?ttD6]?@GGI29..s?T?T!((OO)0%6G "II  
 <0 ) 
	 ! @s   E-c                    t        ddt                     }|j                         }t        dj	                  | j
                  | j                  | j                  |g      j                  d            S )NOIr  r  r   )	r3   r4   rG  r  r   r(  r'  standalone_runtime_cuda_initr   )rx  r  command_lines      rd   config_hashzHalideCodeCache.config_hash#
  sp     !"

 #335II))**44 	 fWo	
 		
rf   c                X   t         j                  j                  j                  d      }||j                  st        d      	 |j                  d   }t        j                  |      D ]  }|j                  d      s	 t        j                  dt        j                  j                  ||      g      }t        j                  d|j!                  d            }|sst        j                  j                  t        j                  j#                  |j%                  d            |       }t        j                  j'                  |      st        j                  j#                  |      c S  	 t        |      # t        j                  $ r Y #w xY w# t(        $ r}t        |      |d }~ww xY w)	Nhalidez$halide python bindings not installedr   r  lddz(/.*)/libHalide.sor   r+   )r  	machinery
PathFinderr  r  r   r}   r=  r  r  r  r~   r   SubprocessErrorrX  searchr  abspathgroupr  r?  )	r  errmsgr  r;  fileoutmr~   r   s	            rd   _search_for_filez HalideCodeCache._search_for_file7
  sL   ""--77A<t>>EFF	.44Q7F

6* 9=='!(55"BGGLL$>?
 		"7G9LMA!ww||BGGOOAGGAJ,GP77>>$/#%77??4#889 6"" &55 ! !  	.v&A-	.sO   8F >5E53'F A+F  F (F 5FF FF 	F)F$$F)c                *   d| j                          d}dt        j                  v rRt        j                  j	                  t        j                  d   |      }t        j                  j                  |      r|S d| d}t        j                  ||      S )Nlibautoschedule_r  
HALIDE_LIBCan't find z3, set env HALIDE_LIB to the directory containing it)r  r}   r  r~   r   r  r  rB  )r   sofiler~   r>  s       rd   find_libautoschedulez$HalideCodeCache.find_libautoscheduleO
  s}     $DJJL>52::%77<<

< 8&ADww~~d#&!TU 	 //??rf   c                   dt         j                  v rRt         j                  j                  t         j                  d   |       }t         j                  j	                  |      r|S dt         j                  v rrt         j                  j                  t         j                  j                  t         j                  d   d|              }t         j                  j	                  |      r|S d|  d}t        j                  d|  |      S )NHALIDE_INCLUDErE  z../include/rF  z7, set env HALIDE_INCLUDE to the directory containing it)r}   r  r~   r   r  r<  r  rB  )r   r~   r>  s      rd   r*  zHalideCodeCache.find_header\
  s     rzz)77<<

+; <dCDww~~d#2::%77??RZZ5TF7KLD ww~~d#$VW 	 //+dV0DfMMrf   c                   t        t        t        |t        | j	                         |f            d      d         }t        j                  |d       d t        |dz        }t        |dz        }t        |dz        }t        |d	z        }t        |d
z        }	t
        j                  j                  |       }
g }|
rt        ||       t        j                  |ddd| ddddg
}|j                  r,|j                  d| j                  |j                        g       |j                  |j!                                |j#                  t%        j&                  t(        j*                  |             |j,                  D cg c]  }|j.                  |j1                         ! }}|j3                         r|j#                  d       | j5                  || j7                  ||      || j9                         f|
r|j"                  nd |j3                               |
r`|j#                  t%        j&                  t:        |             t%        j&                  t<        |	|      }|r ||      j>                  n |        dfd}|S c c}w )Nr  r6  rF  Trs   zgenerate_kernel.pyzhalide_kernel.azhalide_kernel.hdonera  -gr  -oz-fhalide_kernelz-ezstatic_library,h,schedulez-p	uintptr_t)r  r  rw   c                 $    r                  S r_   r`   )bindings_futurewait_for_compiles   rd   r   z3HalideCodeCache.generate_halide_async.<locals>.load
  s     ""$$rf   )r   Callable[[], Any]) r   r  r  r#  r4  r}   r   r   r~   r  r   ry   
executable	schedulerr$  rH  rb   r  r   r
   r  r  r  r  r  r  r  r.  build_standalone_runtimetouch_worker_task_halider  )rx  rT  rV  r  dirpathgenfilelibfiler  donefilelockfileneed_compilejobsr  r  binding_typestaskr   rR  rS  s                    @@rd   generate_halide_asyncz%HalideCodeCache.generate_halide_asyncn
  sI     1489  
 	Gd+g 445g 112#445
w'(w'(77>>(33+.)+C ~~

D#":":4>>"JKLJJtyy{#KK	))**?*?EF ,0==
$'CLL<PC
 
 <<>  -22dJ/ #">">"@A%1dkkt 3 
 KK	))%:;$$%8(DID#,T?#9#9 	%
 5
s   JJc                0      | j                   |i |       S r_   )rc  r  s      rd   generate_halidezHalideCodeCache.generate_halide
  s     9(s(($9&9;;rf   c           	     l   | j                   r5t        j                  j                  | j                         r| j                   S t        j
                  j                         }d}|rdnd}| j                   r6t        j                  j                  | j                         rJ t               }n
t               }t        |      d| d| j                          z  }t        j                  |d       t        |dz        }t        |d	z        }t        |d
z        }t        |dz        }	t        ||z        }
t        j                  j                  |      s!dd l}dd l}|j                  |t               5  t        j                  j                  |      st#        |d      5 }|r9|j%                  | j&                  j)                  | j+                  d                   d d d        |j-                  |	|j/                  |             t1        |
      \  }}t3        |||	g|t5        |            }t7        j8                  t;        j<                  |j?                                      tA        |       d d d        t        j                  j                  |
      sJ |
| _         |
S # 1 sw Y   xY w# 1 sw Y   ?xY w)NzlibStandaloneHalideRuntime.soz	host-cudar  zhalide-runtime--Trs   rL  ra  z	hooks.cppzstandalone_halide_runtime.ar   r3  r  rw   r+  )!r  r}   r~   r  ru   rw   r  rB   rA   r   r4  r   r   rI  r6  r  rJ  r   r*  r2  r)  r*  compile_standalone_runtimeTargetr8   r3   r5   r  r  r  r  rG  rX  )rx  r  libnamer  baserZ  r]  r^  hookfileafilerG  rI  hlr?  r   r,  halide_cmd_gens                    rd   rW  z(HalideCodeCache.build_standalone_runtime
  s@   ''BGGNN((-
 ///**))+1 'V''ww~~c&B&BCCC
 %&D;Dt*#//:K9LMM
Gd+w'(w'(w,-G;;<Ww&'ww~~h'""8\: $ww~~h/h, "GG # @ @ G G$'OO4I$J!" 11%69JK'Mf'U$D*%/!!)5 1#-$7!(%	&N ))N$C$C$EF (O1$2 ww~~f%%%'-$3 $ $s%   1,J*<JBJ*J'	#J**J3)r   r   r  rU   rw   r   r   r  )rT  rV   r  objectr   r   r  )r  r   r>  r   r   r   r   r   r   r   r_   )rT  rV   rV  r   r  r   r   rT  )rb   r   rc   r   r   rT  )r   r   r   r   rG  r   r  r  r  r  r  r  r(  r'  r2  r  r  r.  r   r   r4  rB  rH  r*  rc  re  rW  r`   rf   rd   r  r  	  s   >@E;@u{{+K.2m2X__	F& 	"   /(//		#  $38??	$ B 
 
B  > Y
  
$ # #. Y	@  	@ YN  N  BFBB,/B<?B	B BH < < 7 7rf   r  c                B   ddl m} 	  || t              5  |D ]	  } |         	 d d d        y # 1 sw Y   y xY w# t        j                  $ rP}t
        j                  j                  d      dk(  r't        |dd      ^}}}t
        j                  j                  |      j                  d      rt        |      j                         }d}	|j                  |	      d	k(  sJ  G d
 d      }
 |
       ||j                  d      d	z   <   t!        j"                  t!        j$                  ddg|d      d      }|j'                  |	|      }t        dd      5 }|j)                  |j+                                d d d        n# 1 sw Y   nxY wt-        d|       | d }~ww xY w)Nr   r  HALIDE_REPRO1r  )rp   rp   rp   pythonz    hl.main()r+   c                      e Zd ZddZy) _worker_task_halide.<locals>.Outc                     y)Nr@  r`   r   s    rd   __repr__z)_worker_task_halide.<locals>.Out.__repr__  s    $rf   Nr  )r   r   r   rz  r`   rf   rd   Outrx    s    %rf   r{  rN  z                        import sys, tempfile
                        with tempfile.TemporaryDirectory() as out:
                            sys.argv = zrepro.pyz?
                            hl.main()
                        r  r3  zwrote repro.py: )rI  r  rJ  r  r:  r}   r  r   r~  r~   r  r  r   r  countindexr  r   r  rx   r*  r&  r   )r^  r`  r  jobr   rv  scriptr  r  mainr{  replfds                rd   rY  rY  
  s   ! h- 	 	 	 	 %% ::>>.)S0#*1e\#B FFSww'228<F|((*&zz$'1,,,% % ,/5CIIdOa'(OO( *4(:c(:'= > 
 ||D$/*c* ,bHHT[[]+, , ,"%5aS#9:A9sB   : .: 7: : FDF E=4	F=F	FFc                8    t        | d      j                          y )NrD  )r   closer~  s    rd   rX  rX    s    3rf   c                      e Zd ZU i Zded<   i Zded<    eej                        Ze	d
dd       Z
e		 	 	 d	 	 	 	 	 	 	 	 	 dd       Ze		 	 d	 	 	 	 	 	 	 	 	 dd       Ze	 ej                  d      	 	 	 	 	 	 dd	              Zy)rZ  zDict[str, ModuleType]r   z Dict[str, List[Tuple[Any, ...]]]linemapsc                    t        |d|      S Nrq   r  r-  )rx  rV  r  s      rd   r*  zPyCodeCache.write"  s    [$e44rf   Nc                L    t        |d|      \  }}| j                  ||||      S r  )r*  r[  )rx  rV  r  linemapattrsr   r~   s          rd   r   zPyCodeCache.load&  s-     +t59	T##Cw>>rf   c                f   |g }|| j                   vrt        ||      }| j                   j                  ||       t        t	        |       | j
                  |<   |%|j                         D ]  \  }}t        |||        |s"|s t        j                  t        ||      |_        | j                   |   S r_   )r   r?   r   r  r  r  r   setattrr   r
   r@   _reload_in_subproc)rx  r   r~   r  r  modr  r  s           rd   r[  zPyCodeCache.load_by_key_path1  s     ?Gcii'T2C II  c*!%c7m!4CLL !KKM 'DAqCA&' u)2):):4c4*& yy~rf   c                    || j                   vry | j                   |   \  }}t        ||      }|dk(  ry ||dz
     }|sy dd} ||      S )Nr   r+   c           	         d}t        j                  ||       }t        |      D cg c]  \  }}}|t        |      |d c}}}S c c}}}w )Nz"File "(.+)", line (\d+), in (.+)\n)rN  r+  r   )rX  findallreversedr  )stack_traceregexmatchesr?  lr  s         rd   parse_stack_tracez<PyCodeCache.stack_frames_for_code.<locals>.parse_stack_trace^  sV     :Ejj4G  (0 Aq! A:  s   A)r  r   r   zList[Dict[str, Any]])r  r   )rx  r~   linenor  r  r  entryr  s           rd   stack_frames_for_codez!PyCodeCache.stack_frames_for_codeN  s`    
 s||#||D)u'6a!e	 !''rf   rp   )rV  r   r  r   r   Tuple[str, str])rp   NN)
rV  r   r  r   r  r  r  r   r   r   )NN)
r   r   r~   r   r  r  r  r   r   r   )r~   r   r  r  r   zOptional[List[Dict[str, Any]]])r   r   r   r   rG  r  r   r  r  r  r*  r   r[  r   r   r  r`   rf   rd   rZ  rZ    s   #%E %13H.3u{{+K5 5  37*.?? ? 1	?
 (? 
? ? 
 48*.  1	
 ( 
 8 Y(( #(	'(  (rf   rZ  c                      e Zd Zedd       Zy)TritonCodeCachec                @    t        t        j                  |      |      S r_   )r>   rZ  r   )rx  kernel_namerV  s      rd   r   zTritonCodeCache.loadl  s    '(8(8(E{SSrf   N)r  r   rV  r   r   r   )r   r   r   r  r   r`   rf   rd   r  r  k  s    T Trf   r  c                    t        j                  t        j                  j                        rt        j                  j                  S t        j
                         r3t        j                  j                  t        j                         dd      S t        j                  t        j                  d            rt        j                  dd      S t        j                  t        j                  d            rQt        j                  j                  t        j                  j                  t        j                  dd      d            S y)Nr  nvccCUDACXXrp   	CUDA_HOMEzbin/nvcc)r'   
nvcc_existr$   rw   cuda_cxxr  r}   r~   r   rX   getenvrealpathr`   rf   rd   _cuda_compilerr  q  s    6;;//0{{###ww||K,,.v>>299Y/0yyB''299[12wwRYY{B-G TUUrf   c            	     r   t        j                         rddlm}  | j	                  d      }nt         j
                  j                  }t        j                  j                  t        j                  j                  |d            t        j                  j                  t        j                  j                  |d            t        j                  j                  t        j                  j                  |d            t        j                  j                  t        j                  j                  |d            gS )Nr   r  zcutlass-3-headersr  ztools/library/includeztools/library/srcztools/util/include)r$   r  r  r  get_dir_pathrw   cutlass_dirr}   r~   r  r   )r  cutlass_paths     rd   _cutlass_include_pathsr  }  s    $++,?@{{.. 	lI>?
l4KLM
l4GHI
l4HIJ rf   c                 B   t                ddlm}  | j                  d      t	        j
                  d      gz   }g }t               rPt        |       |D ]  }|j                  d| dd| g        |j                  d	       |j                  d
       |S t        d      )Nr   )cpp_extensionTrh  LIBDIRz-Lz-Xlinkerz-rpath=z-lcudaz-lcudartzMUnsupported env, failed to find cuda libs! Currently only Linux is supported.)r1   torch.utilsr  library_paths	sysconfigget_config_varrG   r2   r$  r  NotImplementedError)r  lpathsextra_ldflagsr~   s       rd   _cuda_lib_optionsr    s    )((d(3  *7 F  "Mzf% 	ND   Btf+zWTF;K!LM	N 	X&Z(
  "[
 	
rf   c                 
    g dS )N)z-fPICz-fno-strict-aliasingz-fvisibility=hiddenz-Wconversionr`   r`   rf   rd   _nvcc_host_compiler_optionsr    s     rf   c            	        t        j                         } | dk(  rd} d|  d|  g}t        j                  j                  r	|d|  gz  }dddd	|  d
dj                  |       dt        j                  j                  dddg}t        j                         rB|j                  dt        j                  j                  t        j                               g       t        j                  j                  r|j                  g d       t        j                  j                  r|j                  g d       t        j                  j                   r|j                  ddg       |S )N9090asm_compute_lto_z-t=0z"-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1z-wz-gencode=arch=compute_z,code=[,]z
-std=c++17z--expt-relaxed-constexprz-DNDEBUGz-ccbin)z	-lineinforM  z-DCUTLASS_DEBUG_TRACE_LEVEL=1)z--keepz,--ptxas-options=--warn-on-local-memory-usagez --ptxas-options=--warn-on-spillsz--resource-usagez--source-in-ptxz--use_fast_mathz -DCUTLASS_USE_TANH_FOR_SIGMOID=1)r'   get_cuda_archr$   rw   enable_cuda_ltor   compile_opt_levelr  r$  r}   r~   r  rX   gccenable_debug_infoenable_ptxas_infouse_fast_math)archr  optionss      rd   _nvcc_compiler_optionsr    s+   !!#Dt|$LHTF+,D{{""4v,
 gchhtn-=Q?%%"	G "''//+//2C"DEF{{$$KL{{$$	
 {{  !2	
 Nrf   c                r   |g }t               }t               }t               }t               }||z   |D cg c]  }d|v rd| nd|  c}z   |D 	cg c]  }	d|	z   	 c}	z   |z   }
dj	                  |       }d}|dk(  r%t                ddj	                  |
       d| d| }nt|d	k(  r6|
j                  d
       t                ddj	                  |
       d| d| }n9|dk(  r%t                ddj	                  |
       d| d| }nt        d| d      t        j                  d|       |S c c}w c c}	w )N=z-Xcompiler z-Xcompiler=z-Ir  rp   r  r  r  z-sharedz -o exezUnsupported output file suffix !zCUDA command: %s)
r  r  r  r  r   r  r  r  ru  r  )	src_filesdst_filedst_file_ext
extra_argsinclude_pathscuda_lib_optionsnvcc_host_compiler_optionsnvcc_compiler_optionsoptr~   r  src_fileress                rd   cuda_compile_commandr    s    
*,M(*!<!>24
	 2
 $'#:k#[3FF
	
 $1
144$;
1	2 	  xx	"H
Cs!"!CHHW$5#6ghZq
S		y!!"!CHHW$5#6d8*AhZP		!"!CHHW$5#6d8*AhZP!$CL>QR"STTII #&J'
 2s   D/D4c                  P    e Zd ZdZ	 	 	 	 d
dZddZddZddZddZddZ	ddZ
y	)
DLLWrapperz A wrapper for a dynamic library.c                b    || _         d| _        t        j                  |      | _        d| _        y )NFT)lib_pathis_openr   r  DLL)r   r  s     rd   r   zDLLWrapper.__init__  s+     !##H-rf   c                L    | j                   r| j                          d| _         y y rj   )r  _dlcloser   s    rd   r  zDLLWrapper.close  s    <<MMO DL rf   c                    d }t               r;t        d       }t        |d      st        d      }t        |d      rF|j                  }n9t	               r$dd l}|j                  dd      }|j                  }nt        d      |wt               r)t        g|_	         || j                  j                         y t	               r9dd l}ddlm} |j                  g|_	         || j                  j                         y y t        j                  d	       y )
Ndlclosezlibc.sor   kernel32T)use_last_errorz&Unsupported env, failed to do dlclose!)wintypeszKdll unloading function was not found, library may not be unloaded properly!)rG   r   rP  r  rH   r4  FreeLibraryr  r   r  r  _handler  HMODULEru  rv  )r   	f_dlclosesymsr4  r  r  s         rd   r  zDLLWrapper._dlclose  s    	::D4+ItY' LL	\{{:d{CH ,,I%&NOO z&.Z	"$((**++&.&6&6%7	"$((**+  KK]rf   c                    | j                   st        d| j                         t        | j                  |      dfd}|S )NzCannot use closed DLL library: c                 D     |  }|rt        dj                         y )NzError in function: )r   r   )rb   errmethods     rd   _wrapped_funcz-DLLWrapper.__getattr__.<locals>._wrapped_func6  s,    $-C"%88I#JKK rf   rb   r   r   r   )r  r   r  r~  r  )r   r   r  r  s      @rd   __getattr__zDLLWrapper.__getattr__0  s?    ||!@PQQ4(	L
 rf   c                    | S r_   r`   r   s    rd   	__enter__zDLLWrapper.__enter__=  s    rf   c                $    | j                          y r_   r  )r   rb   s     rd   __exit__zDLLWrapper.__exit__@      

rf   c                $    | j                          y r_   r  r   s    rd   __del__zDLLWrapper.__del__C  r  rf   N)r  r   r   r   r   )r   r   r   zCallable[..., None])r   r  r  )r   r   r   rF  r   r  r  r  r  r  r  r`   rf   rd   r  r    s;    * 
!
!Frf   r  c                      e Zd ZU ej                   G d d             Zi Zded<    eej                        Z
dZed
d       Ze	 d	 	 	 	 	 	 	 dd       Zedd	       Zy)CUDACodeCachec                  "    e Zd ZU ded<   ded<   y)CUDACodeCache.CacheEntryr   r  r  Nr   r   r   rG  r`   rf   rd   
CacheEntryr  I      rf   r  Dict[str, CacheEntry]r   rn   c                n    t        t        dgd|            }t        || j                  |      \  }}||fS z
        Writes source code into a file with dst_file_ext as the file extension.
        Returns the hash key of source code, and the path to the file.
        dummy_inputdummy_outputr  )r#  r  r*  _SOURCE_CODE_SUFFIXrx  rV  r  cuda_commandr   r  s         rd   r*  zCUDACodeCache.writeR  E      -.,O
  00
Z Jrf   Nc                   | j                  ||      \  }}|| j                  vr^ddlm} t	               } |t
        j                  j                  ||dz         t              }|5  |dt        | j                          |z   }	t
        j                  j                  |	      st        |g|	||      }
t               }t        j                  d|
       |
j!                  d      }	 t#        j$                  |t"        j&                  t
        j(                         t               }d	||z
   d
|
 }t        j3                  |       nt        j                  d|       t4        j7                  ||	      | j                  |<   ddd       | j                  |   j8                  ||fS # t"        j*                  $ r&}t-        j.                  ||j0                        |d}~ww xY w# 1 sw Y   axY w)z
        Compiles CUDA source_code into a file with dst_file_ext extension.
        Returns a tuple of dst_file_path, hash_key, source_code_path
        r   r  r  r  NzCUDA Compilation: %sr  )r  envzCUDA Compilation took  seconds. Compile command: z8CUDA Compilation skipped: %s since output already exists)r*  r   rI  r  r  r}   r~   r   rJ  r  r  r  r  r   ru  r  r  r  r  r  r  r  r%   CUDACompileErrorr  r"  r  r  r  )rx  rV  r  r  r   r  r  r  ra  r  r  r  	cmd_partsr_  end_timelog_duration_msgs                   rd   r{  zCUDACodeCache.compilea  s    ))K>Zcii)#~HBGGLL3=A<XD S()HC0G0G,H+HILXww~~k2.#k<C "&JII4c: #		#IW"//%j.?.?RZZ
  $vH)?:@U?VVqruqv'w$HH-.IIR" "/!9!9*k!R		#/S2 		#**C<< &88 W!229ellKQVVWS Ss2   &A<G#4FA$GG2!GGGG$c                v    |dk7  rt        d| d|       | j                  ||      \  }}}t        |      ||fS z
        Compiles source code and loads the generated .so file.
        Returns a tuple of DLLWrapper, hash_key, source_code_path
        r  zCOnly support loading a .so file for now. Requested file extension: z. Source code: r   r{  r  rx  rV  r  dst_file_pathr   source_code_paths         rd   r   zCUDACodeCache.load  a     4--9N/+X  58KK5
1x!1 =)85EFFrf   rV  r   r  r   r   r  r_   rV  r   r  r   r  Optional[List[str]]r   Tuple[str, str, str]rV  r   r  r   r   zTuple[DLLWrapper, str, str])r   r   r   rQ  	dataclassr  r   rG  r   r  r  r  r  r*  r{  r   r`   rf   rd   r  r  G  s       $&E %u{{+K  TX&=&=-0&=>Q&=	&= &=P G Grf   r  c                      e Zd ZU ej                   G d d             Zi Zded<    eej                        Z
dZdZedd       Ze	 d	 	 	 	 	 	 	 dd	       Zedd
       Zy)ROCmCodeCachec                  "    e Zd ZU ded<   ded<   y)ROCmCodeCache.CacheEntryr   r  r  Nr  r`   rf   rd   r  r'    r  rf   r  r  r   r  Fc                n    t        t        dgd|            }t        || j                  |      \  }}||fS r	  )r#  r(   r*  r  r  s         rd   r*  zROCmCodeCache.write  r  rf   Nc                T   | j                   s6d| _         t        j                  t        t	        t                                  | j                  ||      \  }}|| j                  vr_ddlm	} t               } |t        j                  j                  ||dz         t              }|5  |dt        | j                           |z   }	t        j                  j#                  |	      st%        |g|	||      }
t'               }|
j)                  d      }	 t+        j,                  |t*        j.                  dt        j0                        }t        j                  d	|       t'               }d
||z
   d|
 }t        j;                  |       nt        j                  d|       t<        j?                  ||	      | j                  |<   ddd       | j                  |   j@                  ||fS # t*        j2                  $ r&}t5        j6                  ||j8                        |d}~ww xY w# 1 sw Y   axY w)z
        Compiles source_code into a file with dst_file_ext extension,
        using the compile command specific for the ROCm platform.
        Returns a tuple of dst_file_path, hash_key, source_code_path
        Tr   r  r  r  Nr  )r  r.  r  zCompilation output: %szCompilation took r  z3Compilation skipped: %s since output already exists)!_logged_compiler_versionru  r  r6   r   r)   r*  r   rI  r  r  r}   r~   r   rJ  r  r  r  r(   r   r  r  r  r  r  r  r%   r  r  r"  r%  r  r  )rx  rV  r  r  r   r  r  r  ra  r  r  r  r  r  r_  r  r  s                    rd   r{  zROCmCodeCache.compile  s    +++/C(II/MO0DEF))K>Zcii)#~HBGGLL3=A<XD S()HC0G0G,H+HILXww~~k2.#k<C "&J #		#I	W!+!8!8%#-#4#4!% "

	" 		":FC  $vH):8j;P:QQlmplq'r$HH-.IIM" "/!9!9*k!R		#5S8 		#**C<< &88 W!229ellKQVVW!S Ss3   (A&HAG"A$H"H5!HHHH'c                v    |dk7  rt        d| d|       | j                  ||      \  }}}t        |      ||fS r  r  r  s         rd   r   zROCmCodeCache.load  r  rf   r  r_   r  r"  )r   r   r   rQ  r#  r  r   rG  r   r  r  r  r*  r  r*  r{  r   r`   rf   rd   r%  r%    s       $&E %u{{+K$  TX.=.=-0.=>Q.=	.= .=` G Grf   r%  c                      e Zd ZddZy)CodeCacheFuturec                    t         r_   )r  r   s    rd   r  zCodeCacheFuture.result  s    !!rf   Nr   )r   r   r   r  r`   rf   rd   r-  r-    s    "rf   r-  c                  4    e Zd ZU ded<   	 	 	 	 	 	 ddZddZy)TritonFuturer   r  c                     || _         || _        y r_   )r  r  )r   r  r  s      rd   r   zTritonFuture.__init__  s    
 rf   c                    | j                   ?| j                   j                         }|J d | _         | j                  j                          | j                  S r_   )r  r  r  
precompile)r   r  s     rd   r  zTritonFuture.result
  sI    ;;"[['')F>!>DKKK""${{rf   N)r  r   r  zOptional[Future[Any]]r   r   )r   r   )r   r   r   rG  r   r  r`   rf   rd   r0  r0    s.     & 
	rf   r0  c                      e Zd ZddZddZy)LambdaFuturec                    || _         y r_   	result_fn)r   r8  s     rd   r   zLambdaFuture.__init__  s	    "rf   c                "    | j                         S r_   r7  r   s    rd   r  zLambdaFuture.result  s    ~~rf   N)r8  r  r   r   )r   r  )r   r   r   r   r  r`   rf   rd   r5  r5    s    # rf   r5  )rb   r   rc   r   r   r   )r   r   rr  r  )r  r  r   r   r  )r  Union[str, bytes]r  r   r   r   )r  r   r  r   r  r   r   r!  )rp   r  )r$  r:  r  r   r%  r   r   r   )rp   r  rp   )r$  r:  r  r   r  r   r%  r   r  r   r   r  )r.  r   r   r   )FF)
r<  r   r$  r:  r   r   r)  r   r   r   )rJ  r-   r   r-   )rR  rj  rS  r    r   rM   )rR  rj  rS  r    r   z.Tuple[Callable[[T], T], Tuple[TensorMetadata]])rR  rj  rS  r    r   z7Tuple[Callable[[T], T], Tuple[TensorMetadataAndValues]])rd  r   r   z#Tuple[Callable[[T], T], Tuple[str]])rd  r   r   r   )r  zList[str] | Noner  r   r  zhashlib._Hashr   r   )r   r  )
r  r  r  r  r  r   r  r  r   zTuple[str, List[str]])r  rC  r  r  r  rE   r   r   )r  rE   r  r  r  r  r   r   )r&  r  r   r  )r  r   r   r   )r~   r   r   r  )r  zUnion[str, List[str]]r  r   r  r  r   r   )r   r   rb   r   r   zUnion[list[c_void_p], c_void_p])
r  r   r  r3   r  r   r  r   r   r   )r^  r   r`  zList[partial[Any]]r   r   )rN  r   )r   r  r  r_   )
r  r  r  r   r  r   r  r   r   r   )
__future__r   r  r  rQ  r   r   r  ro  r   loggingr}   r>  r  rX  r  r  rU  r  ry   r  r  r  r9  r]  bisectr   r   r4  r   r   r   datetimer	   r
   pathlibr   r   r   typesr   typingr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   ru   torch.distributedr  r#  r   r    torch._dynamo.utilsr!   r"   r#   torch._inductorr$   r%   r&   torch._inductor.codegen.cudar'   ,torch._inductor.codegen.rocm.compile_commandr(   r)   torch._utils_internalr*   r  r,   r-   collections.abcr.   rD  r/   r0   torch._inductor.cpp_builderr1   r2   r3   r4   r5   r6   r7   r8   r9   torch._inductor.cpu_vec_isar:   torch._inductor.cudagraph_utilsr;   r<   r=   %torch._inductor.runtime.compile_tasksr>   r?   r@   %torch._inductor.runtime.runtime_utilsrA   rB   torch._inductor.utilsrC   rD   rE   rF   rG   rH   rI   torch._loggingrJ   torch._subclasses.fake_tensorrK   rL   rM   %torch.fx.experimental.symbolic_shapesrN   rO   rP   concurrent.futuresrQ   torch._inductor.graphrS   torch._inductor.irrT   torch._inductor.runtime.hintsrU   rV   r~   r<  r  _HEREr  r  r   r  rF  _IS_WINDOWSr  	triton.fbrX   triton.fb.buildrY   torch._inductor.fb.utilsrZ   r[   r\   r]   r  getArtifactLoggerr   rg  rJ  	getLoggerru  r   r   r   r   r   r  r  r  r  r&  r*  r/  r   r#  rA  rK  rU  rX  ra  re  rg  Picklerri  r  r   r  r  r  r?  r[  r  r  r  r  r)  r+  r   r  rG  r  r  r  r  r  rW  r  r  r  r  r  r  r  r  r  r  rY  rX  rZ  r  r  r  r  r  r  r  r  r  r%  r-  r0  r5  r`   rf   rd   <module>r_     sy   "       	   	   	     
        ' '         " (      Q Q 0 0 1 3  CL (5
 
 
 5 
 
 O   , 
 O N )3/I 	!ggoobggooe45k+@Allg%6%2  ..228]Kllg% g!'GG
 G
T' '2Xi XvX
* 9;""!"25"" CI;;'*;<?;;   	
  $! 	  	
 
*   05;2!0!5;!3!B0B5;B<BB4O&.. OdVV%(V2?V	V  TT TD%    :5 :5z&  #	
 &WW#W W 
	Wt;;#; #; 
	;(!8L L^  	  F- F- F-R8 T       8b bZ 
.  .(;%;47;>G;	;/6%/647/6>G/6	/6d  . HD u3 u3 u3p$$$ $ 	$
 
$0 b; b; b;J ;4 ; ;| [0 [ [|
#L  K( K( K(\T T	 .'\ '+	""" " $	"
 	"JH HV QG QG QGh ZG ZG ZGz" "
? * ?  rf   