
    ɯwgs                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZmZmZmZ d dlmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dl m!Z! d d	l"m#Z#m$Z$m%Z%m&Z&m'Z' erd d
l(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 dZ6da7 ejp                  e9      Z: G d d      Z; G d d      Z< G d de=      Z>ej~                  d0d       Z@ej                   G d d             ZBej                   G d d             ZC eC       ZDee!j                  e!j                  f   ZGej                   G d d              ZHej                   G d! d"             ZI G d# d$eI      ZJ G d% d&eI      ZK G d' d(eK      ZL G d) d*eK      ZM G d+ d,eI      ZN G d- d.eN      ZO	 	 	 	 d1d/ZPy)2    )annotationsN)ThreadPoolExecutor)byrefc_size_tc_void_pCDLL)	AnyCallableDictIterableListOptionalSequenceTYPE_CHECKINGUnion)multiprocessing)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCache)BaseProcess)Queue)
ModuleType)TritonTemplateCaller   )config)benchmarker)VCUDA_VISIBLE_DEVICESFc                      e Zd Zy)PingN__name__
__module____qualname__     e/home/mcse/projects/flask/flask-venv/lib/python3.12/site-packages/torch/_inductor/autotune_process.pyr$   r$   <       r*   r$   c                      e Zd Zy)PongNr%   r)   r*   r+   r.   r.   @   r,   r*   r.   c                      e Zd Zy)!NonzeroWorkspaceNotSupportedErrorNr%   r)   r*   r+   r0   r0   D   r,   r*   r0   c              #  p  K   | d yt         j                  j                  t              }t	        |       t         j                  t        <   	 d |t         j                  t        = y|t         j                  t        <   y# |t         j                  t        = w |t         j                  t        <   w xY ww)z
    Context manager to set the CUDA_VISIBLE_DEVICES environment variable to the
    specified single device. If device is None, don't manipulate the environment.
    N)osenvirongetr"   str)devicecurrents     r+   set_cuda_visible_devicer8   H   s      ~jjnn12G'*6{BJJ#$7?

/0/6BJJ+, ?

/0/6BJJ+,s   AB6B 0B61B33B6c                      e Zd ZU dZdZded<   dZded<   dZded<   dZded	<   e		 	 	 	 	 	 dd
       Z
e	dd       ZddZddZddZddZ	 d	 ddZddZddZdddZy)TuningProcessz
    Abstraction for launching a helper process to benchmark kernels. Spawns
    the parent process and uses multiprocessing queues to send benchmark
    requests and return results.
    NOptional[int]r6   zOptional[BaseProcess]processzOptional[Queue[Any]]request_queueresponse_queuec                    t         j                  dt        j                  j	                  t
                     	 t        j                  | |       y# t        $ r}t         j                  d       Y d}~yd}~ww xY w)z4
        Entry point for the child process.
        z2Entering TuningProcess child. Visible devices = %szException in TuningProcessN)
logdebugr2   r3   r4   r"   r:   workloop	Exception	exception)r=   r>   exs      r+   process_mainzTuningProcess.process_mainj   sY     			@JJNN/0	
	8""=.A 	8MM677	8s   A 	A8A33A8c                   	 | j                         }|yt        |t              r|j                  t	                      nGt        |t
              r |j                  |j                                nt        dt        |             )z<
        Work loop for the benchmarking subprocess.
        NzInvalid request type )	r4   
isinstancer$   putr.   BenchmarkRequest	benchmarkRuntimeErrortype)r=   r>   objs      r+   rB   zTuningProcess.workloop{   sr    
 ##%C{C&""46*C!12""3==?3"%:49+#FGG r*   c                ^    | j                   duxr | j                  duxr | j                  duS )z?
        True if the sub-process has been initialized.
        Nr<   r=   r>   selfs    r+   validzTuningProcess.valid   s;    
 LL$ 0""$.0##4/	
r*   c                .    dx| _         x| _        | _        y)z2
        Reset to an uninitialized state.
        NrP   rQ   s    r+   clearzTuningProcess.clear   s     CGFFt)D,?r*   c                   | j                         ryt        j                  d      }|j                         | _        |j                         | _        |j                  | j                  | j                  | j
                  f      | _        | j                  J t        | j                        5  | j                  j                          ddd       y# 1 sw Y   yxY w)z
        Create child process, request/response queues, and do the warm up.
        Set the environment to make only the provided GPU device visible
        to the process.
        Nspawn)targetargs)rS   r   get_contextr   r=   r>   ProcessrF   r<   r8   r6   start)rR   ctxs     r+   
initializezTuningProcess.initialize   s     ::< ))'2 YY[!iik{{$$""## # 
 ||'''$T[[1 	!LL 	! 	! 	!s   ,CCc                v    | j                          | j                  J | j                  j                  |       y)z8
        Push a work item to the child process.
        N)r^   r=   rI   )rR   rN   s     r+   rI   zTuningProcess.put   s4    
 	!!---s#r*   c                   | j                   J | j                  J 	 	 |}d}|(|dk\  r#|dz  }	 | j                  j                  d      }	 || j                  j                  |      }|S # t        j                  $ r | j                   j                         s Y nw xY w|W|dk\  r^# t        j                  $ r> | j                   j                  }|| j                  ||        | j                           w xY w)a,  
        Get a response from the child process. Raises queue.Empty on timeout
        or if the process dies.

        This method is (so far) only used by TuningProcessPool, where torch._inductor.config entries are being used
        to populate the timeouts:

        Arguments:

            @param result_timeout: Timeout in seconds, defaults to 120.0 or to
                                   config.max_autotune_subproc_result_timeout_seconds when called by TuningProcessPool
            @param graceful_timeout: Timeout in seconds to allow graceful shutdown (SIGTERM is sent after this time).
                                    Defaults to 3.0 or to config.max_autotune_subproc_graceful_timeout_seconds
            @param terminate_timeout: Timeout in seconds after SIGTERM, until we send SIGKILL if the process
                                      remains alive. Defaults to 1.0 or to
                                      config.max_autotune_subproc_terminate_timeout_seconds.
        Returns:
            A response from the child process (Any type)
        N      ?g      ?timeout)graceful_timeoutterminate_timeout)	r<   r>   r4   queueEmptyis_aliveexitcodekillrU   )rR   result_timeoutrd   re   remaining_timeoutresstatuss          r+   r4   zTuningProcess.get   s)   , ||'''""...$2!'38IS8P%,%""1155c5B ;--11:K1LC
 !;; "#||446!  7" (38IS8P ;; 
..>II)9*;    JJL
s5   B+ A-  B+ -.BB+ BB+ $B+ +AC<c                    | j                         r8| j                  J | j                  J | j                  j                  d       yy)z8
        Signal the child process to terminate.
        N)rS   r<   r=   rI   rQ   s    r+   	terminatezTuningProcess.terminate   sH     ::<<<+++%%111""4( r*   c                r    | j                   +| j                   j                          | j                          yy)z5
        Wait for the child process to exit.
        N)r<   joinrU   rQ   s    r+   waitzTuningProcess.wait   s,     <<#LLJJL $r*   c                H   | j                   | j                          | j                   j                  |       | j                   j                         rt        j                  d| j                   j                         | j                   j                          | j                   j                  |       | j                   j                         rDt        j                  d| j                   j                         | j                   j                          | j                          y y )Nrb   z&Sending SIGTERM to process with PID %dz&Sending SIGKILL to process with PID %d)
r<   rp   rr   rh   r@   warningpiderrorrj   rU   )rR   rd   re   s      r+   rj   zTuningProcess.kill  s    
 <<#NNLL&67||$$&<LL$$ &&(!!*;!<<<((*II@(( LL%%'JJL! $r*   )r=   
Queue[Any]r>   rx   returnNone)ry   boolry   rz   )rN   r	   ry   rz   )g      ^@g      @ra   )ry   r	   )g      @ra   )r&   r'   r(   __doc__r6   __annotations__r<   r=   r>   staticmethodrF   rB   rS   rU   r^   rI   r4   rp   rs   rj   r)   r*   r+   r:   r:   ]   s     !FM %)G")*.M'.+/N(/8!8"8 
8 8  H H 
G!2$ MP1	1f)r*   r:   c                  ^    e Zd ZU dZdZded<   dZded<   ddZddZdd	Z	dd
Z
	 	 	 	 ddZy)TuningProcessPoolz
    Maintains a pool of TuningProcesses to benchmark kernels in parallel
    across devices. By default, we create one TuningProcess per device and
    set the sub-process environment to make only that device visible.
    Nz$Optional[queue.Queue[TuningProcess]]	processeszOptional[ThreadPoolExecutor]executorc                   | j                   du | j                  du k(  sJ | j                   y| j                         }t        j	                  d|       t        j                         | _         |D ]R  }t        |      }|j                          |j                  t                      | j                   j                  |       T | j                   j
                  D ]$  }t        |j                  d      t              r$J  t        t        |            | _        t         s"daddl}|j%                  | j&                         yy)z,
        Start the child processes.
        Nz$Sub-process autotune device list: %s)r6   )rk   )max_workersTr   )r   r   get_device_listr@   rA   rf   r   r:   r^   rI   r$   rH   r4   r.   r   lenEXIT_HANDLER_REGISTEREDatexitregisterrp   )rR   devicesr6   pr   s        r+   r^   zTuningProcessPool.initialize%  s    $&DMMT,ABBB>>%&&(		8'B  	"FV,ALLNEE$&MNNq!		" %% 	@Aaee4e8$???	@ +s7|D
 '&*#OODNN+	 'r*   c                ^   t         j                  sdgS t        j                  j	                         }t
        t        j                  v rNt        j                  t
           j                  d      D cg c]  }t        |       }}t        |      |k  sJ |S t        t        |            S c c}w )zD
        Gather the list of devices to be used in the pool.
        N,)r   autotune_multi_devicetorchcudadevice_countr"   r2   r3   splitintr   listrange)rR   countdr   s       r+   r   z!TuningProcessPool.get_device_listJ  s     ++6M

'')  2::-')zz2F'G'M'Mc'RS!s1vSGSw<5(((NE%L!!	 Ts   0B*c                2   | j                   !| j                   j                          d| _         | j                  ^| j                  j                  D ]  }|j	                           | j                  j                  D ]  }|j                           d| _        yy)z:
        Signal all child processes to terminate.
        N)r   shutdownr   rf   rp   rs   )rR   r   s     r+   rp   zTuningProcessPool.terminate\  s     ==$MM""$ DM>>%^^)) ^^)) !DN &r*   c                F   |j                   J | j                  J | j                  j                         }|j                  |j                          	 |j                  t        j
                  t        j                  t        j                        | j                  j                  |       S # t        j                  $ rB t        j                  d| d       t        d      cY | j                  j                  |       S w xY w# | j                  j                  |       w xY w)z
        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
        remove it from the queue, execute the benchmark in that subprocess, and return
        the TuningProcess to the queue.
        zFailed to benchmark choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.inf)bmreqr   r4   rI   r   +max_autotune_subproc_result_timeout_seconds-max_autotune_subproc_graceful_timeout_seconds.max_autotune_subproc_terminate_timeout_secondsrf   rg   warningswarnfloat)rR   choicer<   s      r+   rX   zTuningProcessPool.targetk  s     ||'''~~)))..$$&FLL!	(;;BBDDEE NNw' {{ 	 MM.vh 7W W
 <NNw'	  NNw's$   <B+ +7D "D ?D  D D c                    | j                   J d       | j                  J i }t        || j                  j                  | j                  |            D ]
  \  }}|||<    |S )z>
        Benchmark each choice in a separate process.
        z&Tuning process pool is not initialized)r   r   zipmaprX   )rR   choicesresultsr   results        r+   rK   zTuningProcessPool.benchmark  sp     ~~)S+SS)}}((( "'4==+<+<T[['+RS 	%NFF$GFO	% r*   r|   )ry   zSequence[Optional[int]])r   r   ry   r   r   zList[TritonTemplateCaller]ry   z!Dict[TritonTemplateCaller, float])r&   r'   r(   r}   r   r~   r   r^   r   rp   rX   rK   r)   r*   r+   r   r     sK     7;I3:-1H*1#,J"$"(6+ 
+r*   r   c                  p    e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   dZded<   e	 	 	 	 dd       ZddZy)
TensorMetaztorch.devicer6   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNzOptional[str]namec           
     ,   t        |t              r4|D cg c]  }| j                  |       }}t        d |D              sJ |S |}t        |t        j
                        rt	        j                  d|      }|j                         }|J t        |j                         |t        j                  j                  j                  |j                         t        j                         t        j                  j                  j                  |j#                         t        j                         t        j                  j                  j%                  |j'                         j(                  t        j                         |j+                               S c c}w )Nc              3  <   K   | ]  }t        |t                y wN)rH   r   .0xs     r+   	<genexpr>z*TensorMeta.from_irnodes.<locals>.<genexpr>  s     AQz!Z0A   fake)fallback)r6   r   r   r   r   r   )rH   r   from_irnodesallr   LayoutBuffer	get_dtyper   
get_devicer!   graphsizevars
size_hintsget_sizer   unbacked_symint_fallback
get_stride	size_hint
get_layoutr   get_name)clsirnodesr   r   noder   s         r+   r   zTensorMeta.from_irnodes  sD    gx(>E F!1!1!!4 FF FA&AAAAMdBII&99VT*D    ??$''""--88 .  GG$$//!88 0  77##--!((88 .  
 	
 !Gs   Fc                    t        | j                  | j                  | j                  | j                  | j
                        S )N)r6   r   
extra_size)r   r   r   r6   r   r   rQ   s    r+   	to_tensorzTensorMeta.to_tensor  s2    JJLL;;**{{
 	
r*   )r   z/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]ry   #Union[TensorMeta, List[TensorMeta]])ry   torch.Tensor)r&   r'   r(   r~   r   classmethodr   r   r)   r*   r+   r   r     sQ    ((++KD-
E
	,
 
B
r*   r   c                  x    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 d	dZ	 	 	 	 	 	 d
dZddZdd	 	 	 	 	 ddZdd	 	 	 	 	 ddZy)rJ   a1  
    Only handle triton template benchmark for now. The extern kernel benchmark
    can be done inside the same process since they usually don't cause crash.

    Important: Instances of this class and subclasses have to be serializable
    across process boundaries. Do not put CUDA Tensors in here!
    c                    || _         t        |t              r|g}|| _        t        |t        t
        f      rt        |      dk(  sJ |d   }|| _        || _        y )Nr   r   )	kernel_namerH   r   input_tensor_metatupler   r   output_tensor_meta
extra_args)rR   r   r   r   r   s        r+   __init__zBenchmarkRequest.__init__  se     ''4!2 3!2(5$-8)*a///!3A!6"4$r*   c                   t         r   NotImplementedErrorrR   output_tensorinput_tensorss      r+   make_run_fnzBenchmarkRequest.make_run_fn  s
     "!r*   c                     y r   r)   rQ   s    r+   cleanup_run_fnzBenchmarkRequest.cleanup_run_fn  s    r*   Nr   c                   t         r   r   rR   fnr   r   s       r+   do_benchzBenchmarkRequest.do_bench  s
     "!r*   c                   t         j                  t        j                        }|rt	        j                         }|Ft        |      dk(  sJ t        d | j                  D              }| j                  j                         }|r+t	        j                         z
  }t	        j                         }	  | j                  |d|i}|r+t	        j                         z
  }t	        j                         } | j                  |g|| }|r9t	        j                         z
  }	t         j                  dt!        |       |	       | j#                          |S # t        $ r# t         j                  d       t        d      cY S w xY w)Nr   c              3  <   K   | ]  }|j                           y wr   )r   r   s     r+   r   z-BenchmarkRequest.benchmark.<locals>.<genexpr>  s     !PA!++-!Pr   r   z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)r@   isEnabledForloggingDEBUGtimer   r   r   r   r   r   r0   infor   r   rA   r5   r   )
rR   r   r   rA   start_tscreate_tensor_elapser   load_elapseoutbench_elapses
             r+   rK   zBenchmarkRequest.benchmark  sM   
   /yy{H  }%***!!P9O9O!PPM 33==?M#'99;#9 yy{H	 !!!=NNB ))+0Kyy{HdmmB>>>99;1LIIHD	$ 	
+ 1 	 HHGH<	 s   0E )E=<E=)
r   r5   r   r   r   r   r   Iterable[Any]ry   rz   r   r   r   r   ry   zCallable[[], None]r|   r   r   r   zOptional[torch.Tensor]ry   r   )	r&   r'   r(   r}   r   r   r   r   rK   r)   r*   r+   rJ   rJ     s    %% ?% @	%
 "% 
%*"*";G"	"
 15	" %" .	"
 
" 15)$) .) 
	)r*   rJ   c                  2    e Zd ZdZdddZdd	 	 	 	 	 ddZy)	TestBenchmarkRequestz
    Supports unit testing. Defined in this file so that the TuningProcess
    sub-process knows how to unpickle these objects.
    Nc                    || _         y r   )value)rR   r   s     r+   r   zTestBenchmarkRequest.__init__6  s	    
r*   r   c               H    | j                   t        d      | j                   S )NzFailed to run)r   rC   r   s      r+   rK   zTestBenchmarkRequest.benchmark9  s#     ::O,,zzr*   r   )r   zOptional[float]ry   rz   r   )r&   r'   r(   r}   r   rK   r)   r*   r+   r   r   0  s0    
 UY*;Q	r*   r   c                  $    e Zd Zdd	 	 	 	 	 ddZy)GPUDeviceBenchmarkRequestNr   c               X   g ||D ch c]T  }t        |t        j                        r8|j                  r,|j                  j
                  |j                  j
                  V }}t        |      dk  s
J d|        t        |      dk(  rt        t        |            }nt        j                  j                         }t        j                  j	                  |      5  t        j                  |      }t        j                  j                          d d d        |S c c}w # 1 sw Y   S xY w)Nr   zCan not mix devices )rH   r   Tensoris_cudar6   indexr   nextiterr   current_devicer    benchmark_gpusynchronize)rR   r   r   r   tensordevice_idx_set
device_idxr   s           r+   r   z"GPUDeviceBenchmarkRequest.do_benchB  s     :M9=9
&%,,/##/	 MM
 
 >"a'P+??O)PP'~!#d>23J224JZZz* 	%++B/CJJ""$	% 
#
	% 
s   AD4DD)r   r&   r'   r(   r   r)   r*   r+   r   r   A  s*    
 15	 % .	
 
r*   r   c                  p     e Zd Z	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZd ZddZ xZS )	TritonBenchmarkRequestc                ~    t         |   ||||       || _        || _        || _        || _        |	| _        |
| _        y r   )superr   module_pathmodule_cache_keygrid
num_stages	num_warpsmatrix_instr_nonkdim)rR   r   r   r   r   r  r  r  r  r  r  	__class__s              r+   r   zTritonBenchmarkRequest.__init___  sG     	&79KZX& 0	$"$8!r*   c               0   t        j                  | j                  | j                        }t        j                  d| j                  | j                         t        || j                        j                  }t        | j                        }i }dd l}d|j                  |      j                  v rd|d<   ddlm} t         j"                  j$                  rj| j&                  dk7  r[t)        j*                  |g||| j                  d| j,                  i|d || j.                  j0                  j2                        iS t)        j*                  |g||| j                  d| j,                  i|d || j.                  j0                  j2                        iS )Nz"benchmark module key: %s, path: %sr   warmupF)_cuda_getCurrentRawStreamr  stream)r   load_by_key_pathr  r  r@   rA   getattrr   runr   r   inspect	signature
parameterstorch._Cr  r   versionhipr  	functoolspartialr  r   r6   r  )	rR   r   r   mod
run_methodr   
warmup_argr   get_raw_streams	            r+   r   z"TritonBenchmarkRequest.make_run_fnt  s    **4+@+@$BRBRS		0!!	
 S$"2"2377
$//*
 
w((4???#(Jx H==!:!:a!?$$  	
 YY  &d&=&=&D&D&J&JK  $$  	
 YY  &d&=&=&D&D&J&JK r*   c                    t        j                  | j                  | j                        }t	        || j
                        j                          y r   )r   r  r  r  r  r   
precompile)rR   r(  s     r+   r-  z!TritonBenchmarkRequest.precompile  s9    **4+@+@$BRBRST%%&113r*   c                T    d| j                   d| j                  d| j                  S )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r   r  r  rQ   s    r+   __str__zTritonBenchmarkRequest.__str__  s2    #$""$$8t'7'7&99RD<Q<Q;STTr*   )r   )r   r5   r   r   r   r   r   r   r  r5   r  r5   r  z	List[int]r  r   r  r   r  r   ry   rz   r   ry   r5   )r&   r'   r(   r   r   r-  r0  __classcell__r  s   @r+   r  r  \  s     %&99 ?9 @	9
 "9 9 9 9 9 9 "9 
9****;G*	*X4Ur*   r  c                  p     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d fdZd Z	 	 	 	 	 	 d	dZd
dZd Zd
dZddZ	 xZ
S )CUDABenchmarkRequestc                    t         |   ||||       || _        d| _        d | _        d | _        d| _        d| _        d| _        t        j                  | j                  d      \  | _        | _        y )Nr   F so)r  r   source_codeworkspace_size	workspaceDLL_workspace_size_updatedhash_keysource_filer   writerR   r   r   r   r   r9  r  s         r+   r   zCUDABenchmarkRequest.__init__  sr     	&79KZX&#$15)-',$ "*7*=*=d>N>NPT*U't'r*   c                    t         j                  d|        t        j                  | j                  d       t         j                  d|        y )NPrecompiling %sr8  Done precompiling %s)r@   rA   r   compiler9  rQ   s    r+   r-  zCUDABenchmarkRequest.precompile  s8     			#T*d..5		($/r*   c          	     B   | j                          | j                          t        |      |gz   D cg c]  }t        |j	                                }}t
        j                  d| j                  | j                  | j                  | j                  || j                         t        t        j                  j                         j                        }t!        | j                  | j                        }t        d      }| j"                  dkD  rht        j$                  | j"                  dz   dz  t        j&                  |j(                        | _        t        | j*                  j	                               }t-        j.                  |g|| j                  d || S c c}w )Nzqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         )r   r6   )ensure_dll_loadedupdate_workspace_sizer   r   data_ptrr@   rA   r   r?  r>  r<  r   r   r   current_streamcuda_streamr  r:  zerosfloat64r6   r;  r&  r'  )rR   r   r   r
  rY   
stream_ptrr)  workspace_ptrs           r+   r   z CUDABenchmarkRequest.make_run_fn  sw    	 ""$ }-?
 V__&'
 
 			MMHHOO	
 ejj779EEF
TXXt'7'78
 ""[[$$q(Q.mm$++DN
 %T^^%<%<%>?M   

 __
 	

 
 
 	
3
s    Fc           
        | j                   ry | j                          t        | j                  D ch c]  }|j                   c}      }t        |dz         D cg c]  }t        d        }}t        t        j                  j                         j                        }t        | j                  | j                        }t               } |g || j                  t!        |      d |  t        j                  j#                          |j$                  | _        t(        j+                  d| j&                  | j                  | j,                  | j.                  | j                  || j                         d| _         y c c}w c c}w )Nr   zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)r=  rI  r   r   r   r   r   r   r   rL  rM  r  r<  r   r   r   r   r	  r   r:  r@   rA   r?  r>  )rR   metaunique_input_count_rY   rP  r)  c_workspace_sizes           r+   rJ  z*CUDABenchmarkRequest.update_workspace_size  sQ   ''  8N8N!O$))!OP(-.@1.D(EF1FFejj779EEF
TXXt'7'78
#: 	
	
__	
  	
 	
 	
 	

 .44		 hMMHHOO		
 (,$9 "PFs   E<Fc                    | j                   4t        j                  | j                  d      \  | _         | _        | _        y y )Nr8  )r<  r   loadr9  r>  r?  rQ   s    r+   rI  z&CUDABenchmarkRequest.ensure_dll_loaded  s:    888E8J8J  $95DHdmT%5 r*   c                ^    | j                   | j                   j                          d | _        y r   )r<  closer;  rQ   s    r+   r   z#CUDABenchmarkRequest.cleanup_run_fn  s!    88HHNNr*   c                T    d| j                   d| j                  d| j                  S )Nr/  z, self.source_file=z, self.hash_key=)r   r?  r>  rQ   s    r+   r0  zCUDABenchmarkRequest.__str__  s0    #$""$$8t'7'7&99JDMM;KLLr*   r   r5   r   r   r   r   r   r   r9  r5   ry   rz   r   r|   r1  )r&   r'   r(   r   r-  r   rJ  rI  r   r0  r2  r3  s   @r+   r5  r5    s    VV ?V @	V
 "V V 
V$0%
*%
;G%
	%
N ,D
Mr*   r5  c                  $    e Zd Zdd	 	 	 	 	 ddZy)CPUDeviceBenchmarkRequestNr   c               ,    t        j                  |      S r   )r    benchmark_cpur   s       r+   r   z"CPUDeviceBenchmarkRequest.do_bench  s     ((,,r*   r   r  r)   r*   r+   r^  r^    s*    
 15	- %- .	-
 
-r*   r^  c                  b     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d fdZd Z	 	 	 	 	 	 ddZddZd	dZ xZS )
CppBenchmarkRequestc                f    t         |   ||||       || _        t        |      | _        d | _        y r   )r  r   r9  r   r>  r<  rA  s         r+   r   zCppBenchmarkRequest.__init__+  s5     	&79KZX& -6:r*   c                    t         j                  d|        t        j                  | j                  d       t         j                  d|        y )NrC  Fr   rD  )r@   rA   r   rX  r9  rQ   s    r+   r-  zCppBenchmarkRequest.precompile8  s8     			#T*$**7		($/r*   c               \   t        j                  | j                  d      | _        t	        |      |gz   D cg c]  }|j                          }}t        j                  d| j                  | j                  || j                         t        | j                  | j                        }t        d | j                  D              sJ t        j                  gt        |      t        t	        | j                              z   z  |_        t!        j"                  |g|| j                   S c c}w )NFre  zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc              3  P   K   | ]  }t        |t        j                           y wr   )rH   ctypesc_ulonglong)r   args     r+   r   z2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>N  s     R3:c6#5#56Rs   $&)r   rX  r9  r<  r   rK  r@   rA   r   r   r  r   rh  ri  r   argtypesr&  r'  )rR   r   r   r
  rY   r)  s         r+   r   zCppBenchmarkRequest.make_run_fn?  s     $$T%5%5EB04]0C}o0UVf!VV		XHHOO	
 TXXt'7'78
R$//RRRR%112ID122


   

 __
 	
! Ws   D)c                    | j                   3	 t        | j                   d      r| j                   j                          y y y )NrZ  )r<  hasattrrZ  rQ   s    r+   r   z"CppBenchmarkRequest.cleanup_run_fnZ  s9    88 txx)  *	  r*   c                     d| j                   S )Nr/  )r   rQ   s    r+   r0  zCppBenchmarkRequest.__str__b  s    #$""$%%r*   r\  r   r|   r1  )	r&   r'   r(   r   r-  r   r   r0  r2  r3  s   @r+   rb  rb  '  so    ;; ?; @	;
 "; ; 
;0
*
;G
	
6!&r*   rb  c                ,    t         j                  |       S )zO
    Do benchmarking in a subprocess and return the perf number (latency).
    )tuning_poolrK   )r   s    r+   benchmark_in_sub_processrq  f  s       ))r*   )r6   r;   r   )Q
__future__r   
contextlibrh  dataclassesr&  r   r2   rf   r   r   concurrent.futuresr   r   r   r   r   typingr	   r
   r   r   r   r   r   r   r   r   torch._inductor.async_compiler   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r   r   r   multiprocessing.processr   multiprocessing.queuesr   typesr    torch._inductor.select_algorithmr   r7  r   runtime.benchmarkingr    virtualizedr!   r"   r   	getLoggerr&   r@   r$   r.   rC   r0   contextmanagerr8   	dataclassr:   r   rp  r   r   LayoutOrBufferr   rJ   r   r   r  r5  r^  rb  rq  r)   r*   r+   <module>r     s   "      	    1 2 2
 
 
  $ ! .   3, E  -  .  g!	 		 			 	 7 7( y y yx | | |~  ! ryy"))+, 1
 1
 1
h W W Wt+ " 0 6IU6 IUXrM4 rMj- 0 -<&3 <&~*'*&*r*   