
    ɯwg}*                    >   d dl mZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZ ej&                  j)                  ed      Zej&                  j)                  ed      Ze	e
eeej2                  f         Zee	e   gef   Z ej8                  d	
       G d d             Z ej8                  d	
       G d d             Z ej8                  d	
       G d d             Z	 	 	 	 d&dZ d'dZ!d(dZ"d)dZ#d*dZ$	 	 	 	 	 	 d+dZ%	 	 	 	 	 	 	 	 d,dZ&d-dZ'	 	 	 	 d.dZ(	 	 d/dZ)d Z*ej8                   G d d             Z+	 	 	 	 	 	 d0dZ,d1dZ- G d  d!e      Z.	 	 	 	 	 	 	 	 	 	 	 	 d2d"Z/	 	 	 	 	 	 d3d#Z0 ej8                  d	
       G d$ d%             Z1y)4    )annotationsN)Enum)AnyCallableDictListOptionalSequenceTupleUnion)counters)	InputType
perf_hintscudagraph_static_inputsT)frozenc                      e Zd ZU dZded<   y)
FunctionIDz9Unique counter of a function wrapped in cudagraphify_implintidN__name__
__module____qualname____doc____annotations__     d/home/mcse/projects/flask/flask-venv/lib/python3.12/site-packages/torch/_inductor/cudagraph_utils.pyr   r      s
    ?Gr   r   c                  :    e Zd ZU dZded<   ded<   ded<   ded<   y	)
PlaceholderInfoz
    A serializable version of torch.fx.Node that contains information
    pertinent to placeholder stack traces. We use these in logging and error messages
    related to cudagraphs, and will cache these results.
    strnameOptional[str]stack_traceList[PlaceholderInfo]usersmutating_use_stack_traceNr   r   r   r   r    r       s      I  ++r   r    c                  N    e Zd ZU dZded<   ded<   ded<   ded	<   d
ed<   ded<   y)WrappedFunctionz
    Represents a function that you want to record for CUDA graph replay,
    with a little more metadata so we can identify if we have an applicable
    CUDA graph in our CUDA graph tree for it.
    zCallable[..., Any]modelSequence[int]static_input_idxsr   r   zTuple[torch.Tensor, ...]	constantsSequence[PlaceholderInfo]placeholdersmutated_input_idxsNr   r   r   r   r)   r)   ,   s,     $$N''++%%r   r)   c                   t        | j                        dk(  r8t        t        | j                              j                  j                  dd       S | j                  D ]`  }|j                  t        j                  j                  j                  j                  k(  s?|j                  j                  dd       x}s^|c S  y )N   r$   )lenr&   nextitermetagettargettorchopsatencopy_default)placeholder_nodeuser$   s      r   &get_mutating_use_stack_trace_from_noder@   <   s     !!"a'D)//0166::=$OO%% #::--555!hhll=$??{?""#
 r   c                    | j                   S N)r'   )placeholder_infos    r   get_mutating_use_stack_tracerD   K   s    444r   c                    | j                   }| j                  j                  dd       }g }d }| j                  dk(  r-| j                  D cg c]  }t        |       }}t        |       }t        ||||      S c c}w )Nr$   placeholder)r"   r6   r7   opr&   to_placeholder_infor@   r    )r>   r"   r$   r&   r'   is         r   rH   rH   O   s      D"''++M4@KE#m+1A1G1GHA$Q'HH#I$
  4e5MNN Is   
A7c                r    | j                   D cg c]  }|j                  dk(  st        |       c}S c c}w )NrF   )nodesrG   rH   )graphnodes     r   get_placeholder_inforN   ^   s4    .3kk&*TWW=UD!  s   44c                    d|  S )Nzskipping cudagraphs due to r   )reasons    r   format_default_skip_messagerQ   d   s    (11r   c                    d}|D ]  }| |   }t        |      x}s n t        dt        |       d      }|r| d| S |S )N zmutated inputs (z instances). Found from : 
 )rD   rQ   r3   )r/   mutation_indicesr$   idxrF   msgs         r   get_mutation_stack_tracerX   h   sp     "$K "3'6{CC;C
 &
3/01=C (66Jr   c                   t         j                  j                  j                  j                  r3| j
                  D cg c]  }|| j                  v s |||         s| }}n| j
                  }t        j                  d| j                         t        j                  d|       |rt        | j                  |      S d S c c}w )Nz'check mutation static input indices: %sz#check mutation mutation indices: %s)r9   	_inductorconfigtritoncudagraph_treesr0   r,   static_inputs_logdebugrX   r/   )funcinputsis_cuda_graph_recorded_tensorrV   rU   s        r   check_for_mutationrc   {   s     $$44 ..+
t---0=	 +
 +
  22143I3I ACST  	!!2!24DE !+
s   "B>c                j    | j                   D ]$  }|j                  j                  dd       x}s"|c S  y )Nr$   )r&   r6   r7   )rM   r?   r$   s      r   _get_use_stack_tracere      s:    zz ((,,}d;;;; r   c                   | j                  t        j                  d            x}r8d|j                   d}t	        |      x}rt        | d|       S t        |      S t        |       dk(  r0t        t        | j                                     j                  dk(  ry d | j                         D        }t        dd	j                  |             S )
Ncpuzcpu device ()rT   r2   cudac              3  2   K   | ]  }t        |        y wrB   )repr).0keys     r   	<genexpr>z:check_multiple_devices_or_any_cpu_nodes.<locals>.<genexpr>   s     AscAs   zmultiple devices: z, )r7   r9   devicer"   re   rQ   r3   r4   r5   keystypejoin)device_node_mappingcpu_noderW   r$   	keys_reprs        r   'check_multiple_devices_or_any_cpu_nodesrv      s     '**5<<+>??x?X]]O1-.x88;8.#6H/VWW*3// 	 A%)..012776AA&9&>&>&@AI&);DIIi<P;Q'RSSr   c                    t        |       S rB   )rv   )rs   s    r    check_lowering_disable_cudagraphrx      s     33FGGr   c                V    t         j                  |        t        d   dxx   dz  cc<   y )Ninductorcudagraph_skipsr2   )perf_hint_logwarningr   )rW   s    r   #log_cudagraph_skip_and_bump_counterr~      s&    #Z*+q0+r   c                       e Zd ZU ded<   ddZy)BoxedDeviceIndexOptional[int]valuec                :    |t        |t              sJ || _        y rB   )
isinstancer   r   )self
device_idxs     r   setzBoxedDeviceIndex.set   s    !Z
C%@@@
r   N)r   r   )r   r   r   r   r   r   r   r   r   r      s     r   r   c                p   t        d      }t        j                  j                  j                  j
                  r[t        |      }|j                  D cg c]	  }||vs| }}t        |      dk7  }|sy t        | j                        }t        ||      S t        |j                        dk7  }|sd S |S c c}w )Nzmutated inputsr   )rQ   r9   rZ   r[   r\   r]   r   r0   r3   rN   rL   rX   mutated_inputs)	gmcompiled_graphr,   default_msgunique_idxsrV   rU   has_mutationr/   s	            r   3check_for_mutation_ignore_cuda_graph_managed_tensorr      s     ..>?K $$44+, *<<
;@VC
 
 +,1+BHH5'6FGG >889Q>'t8[8
s   	B3B3c                    | j                   r| j                   S | j                  D ]  }|j                   s|j                   c S  y)zM
    Gets the first non-empty stack trace of a placeholder or its users.
    N)r$   r&   )rF   users     r   get_placeholder_stack_tracer      sH     &&&!! $###$ r   c                  $    e Zd ZdZdZdZdZddZy)CheckInvariantStatusr2            c                    | j                   dk(  ry| j                   dk(  ry| j                   dk(  ry| j                    d| j                   S )NCudagraphManagedIdxMismatchz-cudagraph managed tensor data pointer changedStaticInputIdxMismatchz!static input data pointer changed&ExpectedDeadIndicesBeforeGraphMismatchz+expected dead indices before graph are livez: )r"   r   )r   s    r   __str__zCheckInvariantStatus.__str__   sK    9955BYY226YYBB@ii[4::,//r   N)returnr!   )r   r   r   SUCCESSr   r   r   r   r   r   r   r   r      s$    G #$  ./*0r   r   c                   t        |      t        |      k(  rt        |      t        |       k(  sJ d       |D cg c]  }||   	 }}|D cg c]  }||   	 }}| d}t        t        ||            D ]t  \  }\  }	}
t        |	t        j
                        sJ ||   }|	j                         |
k7  s>| |   }| d|j                   d|
 d|	j                          dt        |       d
}v |S c c}w c c}w )z}
    Logs the mismatch between input data pointers and recorded data pointers.
    This checks only idxs in target_idxs.
    zClength mismatch between inputs, recorded_data_ptr, and placeholdersz.
zinput name: z. data pointer changed from z to z. input stack trace: 
)	r3   	enumeratezipr   r9   Tensordata_ptrr"   r   )r/   ra   recorded_data_ptrtarget_idxsmismatchrI   	t_tensorst_data_ptrs	error_msgtensorr   indexrF   s                r   log_data_ptr_mismatchr     s1    v;#/00S[CE 6 MLM  %00q0I01<=A$Q'=K=*C I!*3y++F!G 	FH&%,,///A??(&u-K+\+*:*:); <--5Jd6??;L:M N&&A+&N%OrS 	  1=s   C-C2c                <   t        | j                               dz   fd}t        j                  j                  j
                  j                  rLt        j                  j                  j
                  j                  kD  rt        j                   |              yy)Nr2   c                     d  dS )NzCUDAGraph supports dynamic shapes by recording a new graph for each distinct input size. Recording too many CUDAGraphs may lead to extra overhead. We have observed a0   distinct sizes. Please consider the following options for better performance: a) padding inputs to a few fixed number of shapes; or b) set torch._inductor.config.triton.cudagraph_skip_dynamic_graphs=True. Set torch._inductor.config.triton.cudagraph_dynamic_shape_warn_limit=None to silence this warning.r   )num_cudagraphss   r   warn_msgz4maybe_warning_due_to_dynamic_shape.<locals>.warn_msg+  s    00>/? @''		
r   TF)	r3   rp   r9   rZ   r[   r\   "cudagraph_dynamic_shape_warn_limitr|   r}   )fn_cachenew_int_keyr   r   s      @r   "maybe_warning_due_to_dynamic_shaper   %  st     )A-N

 	%%HH
//
 
 
'
'
J
JK 	hj)r   c                  0    e Zd ZU dZded<   ded<   ded<   y)	CudagraphCachedInfoz'
    Info needed to realign inputs
    r.   r/   zList[Optional[str]]stack_tracesz	List[str]cudagraph_fail_reasonsNr   r   r   r   r   r   B  s     ,+%%%%r   r   )r>   torch.fx.Noder   r#   )rC   r    r   r#   )r>   r   r   r    )rL   ztorch.fx.Graphr   r%   )rP   r!   r   r!   )r/   r.   rU   r+   r   r!   )r`   r)   ra   List[InputType]rb   zCallable[[torch.Tensor], bool]r   r#   )r   r#   )rs   !Dict[torch.device, torch.fx.Node]r   r#   )rs   r   )r   ztorch.fx.GraphModuler,   r+   r   r#   )rF   r    r   r#   )r/   r.   ra   r   r   zSequence[Optional[int]]r   r+   r   r   r   r!   )r   z)Dict[Tuple[int, ...], Callable[..., Any]]r   r   r   bool)2
__future__r   dataclassesenumr   typingr   r   r   r   r	   r
   r   r   r9   torch._dynamo.utilsr   torch._inductor.utilsr   _logginggetArtifactLoggerr   r|   r^   r   r   
OutputType	ModelType	dataclassr   r    r)   r@   rD   rH   rN   rQ   rX   rc   re   rv   rx   r~   r   r   r   r   r   r   r   r   r   r   <module>r      sk   "   N N N  ( + 00<HNN44' 
 (5ell!2345
d9o&
23	 d#  $
 d#, , $, d#& & $&#5O2+?L&
 $B 	>T:TT(H:H1
      99AN99.04 00+ / 	
 # 	>7 
: d#& & $&r   