
    ɯwg6                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlZd dlZd dlmZ d dlmZ d dlmc mZ d dlmZ d dlmZ d dlmZ dd	lmZmZmZ dd
lm Z  ddl!m"Z"m#Z#m$Z$  ejJ                  e&      Z'd Z(ed        Z)edejT                  de
fd       Z+d dZ,d Z-edejT                  de
fd       Z. G d dej^                        Z0edejT                  de
fd       Z1ed        Z2d Z3ejh                  jj                  Z5e5jl                  e5jn                  e5jp                  e5jr                  e5jt                  e5jv                  e5jx                  e5jz                  e5j|                  e5j~                  e5j                  e5j                  e5j                  e5j                  j                  e5j                  j                  e5j                  e5j                  e5j                  e5j                  e5j                  e5j                  e5j                  hZM eeM      ZMed        ZNdee
ej                  f   fdZPd ZQd aRd ZSd ZTd!dZUy)"    N)contextmanager)partial)CallableUnion)SymInt)get_decompositions)bind_symbols   )aot_function
aot_modulemake_boxed_compiler)strip_overloads)default_partition
draw_graph#min_cut_rematerialization_partitionc                     | j                   j                  dt        j                  j                  j
                        D ]+  }t        j                  j                  j                  |_        - | j                          | S )Ncall_functionoptarget)	graph
find_nodestorchopsaten_to_copytor   	recompile)fx_gnodes     _/home/mcse/projects/flask/flask-venv/lib/python3.12/site-packages/torch/_functorch/compilers.py_canonicalizer"   $   s`    

%%599>>#:#: &  ( iinn''( 	NNK    c               #      K   t         j                  j                  d      } 	 d  t         j                  j                  |        y # t         j                  j                  |        w xY ww)NF)r   _C_jit_set_autocast_mode)old_jit_autocast_flags    r!   _disable_jit_autocastr(   -   sI     !HH;;EB?''(=>''(=>s    A+A  A+!A((A+r   returnc                 D   t               5  t        |        | j                  j                  dt        j
                  j                  j                        D ]l  }t        |j                        dk(  st        |j                        dk(  s5d|j                  v sDt        j
                  j                  j                  |_        n | j                  j                  D ]X  }i }|j                  j                         D ]0  \  }}t        |t        j                         r|j"                  }|||<   2 ||_
        Z | j                  j%                          | j'                          t        j(                  j+                  |       }t        j,                  j/                  |j                         t        j(                  j1                  |j3                               }t        j(                  j5                  |      }t7        d |D              s ||  ddd       |S # 1 sw Y   S xY w)a  
    Compiles the :attr:`fx_g` with Torchscript compiler.

    .. warning::
        This API is experimental and likely to change.

    Args:
        fx_g(fx.GraphModule): The input Fx graph module to be compiled.

    Returns:
        Torch scripted model.
    r   r   r
   dtypec              3   d   K   | ](  }t        |t        j                  j                         * y wN)
isinstancer   _subclasses
FakeTensor).0ts     r!   	<genexpr>zts_compile.<locals>.<genexpr>`   s#     M1:a!2!2!=!=>Ms   .0N)r(   r   r   r   r   r   r   r   lenargskwargsr   r   nodesitemsr.   devicetypelintr   jitscriptr%   _jit_pass_remove_mutationfreezeevaloptimize_for_inferenceany)r   inpsr    
new_kwargskvfs          r!   
ts_compilerH   6   s    
	  JJ))uyy~~'>'> * 
 	0D 499~"s4;;'71'<DKKAW#iinn//		0 JJ$$ 	%DJ))+ "1a.A !
1" %DK	% 	

IIT"**1773IIQVVX&II,,Q/MMMtH9: H;: Hs   A&H2HHE1HHc                 L    t        | j                         t        | ||       | S )N)
clear_meta)printcoder   )r   _namerJ   s       r!   _draw_graph_compilerO   e   s    	$))tTj1Kr#   c                 6    t        t        t        |             S )NrN   )r   r   rO   rQ   s    r!   draw_graph_compilerR   k   s    w':FGGr#   c                     | S )z
    Returns the :attr:`fx_g` Fx graph module as it is. This is a no-op compiler
    and can be used to check accuracy.

    .. warning::
        This API is experimental and likely to change.

     r   rM   s     r!   noprV   o   s	     Kr#   c                   (     e Zd Z fdZ fdZ xZS )DebugInterpreterc                 T    t        | j                  g| | _        t        |   |  y r-   )r	   modulesymbol_mappingsuperrun)selfr5   	__class__s     r!   r]   zDebugInterpreter.run}   s%    *4;;>>Tr#   c                    
  fdfdfd

fd}t            |      }d|j                  v rt        j                  |j                  d         \  }}t        j                  |      \  }}t        |      t        |      k(  sJ t        |       dt        |              t        t        t        |            ||      D ]/  \  }}	t        |	t        j                        s" |||	 fd       1 |S )Nc                     t        | t              s| S t        j                  | j                  j
                  j                  j                              }|j                  sJ |       t        |      S r-   )
r.   r   sympyexpandr    exprxreplacer[   	is_numberint)nirr^   s     r!   subst_symintz/DebugInterpreter.run_node.<locals>.subst_symint   sS    b&)	RWW\\2243F3FGHA;;!!;q6Mr#   c                 ,    t        fd| D              S )Nc              3   .   K   | ]  } |        y wr-   rT   )r1   rh   rj   s     r!   r3   zHDebugInterpreter.run_node.<locals>.subst_symint_tuple.<locals>.<genexpr>   s     8bb)8s   )tuple)nisrj   s    r!   subst_symint_tuplez5DebugInterpreter.run_node.<locals>.subst_symint_tuple   s    8C888r#   c                      | j                               dkD  r`t        | j                        D ]H  } | j                  |            |j                  |      k7  s- | j	                  |            dkD  sH y y)Nr   r
   FT)numelrangendimstridesize)abidxrj   s      r!   check_significant_stridesz<DebugInterpreter.run_node.<locals>.check_significant_strides   sg    AGGI&* = %C$QXXc]3qxx}D(59$% r#   c           	      "   t        |      sJ | j                  |j                  k(  s(J  |        d| j                   d|j                           | j                               |j                         k(  sGJ  |        d| j                          d | j                                d|j                                  | |      }|sGJ  |        d| j                          d | j                                d|j                                 y )Nz:  != z aka )callabler+   ru   rt   )nvrvdescsame_stridesry   ro   s       r!   checkz(DebugInterpreter.run_node.<locals>.check   s    D>!>88rxx'NDF82bhhZtBHH:)NN'"2779-:[&BGGI;e,>rwwy,I+J$rwwykZ[:4R<La&BIIK=.@.M-NdSUS\S\S^R_`ar#   valr{   c                  (    d  dj                    S )Nzoutput z where )r[   )ir^   s   r!   <lambda>z+DebugInterpreter.run_node.<locals>.<lambda>   s    s'$:M:M9N&O r#   )r\   run_nodemetapytreetree_flattenr4   ziprr   r.   r   Tensor)r^   nr   ri   n_valsn_specr_valsr_specr}   r~   ry   r   rj   ro   r_   s   `         @@@@r!   r   zDebugInterpreter.run_node   s    		9			a GQAFF?#00?NFF#003NFF v;#f+-P#f+d3v;-/PP- s6{!3VVD Q	2r!"ell3b"OPQ r#   )__name__
__module____qualname__r]   r   __classcell__)r_   s   @r!   rX   rX   |   s    / /r#   rX   c                 ,    t        |       j                  S )z
    Returns a (slow) interpreter over the FX graph module that also checks
    various debugging properties (e.g., that tracing strides matched real
    strides.)
    )rX   r]   rU   s     r!   	debug_nopr      s     D!%%%r#   c                     t        |        t        j                  j                  |       }t        j                  j	                  |j                               }|S r-   )r   r   r<   r=   r?   r@   )r   rM   rG   s      r!   simple_ts_compiler      s=    D		A		"AHr#   c                 "    t        | t              S r-   )r   r   )rG   s    r!   nnc_jitr      s    ,--r#   c                 0    t        | j                         | S r-   )rK   rL   rU   s     r!   print_compiler      s    	$))Kr#   fnc                     t         t         t        t        d}|j                  |       t	        | t
        j                  j                        rt        | fi |S t        | fi |S )a  
    Wrapper function over :func:`aot_function` and :func:`aot_module` to perform
    memory efficient fusion. It uses the
    :func:`min_cut_rematerialization_partition` partitioner to perform efficient
    recomputation. It uses NVFuser to compile the generated forward and backward
    graphs.

    .. warning::
        This API is experimental and likely to change.

    Args:
        fn (Union[Callable, nn.Module]): A Python function or a ``nn.Module``
            that takes one ore more arguments. Must return one or more Tensors.
        **kwargs: Any other overrides you want to make to the settings

    Returns:
        Returns a ``Callable``  or ``nn.Module`` that retains the eager behavior
        of the original :attr:`fn`, but whose forward and backward graphs have
        gone through recomputation optimizations, and the graphs have been
        compiled with nvfuser.

    fw_compilerbw_compilerpartition_fndecompositions)
rH   r   default_decompositionsupdater.   r   nnModuler   r   )r   r6   configs      r!   memory_efficient_fusionr      sW    6 "!;0	F MM&"ehhoo&"'''B)&))r#   c                     | j                  d       t        d|D cg c]  }|j                  |j                  f c} d       ddlm}   |       j                         |  t        | |      S c c}w )NfooaQ  
##############################################################
# To minimize FX graph, copy and paste the below and run it  #
##############################################################

import torch
import torch.fx as fx
from functorch.compile import minifier, check_nvfuser_subprocess, check_nvfuser_correctness_subprocess

inps = a?  
inps = [torch.ones(shape, dtype=dtype, device='cuda') for (shape, dtype) in inps]
from foo import FxModule
mod = FxModule().cuda()

with torch.jit.fuser("fuser2"):
  # check_nvfuser_subprocess can be replaced with check_nvfuser_correctness_subprocess
  minifier(fx.symbolic_trace(mod), inps, check_nvfuser_subprocess)
r   )FxModule)	to_folderrK   shaper+   r   r   cudarH   )r   rC   r   r   s       r!   debug_compiler     so    NN5		 &**!''177	*+ ,	( HJOOtdD!! 	+s   A/
c                 J   g }t        | d      5 }t        j                  |      }g }|D ]  }t        |      dk(  r|} |t	        j
                               }n|\  }}}}	}
|	t        j                  t        j                  t        j                  t        j                  t        j                  t        j                  t        t        hv rt        j                  dd||	|
      }nt        j
                  ||	|
      }|j                  |        	 ddd       |S # 1 sw Y   |S xY w)zZ
    Return a random input for the given inputs meta generated from _save_fx_default.
    rbr
   r   )r+   r9   N)openpickleloadr4   randomrandr   rg   int32int64booluint8floatrandintappend)input_data_pathinputsrG   inputs_metar   r:   inputr   rt   r+   r9   s              r!   
get_inputsr   3  s     F	ot	$ !kk!n 	!D4yA~V[[]+592eVUFIIKKKKJJIIKK	 	 "MM!QU6RE!JJuE&IEMM% '	!!. M/!. Ms   C>DD"c                 t    	
 ddl m} 	fd	 	fd

fd}
fd}
fd} ||||||t              S )	aO  
    The forward, backward, and joint computation graph will be stored in
    {folder_name}/{current_name}/{current_name}_forward_{graph_index},
    {folder_name}/{current_name}/{current_name}_backward_{graph_index}, and
    {folder_name}/{current_name}/{current_name}_joint_{graph_index} respectively.
    The input shape of the graphs will be stored in the .input files.
    These files can be loaded with pickle,
    and is a list of format (type, shape, stride, dtype, device).
    In the case of type = int or float, it is just (type,).
    For joint graph input, it is a nested list [[],[]]
    where the two inner lists have the same format.
    If dump_example_input is True, example_inputs will be stored in .pt file.
    Since each function might produce multiple graphs,
    the graph_index is used to distinguish difference graphs
    r   )aot_module_simplifiedc                    g }t        |       dkD  r1t        | d   t              r| | d         z  }| | d         z  }|S | D ]  }t        |      t        k(  st        |      t
        k(  r|j                  t        |      f       C|j                  t        |      |j                  |j                         |j                  |j                  f        |S )Nr   r
   )r4   r.   rm   r:   rg   r   r   r   rt   r+   r9   )r5   
input_metaargget_input_metas      r!   r   z(_save_fx_default.<locals>.get_input_metad  s    
t9q=ZQ7.a11J.a11J 	CCyC49#5!!49,/!!#Y		3::<CJJO		 r#   c                    t        | j                  j                        dk(  r,t        j                  t        j
                  d|t               y t        j                  |       }|j                  j                  t        j                  j                  j                                |j                           |      }t        j                   d d       |j!                   d d d| dt         	       t#        j$                  |t'         d d d| dt         d d| dt         dd             r7t        j(                  | d d d| dt         d d| dt         d	       y y )
Nr   z!No nodes in graph {%s}_{%s}_{%s}./T)exist_okrM   z.inputwbz.pt)r4   r   r7   logloggingWARNINGgraph_indexcopydeepcopyset_codegenr   fxCodeGenr   osmakedirsr   r   dumpr   save)	
gm_to_saver5   	type_namegmr   current_namedump_example_inputfolder_namer   s	        r!   graph_saver_helperz,_save_fx_default.<locals>.graph_saver_helpers  s   z%%&!+GG3 ]]:&
UXX^^3356
#D)

{m1\N3dC
m1\N!L>9+Q{mT	
 	-qa~Qyk;-WXYeXffghqgrrst  tA  AG  H	
 JJ-qa~Qyk;-WXYeXffghqgrrst  tA  AD  E r#   c                      | |d       | S )NforwardrT   )r   fw_argsr   s     r!   graph_saver_forwardz-_save_fx_default.<locals>.graph_saver_forward  s    2w	2	r#   c                 .     | |d       t         dz  a | S )Nbackwardr
   )r   )r   bw_argsr   s     r!   graph_saver_backwardz._save_fx_default.<locals>.graph_saver_backward  s    2w
3q	r#   c                 0     | |d       t        | |      S )Njoint)r   )r   
joint_argsr   s     r!   graph_saver_jointz+_save_fx_default.<locals>.graph_saver_joint  s    2z73 Z00r#   r   )functorch.compiler   r   )r   r   r   r   example_inputsr   r   r   r   r   r   s   ```      @@r!   _save_fx_defaultr   R  sC      8!F1 !
'(&- r#   c                 *    da t        t        | ||      S )as  
    Dump the forward, backward, and joint computation graph.
    Example Usage:
    save_fx_func = graph_dumper_aot(current_name, folder_name, dump_example_input = False)
    optimize_ctx = torchdynamo.optimize(
        save_fx_func
    )
    with torch.enable_grad():
        with optimize_ctx:
            result = forward_and_backward_pass(model, example_inputs)
    r   )r   r   r   )r   r   r   s      r!   graph_dumper_aotr     s     K#\;@RSSr#   )T)F)Vr   r   r   r   r   
contextlibr   	functoolsr   typingr   r   rb   r   torch.fxr   torch.nnr   torch.utils._pytreeutils_pytreer   r   torch._decompr   %torch.fx.experimental.symbolic_shapesr	   aot_autogradr   r   r   compile_utilsr   partitionersr   r   r   	getLoggerr   r   r"   r(   GraphModulerH   rO   rR   rV   InterpreterrX   r   r   r   r   r   detachgelu_backwardleaky_relu_backwardsigmoid_backwardthreshold_backwardhardtanh_backwardhardsigmoid_backwardhardswish_backwardtanh_backwardsilu_backwardelu_backwardcudnn_batch_normcudnn_batch_norm_backwardmasked_fillScalarr   elu
leaky_reluhardtanh	hardswishhardsigmoidconj_physicalis_same_sizer   r   r   r   r   r   r   r   r   rT   r#   r!   <module>r     s]     	   %  "     $ $  , > G G *  g!
 ? ? +R^^ +h + +\H 	bnn 	H 	 	4r~~ 4n &BNN &( & &  . yy~~KK""HHOOMMNN- 2 ,,BC   
$*h		!"$*N": >YzTr#   