
    ɯwgW                        U d Z ddlZddlZddlZddlZddlmZmZ ddlm	Z	 ddl
mZmZmZmZmZmZmZmZmZmZmZmZmZ ddlZddlmZ ddlmc mc mZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( dd	l)m*Z* d
dl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 erddl2m3Z3 ddl4m5Z5 d
dlm6Z6 dZ7e7dz   Z8dZ9 ejt                         Z;ejt                  e<d<    G d d      Z= G d de=      Z> G d de$      Z?de!j                  dee?   fdZAde!j                  dee?   fdZB G d de      ZC G d d e      ZDd!e?fd"ZEed!e?de!j                  ded#   fd$       ZFed!e?de!j                  deGfd%       ZHd& ZId'eJdeJfd(ZKd)ej                  ddfd*ZMd)ej                  deGfd+ZNde!j                  d,edeeeJe!j                  f      fd-ZP	 dEd.ejB                  j                  d/eGdee!j                  eeJ   f   fd0ZQed!e?d1d#d2ej                  ddfd3       ZSed!e?d1d#deeeJ      fd4       ZT	 dFd5ejB                  j                  d6ed7ed8eeeJ      fd9ZUed!e?d:eeC   ddfd;       ZVd<ee!j                     dee!j                     fd=ZWd>difd?ejB                  j                  d@eee!j                        dAeeJef   deee!j                        fdBZXd)ej                  dCej                  ddfdDZZy)Gz7
This file includes private common utilities for FSDP.
    N)autoEnum)partial)AnyCallablecastDict	GeneratorIterableListno_type_checkOptionalSetTupleTypeTYPE_CHECKING)_get_module_state_State)_CHECKPOINT_PREFIX)_apply_to_tensors)no_dispatch   )FullOptimStateDictConfigFullStateDictConfigOptimStateDictConfigShardingStrategyStateDictConfigStateDictType)
DeviceMesh)FSDPExtensions)FlatParamHandle_fsdp_wrapped_module._fsdp_flattened_MODULE_TO_INP_DTYPEc                   t    e Zd ZdZd
dej
                  defdZedej
                  dd fd       Z	de
defd	Zy)_FSDPDeviceHandlez
    This is a simple abstraction for FSDP computing devices,
    which enables custom backends that implement CUDA-like
    semantics to be integrated with FSDP.
    Ndevicebackendc                     |(	 t        t        |j                        | _        || _        y || _        y # t
        $ r"}t        d| d|j                   d      |d }~ww xY w)NzDevice 'z=' does not have a corresponding backend registered as 'torch.z'.)getattrtorchtype_FSDPDeviceHandle__backend_FSDPDeviceHandle__deviceAttributeError)selfr(   r)   excs       i/home/mcse/projects/flask/flask-venv/lib/python3.12/site-packages/torch/distributed/fsdp/_common_utils.py__init__z_FSDPDeviceHandle.__init__K   sl    ?!(!< & %DN " $vh&cdjdodocpprss   &3 	AAAreturnc                     |j                   dk(  rt        t        t        j                        S |j                   dk(  rt        t        t        j
                        S  | |      S )ak  
        Return a device handle corresponding to the device, and through this handle,
        operations with the same semantics as CUDA can be performed on the device.
        Just return torch.cuda if the device is cuda to make attribute-access faster.
        Custom backend must first register a module with the same name with {device.type} on torch.
        cudamtia)r-   r   r'   r,   r7   r8   )clsr(   s     r3   from_devicez_FSDPDeviceHandle.from_deviceW   sJ     ;;& )5::66[[F")5::666{    _FSDPDeviceHandle__namec           
          	 t        | j                  |      S # t        $ rC}t        d| j                  j                   d| j                  j                   d| d      |d }~ww xY w)NzCustom backend 'z' not implement 'torch.r#   ')r+   r.   r0   r/   r-   )r1   r<   r2   s      r3   __getattr__z_FSDPDeviceHandle.__getattr__e   sk    	4>>622 	 "4==#5#5"66MdmmN`N`Maabcibjjkl	s    	A$>AA$N)__name__
__module____qualname____doc__r,   r(   r   r4   classmethodr:   strr?    r;   r3   r'   r'   D   sX    
%u|| 
%c 
%  2E  # # r;   r'   c                   $    e Zd ZddZdedefdZy)_UninitializedDeviceHandler5   Nc                      y r@   rG   r1   s    r3   r4   z#_UninitializedDeviceHandle.__init__o   s    r;    _UninitializedDeviceHandle__namec                     t        d      )Nz-Trying to use an uninitialized device handle.)RuntimeError)r1   rL   s     r3   __getattribute__z+_UninitializedDeviceHandle.__getattribute__r   s    JKKr;   r5   N)rA   rB   rC   r4   rF   r   rO   rG   r;   r3   rI   rI   n   s    Ls Ls Lr;   rI   c                       e Zd ZddZy)
_FSDPStateNc                 2   t               | _        t               | _        t               | _        d | _        d| _        d| _        d | _        t        j                  | _
        d| _        t        j                  | _        i | _        t         j"                  | _        t'               | _        t+               | _        d | _        d | _        i | _        d | _        d| _        d| _        d | _        d | _        d | _        tA               | _!        g | _"        g | _#        d | _$        y )NFr   )%set_ignored_modules_ignored_params_ignored_buffer_namesprocess_grouprank
world_size_device_meshr   
FULL_SHARDsharding_strategy_use_orig_paramsTrainingStateIDLEtraining_state_unshard_params_ctxr   FULL_STATE_DICT_state_dict_typer   _state_dict_configr   _optim_state_dict_config_is_root_handle_fully_sharded_module_to_handlecompute_device_gradient_predivide_factor_gradient_postdivide_factor
_comm_hook_comm_hook_state_unshard_eventrI   _device_handle_all_fsdp_states_all_handles_fsdp_extensionrK   s    r3   r4   z_FSDPState.__init__w   s    1425%/2u":>	!26!1!<!<&++00?A /</L/L3F3H>V>X%(,BF  	, 7;/0'01(.2/359 2L1M 35CE9=r;   rP   )rA   rB   rC   r4   rG   r;   r3   rR   rR   v   s    $>r;   rR   moduler5   c                 B    t        |       }|t        |t              sy |S r@   )r   
isinstancerR   ru   states     r3   _get_module_fsdp_staterz      s"    f%E}Juj9Lr;   c                 N    t        |       }|y || k(  r|S | |j                  v r|S y r@   )rz   rj   rx   s     r3   ._get_module_fsdp_state_if_fully_sharded_moduler|      s8     #6*E}666r;   c                   :    e Zd ZdZ e       Z e       Z e       Zy)r`   zU
    An enum that indicates the state of a ``FullyShardedDataParallel` instance.
    N)rA   rB   rC   rD   r   ra   FORWARD_BACKWARDSUMMON_FULL_PARAMSrG   r;   r3   r`   r`      s      6Dvr;   r`   c                   V    e Zd ZdZ e       Z e       Z e       Z e       Z e       Z	y)HandleTrainingStatezC
    An enum that indicates the state of a ``FlatParamHandle`.
    N)
rA   rB   rC   rD   r   ra   FORWARDBACKWARD_PREBACKWARD_POSTr   rG   r;   r3   r   r      s-     6DfG6LFMr;   r   ry   c                 8    t        | t        j                         S r@   )rw   nnModule)ry   s    r3   _is_composabler      s    %+++r;   r!   c                     t        |       rA| j                  y|| j                  v sJ d| d| j                          | j                  |   S |j                  S )z
    Returns the ``FlatParamHandle`` s corresponding to ``module``. This is
    the handle that contains some parameter in ``module``.
    Nz'Expects a fully sharded module but got z	 on rank )r   ri   rj   rZ   ry   ru   s     r3   _module_handler      si     e == e;;;	S4VHIejj\R	S;44V<< ~~r;   c                     t        | |      duS )z5Returns if ``module`` has parameters managed by FSDP.N)r   r   s     r3   _has_fsdp_paramsr      s     %(44r;   c                 "    | r| j                   S dS )z6
    Returns the sharding strategy of the handle.
    N)_sharding_strategy)handles    r3   _get_sharding_strategyr      s     )/6$$8D8r;   tensor_namec                 ^    | j                  t        d      } | j                  t        d      } | S )zZ
    Cleans the parameter or buffer name by removing any module wrapper
    prefixes.
     )replaceFSDP_PREFIXr   )r   s    r3   clean_tensor_namer      s1    
 %%k26K %%&8"=Kr;   tensorc                 &    t        | t        d       y)z
    Sets an attribute on ``tensor`` to mark it as flattened by FSDP. This is to
    avoid re-flattening it during nested construction.
    TN)setattrFSDP_FLATTENEDr   s    r3   _set_fsdp_flattenedr      s    
 FND)r;   c                 $    t        | t        d      S )z;Returns if ``tensor`` has been marked as flattened by FSDP.F)r+   r   r   s    r3   _is_fsdp_flattenedr     s    6>511r;   kwargsc                     d|vsJ d       d|d<   	 t         | j                  di |      }|S # t        $ r7}|j                  d       t         | j                  di |      }Y d}~|S d}~ww xY w)zx
    This API is required as some modules overwrite `named_parameters()` but do not support
    `remove_duplicate`.
    remove_duplicatezR_named_parameters_with_duplicates cannot be used with `remove_duplicate` argument.FNrG   )listnamed_parametersAssertionErrorpop)ru   r   retes       r3   !_named_parameters_with_duplicatesr     s     	&(\[\(!&F6*6**4V45 J  6

%&*6**4V45J6s   / 	A/,A**A/modeldedup_shared_paramsc                 z    fd}d }i }t        | ||t        |       D cg c]  \  }}|	 c}}|      S c c}}w )aU  
    Constructs a mapping from parameter to a list of its "canonical" FQNs. Here,
    we use canonical to mean the fully-qualified name assigned to the parameter
    based on its position in the original nn.Module hierarchy before any wrapper
    or parallelism has been applied to it. This is in contrast to FQNs that may be
    generated after parallelisms or wrappers have been applied to the model.

    Each normal parameter maps to a singleton list containing its FQN, while each
    ``FlatParameter`` maps to a list of its original parameter FQNs, which may
    have length greater than one.  All FQNs are prefixed starting from ``model``.

    In the case where FSDP was applied with ``use_orig_params=True``, there should be no
    ``FlatParameter`` s registered to the model's modules and this mapping will only
    contain mappings from ``nn.Parameter`` s to singleton FQN lists.

    It is only in the case where FSDP was applied with ``use_orig_params=False`` where
    a ``FlatParameter`` will be registered in place of the original parameters and there
    will be mappings from each ``FlatParameter`` to lists of FQNs corresponding to the
    original parameters.

    Args:
        model (torch.nn.Module): Root module (which may or may not be a
            :class:`FullyShardedDataParallel` instance).
        dedup_shared_params (bool): For shared parameters, if ``True``, only
            includes the FQNs corresponding to the first encounter of the
            shared parameter in the module traversal; if ``False``, then
            includes the FQNs across all encounters. (Default: ``True``)
    c                 r   t        | d      D ]  \  }}t        |t        j                        r|j                  n|g}|D cg c]  }t        ||z          }}||v }	|	s|||<   Vt        |t        j                        rt        j                  d       |||<   
r||   j                  |        y c c}w )NF)recursezFlatParameter is being traversed more than once. This case should only happen when using DistributedModelParallel with FullyShardedDataParallel.)	r   rw   flat_param_fileFlatParameter_fqnsr   warningswarnextend)ru   prefix
tree_levelparam_to_fqns
param_nameparam
local_fqnsnameglobal_fqnsis_shared_paramr   s             r3   	module_fnz%_get_param_to_fqns.<locals>.module_fn<  s    !BE"
 "	=J
 e_%B%BC  \  >H59!&4-0K  $}4O"'2e$e_%B%BC MMR
 ,7M%(,!%(//<E"	=s   B4c                     | S r@   rG   )r   s    r3   	return_fnz%_get_param_to_fqns.<locals>.return_fna  s    r;   )_apply_to_modulesr   )r   r   r   r   param_to_unflat_param_nameskey_s    `     r3   _get_param_to_fqnsr     sO    B#=J HJ<UCDaD#  	Es   7r   loggerc                     | j                   rG|j                  t        j                  j                  k(  rt        | |      }|j                  d|       y y y )Nz1FSDP firing post-backward hooks for parameters %s)r_   _debug_leveldist
DebugLevelINFO_get_handle_fqns_from_rootwarning)ry   r   r   
param_fqnss       r3   _log_post_backward_hookr   n  sH     &"5"59M9M"M/v>
JJW #Nr;   c                     |y | j                   j                  }|j                  j                  }|D cg c]  }||   	 c}D cg c]  }|D ]  }|  }}}|S c c}w c c}}w r@   )_exec_order_dataparam_to_fqn
flat_param_params)ry   r   r   handle_paramspfqn_listfqnr   s           r3   r   r   z  s}     ~))66L%%--M6CD\!_DPXILJ   Es   AA root_moduler   r   filter_fqnsc                     dt         j                  j                  dt        dt        ffd | ddg|i |  ||i |S )a"  
    Performs a pre-order traversal of the modules in the hierarchy rooted at
    ``root_module``, applying ``module_fn`` at each module and finally
    returning a value using ``return_fn``. The traversal constructs the full
    module prefix name (e.g. "module.submodule." just like in model state dict)
    and makes that available to ``module_fn``.

    ``filter_fqns`` is used because some module may have its own prefix similar
    to ``FullyShardedDataParallel`` and the ``named_parameters()`` is overwritten
    to remove the prefix.
    ru   r   r   c                      | ||g|i | | j                         D ]S  \  }}|	||z   dz   }|dz   }.D ]  }	|	j                  |      s n |dk(  s|dk(  r|}n|dk(  r|} 
|||g|i | U y )Nr#   r   r"   _dmp_wrapped_moduleru   )named_children
startswith)ru   r   r   argsr   submodule_name	submodule
new_prefixnew_tree_levelr   fr   r   s             r3   r   z_apply_to_modules.<locals>.f  s    &&*>t>v>)/)>)>)@ 	F%NI .036J'!^N&& ,C~~j1, '*@@)-BB%+
'83%+
i^EdEfE/	Fr;   r   r   )r,   r   r   rF   int)r   r   r   r   r   r   r   s    ` `  @r3   r   r     sQ    (F%((// F3 FC F8 k2q*4*6*d%f%%r;   training_statesc                    | j                   |vrvd| d| j                    }| j                  dk(  rJt        | t        j                        rt        d|         t        d|        t        j                          t        |      y)z8Asserts that FSDP is in the states ``_training_states``.zexpected to be in states z but current state is r   zAsserting FSDP instance is: zERROR: N)	rb   rZ   rw   r   r   print	tracebackprint_stack
ValueError)ry   r   msgs      r3   _assert_in_training_statesr     s     ?2''88N##$& 	
 ::?%+4UG<=GC5/"!!#o 3r;   modulesc                     t               }| D ci c]  }|t        |j                                }}| D ]@  }d}|j                         D ]  \  }}||uxr ||v }|sd} n |s0|j                  |       B |S c c}w )a$  
    Returns:
        Set[nn.Module]: The subset of ``modules`` that are root modules (i.e.
        parent-less) with respect to the modules in the set itself. In other
        words, these are the modules in ``modules`` that are not the child of
        any other module in ``modules``.
    TF)rU   r   itemsadd)r   root_modulesru   module_to_submodulescandidate_moduleis_root_module
submodulesis_child_modules           r3   _get_root_modulesr     s     $'5LHOPfFC(8$99PP# 
/"6"<"<"> 	FJ .Q3Cz3Q  !&	 -.
/  Qs   !A9mixed_precisionrootmodule_classes_to_overridewrap_override_dictc           	         t        t        |            }t               }| j                         D ]  }t        ||      s|j	                  t        |             ||_        dt        j                  dt        j                  dt        j                  dt        j                  fdfd}fd}|j                  |d	       |j                  |d	        |S )
Ndtyperu   xr5   c                     t        j                  |      r|j                  | k(  r|S |j                  t        |<   |j	                  |       S r@   )r,   is_floating_pointr   r%   to)r   ru   r   s      r3   cast_fnz1_override_module_mixed_precision.<locals>.cast_fn  s?     ..q1QWW5EH/0ww$V,ttE{"r;   c                 N    t        t        t        j                  |       |      S r@   )r   r   r,   float32)ru   r   r  s     r3   forward_pre_hookz:_override_module_mixed_precision.<locals>.forward_pre_hook  s    (%--)PRVWWr;   c                 V    | t         v r t         |    }t        t        ||       |      S y r@   )r%   r   r   )ru   r   output	old_dtyper  s       r3   forward_post_hookz;_override_module_mixed_precision.<locals>.forward_post_hook  s8     11 4V <I,F;V  2r;   F)prepend)tuplerU   r   rw   r   r-   _wrap_overridesr,   r   r   r   Tensorregister_forward_pre_hookregister_forward_hook)r   r   r   overridden_module_classesmodr  r  r  s          @r3    _override_module_mixed_precisionr    s    
 "'s+E'F!G69e||~ #Hc56%))$s)4"4C#{{#,.II#:?,,##X ))*:E)J%%&7%GG#HH %$r;   streamc                 &   | j                   j                  ddt        j                  j	                         fvry t        j
                  j                  j                         ry t               5  | j                  |       d d d        y # 1 sw Y   y xY w)Nr7   r8   )
r(   r-   r,   _C_get_privateuse1_backend_namedistributed_functional_collectivesis_torchdynamo_compilingrecord_streamr   )r   r  s     r3   _no_dispatch_record_streamr    sy    }}..0" 
 	00IIK ] 	)  (	) 	) 	)s   ,BB)Tr@   )[rD   loggingr   r   weakrefenumr   r   	functoolsr   typingr   r   r   r	   r
   r   r   r   r   r   r   r   r   r,   torch.distributedr  r   "torch.distributed.fsdp._flat_paramfsdp_flat_paramr   torch.nnr   #torch.distributed._composable_stater   r   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   torch.distributed.utilsr   torch.utils._mode_utilsr   apir   r   r   r   r   r   torch.distributed.device_meshr   'torch.distributed.fsdp._fsdp_extensionsr    r!   FSDP_WRAPPED_MODULEr   r   WeakKeyDictionaryr%   __annotations__r'   rI   rR   r   rz   r|   r`   r   r   r   boolr   r   rF   r   r  r   r   	Parameterr   r   Loggerr   r   r   r   r   r  Streamr  rG   r;   r3   <module>r6     s                < <  I 6 /  8F,, !C'" 3L'2K2K2M g// M' 'TL!2 L%> %>P299 *1E 
II
j
 D  	 $ 	 ,* ,
 * bii HEV<W  & 5J 5		 5d 5 5
9
3 
3 
* * *2u|| 2 2
II!$	%R\\!
"#* !%P88??PP 
",,S	
!"Pf XX0X:A..X	X X 

0
d3i
 
" (,	1&1&1& 1& $s)$	1&h -( 
 *s299~ #bii. 6 +<T)B,%
((//,% (bii 9,% S#X,% 	bii	,%^)u|| )U\\ )d )r;   