
    Ǆgy6                     d   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZ e
rd dlmZ d d	fd
efdZ e j&                  ed d      Z e j&                  ed d	      Ze G d d             Ze G d d             Z G d d      Z G d d      ZddZd d dfdZd Zd Zy)    N)deque)	dataclass)DictListTYPE_CHECKINGprofile)
DeviceType)_KinetoEventc                     | j                   S N)childrenxs    ]/home/mcse/projects/flask_80/flask-venv/lib/python3.12/site-packages/torch/profiler/_utils.py<lambda>r      s
    1::     Freversec              #      K   |rt         nd }t         ||             }|r4 ||      }|  | ||            D ]  }|j                  |        |r3y y w)Nc                     | S r    r   s    r   r   z_traverse.<locals>.<lambda>   s    q r   )reversedr   append)treenext_fnchildren_fnr   order	remaining
curr_eventchild_events           r   	_traverser!      s\     H[EeDk"I
Y'
 Z!89 	*K[)	* s   AAAc                 "    | j                         S r   )popr   s    r   r   r      s    aeeg r   T)r   r   c                 "    | j                         S r   )popleftr   s    r   r   r      s     r   c                   V    e Zd ZU dZeed<   dZeed<   dZeed<   dZeed<   e	d        Z
y)EventMetricsr   duration_time_nsself_time_nsidle_time_nsqueue_depthc                 T    | j                   dk(  ry| j                  | j                   z  S )Nr   g        )r(   r*   selfs    r   fraction_idle_timezEventMetrics.fraction_idle_time(   s*      A%  4#8#888r   N)__name__
__module____qualname__r(   int__annotations__r)   r*   r+   propertyr/   r   r   r   r'   r'   !   s=    cL#L#K9 9r   r'   c                   0    e Zd ZU eed<   eed<   dZeed<   y)Intervalstartendr   r+   N)r0   r1   r2   r3   r4   r+   r   r   r   r7   r7   /   s    J	HKr   r7   c                   6    e Zd Zd Zd Zd Zd Zdee   fdZ	y)EventKeyc                     || _         y r   event)r.   r>   s     r   __init__zEventKey.__init__7   s	    
r   c                 @    t        | j                  j                        S r   )hashr>   idr-   s    r   __hash__zEventKey.__hash__:   s    DJJMM""r   c                 \    | j                   j                  |j                   j                  k(  S r   )r>   rB   )r.   others     r   __eq__zEventKey.__eq__=   s    zz}}..r   c                 0    | j                   j                   S r   )r>   namer-   s    r   __repr__zEventKey.__repr__@   s    **//"#r   	intervalsc                    d}t        |d       }|rgt        | j                  j                  |d   j                        }t        | j                  j                  |d   j                        }||k  r|||z
  z  }d\  }}|t        |      k  r||   }||   }|dz  }|j                  |j                  kD  r2|j                  |j                  kD  r|dz  }U|j                  |_        |}t        | j                  j                  |j                        }t        | j                  j                  |j                        }||k  r|||z
  z  }|t        |      k  r|S )Nr   c                     | j                   S r   r8   r   s    r   r   z,EventKey.intervals_overlap.<locals>.<lambda>E   s
    AGG r   key)r      rP   )	sortedmaxr>   start_time_nsr8   minend_time_nsr9   len)	r.   rJ   overlap_timeoverlap_startoverlap_endijprev_intervalcurr_intervals	            r   intervals_overlapzEventKey.intervals_overlapC   sJ   9*;<	

 8 8)A,:L:LMMdjj44il6F6FGK{*m ;;1#i. %aLM%aLMFA  =#6#66 $$}'8'88FA*7*;*;M'A

 8 8-:M:MNMdjj44m6G6GHK{*m ;;! #i. $ r   N)
r0   r1   r2   r?   rC   rF   rI   r   r7   r^   r   r   r   r;   r;   6   s&    #/$4> r   r;   c                   B    e Zd ZdefdZd Zd Zd Zd Zdde	de
fd	Zy
)BasicEvaluationprofc                 X   || _         i | _        | j                          t        d | j                  j	                         D        d       | _        | j
                  D cg c]  }|j                   c}| _        g | _        | j                         | _
        | j                          y c c}w )Nc              3       K   | ]  }|  y wr   r   ).0es     r   	<genexpr>z+BasicEvaluation.__init__.<locals>.<genexpr>j   s     ,1Q,s   c                 .    | j                   j                  S r   )r>   rS   r   s    r   r   z*BasicEvaluation.__init__.<locals>.<lambda>j   s    AGG<Q<Q r   rN   )r	   metricscompute_self_timerQ   keys
event_keysr>   eventscuda_eventscompute_queue_depthqueue_depth_listcompute_idle_time)r.   ra   re   s      r   r?   zBasicEvaluation.__init__e   s    57  ,))+,2Q
 )-81qww8/1 $ 8 8 :  9s   B'c                 6   | j                   j                  J t        | j                   j                  j                               }|r|j	                         }|j
                  }|j                  D ]"  }||j
                  z  }|j                  |       $ t        |      | j                  vs!J d|j                   d|j                          t        |      | j                  t        |      <   |j
                  | j                  t        |         _        |ryy)zM
        Computes event's self time(total time - time in child ops).
        NzDuplicate id: z, )r)   )r	   kineto_resultsr   experimental_event_treer#   r(   r   r   r;   rh   rB   rH   r'   )r.   stackr   	self_timer    s        r   ri   z!BasicEvaluation.compute_self_timeq   s     ||**666dll11IIKL J"33I)22 *[999	[)* $DLL8C
b0ABC81=91UDLL*-. ",!<!< LL$ r   c                    | j                   j                  J | j                   j                  j                         }d d t        fd|D        d       }t        fd|D        d       }t        ||z   d       | _        i }d	}|D ]  t        |fd
|      }||<   ||n|}  d	}d}||z   | j                  z   }	d }
g }|	j                  |
       |	D ]  }t        |d      rE|j                         dz  }|j                         |j                         z   dz  }||v r
||   ||   }t        |d      r@|j                         }|j                         |j                         z   }||v r/||   *||   }n$t        |d      r|j                  }|j                  }|t        |      k  r@||   j                         k  r*|dz  }|t        |      k  r||   j                         |k  r*||z
  dz   }t        |d	      }t        |d      st        |d      r|j!                  t#        |             ]t        |d      sk|| j$                  t'        |         _         |S )z
        Computes queue_depth at each event. This will calculate the queue depth data for
        All the events in the tree.
        This will return a list of Interval of queue depth data of cuda launch and kernels.
        c                      | j                   dk(  S )NcudaLaunchKernel)rH   re   s    r   is_cuda_launch_kernelzBBasicEvaluation.compute_queue_depth.<locals>.is_cuda_launch_kernel   s    66///r   c                     | j                         t        j                  k(  xr d| j                  j	                         vS )Nmem)device_typer
   CUDArH   lowerry   s    r   is_cuda_kernelz;BasicEvaluation.compute_queue_depth.<locals>.is_cuda_kernel   s+    ==?joo5U%qvv||~:UUr   c              3   4   K   | ]  } |      s|  y wr   r   )rd   re   rz   s     r   rf   z6BasicEvaluation.compute_queue_depth.<locals>.<genexpr>   s     D1+@+CQD   c                 "    | j                         S r   start_nsr   s    r   r   z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>       !**, r   rN   c              3   4   K   | ]  } |      s|  y wr   r   )rd   re   r   s     r   rf   z6BasicEvaluation.compute_queue_depth.<locals>.<genexpr>   s     =1>!+<Q=r   c                 "    | j                         S r   r   r   s    r   r   z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>   r   r   c                 "    | j                         S r   r   r   s    r   r   z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>   s    1::< r   r   c                 F    | j                         j                         k(  S r   )linked_correlation_id)r   cuda_launch_events    r   r   z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>   s!    !113$::<= r   rM   c                     t        | d      r| j                         dz  S t        | d      r| j                         S t        | d      r| j                  S t	        d      )Nstart_us  r   rS   zUnknown Event Type)hasattrr   r   rS   	Exceptionr=   s    r   new_old_event_comparatorzEBasicEvaluation.compute_queue_depth.<locals>.new_old_event_comparator   sW    uj)~~'$..uj)~~''uo.***011r   r   r   r   rS   rP   )r	   rr   rl   rQ   rm   index_of_first_matchsortr   r   duration_usr   duration_nsrS   rU   rV   rR   r   r7   rh   r;   r+   )r.   cuda_event_listcuda_launch_eventscuda_kernel_eventskernel_mappinglast_mapped_kernelindexcurrent_kernel_indexspawned_kernel_index
all_eventsr   ro   r>   
start_timeend_timecurrent_queue_depthr   r   rz   s                   @@@r   rn   z#BasicEvaluation.compute_queue_depth   s    ||**666,,55<<>	0	V $DD&
 $==&

 "!339O
 35!3 	T("=(	E 16N,-*/*;AS	T  !!'*<<t{{J
	2 ,.45  	PEuj)"^^-4
!NN,u/@/@/BBdJN*~e/D/P+9%+@(uj)"^^-
 >>+e.?.?.AAN*~e/D/P+9%+@(0"00
 ,, %s+='>>'(<=FFH %)$	 %s+='>>'(<=FFH #79M"MPQ"Q"%&91"=uj)WUJ-G ''Z3FG 0<OXe_-9A 	PD  r   c                    d}d}g }| j                   r| j                  rw|t        | j                  d   j                  | j                   d   j                        t        | j                   d   j
                  | j                  d   j                        gz  }| j                   D ][  }|j                  dk(  r|s|j
                  }d}|j                  dkD  s2|s5|j                  t        ||j                               d}] | j                  j                         D cg c]  }|j                   }}|D ]7  }t        |      j                  |      | j                  t        |         _        9 yc c}w )z4
        Computes idle time of the profile.
        Fr   r   TN)ro   rl   r7   rS   r8   r9   rU   r+   r   rh   rj   r>   r;   r^   r*   )r.   idle
idle_startidle_intervals
data_pointre   
event_listr>   s           r   rp   z!BasicEvaluation.compute_idle_time   sM   
 
)+  T[[Q55t7L7LQ7O7U7UV..r266B8S8ST N
 // 	J%%*4'^^
%%)d%%hz:;K;K&LM	 (,||'8'8':;!agg;
; 	0E9A:/ LL%)6	0 <s   E.c                    ddl }t        t        | j                              }|D cg c]  }|j                   }}dd}g }d}|t        |      k  r||   kD  r|dz  }t        |dz   t        |            D ]i  }	t        |fd|	      }
t        ||	|
      }|%||   |k\  s.|j                  t        ||   j                  ||   j                               |
|
n|} n |dz  }|t        |      k  r| j                  j                         D cg c]  }|j                  |      r| }}|r|j                  |D cg c]  }| j                  |   j                    c}|j"                        }|j                  |D cg c]  }| j                  |   j$                   c}|j"                        }||j'                  |      z
  |j)                  |      z  }||j'                  |      z
  |j)                  |      z  }|d	|z  z   }t+        t-        ||      t/        j0                  d      d
      D cg c]  \  }}|	 }}}|d| }|S c c}w c c}w c c}w c c}w c c}}w )a  
        Filter and Rank the events based on some heuristics:
        1) Events that are in the falling phase of the queue depth.
        2) Events that have a high idle_time, self_time difference.

        Parameters:
            length: The number of events to return.
        r   N   rP   c                     | k  S r   r   )r   bottom_threasholds    r   r   z-BasicEvaluation.rank_events.<locals>.<lambda>  s    .?)? r   rM   )r8   r9   )dtypeg333333?T)rO   r   )torchlistr   ro   r+   rV   ranger   argmaxr   r7   r8   rh   rj   r^   tensorr)   float32r/   meanstdrQ   zipoperator
itemgetter)r.   lengthr   ro   re   	qd_valuestop_threasholddecrease_intervalrZ   r[   next_minimum_idxpeak_idxr>   r   ru   	idle_timenormalized_gainnormalized_selfheuristic_score_list_r   s                       @r   rank_eventszBasicEvaluation.rank_events  s    	)>)> ?@,<=qQ]]=	=#i. |//Q1q5#i.1  $8?q$  ")1:JK 'Ih,?>,Q%,, ,X6<<>Nq>Q>W>W
 -=,H(aA!" FA+ #i. 2 **,
&&'89 

 

 ?IJee$11Jmm % I EOPEe$77Pmm % I  )5::i+@@EIIiDXXO(5::i+@@EIIiDXXO#2S?5J#J 
 !',j9 ++A. !Au J  $GV,Js >:
 K Qs   II I I:I$r   print_enablec                    | j                  |      }|s|S |rdnd}|dj                  |D cg c]@  }d d| dt        |j                         d| j                  |   j
                  dz  d	d
d 	B c}      z  }|rt        |       |S c c}w )NzOptimizable events:
zNo events to optimize

zP--------------------------------------------------------------------------------z
Event:                z
Source code location: z
Percentage idle time: d   z.2fz%
)r   joinsource_code_locationr>   rh   r/   print)r.   r   r   r   outputr>   s         r   get_optimizable_eventsz&BasicEvaluation.get_optimizable_eventsJ  s    %%f-
,6(<U$)) (  H g +EKK89 :||E*==CCH I		
 		
 &Ms   AB	
N)rP   T)r0   r1   r2   r	   r?   ri   rn   rp   r   r3   boolr   r   r   r   r`   r`   d   s9    
!W 
!=,\ |08GRS D r   r`   c                 z    ||t        |       k\  rt        |       }t        ||      D ]  } || |         s|c S  y r   )rV   r   )seq	predicater8   r9   rZ   s        r   r   r   _  sF    
{cSXo#h5# SVH r   c                     | S r   r   r   s    r   r   r   h  s    a r   c                 h    | || } t        |       dk(  ry | j                  t        | |            |z   S )Nr   rN   )rV   r   rR   )r   rO   r8   r9   s       r   r   r   h  s6    
eC.C
3x1}99S#&'%//r   c                 ~    | ;t        j                  d| j                        }|| j                  } 1| j                  S y)Nz
\.py\(.*\)zNo source code location found)researchrH   parent)r>   matchs     r   r   r   o  s:    

		-4=LLEzz*r   c                  J    ddl m}   |        5  	 d d d        y # 1 sw Y   y xY w)Nr   r   )torch.autograd.profilerr	   r   s    r   _init_for_cuda_graphsr   }  s"    /	   s   ")r   N)	functoolsr   r   collectionsr   dataclassesr   typingr   r   r   r   r	   torch.profilerr
   torch.autogradr   r   r!   partialtraverse_dfstraverse_bfsr'   r7   r;   r`   r   r   r   r   r   r   r   <module>r      s      	  ! , , + % + *>u * * !y  4EtT y  ,e
 
9 
9 
9   + +\x xv  qd 0+r   