
    Ǆgf                     <    d dl ZddZ G d d      Z G d d      Zy)    Nc                     t        | du|du|dug      dk(  sJ d}d}| |||z  } n|||z  } nt        d      | |k\  r	| |z  ddS | |k\  r	| |z  ddS | dd	S )
zDefine time formatting.N   g    .A     @@zShouldn't reach here :)z.3fsmsus)sumAssertionError)time_ustime_mstime_sUS_IN_SECONDUS_IN_MSs        h/home/mcse/projects/flask_80/flask-venv/lib/python3.12/site-packages/torch/utils/throughput_benchmark.pyformat_timer      s    t#WD%8&:LMNRSSSSLH(G|+G !:;;,L(-Q//(H$S),,c]"    c                   X    e Zd Zd Zed        Zed        Zed        Zed        Zd Z	y)ExecutionStatsc                      || _         || _        y N)_c_statsbenchmark_config)selfc_statsr   s      r   __init__zExecutionStats.__init__   s     0r   c                 .    | j                   j                  S r   )r   latency_avg_msr   s    r   r   zExecutionStats.latency_avg_ms!   s    }}+++r   c                 .    | j                   j                  S r   )r   	num_itersr   s    r   r    zExecutionStats.num_iters%   s    }}&&&r   c                 4    | j                   | j                  z  S )zHReturn total number of iterations per second across all calling threads.)r    total_time_secondsr   s    r   iters_per_secondzExecutionStats.iters_per_second)   s     ~~ 7 777r   c                 h    | j                   | j                  dz  z  | j                  j                  z  S )Nr   )r    r   r   num_calling_threadsr   s    r   r"   z!ExecutionStats.total_time_seconds.   s7    ~~&(*,0,A,A,U,UV 	Vr   c           	          dj                  dt        | j                        z   d| j                   d| j                  ddt        | j
                        z   g      S )	N
zAverage latency per example: )r   zTotal number of iterations: z<Total number of iterations per second (across all threads): z.2fzTotal time: )r   )joinr   r   r    r#   r"   r   s    r   __str__zExecutionStats.__str__3   sa    yy+k$BUBU.VV*4>>*:;J4K`K`adJef[0G0GHH	
  	r   N)
__name__
__module____qualname__r   propertyr   r    r#   r"   r)    r   r   r   r      s^    1 , , ' ' 8 8 V Vr   r   c                   2    e Zd ZdZd Zd Zd Z	 	 	 	 ddZy)ThroughputBenchmarkan  
    This class is a wrapper around a c++ component throughput_benchmark::ThroughputBenchmark.

    This wrapper on the throughput_benchmark::ThroughputBenchmark component is responsible
    for executing a PyTorch module (nn.Module or ScriptModule) under an inference
    server like load. It can emulate multiple calling threads to a single module
    provided. In the future we plan to enhance this component to support inter and
    intra-op parallelism as well as multiple models running in a single process.

    Please note that even though nn.Module is supported, it might incur an overhead
    from the need to hold GIL every time we execute Python code or pass around
    inputs as Python objects. As soon as you have a ScriptModule version of your
    model for inference deployment it is better to switch to using it in this
    benchmark.

    Example::

        >>> # xdoctest: +SKIP("undefined vars")
        >>> from torch.utils import ThroughputBenchmark
        >>> bench = ThroughputBenchmark(my_module)
        >>> # Pre-populate benchmark's data set with the inputs
        >>> for input in inputs:
        ...     # Both args and kwargs work, same as any PyTorch Module / ScriptModule
        ...     bench.add_input(input[0], x2=input[1])
        >>> # Inputs supplied above are randomly used during the execution
        >>> stats = bench.benchmark(
        ...     num_calling_threads=4,
        ...     num_warmup_iters = 100,
        ...     num_iters = 1000,
        ... )
        >>> print("Avg latency (ms): {}".format(stats.latency_avg_ms))
        >>> print("Number of iterations: {}".format(stats.num_iters))
    c                     t        |t        j                  j                        r/t        j                  j                  |j                        | _        y t        j                  j                  |      | _        y r   )
isinstancetorchjitScriptModule_Cr0   _c
_benchmark)r   modules     r   r   zThroughputBenchmark.__init___   sF    feii445#hh::699EDO#hh::6BDOr   c                 :     | j                   j                  |i |S )a  
        Given input id (input_idx) run benchmark once and return prediction.

        This is useful for testing that benchmark actually runs the module you
        want it to run. input_idx here is an index into inputs array populated
        by calling add_input() method.
        )r8   run_oncer   argskwargss      r   r;   zThroughputBenchmark.run_oncee   s      (t''888r   c                 <     | j                   j                  |i | y)a  
        Store a single input to a module into the benchmark memory and keep it there.

        During the benchmark execution every thread is going to pick up a
        random input from the all the inputs ever supplied to the benchmark via
        this function.
        N)r8   	add_inputr<   s      r   r@   zThroughputBenchmark.add_inputo   s     	"!!4262r   c                     t         j                  j                         }||_        ||_        ||_        ||_        | j                  j                  |      }t        ||      S )a  
        Run a benchmark on the module.

        Args:
            num_warmup_iters (int): Warmup iters are used to make sure we run a module
                a few times before actually measuring things. This way we avoid cold
                caches and any other similar problems. This is the number of warmup
                iterations for each of the thread in separate

            num_iters (int): Number of iterations the benchmark should run with.
                This number is separate from the warmup iterations. Also the number is
                shared across all the threads. Once the num_iters iterations across all
                the threads is reached, we will stop execution. Though total number of
                iterations might be slightly larger. Which is reported as
                stats.num_iters where stats is the result of this function

            profiler_output_path (str): Location to save Autograd Profiler trace.
                If not empty, Autograd Profiler will be enabled for the main benchmark
                execution (but not the warmup phase). The full trace will be saved
                into the file path provided by this argument


        This function returns BenchmarkExecutionStats object which is defined via pybind11.
        It currently has two fields:
            - num_iters - number of actual iterations the benchmark have made
            - avg_latency_ms - average time it took to infer on one input example in milliseconds
        )
r3   r6   BenchmarkConfigr%   num_warmup_itersr    profiler_output_pathr8   	benchmarkr   )r   r%   rC   r    rD   configr   s          r   rE   zThroughputBenchmark.benchmarky   sX    B ))+%8""2$&:#//++F3gv..r   N)r   
   d    )r*   r+   r,   __doc__r   r;   r@   rE   r.   r   r   r0   r0   <   s+     DC93 !"!#'/r   r0   )NNN)torch._Cr3   r   r   r0   r.   r   r   <module>rL      s&    , @d/ d/r   