
    ǄgC$                         d dl Z d dlZd dlmZmZmZmZmZ ddlm	Z	 ddl
m
Z
 	 ddddedee   d	eeee   f   d
ee   def
dZ ej                         ZdZe j                   deded   fd       ZdefdZdedefdZy)    N)Callable	GeneratorIterableOptionalUnion   )	custom_op)infer_schema)schemanamefnmutates_argsr   returnc               D     dt         dt         f fd}||S  ||      S )af  Create a custom operator whose implementation is backed by 1+ triton kernels.

    Use this instead of :func:`torch.library.custom_op` when the implementation
    consists of 1+ triton kernels. :func:`torch.library.custom_op` treats
    custom operators as opaque (:func:`torch.compile` and
    :func:`torch.export.export` will never trace into them), but ``triton_op``
    makes the implementation visible to these subsystems, allowing them
    to optimize the triton kernel(s).

    Note that ``fn`` must only consist of calls to PyTorch-understood
    operators and triton kernels. Any triton kernels called inside ``fn``
    must be wrapped in a call to :func:`torch._library.capture_triton``.

    Args:
        name (str): A name for the custom op that looks like "{namespace}::{name}",
            e.g. "mylib::my_linear". The name is used as the op's stable identifier
            in PyTorch subsystems (e.g. torch.export, FX graphs).
            To avoid name collisions, please use your project name as the namespace;
            e.g. all custom ops in pytorch/fbgemm use "fbgemm" as the namespace.
        mutates_args (Iterable[str] or "unknown"): The names of args that the function mutates.
            This MUST be accurate, otherwise, the behavior is undefined. If "unknown",
            it pessimistically assumes that all inputs to the operator are being mutated.
        schema (None | str): A schema string for the operator. If None
            (recommended) we'll infer a schema for the operator from its type
            annotations. We recommend letting us infer a schema unless you
            have a specific reason not to.
            Example: "(Tensor x, int y) -> (Tensor, Tensor)".

    Example::

        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
        >>> import torch
        >>> from torch._library import triton_op, capture_triton
        >>>
        >>> import triton
        >>> from triton import language as tl
        >>>
        >>> @triton.jit
        >>> def add_kernel(
        >>>     in_ptr0,
        >>>     in_ptr1,
        >>>     out_ptr,
        >>>     n_elements,
        >>>     BLOCK_SIZE: "tl.constexpr",
        >>> ):
        >>>     pid = tl.program_id(axis=0)
        >>>     block_start = pid * BLOCK_SIZE
        >>>     offsets = block_start + tl.arange(0, BLOCK_SIZE)
        >>>     mask = offsets < n_elements
        >>>     x = tl.load(in_ptr0 + offsets, mask=mask)
        >>>     y = tl.load(in_ptr1 + offsets, mask=mask)
        >>>     output = x + y
        >>>     tl.store(out_ptr + offsets, output, mask=mask)
        >>>
        >>> @triton_op("mylib::add", mutates_args={})
        >>> def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        >>>     output = torch.empty_like(x)
        >>>     n_elements = output.numel()
        >>>
        >>>     def grid(meta):
        >>>         return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
        >>>
        >>>     # NB: we need to wrap the triton kernel in a call to capture_triton
        >>>     capture_triton(add_kernel)[grid](x, y, output, n_elements, 16)
        >>>     return output
        >>>
        >>> @torch.compile
        >>> def f(x, y):
        >>>     return add(x, y)
        >>>
        >>> x = torch.randn(3, device="cuda")
        >>> y = torch.randn(3, device="cuda")
        >>>
        >>> z = f(x, y)
        >>> assert torch.allclose(z, x + y)

    r   r   c           	            fd}t        |t                     }ddlm} |j	                           fd}|j                  ||       |S )Nc                  V    t        d      5   | i |cd d d        S # 1 sw Y   y xY w)NF)set_capture_triton_enabled)argskwargsr   s     ]/home/mcse/projects/flask_80/flask-venv/lib/python3.12/site-packages/torch/_library/triton.py
backend_fnz*triton_op.<locals>.dec.<locals>.backend_fn`   s/     ,E2 +4*6*+ + +s   ()r   )r   r      )FunctionalTensorModec                 D    | 5   |i |cd d d        S # 1 sw Y   y xY wN )mode_typesr   r   r   s        r   functional_decompz1triton_op.<locals>.dec.<locals>.functional_decompz   s*      +4*6*+ + +s   )r	   r
   _subclasses.functional_tensorr   register_fakeregister_torch_dispatch)r   r   resultr   r    r   r   s   `    r   decztriton_op.<locals>.dec_   s[    	+ %>	
 	I 	R 	+ 	&&';=NO    )r   )r   r   r   r   r%   s   ` `  r   	triton_opr'   	   s-    l" "X "H 
z
2wr&   Tenabled)NNNc              #   v   K   	 t               }| t        _        d |t        _        y# t        _        w xY ww)aU  If triton kernels annotated with @capture_triton should dispatch via HOP
    or go straight to the triton kernel execution.

    We have this switch because eager-mode performance of HOP dispatch is slow
    enough to matter (~1ms) and we know that capture_triton isn't necessary in
    some situations (eager-mode with regular Tensors)
    N)is_capture_triton_enabledcapture_triton_enabledvalue)r(   prevs     r   r   r      s1     ,(*'.$'+$t$s   9) 969c                  ,    t        t        dt              S )Nr,   )getattrr+   capture_triton_enabled_defaultr   r&   r   r*   r*      s    )74RSSr&   triton_kernelc                    ddl m} ddlm} ddlm} t        | ||f      st        d      t               s| S  || dd      S )a(	  Allows capture of a triton kernel into a graph via make_fx or
    non-strict export (coming soon).

    These technologies perform Dispatcher-based tracing (via
    ``__torch_dispatch__``) and cannot see calls to raw triton kernels.
    The ``capture_triton`` API returns a new callable that can actually
    be traced into a graph.

    Examples:

        >>> # xdoctest: +SKIP
        >>> import torch
        >>> import triton
        >>> from triton import language as tl
        >>> from torch.fx.experimental.proxy_tensor import make_fx
        >>> from torch._higher_order_ops.triton_kernel_wrap import capture_triton
        >>>
        >>> @triton.jit
        >>> def add_kernel(
        >>>     in_ptr0,
        >>>     in_ptr1,
        >>>     out_ptr,
        >>>     n_elements,
        >>>     BLOCK_SIZE: "tl.constexpr",
        >>> ):
        >>>     pid = tl.program_id(axis=0)
        >>>     block_start = pid * BLOCK_SIZE
        >>>     offsets = block_start + tl.arange(0, BLOCK_SIZE)
        >>>     mask = offsets < n_elements
        >>>     x = tl.load(in_ptr0 + offsets, mask=mask)
        >>>     y = tl.load(in_ptr1 + offsets, mask=mask)
        >>>     output = x + y
        >>>     tl.store(out_ptr + offsets, output, mask=mask)
        >>>
        >>> def add(x, y):
        >>>     output = torch.empty_like(x)
        >>>     n_elements = output.numel()
        >>>
        >>>     def grid_fn(meta):
        >>>         return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
        >>>
        >>>     capture_triton(add_kernel)[grid_fn](x, y, output, n_elements, 16)
        >>>     return output
        >>>
        >>> x = torch.randn(3, device="cuda")
        >>> y = torch.randn(3, device="cuda")
        >>> gm = make_fx(add)(x, y)
        >>> print(gm.code)
        >>> # def forward(self, x_1, y_1):
        >>> #     empty_like = torch.ops.aten.empty_like.default(x_1, pin_memory = False)
        >>> #     triton_kernel_wrapper_mutation_proxy = triton_kernel_wrapper_mutation(
        >>> #         kernel_idx = 0, constant_args_idx = 0,
        >>> #         grid = [(1, 1, 1)], kwargs = {
        >>> #             'in_ptr0': x_1, 'in_ptr1': y_1, 'out_ptr': empty_like,
        >>> #             'n_elements': 3, 'BLOCK_SIZE': 16
        >>> #         })
        >>> #     return empty_like

    r   )	Autotuner)JITFunction)TraceableTritonKernelWrapperzScapture_triton only works on functions annotated with triton.jit or triton.autotuneN)	triton.runtime.autotunerr3   triton.runtime.jitr4   *torch._higher_order_ops.triton_kernel_wrapr5   
isinstanceRuntimeErrorr*   )r1   r3   r4   r5   s       r   capture_tritonr;      sJ    x 3.Wmk9%=>a
 	
 %&'tTBBr&   r   )
contextlib	threadingtypingr   r   r   r   r   
custom_opsr	   r
   strr'   localr+   r0   contextmanagerboolr   r*   r;   r   r&   r   <module>rD      s      A A ! &
 "} !}
}}
 Xc]*+} SM} }@ )* !%  , ,;K1L , , T4 TGC( GC( GCr&   