
    ɯwgB                    L   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZddlmZ ddlmZmZ dd	lmZmZ dd
l m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl$m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z:m;Z;m<Z< ddl=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZE ddlFmGZG ddlHmIZI ddlJmKZKmLZLmMZM erd dlNmOZO ddl+mPZPmQZQ  ej                  eS      ZTd ZU G d deV      ZW G d deC      ZX eX       j                  ZZ eC       j                  Z[ej                  d ej                  d!ej                  d"ej                  d#ej                  d$ej                  d%ej                  d&ej                  d'ej                  d(ej                  d)ej                  d*ej                  d+ej                  d,iZid- Zjd. Zk G d/ d0eB      Zld<d1Zm G d2 d3e?      Znej                   G d4 d5             Zpd6 Zqd7 Zr G d8 d9eL      Zs G d: d;eM      Zty)=    )annotationsN)defaultdict)inf)	AnyCallableDictListOptionalSequenceTupleTYPE_CHECKINGUnion   )is_integer_dtype)FloorDivModularIndexing)symbol_is_typeSymT)ValueRanges   )configir)HalideCodeCache)get_reduction_combine_fn)is_metric_table_enabledlog_kernel_metadata)AddParenHandlerMockHandler)HalideInputSpec
HalideMetaReductionHint)get_bounds_index_exprget_kernel_metadataparallel_num_threadssympy_index_symbol
sympy_subs)_ops
OpsHandlerV   )BackendFeatureCSEVariableDeferredLineIndentedBufferOpOverridesPythonPrinterSizeArg	TensorArg)DTYPE_TO_CPP)cexpr)constant_repr
SIMDKernelSIMDScheduling)
OrderedSet)ReductionType	StoreModec                "   t        | t              rVd| cxk  rdk  sKn t        j                  t        j                        }| |j
                  k(  ry| |j                  k(  ryd| dS t        | t              rdt        |        dS t        |       S )Ni   izhl.Int(64).min()zhl.Int(64).max()zhl.i64()zhl.f64()

isinstanceinttorchiinfoint64minmaxfloatr5   repr)valinfos     c/home/mcse/projects/flask/flask-venv/lib/python3.12/site-packages/torch/_inductor/codegen/halide.pyhalide_constantrI   E   s~    #s[C%E:%E{{5;;'$((?%$((?%q!!#us+,A..9    c                        e Zd Zd fdZ xZS )Unsupportedc                *    t         |   d|        y )Nz!halide backend does not support: )super__init__)selfthing	__class__s     rH   rO   zUnsupported.__init__S   s    <UGDErJ   returnNone)__name__
__module____qualname__rO   __classcell__rR   s   @rH   rL   rL   R   s    F FrJ   rL   c                       e Zd Zed        Zed        Zd Zd Zd Zd Z	e	Z
d Zd Zd	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Z fdZd ZeZd Zd Z xZS )HalidePrinterc                D    dt         j                  j                   d|  dS )Nhl.cast(, r<   )r)   kernelindex_dtypeexprs    rH   
cast_indexzHalidePrinter.cast_indexX   s"    !((../r$q99rJ   c                    d|  dS )Nhl.cast(hl.Float(32), r<    rb   s    rH   
cast_floatzHalidePrinter.cast_float\   s    'vQ//rJ   c                    d| dS )Nhl.f32(r<   rg   rP   rc   s     rH   _print_FloatzHalidePrinter._print_Float`   s    a  rJ   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr*   rj   r   r<   lenargs_printrk   s     rH   _print_ToFloatzHalidePrinter._print_ToFloatc   s9    499~"""TYYq\23155rJ   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr*   	hl.floor(r   r<   ro   rp   rd   rq   rk   s     rH   _print_floorzHalidePrinter._print_floorg   B    499~"""4;;tyy|+D*EQGHHrJ   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr*   	hl.trunc(r   r<   ru   rk   s     rH   _print_TrunczHalidePrinter._print_Trunck   rw   rJ   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr*   hl.ceil(r   r<   ru   rk   s     rH   _print_ceilingzHalidePrinter._print_ceilingq   sB    499~"""$++diil*C)DAFGGrJ   c                J    d| j                  | j                  |             dS Nzhl.sqrt(r<   )rh   rq   rk   s     rH   _helper_sqrtzHalidePrinter._helper_sqrtu   s$    $//$++d*;<=Q??rJ   c                    | j                  |j                  d         }| j                  |j                  d         }| j                  |j                  d         }d| d| d| dS )Nr   r*   r   
hl.select(r_   r<   )doprintrp   )rP   rc   cpqs        rH   _print_WherezHalidePrinter._print_Wherex   s_    LL1&LL1&LL1&A3b2aS**rJ   c                h   t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }| j                  t        j                  |j                  d |        }| j                  t        j                  |j                  |d         }d| d| dS )Nr*   r   r   hl.min(r_   r<   )ro   rp   rq   sympyMinrP   rc   midabs        rH   
_print_MinzHalidePrinter._print_Min~   s    tyy>Q;;tyy|,,$))n!KK		499Tc?34KK		499ST?342aS""rJ   c                h   t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }| j                  t        j                  |j                  d |        }| j                  t        j                  |j                  |d         }d| d| dS )Nr*   r   r   hl.max(r_   r<   )ro   rp   rq   r   Maxr   s        rH   
_print_MaxzHalidePrinter._print_Max   s    tyy>Q;;tyy|,,$))n!KK		499Tc?34KK		499ST?342aS""rJ   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr*   hl.abs(r   r<   ru   rk   s     rH   
_print_AbszHalidePrinter._print_Abs   sB    499~"""TYYq\)B(C1EFFrJ   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr*   zhl.cos((r   r<   rn   rk   s     rH   _print_OpaqueUnaryFn_cosz&HalidePrinter._print_OpaqueUnaryFn_cos   9    499~"""$++diil34A66rJ   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr*   z	hl.cosh((r   r<   rn   rk   s     rH   _print_OpaqueUnaryFn_coshz'HalidePrinter._print_OpaqueUnaryFn_cosh   9    499~"""4;;tyy|45Q77rJ   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr*   z	hl.acos((r   r<   rn   rk   s     rH   _print_OpaqueUnaryFn_acosz'HalidePrinter._print_OpaqueUnaryFn_acos   r   rJ   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr*   zhl.sin((r   r<   rn   rk   s     rH   _print_OpaqueUnaryFn_sinz&HalidePrinter._print_OpaqueUnaryFn_sin   r   rJ   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr*   z	hl.sinh((r   r<   rn   rk   s     rH   _print_OpaqueUnaryFn_sinhz'HalidePrinter._print_OpaqueUnaryFn_sinh   r   rJ   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr*   z	hl.asin((r   r<   rn   rk   s     rH   _print_OpaqueUnaryFn_asinz'HalidePrinter._print_OpaqueUnaryFn_asin   r   rJ   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr*   zhl.tan((r   r<   rn   rk   s     rH   _print_OpaqueUnaryFn_tanz&HalidePrinter._print_OpaqueUnaryFn_tan   r   rJ   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr*   z	hl.tanh((r   r<   rn   rk   s     rH   _print_OpaqueUnaryFn_tanhz'HalidePrinter._print_OpaqueUnaryFn_tanh   r   rJ   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr*   z	hl.atan((r   r<   rn   rk   s     rH   _print_OpaqueUnaryFn_atanz'HalidePrinter._print_OpaqueUnaryFn_atan   r   rJ   c                D   |j                   rt        | 	  |      S |j                  \  }}| j	                  | j                  | j                  |                  }| j	                  | j                  | j                  |                  }| j                  d| d| d      S )Nrt   z / r<   )
is_integerrN   _print_FloorDivrp   rh   parenr   rd   )rP   rc   xdivrR   s       rH   r   zHalidePrinter._print_FloorDiv   s    ??7*4003OODJJt||A78oodjjc):;<1#SQ788rJ   c                    t        |j                        dk(  sJ | j                  d| j                  |j                  d          d      S )Nr*   	hl.round(r   r<   ru   rk   s     rH   _print_RoundzHalidePrinter._print_Round   rw   rJ   c                2    |j                   \  }}d| d| dS )N() / (z+hl.f32(0)))rp   )rP   rc   r   r   s       rH   _print_IntTrueDivzHalidePrinter._print_IntTrueDiv   s$    yy11#U1#[))rJ   c                ~    |j                   \  }}| j                  |      }t        |      }dd| z  d| dd|z  dS )Nrj   g      $@z)*hl.round((z	)*hl.f32()))rp   rq   r>   )rP   rc   rF   ns       rH   _print_RoundDecimalz!HalidePrinter._print_RoundDecimal   sK    Qkk#Fr\#iQzLLrJ   ) rV   rW   rX   staticmethodrd   rh   rl   rr   rv   rz   _print_TruncToIntr}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _print_RoundToIntr   r   rY   rZ   s   @rH   r\   r\   W   s    : : 0 0!6II %H@+##G7887887889I %*
MrJ   r\   z	hl.Bool()zhl.BFloat(16)zhl.Float(16)zhl.Float(32)zhl.Float(64)z	hl.Int(8)z
hl.Int(16)
hl.Int(32)
hl.Int(64)z
hl.UInt(8)zhl.UInt(16)zhl.UInt(32)zhl.UInt(64)c                    t         |    S N)_halide_typedtypes    rH   halide_typer      s    rJ   c                    t        |       r/| j                  r#| t        j                  k7  rt        j                  } | t        j
                  t        j                  fv rt        j                  } t        |       S r   )	r   	is_signedr?   rA   int32float16bfloat16float32r   r   s    rH   halide_acc_typer      sM    5??u7K//urJ   c                     e Zd Ze	 	 dA	 	 	 dBd       ZedCd       Zed        Zed        Zed        Z	ed        Z
ed        Zed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed        Z"ed         Z#ed!        Z$ed"        Z%ed#        Z&ed$        Z'ed%        Z(ed&        Z)ed'        Z*ed(        Z+ed)        Z,ed*        Z-ed+        Z.ed,        Z/ed-        Z0ed.        Z1ed/        Z2ed0        Z3ed1        Z4ed2        Z5ed3        Z6ed4        Z7ed5        Z8ed6        Z9ed7        Z:ed8        Z;ed9        Z<ed:        Z=ed;        Z>ed<        Z?ed=        Z@edDd>       ZAed?        ZBed@        ZCy)EHalideOverridesNc                X    |t         j                  k(  rd|  dS dt        |       d|  dS )Nr   z != 0)r^   r_   r<   )r?   boolr   )r   r   	src_dtypeuse_compute_typess       rH   to_dtypezHalideOverrides.to_dtype   s9     EJJqc= +e,-Rs!44rJ   c                    |t         j                  t         j                  fv rdt        |       d|  d} dt        |       d|  d}|t         j                  t         j                  fv rd| d}|S )Nr^   r_   r<   zhl.reinterpret(rf   )r?   r   r   r   )r   r   r   lines       rH   to_dtype_bitcastz HalideOverrides.to_dtype_bitcast  ss    77;y12"QCq9A U!3 4Bqc;U]]ENN33+D63DrJ   c                8    | j                  t        |      |      S r   )r   rI   )clsvaluer   s      rH   constantzHalideOverrides.constant
  s    ||OE2E::rJ   c                    d|  dS )Nr   r<   rg   r   s    rH   abszHalideOverrides.abs      1~rJ   c                R    t        | d      sd|  dS d|  d| j                   d|  dS )Nnamehl.exp(r<   z"hl.fast_exp(hl.cast(hl.Float(32), z)) if z!.type().bits() <= 32 else hl.exp(hasattrr   r   s    rH   expzHalideOverrides.exp  s=    q&!QCq>!3A3fQVVHDefgehhijjrJ   c                    d|  dS )Nr   r<   rg   r   s    rH   libdevice_expzHalideOverrides.libdevice_exp  r   rJ   c                    d|  dS r   rg   r   s    rH   sqrtzHalideOverrides.sqrt      !ArJ   c                    t        | d      s	d|  d| dS d| j                   d| d}d|  d| d	|  d
|  d| d| j                   d|  d| dS )Nr   r   r_   r<   r^   	.type(), hl.select((<)|hl.is_nan(), ) if z.type().is_float() else hl.min(r   r   r   s     rH   minimumzHalideOverrides.minimum        q&!QCr!A&&qvvhis!,QCq<s#aS1#U166(JijkillnopnqqrssrJ   c                    t        | d      s	d|  d| dS d| j                   d| d}d|  d| d	|  d
|  d| d| j                   d|  d| dS )Nr   r   r_   r<   r^   r   r   >r   r   r   z.type().is_float() else hl.max(r   r   s     rH   maximumzHalideOverrides.maximum(  r   rJ   c                X    t        |d      rd|j                   d| d}d|  d| d| dS )Nr   r^   r   r<   r   r_   r   )r   r   r   s      rH   wherezHalideOverrides.where0  s?    1f166()A3a0AA3b2aS**rJ   c                    d|  dS )Nzhl.cos(r<   rg   r   s    rH   coszHalideOverrides.cos6  r   rJ   c                    d|  dS )Nzhl.sin(r<   rg   r   s    rH   sinzHalideOverrides.sin:  r   rJ   c                    t        d      )NlgammarL   r   s    rH   r   zHalideOverrides.lgamma>      (##rJ   c                    d|  dS )Nzhl.erf(r<   rg   r   s    rH   erfzHalideOverrides.erfB  r   rJ   c                    d|  dS )Nzhl.cosh(r<   rg   r   s    rH   coshzHalideOverrides.coshF  r   rJ   c                    d|  dS )Nzhl.sinh(r<   rg   r   s    rH   sinhzHalideOverrides.sinhJ  r   rJ   c                    d|  dS )Nzhl.acos(r<   rg   r   s    rH   acoszHalideOverrides.acosN  r   rJ   c                    d|  dS )Nz	hl.acosh(r<   rg   r   s    rH   acoshzHalideOverrides.acoshR      1#QrJ   c                    d|  dS )Nzhl.asin(r<   rg   r   s    rH   asinzHalideOverrides.asinV  r   rJ   c                    d|  dS )Nz	hl.asinh(r<   rg   r   s    rH   asinhzHalideOverrides.asinhZ  r  rJ   c                    d|  d| dS )Nz	hl.atan2(r_   r<   rg   r   ys     rH   atan2zHalideOverrides.atan2^      1#Rs!$$rJ   c                    d|  dS )Nzhl.atan(r<   rg   r   s    rH   atanzHalideOverrides.atanb  r   rJ   c                    d|  dS )Nz	hl.atanh(r<   rg   r   s    rH   atanhzHalideOverrides.atanhf  r  rJ   c                    t        d      )Ncopysignr  r  s     rH   r  zHalideOverrides.copysignj  s    *%%rJ   c                    t        d      )Nerfinvr  r   s    rH   r  zHalideOverrides.erfinvn  r  rJ   c                    d|  d| dS )Nz	hl.hypot(r_   r<   rg   r  s     rH   hypotzHalideOverrides.hypotr  r  rJ   c                    t        d      )N	nextafterr  r  s     rH   r"  zHalideOverrides.nextafterv  s    +&&rJ   c                    |  d| S Nz & rg   r   s     rH   logical_andzHalideOverrides.logical_andz      Cs|rJ   c                    |  dS )Nz == 0rg   r   s    rH   logical_notzHalideOverrides.logical_not~  s    E{rJ   c                    |  d| S Nz | rg   r   s     rH   
logical_orzHalideOverrides.logical_or  r&  rJ   c                    d|  d| dS )Nr    ^ r<   rg   r   s     rH   logical_xorzHalideOverrides.logical_xor  s    1#S1~rJ   c                    |  d| S r$  rg   r   s     rH   bitwise_andzHalideOverrides.bitwise_and  r&  rJ   c                    d|  S )N~rg   r(  s    rH   bitwise_notzHalideOverrides.bitwise_not  s    1#wrJ   c                    |  d| S r+  rg   r   s     rH   
bitwise_orzHalideOverrides.bitwise_or  r&  rJ   c                    |  d| S )Nr.  rg   r   s     rH   bitwise_xorzHalideOverrides.bitwise_xor  r&  rJ   c                    |  d| S )Nz << rg   r   s     rH   bitwise_left_shiftz"HalideOverrides.bitwise_left_shift      D}rJ   c                    |  d| S )Nz >> rg   r   s     rH   bitwise_right_shiftz#HalideOverrides.bitwise_right_shift  r;  rJ   c                    d|  d| dS )Nzhalide_helpers.rand(r_   r<   rg   seedoffsets     rH   randzHalideOverrides.rand  s    %dV2fXQ77rJ   c                    d|  d| dS )Nzhalide_helpers.randn(r_   r<   rg   r?  s     rH   randnzHalideOverrides.randn  s    &tfBvha88rJ   c           	          d|  d| d| d| d	S )Nzhalide_helpers.randint64(r_   r<   rg   )r@  rA  lowhighs       rH   	randint64zHalideOverrides.randint64  s#    *4&6("SED6KKrJ   c                    t        j                  | d       dt        j                  j                  j                  d|       S )Nr    + load_seed_offset)opsloadr)   r`   rp   seed_offset)r   rA  s     rH   	load_seedzHalideOverrides.load_seed  s7    ((4#$C(A(ABTV\(]'^__rJ   c                    d|  dS )Nz1./hl.sqrt(r<   rg   r   s    rH   rsqrtzHalideOverrides.rsqrt  s     QCq!!rJ   c                    d|  dS )Nzhl.tan(r<   rg   r   s    rH   tanzHalideOverrides.tan  r   rJ   c                    d|  dS )Nzhl.tanh(r<   rg   r   s    rH   tanhzHalideOverrides.tanh  r   rJ   c                    d|  dS )Nz3(hl.reinterpret(hl.UInt(32), hl.cast(hl.Float(32), z)) >> 31) != 0rg   r   s    rH   signbitzHalideOverrides.signbit  s    DQC~VVrJ   c                    |  d|  d| d| S )Nz - hl.trunc(/z)*rg   r   s     rH   fmodzHalideOverrides.fmod  s!     L1QCr!--rJ   c                    d|  d| dS )Nzhl.pow(r_   r<   rg   r   s     rH   powzHalideOverrides.pow  s    2aS""rJ   c                    d|  dS )Nzhl.log(r<   rg   r   s    rH   logzHalideOverrides.log  r   rJ   c                    d|  dS )Nz hl.is_inf(hl.cast(hl.Float(32), r   rg   r   s    rH   isinfzHalideOverrides.isinf       2!B77rJ   c                    d|  dS )Nz hl.is_nan(hl.cast(hl.Float(32), r   rg   r   s    rH   isnanzHalideOverrides.isnan  ra  rJ   c                    d|  dS )Nr   r<   rg   r   s    rH   roundzHalideOverrides.round  r  rJ   c                    d|  dS )Nrt   r<   rg   r   s    rH   floorzHalideOverrides.floor  r  rJ   c                    d|  d| dS )Nr   r   z + hl.f32(0))rg   r   s     rH   int_truedivzHalideOverrides.int_truediv  s    1#U1#]++rJ   c                .    d| j                    d|  d| dS )Nz"hl.floor(hl.cast(hl.Float(max(32, .type().bits())), ) / r<   r   r   s     rH   floordivzHalideOverrides.floordiv  s)     18J1#TRSQTTUV	
rJ   c                4   t        j                  t        j                  d|      t        j                        }t        j                  t        j                  |d      t        j                        }t        j
                  ||      }d|j                   d| dS )N0r^   r   r<   )rL  r   ltr?   int8subr   )r   r   leftrightrs  s        rH   signzHalideOverrides.sign  sg    ||CFF3NEJJ7SVVAs^UZZ8ggdE"!&&3%q11rJ   c                    d|  dS )Nry   r<   rg   r   s    rH   trunczHalideOverrides.trunc  r  rJ   c                .    d| j                    d|  d| dS )Nz"hl.trunc(hl.cast(hl.Float(max(32, rk  rl  r<   rm  r   s     rH   truncdivzHalideOverrides.truncdiv  s)    
 18J1#TRSQTTUV	
rJ   c                    d|  dS )Nr|   r<   rg   r   s    rH   ceilzHalideOverrides.ceil   r   rJ   c                    d|  dS )Nr   z, 0)rg   r   s    rH   reluzHalideOverrides.relu  s    4  rJ   c                ~   t         j                  j                  |      }t         j                  j                  t         j                  j	                  |      t         j                  j                  |      t        |            }|t        j                  t        j                  hvrt        j                  ||      S |S Nbounds)r)   r`   prepare_indexinggenfuncindex_to_strused_dims_from_indexr"   r?   r   rA   rL  r   )r   rc   r   indexvars        rH   
index_exprzHalideOverrides.index_expr  s    ))$/hhHH!!%(HH))%0(.  

 ekk22<<U++
rJ   c                    t        j                  |t        j                        }t        j                  |||      }||_        t        t        |            S r   )rL  r   r?   r   halide_clampindirect_indexing_sizer%   str)r   	index_varsizecheckwrap_negs        rH   indirect_indexingz!HalideOverrides.indirect_indexing  sC     LLEKK8	$$Ye<	+/	(!#i.11rJ   c                    t         j                  j                  t         j                  j                  |      dz
        }t	        |t
        t        j                  f      sd|j                   d| d}d| d| dS )Nr*   r^   r   r<   z	hl.clamp(z, 0, )	r)   r`   kexprrename_indexingr=   r>   r   Integerr   )r   r   r  r  ends        rH   r  zHalideOverrides.halide_clamp  sj    hhnnQXX55d;a?@$emm 45UZZL	#a8C 5'se1--rJ   c                   t         j                  j                  | |      5 } |       }d d d        j                  j                  rt        |      }t         j                  j                  d|j                   dt        |       dg t        j                  |            }t        j                  ||      S # 1 sw Y   xY w)Nr^   r   r<   r  )r)   r`   
mask_loadsr  is_boolr   r  r   rI   r   wraprL  r   )maskbodyothernew_maskresults        rH   maskedzHalideOverrides.masked%  s    XX  u- 	VF	 ==  KE   v{{m9_U-C,DAF##E* ! 
 yy6511	 	s   B88C)NT)r   torch.dtyper   zOptional[torch.dtype])r   r  r   r  )TT)DrV   rW   rX   r   r   r   classmethodr   r   r   r   r   r   r   r   r   r   r   r  r  r  r
  r  r  r  r  r  r  r  r  r   r"  r%  r)  r,  r/  r1  r4  r6  r8  r:  r=  rB  rD  rH  rO  rQ  rS  rU  rW  rZ  r\  r^  r`  rc  re  rg  ri  rn  rv  rx  rz  r|  r~  r  r  r  r  rg   rJ   rH   r   r      s    ,0	55 )5 5   ; ;   k k
     t t t t + +
     $ $                   % %       & & $ $ % % ' '                     8 8 9 9 L L ` ` " "     W W . . # #   8 8 8 8         , , 
 
 2 2     
 
   ! ! 	 	 2 2 . . 2 2rJ   r   c                    | S r   rg   )hs    rH   _typecheck_HalideOverridesr  8  s    HrJ   c                  ^     e Zd Z ej                  d      Zd fdZd Zd ZddZ	d Z
 xZS )	HalideCSEVariablez\b(tmp\d+)\[\?\]c                4    t         |   ||       d | _        y r   )rN   rO   	used_dims)rP   r   r  rR   s      rH   rO   zHalideCSEVariable.__init__?  s    v&7;rJ   c                T   t        | j                  xs d      }t        j                  ||j	                               D ]D  }t        |t              s|j                  
J |||f       |j                  |j                         F t        j                  j                  |      | _        y )Nrg   )setr  	itertoolschainvaluesr=   r  updater)   r`   sort_used_dims)rP   r   rp   kwargsusedargs         rH   update_on_argsz HalideCSEVariable.update_on_argsC  s    4>>'R(??49 	+C#01}}0C4d2CC0CMM*	+ 006rJ   c                    t        |      dk(  r| j                   dS | j                   ddj                  t        t        |             dS )Nr   z[()][r_   ])ro   r   joinmapr  )rP   dimss     rH   	index_strzHalideCSEVariable.index_strK  sE    t9>ii[%%))AdiiC78::rJ   c                n    | j                   | j                   dS | j                  | j                         S )Nz[?])r  r   r  )rP   s    rH   __str__zHalideCSEVariable.__str__Q  s0    >>!ii[$$~~dnn--rJ   c           	         | j                   t        d | j                   D              sJ | j                  | j                   D cg c]  }|j                  ||       c}      S c c}w )Nc              3  P   K   | ]  }t        |t        j                           y wr   r=   r   Expr.0r   s     rH   	<genexpr>z-HalideCSEVariable.subs_str.<locals>.<genexpr>X  s       2
*+Jq%**%2
   $&)r  allr  get)rP   replacementsr   s      rH   subs_strzHalideCSEVariable.subs_strW  s^    ~~)c 2
/3~~2
 /
 	
 
 ~~t~~N!|//15NOONs   A$)r  zValueRanges[Any]rT   rU   )rT   r  )rV   rW   rX   recompileundefined_rerO   r  r  r  r  rY   rZ   s   @rH   r  r  <  s.    2::12L<7;.PrJ   r  c                  H     e Zd ZU ded<   ded<   ded<   d fdZd	dZ xZS )
DimensionInfozOptional[sympy.Expr]rc   
sympy.Exprr  stridec                    t         |           t        j                  j                  j                  |d      r| }| }|| _        || _        || _        y Nr   )	rN   rO   r)   graphsizevarsstatically_known_ltrc   r  r  )rP   rc   r  r  rR   s       rH   rO   zDimensionInfo.__init__d  sK    77//:WF5D		rJ   c                   | j                   J | j                   }|r|dk(  ry|ri |}|j                  D ]  }t        |t        j                        st        |t        j                        sJ t        j                  j                  |j                        }t        |t              sJ t        |j                  |            ||<    t        ||      }t        j                  j!                  |      S )Nr   hl.Var())rc   free_symbolsr   r   TMPr=   r   Symbolr)   r`   lookup_cse_varr   r  r%   r  r&   r  )rP   r  	zero_varsrc   symr  s         rH   r  zDimensionInfo.index_strm  s    yy$$$yy+l+L(( W!#txx0%c5<<888((11#((;C%c+<===(:3<<;U(VL%W dL1Dxx$$T**rJ   rS   NF)rV   rW   rX   __annotations__rO   r  rY   rZ   s   @rH   r  r  ^  s    

+rJ   r  c                   t         j                  j                  j                  | |      ry	 t         j                  j                  j	                  |       }t         j                  j                  j	                  |      }||k(  r*t         j                  j                  j                  | |       ||k(  S # t
        $ r Y yw xY wNTF)r)   r  r  statically_known_equals	size_hint	TypeErrorguard_equals)rt  ru  r   r   s       rH   eqr  ~  s    ww//e<GG&&t,GG&&u- 	Av	%%dE26M	  s   AB3 3	B?>B?c                   t         j                  j                  j                  | |      ry	 t         j                  j                  j	                  |       }t         j                  j                  j	                  |      }||k  r*t         j                  j                  j                  | |       ||k  S # t
        $ r% t        j                  | |      }|| k(  r| |k7  cY S Y yw xY wr  )	r)   r  r  r  r  r  r   gcdguard_lt)rt  ru  r   r   r  s        rH   rq  rq    s    ww++D%8GG&&t,GG&&u- 	1u	!!$.q5L  iie$$;5= 	s   AB3 3)C! C!c                      e Zd ZU eZeZded<   ddej                  dd	 	 	 	 	 d# fdZ
d$dZd% fdZd Zd	 Z	 	 d& fd
Zd Zd'dZd Zd Zd&dZd Zd(dZd)dZd*dZ	 d$	 	 	 	 	 	 	 	 	 d+dZ	 	 	 	 	 	 	 	 	 	 d,dZd Z	 	 	 	 	 	 	 	 d-dZ ej<                         d	 d.dZd.dZ d*dZ!d Z"d/dZ#d$dZ$e%d        Z&d$d*d Z'd! Z(	 	 	 	 	 	 	 	 d0d"Z) xZ*S )1HalideKernelzCallable[[sympy.Expr], str]r  N)	mutations	pid_cachereduction_hintoverride_persistent_reductionc          	     ~   t        |   ||||||d | j                  | _        | j                  | _        | j                  | _        t               | _        | j                  | _	        | j                  | _
        i | _        i | _        i | _        i | _        i | _        i | _        t#        t$              | _        d| _        y )N)ra   r  r  r  r  F)rN   rO   r  computeloadsstoresr.   indexing_code_dominside_reductionneeds_dom_indexinghas_reductionbuffer_dimensionsbuffer_offsetshalide_varsindex_replacementsreduction_renamesdom_renamesr   listbuffer_aliaseshas_indirect_indexing)rP   ra   r  r  r  r  groupsrR   s          rH   rO   zHalideKernel.__init__  s     	#)*G	
 yyYY
ii!/!1"&"7"7!22AC57;=@BCEHJ4?4E%*"rJ   c                \    | j                   j                  | d|d       t        ||      S )Nz = hl.Func(r<   )r  	writeliner  )rP   r   r  s      rH   create_cse_varzHalideKernel.create_cse_var  s.    		tfKxq9: v..rJ   c           
        | j                   s| j                  s| j                  rJ t        j                  t
        j                  j                  j                  t              t        j                  t        t         | 8  |            }t               t         j"                  j%                  | j&                  D cg c]  }|j(                  j+                          c}      D ci c]  }|j-                         | c}d }fd}fd}|D ]  }|j/                  t0              rV|j3                  t1        t5        j6                  d      t5        j6                  d      t5        j6                  d            |       |j/                  t8              rB|j3                  t9        t5        j6                  d      t5        j6                  d            |       j;                  t         | 9  |      j<                          t?        d D              | _         d	}tC        | j&                        D ]  }|j(                  j+                         D cg c]  }|j-                         v s| }	}|	jE                  fd
       |	s+|	jG                  |jI                  d|jJ                               d}
t5        jL                  d      }g }|
tO        |	      k  rtQ        |jJ                  |      s|	D cg c]+  }tQ        |jR                  |      s ||jT                        - }}|
tO        |      z  }
|sJ |	       |t        jV                  t
        j                  j                  jX                  |      z  }|j[                  |	D cg c]C  }t]        ||jR                        r+t]        |jR                  |      r ||jR                  |z        E c}       |rt        jV                  t4        j^                  |      }tQ        |d      r2 ||jJ                  |z        }tQ        |d      rJ g }tO        |	      }
d}ta        dtO        | j                               }|jb                  dk(  r.ta        dtO        | j                               | j                  |<   || j                  |<   |jG                  ||f       ||z  }|	D cg c]%  }tQ        |jR                  |      s|jT                  ' }}|
tO        |      z  }
tO        |      }|D cg c]&  }tQ        ||      st5        jd                  ||z        ( }}tO        |      |k  s|dk(  sJ |j[                  |       |r|
tO        |	      k  rtQ        |jJ                  |      s|	D ]  }	 d}d}tQ        |jR                  |      s)||   \  }}|dz  }||z  }tQ        |jR                  |      s)d}t5        jL                  d      }tQ        |jT                  |      s1||   \  }}|dz  }|||z  z  }||z  }tQ        |jT                  |      s1|| j                   |j-                         <    ! | j                  D ]-  }| jj                  jm                  | d|jn                  d       / | j                  rL| jq                  d| j                  js                         D ci c]  \  }}|| j                  |    c}}       yyc c}w c c}w c c}w c c}w c c}w c c}w c c}w # tf        $ r |sJ t5        jL                  d      }t5        jL                  d      }|D ]  \  }}|||z  z  }||z  } t
        j                  j                  ji                  t1        ||jR                  |jT                        | j                        | j                   |j-                         <   Y >w xY wc c}}w )a  
        Hook called right before codegen with every index that will be
        used in the fused kernel.

        This populates self.halide_vars/index_replacements/reduction_renames which is an alternate indexing
        scheme that avoids using divide and modulus.  Instead of xindex/yindex/rindex
        we base indexing on a larger number of vars whose product combines to those.

        This function populates self.halide_vars, self.index_replacements, and self.reduction_renames
        fallbackc                z    t        j                  t        j                  j                  j                  |             S r   )r   simplifyr)   r  r  remove_precomputed_replacementsrb   s    rH   r
  z0HalideKernel.finalize_indexing.<locals>.simplify  s+    >>  @@F rJ   c                   | v r|    }j                  |j                  j                  |j                  |z  t        j
                  j                  j                  |t        |j                  |                  j                                y y r   )addrootlookupdivisorr)   r  r  evaluate_minr   lengthsymbol)baser  modulusnodeall_used_symbolssym_to_nodes       rH   visit_modular_indexingz>HalideKernel.finalize_indexing.<locals>.visit_modular_indexing  sw    {""4( $$II$$w.((55#Xdkk7%C
 fh #rJ   c           	         | v r`|    }j                  |j                  j                  |j                  |z  t	        |j
                  |            j                                y y r   )r  r  r  r  r   r  r  )r  r  r  r  r  s      rH   visit_floor_divz7HalideKernel.finalize_indexing.<locals>.visit_floor_div  s]    {""4( $$II$$w. g6 fh	 #rJ   r  r  r  c              3  P   K   | ]  }t        |t        j                           y wr   )r   r   INDIRECT)r  r  s     rH   r  z1HalideKernel.finalize_indexing.<locals>.<genexpr>  s       )
36N3.)
r  Fc                (     | j                         S r   )r  )r   r  s    rH   <lambda>z0HalideKernel.finalize_indexing.<locals>.<lambda>  s    Yqyy%9 rJ   keyr*   r   Tr  rhrz
 = hl.Var(r<   rdomN):r  r  r  	functoolspartialr)   r  r  r  r   dictfromkeysr  rN   r  r  r  r  from_iterablerange_treesnodesr  r  hasr   replacer   Wildr   r  r  anyr  reversedsortappendr  numelr  ro   r  r  r  reduceevaluate_maxextendrq  r  r%   prefixr
  
IndexErrorsimplify_with_rangesindexing_coder  r   codegen_rdomitems)!rP   indicestreer   r
  r  r  r  had_fallbackr+  handled_countr  added_sym_sizesizes_to_addr  	next_sizer  	new_sizes	prior_lensr  idxr  r  rc   
full_indexr  vrvr  r  r  rR   s!                                @@@rH   finalize_indexingzHalideKernel.finalize_indexing  s    ##t'7'74;Q;Q	
 
 %%agg&6&6&@&@3O	--EG$<g FG5 __22151A1AB""$B
 HHJM
	

		  	REyy)#

6*

9-

9-
 + yy"

6*

9- $ ##EG$<U$C$P$PQ%	R( &) )
:J)
 &
" T--. S	D $

 1 1 3V1qxxzEU7UQVEVJJ9J:T[[DJJ78MmmA&GN#e*,R

G5L05 +,AIIw9OHQXX&    \!22#*U*|	 0 0GG$$11<!  ## "'gqyy1bC6H !W!45 # ) 0 0L II)Q' %-TZZ'-A$B	#%i#333')(+E
'+,qT5E5E1F0G-HIC{{c)6H T%5%5!6 787..s3 -6D$$S)"))3	*:;y(G38 SaBqyy'<R SI S!S^3M #L 1I ".$!!Y/ q9}5$L $
 |,y8INJJ ''	27 #!  #e*,R

G5L\  CG w7$23$7	Tq4 !w7 F ==+D f5$23$7	Tq,$	 !f5
 >BD++DKKM:oS	l ## 	JC((C5
388,a)HI	J!!6:6L6L6R6R6TUUQT%%a((U "c C
z W 0 !T$2 " ''<!&q!1J"]]1-F%3 '	T"fsl2
$'
 ((=='
DLL$++N(( ++( Vsd    ![**[/[4+[46[9[9	A[>
\"\+\A\A\3\_B<__c           
        | j                   rdnd}|| j                  v r| j                  |   S i }| j                  j                         D ]c  }| j                   s|| j                  v rt        j                  d|j                        }|sJ t        d| |j                  d             ||<   e | j                  | d|j                         D ci c]  \  }}|| j                  |    c}}       || j                  |<   |S c c}}w )zCRDom based indexing uses explicit iteration ranges for Func updatesioz^h(\d+)$r  r*   dom)r  r  r  keysr  r  matchr   r%   groupr;  r<  )rP   r7  renamesr  mrI  rJ  s          rH   setup_dom_indexingzHalideKernel.setup_dom_indexingt  s   --3T%%%##F++##((* 	HC((SD4J4J-Jchh/AH1-&!''!*.FGGCL	H 	hcN'--/RBR!1!1!!44R	
 $+ 	 Ss   Dc           	     v   |j                         D cg c]&  }d| j                  | j                  |             d( }}| j                  j	                  | ddj                  |       d       t        |j                               D ])  \  }}| j                  j	                  | d| d| d       + y c c}w )	Nhl.Range(0, r<   z = hl.RDom([r_   ]) = r  r  )r  r  r  r:  r  r  	enumeraterP  )rP   r   varsr  rsizesrM  rsyms          rH   r;  zHalideKernel.codegen_rdom  s     
 4::d&:&:4&@AB!D
 
 	$$v\$))F:K9LB%OP - 	BGAt((D6TF!A3a)@A	B
s   +B6c                    t         |   |      }t        || j                        }t        j
                  j                  j                  || j                        S r   )	rN   r  r&   r  r)   r  r  r9  r  )rP   r  rR   s     rH   r  zHalideKernel.prepare_indexing  sI     (/5$"9"9:ww44UD<L<LMMrJ   c                    t        |t        j                        r%| j                  |j                        j
                  S | j                  |   S )zThe size of an index symbol)r   r   r  r  r   r  r  )rP   r  s     rH   sym_sizezHalideKernel.sym_size  s<    #txx(&&sxx0GGG$$rJ   c           	         g t        |j                  d       D ]~  }t        |t        j                  t        j
                  f      rj                  |       ?t        |t        j                  t        j                  t        j                  f      ryJ |        t        j                  d      }D ci c]  }|t        j                  d       }}g }t        j                   j                  |            }t        |t        j                        r|j                   n|gD ]  }	|	j                  D 
cg c]	  }
|
|v s|
 }}
t#        |      dk(  r||	z  }5t#        |      dk(  r||d   xx   |	z  cc<   Tg }t%        t#        |            D ]e  }||   J ||   \  }}t'        |      t'        |      z  r*|j)                  |D 
cg c]	  }
|
|vs|
 c}
       |	|z  }	S|j                  ||f       g g |||	f}  fd}g }|D ]8  \  }}|D ]  }
||j+                  |
      z  } |j                   |||             : |j-                         D ]  \  }}|j                   |||g               |j/                  d        |s< j0                  r|j                  t3        t        j                  d      dd             nxt4        j6                  j8                  j;                  |d   j<                  d      sA|j?                  dt3        t        j                  d      rdn|d   j<                  d             |rs| j@                  v rit4        j6                  j8                  jC                  | j@                  |         r2 jE                  || j@                  |   z
          j@                  |   }n>t4        j6                  j8                  jG                  |d      r jE                  ||       d}|}tI        jJ                         D ]W  } jM                  |||      r||fc S rJ | d| }| jN                  |   vs: jN                  |   j                  |       Y yc c}w c c}
w c c}
w )	zEConvert address-based indexing into dimensions using self.halide_varsc                    | j                   S r   rm  r   s    rH   r  z5HalideKernel.indexing_to_dimensions.<locals>.<lambda>  s
    AFF rJ   r   r   r*   Nc                   t        j                  |       } t        |      dk(  rUt        j                  d
      }| j	                  ||d   z        }|r%t        |d   	j                  |d         ||         S rJ |        t        j                  t        | |D ci c]  }|	j                  |      dz
   c}      dz         }t        j                  d      }t        | t         j                        rt| j                  D ]e  }t        |t         j                        s||z  }t        j                  | |z        } t        j                  t        j                  ||z              }g t        | ||      S c c}w )Nr*   wild)excluder   )r   factorro   r.  rQ  r  r`  r
  r&   r  r=   Mulrp   ceiling)rc   symsstride_wildrT  r  r  r  termis_storerP   symbolss           rH   expr_to_dimensionz>HalideKernel.indexing_to_dimensions.<locals>.expr_to_dimension  sH   <<%D4yA~#jjAJJ{T!W45(QtAw!7;   %%<^^4!N##t}}S'9A'="=!NORSSF ]]1%F$		* II ND!$6$$~~dTk:!&ftm0L!M	N
 !vv66 "Os   E6c                t    t         j                  j                  j                  | j                  t
              S )Nr  )r)   r  r  r  r  r   )ds    rH   r  z5HalideKernel.indexing_to_dimensions.<locals>.<lambda>  s$     0 0 : :188c : R rJ   _view)(sortedr  r   r   HALIDEr  r2  UNBACKED_INTSIZEPRECOMPUTED_SIZEr   r  expandr  r=   Addrp   ro   ranger  r6  popr<  r1  r  r  r)   r  r  r  r  insertr  statically_known_geqapply_offset_to_dimensionstatically_known_gtr  countinstall_dimsr   )rP   r  r  rl  r  rA  rF  
split_exprsplit_failedpartrI  	part_varsnew_split_failedrM  
other_vars
other_partrn  r  ri  rc   orig_varrm  s   `  `                 @rH   indexing_to_dimensionsz#HalideKernel.indexing_to_dimensions  s8   %,,2BC 	CcDKK#:;s#%))		--   		 q!3:;aaq));
;DFT11%89",UEII">EJJUG 	FD$($5$5IqjIII9~"$Y1$9Q<(D0(#% s<01 JA'?666-9!_*J
:Y7!((Z)V1ICU!)VW
*(//Z0HIJ  F!1EIt3DE!	F$	7. & 	7JD$ *
q))*KK)$56	7 $))+ 	8ICKK)$67	8		R	S))M%--*:AqAB!!99$q'..!LKK=q!11d1gnnVWX d)))agg.>.>.S.S++C0/ ..tVd>Q>QRU>V5VW,,S1!!55 ..tV<" 	:A  dFH=Dy <JeA3'C$--h77##H-44S9	:] < J *Ws   8Q,7	Q1Q1	Q6Q6c                f   || j                   vr|| j                   |<   || j                  |<   y| j                  |   |k7  s$t        | j                   |         t        |      k7  ry|r| j                   |   |k(  S t        | j                   |   |      D ]  \  }}|j                  |j                  k7  r y|j
                  |j
                  k7  s|j                  |j                  k7  sTt        j                  j                  j                  |j
                  |j
                        |_        d|_         y)z>Try to set self.buffer_dimensions[var], return True on successTFN)r  r  ro   zipr  r  rc   r)   r  r  r5  )rP   r  r  rA  rl  oldnews          rH   r  zHalideKernel.install_dims  s   d,,,*.D""3''-D$s#v-""3'2
Y2 ))#.$66D2237> 	 HCzzSZZ'xx388#sxx388';77++88388L	  rJ   c                   |dk(  ry t        t        t        |                  D ]  }||   j                  dk(  s8t        j
                  j                  j                  |||   j                        sMt        |||   j                        }||||   j                  z  z  }||   xj                  |z  c_	         |dk(  sJ y )Nr   r*   )
r0  ry  ro   r  r)   r  r  r|  r   rc   )rP   r  rA  rM  r  s        rH   r}  z&HalideKernel.apply_offset_to_dimension  s    Q;%D	*+ 	%AAw~~"agg&6&6&K&KQ'  Q7$a//Q$	% {{rJ   c                x   t               }|j                  D ]  }t        |t        j                        sJ t        |t        j                        rU| j                  |j                        }t        |t              r|j                  J |j                  |j                         t        |t        j                        r|j                  |       t        |t        j                  t        j                   t        j"                  t        j$                  f      rt'        d|        | j)                  |      S )zIDetect which range trees are used to populate HalideCSEVariable.used_dimszunhandled symbol )r  r  r=   r   r  r   r   r  r  r   r  r  r  rs  r  rt  ru  rv  INDEXNotImplementedErrorr  )rP   r  r  r  cse_vars        rH   r  z!HalideKernel.used_dims_from_index(  s    E	%% 	ECc5<<000c488,--chh7w(9:))56   !2!23T[[1c"d''D4I4I4::V ),=cU*CDD#	E$ ""9--rJ   c                    t        d |D              sJ t        j                  | j                  | j                  j                               D cg c]  }||v r|
 }}t        |      t        |      k(  sJ |S c c}w )Nc              3  P   K   | ]  }t        |t        j                           y wr   r  r  s     rH   r  z.HalideKernel.sort_used_dims.<locals>.<genexpr>@  s     @:a,@r  )r  r  r  r  r  r  ro   )rP   r  r  ordereds       rH   r  zHalideKernel.sort_used_dims?  s    @i@@@@ !  $"8"8"?"?"A
 i	 
 
 7|s9~---
s   A9c                    dj                  fd|D              }t        |      dk(  rd}|S t        |      dk(  r| d}|S )Nr_   c              3  B   K   | ]  }|j                          y wr   )r  )r  rp  r  r  s     rH   r  z.HalideKernel.make_index_str.<locals>.<genexpr>L  s     Qqakk,	BQs   r   ()r*   ,)r  ro   )rP   r  r  r  r  s     `` rH   make_index_strzHalideKernel.make_index_strK  sM    IIQDQQ	t9>I  Y!^$+QIrJ   c                   | j                   j                  |      }| j                  |      }| j                  ||d      \  }}| d| j	                  |       d}t
        j                  j                  |      }|t        j                  t        j                  fv rt        j                  }d| d}| j                  rt        | j                  t              r| j                  j                  J h | j!                  |      | j                  j                  }| j#                  | j%                  |            }|j                  r| j&                  j)                  |j*                   d       | j&                  j)                  |j*                   d| j                   d       | j-                  | j.                  xs d      }	| j&                  j)                  | d	t1        |       d
|	 d       | j&                  j)                  | d| dt1        |       d
|j*                   d       |S | j&                  j)                  | d| j                   d
| dt1        |       d       |S | j3                  || j!                  |            S )z"Codegen a load from an InputBufferFr  r  rf   r<   z!_mask = hl.RDom([hl.Range(0, 1)])z_mask.where(r   z = hl.cast(r_   rY  z + hl.cast(z_mask)z = hl.select(z
, hl.cast(z, 0)))rp   inputr  r  r  r)   r  	get_dtyper?   r   r   r   
_load_maskr=   r  r  r  newfuncr  r  r  r   r  _load_otherr   r  )
rP   r   r  r  r  r   r   r  r  r  s
             rH   rM  zHalideKernel.loadT  sC   iiood#%%e,//UEB	Ta++D12!4!!$'U]]ENN33MME+D63D??4??,=>OO--9: X$33E:WT__=V=VWI\\$"5"5i"@AF		##v{{m3T$UV		##v{{m<?PPQ$RS

4#3#3#8q9		##hk+e*<)=RwaH 		##hc${;u3E2FbU[\ M 		##hmDOO+<BtfJ{[`OaNbbgh M<<d&?&?&FGGrJ   c                ^    | j                   j                  t        j                  dd|         S )Nz\[.* )csevarname_mapr  rs  rP   r   s     rH   r  zHalideKernel.lookup_cse_vary  s$    xx##BFF7B$=>>rJ   c                (   t        |t              sJ | j                  j                  |      }| j	                  |      }| j                  ||d      \  }}| j                  |      s|| j                         }| j                  ||      }|j                  |      }	dj                  dgt        |      z        xs d}
| j                  j                  t        || d|
 d| d             n| j                  |d	      }t        |      }	t         j"                  j%                  |      }|| d| d
t'        |       d|	 d}n+|dk(  r| d| dt'        |       d|	 d}nt)        d|       | j                  j                  t        ||             y)z"Codegen a store to an OutputBufferTNr_   r  r  r  z] = hl.undef(z.type()))r  z] = hl.cast(r<   
atomic_addz] += hl.cast(zstore mode=)r=   r  rp   outputr  r  is_indirect_indexingrU  r  r  r  ro   r  r  r-   r  r)   r  r  r   r  )rP   r   r  r   moder  r  r  r  	value_str
undef_dimsr   r   s                rH   storezHalideKernel.store|  s    %!2333iit$%%e,//UDA	T$$U+t/?224L++D,?I|4I))ZL3t9$<=F$JIITcU!J<}SE#RS ++DD+AIE
I!!$'<U!I;l;u3E2FbSTUD\!U!I;mK4F3Gr)TUVD%D6&:;;		Lt45rJ   c                @   | j                   sJ | j                  rJ |||f}|| j                  j                  v r| j                  j                  |   S t	        |t
              r1|dk(  sJ  | j                  | x| j                  j                  |<   }|S t	        |t              r|j                  J h | j                  }| j                  |j                  D cg c]	  }||vs| c}      }	|h |j                  z
  r0| j                  | | j                  h |j                  |            }|j                  | j                        }
t        j                  j!                  ||      }t#        |      }|dv r|	j$                   d| }| j&                  j)                  | d| d|
 d       g }d}t+        | j                        D ]C  \  }}|j-                  | d	| d
       |dk7  r|dxx   d| z  cc<   || j.                  |   z  }E | j&                  j)                  |	 ddj1                  |              n|dk(  r| j3                  ||      }	nt5        ||      }t7        j8                  t;        t=        t?                                 5   ||	|
      }ddd       d| dtA        |       d}| j&                  j)                  |	 d|        | j&                  j)                  |	 d        |	| j                  j                  |<   |	S c c}w # 1 sw Y   {xY w)zCodegen a reduction operationwelford_combineN)argmaxargmin_z = hl.z(rdom, r<   r*   r  r  *rY  rJ  welford_reducer^   r_   )!r  r  r  reduction_cacher=   tuplewelford_combine_implr  r  r  r  r  r  r  r   	Reductiondefault_accumulatorr   r   r  r  rZ  r2  r  r  welford_reduce_fallbackr   r)   set_ops_handlerr   r   r   rI   )rP   r   r   reduction_typer   	cache_keyresult_tuplereduction_varsrI  
result_varr  defaultacc_typer  partsr  rM  r  
combine_fncombine_strdefault_strs                        rH   	reductionzHalideKernel.reduction  s%    $$$$??""6	00088++I66eU#!%6666  9t88%@ADHH$$%!238SSS24112\\C11N+BQC

 .eoo..LL'D//0S%//0SN0STE NN4#9#9:	,,22>9M"5)11!'q(89EII5'/?wykQR STEF#D$:$:; 03was!_-Q;"I1VH-I$**3//	0
 II:,c%**U2C1D EF//55eUCJ1.(KJ""??;=3Q#RS @(Y?@$XJb1I0J!LKII:,c+ ?@II:,c+ ?@.8  +E D6@ @s   	L#L
LLc                   t        |t              r|j                  J t        |t              r|j                  J t        |t              r|j                  J h |j                  |j                  |j                  xs h | j                  }|h | j                  z  }| j                  | j                  |            }|||fD cg c]  }d|j                   d }}|j                  }| j                  j                  | ddj                  |       d       | j                  j                  | d| d       | j                  j                  | d| d	       | j                  j                  | d
| d       | j                  j                  | d|j                  | j                                | j                  j                  | d|j                  | j                                | j                  j                  | d|j                  | j                                | j                  j                  | d| d| d       | j                  j                  | d| d| d       | j                  j                  | d| d| d| d       | d| d| d| d| d| d| d| d| d| dg}	| j                  j                  | ddj                  |	       d       g }
t        d       D ]S  }|
j                  | j                  |j                               | j                  j                  |
d!    d"| d#| d$       U t        |
      S c c}w )%Nr^   z.type(), 0)z = hl.Tuple([r_   rX  z
_mean_1 = z[0]z_m2_1 = z[1]z_weight_1 = z[2]z
_mean_2 = z_m2_2 = z_weight_2 = z	_delta = z
_mean_2 - _mean_1z_new_weight = z_weight_1 + 	_weight_2z_w2_over_w = hl.select(z_new_weight == 0.0, 0.0, z_weight_2 / z_new_weight)z
_mean_1 + z	_delta * 
_w2_over_wz_m2_1 + z_m2_2 + z_weight_1 * _new_weightr   r  rY  r  r  )r=   r  r  r  r  r  r  r   r  r  r  r  ry  r2  r  )rP   meanm2weightr  r  r   r  pfxr  unpackedrM  s               rH   r  z!HalideKernel.welford_combine_impl  sm   $ 12t~~7QQQ"/0R\\5MMM&"349I9I9UUUGdnnGr||Gf6F6FG 
 L
L
	 	.t--..	\\$"5"5i"@A
<@"f;MNaXaffX[1NNoo		zl-		'8J7K2NO		se:j\=>		se8J<s;<		se<
|3?@		se:dmmD<R<R.S-TUV		se8BKK8N8N,O+PQR		e<0F0F GHI	
 			se9SEC5HI		se>#l3%yQR		e*3%/H\Z]Y^^jk	
 e:cU)C5
;e8C5Yse9SEVYUZZdee;

 			zl-		&8I7J"MNq 	GAOODLL)=)=>?II8B<.J<q1 EF	G X7 Os   	Mc           
     &   | j                   sJ t        |      t        |      k(  sJ g }t               }|D ]  }t        |t              r|j
                  J t        |j
                        t        | j                        z  r|j                  |       n?|j                  | j                  | g |j
                  g | j                  d d              |j                  |j
                          | j                  | j                  |            }|j
                  r+t        |j
                        t        | j                        z  sJ t        ||      D cg c]  \  }}dt        |       d| d }	}}| j                  | j                  | j                   d   j"                              }
|j$                   d}| d}| j&                  j)                  | d|
 d	       t        | j                        dk(  sJ d
       g | j                  \  }|t+        |      i}|t+        |      dz
  i}t        |      dk(  r(d }|j-                  |      g}|j-                  |      g}nqd }t/        t        |            D cg c]  }|j-                  |      d| dz    }}t/        t        |            D cg c]  }|j-                  |      d| dz    }}| j&                  j)                  | d ||	              t1        j2                  t5        t7        t9                                 5   |||      }d d d        | j&                  j)                  |j-                  |       d |              t        |      dk(  r|fS |D cg c]"  }| j                  | j                  |            $ }}t;        |      D ])  \  }}| j&                  j)                  | d| d| d       + t=        |      S c c}}w c c}w c c}w # 1 sw Y   xY wc c}w )Nr*   r^   r_   r<   r  _rdomz.xz = hl.RDom([hl.Range(1, z)])z&multi-dimensional scan not implementedc                    | d   S r  rg   r   s    rH   maybe_tuplez&HalideKernel.scan.<locals>.maybe_tuple)  s    trJ   c                ,    ddj                  |        dS )Nz
hl.Tuple([r_   rX  )r  r   s    rH   r  z&HalideKernel.scan.<locals>.maybe_tuple0  s    #DIIaL>44rJ   r  r  rY  )r  ro   r  r=   r  r  r  r2  r  r  r  r  r  r   r  r  r*  r3  r   r  r  r%   r  ry  r)   r  r   r   r   rZ  r  )rP   dtypesr  values_origr  all_used_dimsr   r  r   initialr  scan_domscanscan_varscan_renames_curscan_renames_prir  	read_left
read_rightrM  r  r  unpack_varsrI  s                           rH   r  zHalideKernel.scan  s%    $$$$6{c+....*,  
	2Ee%67EOO<WWW5??#c$*@*@&AAe$LL '$Ueoo$U7P9O9O7PQSRS7T$U
   1
	2 \\$"5"5m"DE
##J,@,@(AC""E
 )
 	
 

 !$FF 3
u u-.bq9
 

 D001A1A"1E1K1KLM oo&e,2		xj(@LM &&'1,	43	4,/../$&8&>?$&8&>&BCv;! $,,-=>?I$--.>?@J5
 s6{+ ##$45!A3a@I  s6{+ ##$45!A3a@J 
 			zl#k'.B-CDE {}/MNO 	<$Y
;K	<		""#345S[9Q8RS	
 v;!= QWXAt||D$7$7$FGXXk* 	<DAqII1#SAaS :;	<[!!k
:	< 	< Ys$   O2>O85O=+
P'PPr  c                   | j                   j                  | j                  ||      }t        |t              sJ ||_        |S r  )r  generater  r=   r  r  )rP   r   r  r  r  s        rH   r  zHalideKernel.genfuncM  s@     hh		4?#0111!
rJ   c                l    | j                   j                         }t        |t              sJ ||_        |S r   )r  newvarr=   r  r  )rP   r  r  s      rH   r  zHalideKernel.newfuncU  s/    hhoo#0111!
rJ   c                x    t         j                  j                  |      j                         j	                         S )a  
        We map all tensors to 1D buffers in Halide since Halide has trouble representing some strides that PyTorch
        supports.  If there are gaps in the underlying layout the numel we pass to Halide includes the gaps while
        PyTorch's numel excludes them.
        )r)   r  
get_buffer
get_layoutstorage_sizer  s     rH   halide_buffer_numelz HalideKernel.halide_buffer_numel[  s+     ww!!$'224AACCrJ   c                   d }g }| j                   j                         \  }}}}t        t        ||      |      D ]  \  }}|j	                  ||f       t        |t              s*|j                  dk(  r|j                  J | j                  j                  |j                  d      D ]K  }|j	                  dt        ||j                  |j                  |j                  |j                        f       M  |S )zX
        Halide requires scalar inputs before outputs, so need to reorder args.
        c                n    | \  }}t        |t              ryd|j                  v ryd|j                  v sJ y)Nr*   out_ptrr   in_ptrr   )r=   r1   r   )	arg_tuplecall_strr  s      rH   	arg_orderz.HalideKernel.halide_argdefs.<locals>.arg_orderh  s<    %MHc#w'chh&388+++rJ   r   r   Nrg   )alias_of)rp   python_argdefsrr  r  r2  r=   r2   rA  r  r   r  r   bufferr   )	rP   r  r  r  r   r   r  r  aliass	            rH   halide_argdefszHalideKernel.halide_argdefsc  s    
	 YY--/
1a#C1I9= 	MHcMM8S/*#y)zzQ3<<+???!0044SXXrB EMM % % #

 #		 #

),			" rJ   c                   g }| j                         D ]4  \  }}t        |t              r	d}d}d}d}n| j                  |j                     D cg c]&  }t        | j                  |j                              ( }}| j                  |j                     D cg c]&  }t        | j                  |j                              ( }}t        |      t        |      k(  sJ t        | j                  |j                           }t        |j                      d}|j                  t        ||j                  ||||j                               7 t         j"                  j$                  j'                         }	|	j(                  dk(  rDt*        j,                  j.                  g}
t*        j,                  j0                  }dt3               i}d}n|	j(                  dk(  sJ d       |	j4                  d	k  sJ d
       t*        j,                  j6                  g}
t*        j,                  j8                  }t:        j<                  j?                  |	      }d|
d	   vrAdD ]<  \  }}|j@                  |k\  s|jB                  |k\  s&|
j                  d| |         n |
j                  d       d|jD                  i}tG        d	|	j4                        }|
j                  d       |
j                  d       t*        j,                  jH                  s|
j                  d       t*        j,                  jJ                  r|
j                  d       d| jL                  v r|
j                  d       tO        |djQ                  |
      |||      S c c}w c c}w )z)Compute metadata required by codecache.pyNlongr  )shaper  rA  r  cpuparallelismcudazonly cpu/cuda supportedr   zonly default device supportedcuda_capability))      )r   r   )      )r  r   )r  r*   cuda_capability_user_contextstrict_float
no_runtime
no_assertsdebug64large_buffers-)target	schedulerscheduler_flagscuda_device))r  r=   r1   r  r   r4   r  r  r  ro   r  r3   r   r2  r   r  r)   r  r  get_current_device_or_throwtyper   halide
cpu_targetscheduler_cpur$   r  
gpu_targetscheduler_cudar?   r  get_device_propertiesmajorminormulti_processor_countrC   assertsr	  ra   r    r  )rP   argtypesr  r  r  r  rA  r   r   current_devicer  schdulerr  r  
capabilityr  r  s                    rH   halide_kernel_metazHalideKernel.halide_kernel_meta  s$   ))+ 	FAs#w' "33CHH= $..qvv67  "33CHH= $..qxx89  5zS[000t22388<='		2315OOHH!! \\	%	: **FFH%'mm../F}}22H35O K!&&&0K2KK0!''1,M.MM,mm../F}}33H99.IJ q	1$L LE5!''50Z5E5E5N(8w&GH MM.)z??O
 a!5!56K 	n% 	l#}}$$MM,'==MM'"4###MM/*88F#+#
 	
Cs   +M9+M>c                
     j                   j                  rt        d       j                         }t	               }|j                  dd       |j                           j                         D ]  \  }}t        |t              r,|j                  |j                   d j                   d       B|j                  sJ |       d|j                  v rdnd	}t        |j                        }t!         j"                  |j                           }|j                  |j                   d
| d| d| d        |j                  d       |j                           j                         D ]/  \  }}|j                  |j                   d|j                          1  j                   j%                         D ]  \  }	}
|j                  |	 d
|
         |j                   j&                          fd} j(                  j*                  D ]C  }t        |t,              r t.        j0                  j3                  ||      }|j                  |       E |j                  d       |j                  d        j                         D ]  \  }}t        |t              rWt4        j6                  j8                  j;                  |j<                  d      }|j                  |j                   d| d       n j"                  |j                     }g }t?        |      D ]   \  }} jA                  t4        j6                  j8                  j;                  |jB                  d      |      }|jE                  d| d       d|j                  vsp|j                  |j                   d| d       	 |j                  |j                   d| dtG        |jH                         d       	 |j                  |j                   d| dtG        |jB                         d        |j                  |j                   ddjM                  |       d        |jO                  d       |j                  djQ                                |jR                  rk|j                  dtU        jV                  |jR                        d|jX                  d |jR                  d|jZ                  d!	d       |j]                         S |j                  d"|jX                  d#d       |j]                         S # tJ        $ r Y Rw xY w# tJ        $ r Y +w xY w)$z3Called at the end to generate a final kernel stringinplace_buffersz
            import halide as hl
            from torch._inductor.runtime import halide_helpers
            from math import inf, nan

            @hl.generator(name="kernel")
            class Kernel:
        Tstripz = hl.InputScalar(r<   outzhl.OutputBufferzhl.InputBufferrY  r   r_   z&
            def generate(g):
        z = g.c                    j                   j                  | j                  d         }|j                  J |       t	        |      S )Nr*   )r  r  rR  r  r  )rT  r  rP   s     rH   update_indexz1HalideKernel.codegen_kernel.<locals>.update_index  s=    ((&&qwwqz2C==,1c1,s8OrJ   r  zassert g.using_autoscheduler()r*   r  z.set_estimate(rW  z.dim(z).set_min(0)z).set_stride(z).set_extent(z.set_estimates([rX  r   zN
            if __name__ == "__main__":
                hl.main()
            z:
                else:
                    hl.load_plugin(z))
                    target = hl.Target(z=)
                    autoscheduler = hl.AutoschedulerParams(a  )
                    with hl.GeneratorContext(target, autoscheduler):
                        gen = Kernel()
                        pipeline = gen._build_pipeline()
                        # gen.compile_to_callable() does not run the autoscheduler
                        pipeline.apply_autoscheduler(target, autoscheduler)
                        kernel = pipeline.compile_to_callable([
                                gen._get_input_parameter(a.name)._to_argument()
                                for a in gen._get_arginfos()
                                if a.dir == hl.ArgInfoDirection.Input
                            ], target)
                zR
                  else:
                      with hl.GeneratorContext(hl.Target(zX)):
                          kernel = Kernel().compile_to_callable()
                  )/rp   r#  rL   r!  r.   splice	do_indentr  r=   r1   r  r   ra   r  r   r   ro   r  aliasesr:  r  _linesr  r  r  rs  r)   r  r  r  rc   rZ  _autoscheduler_workaroundsr  r2  r>   r  r  r  do_unindentrstripr  r   find_libautoscheduler  r  getvalue)rP   r   metacoder  r  argclsargtypendimr  r  r(  r   hintr  range_hintsrM  dims   `                 rH   codegen_kernelzHalideKernel.codegen_kernel  s   99$$/00&&(  	 
	
 	))+ 	LFAs#w'#((+=d>N>N=OqQRzz&3&z.3sxx.?*EU%cii0411#((;<#((3vhay4&JK	L 		

 	))+ 	9FAsNNchhZuSXXJ78	9		))+ 	-HCNNcU#cU+,	-D&&'	
 II$$ 	!D$$(5599,MNN4 		!
 	r78))+ 	XFAs #w'ww''11#((Q1G#((>$qAB--chh7 'o !FAs::((22388a2H$D  &&dV1'=>CHH,#((5<'HI! NN#&88*E!M#cjj/ARRS T
! NN#&88*E!M#chh-PQ R!& #((+;DIIk<R;SSUVW;	X> 	 		
 >>KK$$3$H$H$X#[ \((, 7<<@NN;MRPTPdPdOg h	  #  8 }} KK::>++ I
    }}]  ) ! !  ) ! !s$   &7T#7T3#	T0/T03	U ?U c                    t        |      dk(  r^t        j                  j                  dk(  rAt        j
                  j                  j                         j                  dk(  rt        d|       } | S )Nr*   Anderson2021r  r   )
ro   r   r  r  r)   r  r  r  r  rC   )r   r  s     rH   r-  z'HalideKernel._autoscheduler_workaroundsS  sT     IN,,>!!==?DDN Aq	ArJ   c                   t         j                  j                  }| j                         D cg c]  \  }}|j                  |  }}}t         j                  j
                  j                         }|j                  dk(  r;|j                  |j                  t         j                        }|j                  |       |j                  ||d       yc c}}w )zCodegen a call to this kernelNr  F)r  )r)   r  wrapper_coder  r  r  r  r  write_get_raw_streamr  r2  generate_kernel_call)	rP   r   r  wrapperr   r  	call_argsr  stream_names	            rH   call_kernelzHalideKernel.call_kernel^  s    ''&&*.*=*=*?X33<<CWsVX	X**FFH&(!66~7K7KQWWUK[)$$ 	% 	
 Ys   CCc                     yr  rg   )rP   r  s     rH   generate_assertzHalideKernel.generate_assertl  s    rJ   c                     y r   rg   )rP   rc   r  loweruppers        rH   check_boundszHalideKernel.check_boundso  s     	rJ   )ra   r  r  zOptional[OrderedSet[str]]rT   rU   r   )r=  zSequence[sympy.Expr])r  r  )r  r  r  r  rl  r   r  )r   r  r  r  )r   r  )
r   r  r  r  r   r,   r  r:   rT   rU   )
r   r  r   r  r  r9   r   +Union[CSEVariable, Tuple[CSEVariable, ...]]rT   rK  )r  zTuple[torch.dtype, ...]r  zUCallable[[Tuple[CSEVariable, ...], Tuple[CSEVariable, ...]], Tuple[CSEVariable, ...]]r  Tuple[CSEVariable, ...]rT   rL  )rT   r  )rT   r    )rc   r  r  r  rH  r   rI  r   )+rV   rW   rX   r   	overridestexprr  r  r!   DEFAULTrO   r  rK  rU  r;  r  r`  r  r  r}  r  r  r  rM  r  r  r  r  r  r   unknownr  r  r  r  r!  r:  r   r-  rD  rF  rJ  rY   rZ   s   @rH   r  r    s   I).E&. 04$,,&*$+ $+ -	$+ 
$+L/iV*BNN%f:P(
..
#HJ? SW66 *63>6FO6	6::: : &	:
 ;: 
5:x$LP"'P"
P" -P" 
!P"f *=)<)<)>	D"HQ
fwr  
&09=FJrJ   r  c                  0    e Zd ZdZdZeZedd       Zd Z	y)HalideSchedulingr   r   c                    t         j                  t        j                  t        j                  t        j
                  g      }t        j                  j                  rd |t        j                  <   |S r   )
r'  r(  r+   TUPLE_REDUCTIONPREFER_STORE_LOOP_ORDERREDUCE_TO_SINGLE_ELEMENTr   r  scan_kernelsSCAN)r   devicer  s      rH   get_backend_featuresz%HalideScheduling.get_backend_features{  sT    ..6677
 ==%%*.F>&&'rJ   c                   t         j                  j                  }||j                  v r|j                  |   }|S d|j	                          }||j                  |<   |j                  d       t               }|j                  d|j                         d       |j                  |d       |j                  d       t        ||      \  }}| d| }	|j                  ||j                         |	       t        d	      rt        |d
|       |S )z6Codegen kernel definition to go in output wrapper codehalide_kernel_zEfrom torch._inductor.runtime.hints import HalideMeta, HalideInputSpeczasync_compile.halide(z, '''Tr$  z''')
kernel_metadatar  )r)   r  r>  src_to_kernelnext_kernel_suffixadd_import_oncer.   r  r!  r)  r#   define_kernelr1  r   r   )
rP   src_codenode_scheduler`   rA  kernel_namecompile_wrapperoriginsdetailed_originsmetadata_comments
             rH   rb  zHalideScheduling.define_kernel  s"   ''&&w,,,!//9K. + +7+E+E+G*HIK.9G!!(+##W -.O%%'(A(A(C'FeL ""84"8%%f-(;M7(S%G%")"-=,>?!!_5579I ''89#KX>rJ   N)rY  ztorch.device)
rV   rW   rX   
int32_type
int64_typer  kernel_typer  rZ  rb  rg   rJ   rH   rR  rR  u  s(    JJK
 
rJ   rR  )r  r   rT   zOpsHandler[str])u
__future__r   dataclassesr%  r  loggingr  collectionsr   mathr   typingr   r   r   r	   r
   r   r   r   r   r   r?   torch._logging_prims_commonr   utils._sympy.functionsr   r   utils._sympy.symbolr   r   utils._sympy.value_rangesr   r  r   r   	codecacher   r   metricsr   r   ops_handlerr   r   runtime.hintsr   r    r!   utilsr"   r#   r$   r%   r&   virtualizedr'   rL  r(   r)   commonr+   r,   r-   r.   r/   r0   r1   r2   cppr3   	cpp_utilsr4   simdr5   r6   r7   torch.utils._ordered_setr8   r9   r:   	getLoggerrV   r^  rI   RuntimeErrorrL   r\   r   rN  pexprr   r   r   r   float64rr  int16r   rA   uint8uint16uint32uint64r   r   r   r   r  r  	dataclassr  r  rq  r  rR  rg   rJ   rH   <module>r     s   "     	 # 
 
 
    - ? 7 4  ' ) B 6 F F  5 4	 	 	   ; ; 36g!
F, F
zMM zMz 	 
JJ	NNO	MM>	MM>	MM>	JJ	KK	KK	KK	KK	LL-	LL-	LL-"2k 2F
P PD + + +>
 W: Wt.~ .rJ   