Home | History | Annotate | Download | only in u32
      1 
      2 
      3 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
      4 __attribute__((reqd_work_group_size((1 << 4) * 1, 1, 1))) void
      5 hs_kernel_bs_0(__global uint const* const restrict vin,
      6                __global uint* const restrict vout)
      7 {
      8   uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
      9                         (get_local_id(0) & ((1 << 4) - 1));
     10   uint r1 = vin[gmem_idx + (1 << 4) * 0];
     11   uint r2 = vin[gmem_idx + (1 << 4) * 1];
     12   uint r3 = vin[gmem_idx + (1 << 4) * 2];
     13   uint r4 = vin[gmem_idx + (1 << 4) * 3];
     14   uint r5 = vin[gmem_idx + (1 << 4) * 4];
     15   uint r6 = vin[gmem_idx + (1 << 4) * 5];
     16   uint r7 = vin[gmem_idx + (1 << 4) * 6];
     17   uint r8 = vin[gmem_idx + (1 << 4) * 7];
     18   {
     19     uint const t = min(r1, r5);
     20     r5 = max(r1, r5);
     21     r1 = t;
     22   };
     23   {
     24     uint const t = min(r2, r6);
     25     r6 = max(r2, r6);
     26     r2 = t;
     27   };
     28   {
     29     uint const t = min(r3, r7);
     30     r7 = max(r3, r7);
     31     r3 = t;
     32   };
     33   {
     34     uint const t = min(r4, r8);
     35     r8 = max(r4, r8);
     36     r4 = t;
     37   };
     38   {
     39     uint const t = min(r1, r3);
     40     r3 = max(r1, r3);
     41     r1 = t;
     42   };
     43   {
     44     uint const t = min(r2, r4);
     45     r4 = max(r2, r4);
     46     r2 = t;
     47   };
     48   {
     49     uint const t = min(r5, r7);
     50     r7 = max(r5, r7);
     51     r5 = t;
     52   };
     53   {
     54     uint const t = min(r6, r8);
     55     r8 = max(r6, r8);
     56     r6 = t;
     57   };
     58   {
     59     uint const t = min(r3, r5);
     60     r5 = max(r3, r5);
     61     r3 = t;
     62   };
     63   {
     64     uint const t = min(r4, r6);
     65     r6 = max(r4, r6);
     66     r4 = t;
     67   };
     68   {
     69     uint const t = min(r1, r2);
     70     r2 = max(r1, r2);
     71     r1 = t;
     72   };
     73   {
     74     uint const t = min(r3, r4);
     75     r4 = max(r3, r4);
     76     r3 = t;
     77   };
     78   {
     79     uint const t = min(r5, r6);
     80     r6 = max(r5, r6);
     81     r5 = t;
     82   };
     83   {
     84     uint const t = min(r7, r8);
     85     r8 = max(r7, r8);
     86     r7 = t;
     87   };
     88   {
     89     uint const t = min(r2, r5);
     90     r5 = max(r2, r5);
     91     r2 = t;
     92   };
     93   {
     94     uint const t = min(r4, r7);
     95     r7 = max(r4, r7);
     96     r4 = t;
     97   };
     98   {
     99     uint const t = min(r2, r3);
    100     r3 = max(r2, r3);
    101     r2 = t;
    102   };
    103   {
    104     uint const t = min(r4, r5);
    105     r5 = max(r4, r5);
    106     r4 = t;
    107   };
    108   {
    109     uint const t = min(r6, r7);
    110     r7 = max(r6, r7);
    111     r6 = t;
    112   };
    113   {
    114     uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
    115     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
    116     ;
    117     {
    118       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
    119       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
    120       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
    121       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
    122     };
    123     {
    124       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
    125       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
    126       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
    127       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
    128     };
    129     {
    130       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
    131       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
    132       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
    133       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
    134     };
    135     {
    136       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
    137       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
    138       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
    139       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
    140     };
    141   }
    142   {
    143     uint const t = min(r1, r5);
    144     r5 = max(r1, r5);
    145     r1 = t;
    146   };
    147   {
    148     uint const t = min(r3, r7);
    149     r7 = max(r3, r7);
    150     r3 = t;
    151   };
    152   {
    153     uint const t = min(r1, r3);
    154     r3 = max(r1, r3);
    155     r1 = t;
    156   };
    157   {
    158     uint const t = min(r5, r7);
    159     r7 = max(r5, r7);
    160     r5 = t;
    161   };
    162   {
    163     uint const t = min(r2, r6);
    164     r6 = max(r2, r6);
    165     r2 = t;
    166   };
    167   {
    168     uint const t = min(r4, r8);
    169     r8 = max(r4, r8);
    170     r4 = t;
    171   };
    172   {
    173     uint const t = min(r2, r4);
    174     r4 = max(r2, r4);
    175     r2 = t;
    176   };
    177   {
    178     uint const t = min(r6, r8);
    179     r8 = max(r6, r8);
    180     r6 = t;
    181   };
    182   {
    183     uint const t = min(r1, r2);
    184     r2 = max(r1, r2);
    185     r1 = t;
    186   };
    187   {
    188     uint const t = min(r3, r4);
    189     r4 = max(r3, r4);
    190     r3 = t;
    191   };
    192   {
    193     uint const t = min(r5, r6);
    194     r6 = max(r5, r6);
    195     r5 = t;
    196   };
    197   {
    198     uint const t = min(r7, r8);
    199     r8 = max(r7, r8);
    200     r7 = t;
    201   };
    202   {
    203     uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
    204     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
    205     ;
    206     {
    207       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
    208       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
    209       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
    210       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
    211     };
    212     {
    213       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
    214       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
    215       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
    216       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
    217     };
    218     {
    219       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
    220       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
    221       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
    222       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
    223     };
    224     {
    225       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
    226       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
    227       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
    228       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
    229     };
    230   }
    231   {
    232     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
    233     int const t_lt = get_sub_group_local_id() < half_lane_idx;
    234     ;
    235     {
    236       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
    237       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
    238     };
    239     {
    240       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
    241       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
    242     };
    243     {
    244       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
    245       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
    246     };
    247     {
    248       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
    249       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
    250     };
    251     {
    252       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
    253       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
    254     };
    255     {
    256       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
    257       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
    258     };
    259     {
    260       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
    261       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
    262     };
    263     {
    264       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
    265       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
    266     };
    267   }
    268   {
    269     uint const t = min(r1, r5);
    270     r5 = max(r1, r5);
    271     r1 = t;
    272   };
    273   {
    274     uint const t = min(r3, r7);
    275     r7 = max(r3, r7);
    276     r3 = t;
    277   };
    278   {
    279     uint const t = min(r1, r3);
    280     r3 = max(r1, r3);
    281     r1 = t;
    282   };
    283   {
    284     uint const t = min(r5, r7);
    285     r7 = max(r5, r7);
    286     r5 = t;
    287   };
    288   {
    289     uint const t = min(r2, r6);
    290     r6 = max(r2, r6);
    291     r2 = t;
    292   };
    293   {
    294     uint const t = min(r4, r8);
    295     r8 = max(r4, r8);
    296     r4 = t;
    297   };
    298   {
    299     uint const t = min(r2, r4);
    300     r4 = max(r2, r4);
    301     r2 = t;
    302   };
    303   {
    304     uint const t = min(r6, r8);
    305     r8 = max(r6, r8);
    306     r6 = t;
    307   };
    308   {
    309     uint const t = min(r1, r2);
    310     r2 = max(r1, r2);
    311     r1 = t;
    312   };
    313   {
    314     uint const t = min(r3, r4);
    315     r4 = max(r3, r4);
    316     r3 = t;
    317   };
    318   {
    319     uint const t = min(r5, r6);
    320     r6 = max(r5, r6);
    321     r5 = t;
    322   };
    323   {
    324     uint const t = min(r7, r8);
    325     r8 = max(r7, r8);
    326     r7 = t;
    327   };
    328   {
    329     uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
    330     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
    331     ;
    332     {
    333       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
    334       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
    335       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
    336       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
    337     };
    338     {
    339       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
    340       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
    341       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
    342       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
    343     };
    344     {
    345       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
    346       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
    347       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
    348       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
    349     };
    350     {
    351       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
    352       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
    353       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
    354       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
    355     };
    356   }
    357   {
    358     uint const half_lane_idx = get_sub_group_local_id() ^ 2;
    359     int const t_lt = get_sub_group_local_id() < half_lane_idx;
    360     ;
    361     {
    362       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
    363       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
    364     };
    365     {
    366       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
    367       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
    368     };
    369     {
    370       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
    371       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
    372     };
    373     {
    374       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
    375       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
    376     };
    377     {
    378       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
    379       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
    380     };
    381     {
    382       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
    383       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
    384     };
    385     {
    386       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
    387       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
    388     };
    389     {
    390       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
    391       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
    392     };
    393   }
    394   {
    395     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
    396     int const t_lt = get_sub_group_local_id() < half_lane_idx;
    397     ;
    398     {
    399       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
    400       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
    401     };
    402     {
    403       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
    404       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
    405     };
    406     {
    407       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
    408       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
    409     };
    410     {
    411       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
    412       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
    413     };
    414     {
    415       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
    416       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
    417     };
    418     {
    419       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
    420       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
    421     };
    422     {
    423       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
    424       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
    425     };
    426     {
    427       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
    428       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
    429     };
    430   }
    431   {
    432     uint const t = min(r1, r5);
    433     r5 = max(r1, r5);
    434     r1 = t;
    435   };
    436   {
    437     uint const t = min(r3, r7);
    438     r7 = max(r3, r7);
    439     r3 = t;
    440   };
    441   {
    442     uint const t = min(r1, r3);
    443     r3 = max(r1, r3);
    444     r1 = t;
    445   };
    446   {
    447     uint const t = min(r5, r7);
    448     r7 = max(r5, r7);
    449     r5 = t;
    450   };
    451   {
    452     uint const t = min(r2, r6);
    453     r6 = max(r2, r6);
    454     r2 = t;
    455   };
    456   {
    457     uint const t = min(r4, r8);
    458     r8 = max(r4, r8);
    459     r4 = t;
    460   };
    461   {
    462     uint const t = min(r2, r4);
    463     r4 = max(r2, r4);
    464     r2 = t;
    465   };
    466   {
    467     uint const t = min(r6, r8);
    468     r8 = max(r6, r8);
    469     r6 = t;
    470   };
    471   {
    472     uint const t = min(r1, r2);
    473     r2 = max(r1, r2);
    474     r1 = t;
    475   };
    476   {
    477     uint const t = min(r3, r4);
    478     r4 = max(r3, r4);
    479     r3 = t;
    480   };
    481   {
    482     uint const t = min(r5, r6);
    483     r6 = max(r5, r6);
    484     r5 = t;
    485   };
    486   {
    487     uint const t = min(r7, r8);
    488     r8 = max(r7, r8);
    489     r7 = t;
    490   };
    491   {
    492     uint const flip_lane_idx = get_sub_group_local_id() ^ 15;
    493     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
    494     ;
    495     {
    496       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
    497       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
    498       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
    499       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
    500     };
    501     {
    502       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
    503       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
    504       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
    505       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
    506     };
    507     {
    508       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
    509       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
    510       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
    511       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
    512     };
    513     {
    514       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
    515       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
    516       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
    517       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
    518     };
    519   }
    520   {
    521     uint const half_lane_idx = get_sub_group_local_id() ^ 4;
    522     int const t_lt = get_sub_group_local_id() < half_lane_idx;
    523     ;
    524     {
    525       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
    526       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
    527     };
    528     {
    529       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
    530       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
    531     };
    532     {
    533       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
    534       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
    535     };
    536     {
    537       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
    538       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
    539     };
    540     {
    541       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
    542       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
    543     };
    544     {
    545       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
    546       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
    547     };
    548     {
    549       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
    550       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
    551     };
    552     {
    553       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
    554       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
    555     };
    556   }
    557   {
    558     uint const half_lane_idx = get_sub_group_local_id() ^ 2;
    559     int const t_lt = get_sub_group_local_id() < half_lane_idx;
    560     ;
    561     {
    562       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
    563       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
    564     };
    565     {
    566       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
    567       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
    568     };
    569     {
    570       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
    571       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
    572     };
    573     {
    574       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
    575       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
    576     };
    577     {
    578       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
    579       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
    580     };
    581     {
    582       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
    583       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
    584     };
    585     {
    586       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
    587       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
    588     };
    589     {
    590       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
    591       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
    592     };
    593   }
    594   {
    595     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
    596     int const t_lt = get_sub_group_local_id() < half_lane_idx;
    597     ;
    598     {
    599       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
    600       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
    601     };
    602     {
    603       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
    604       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
    605     };
    606     {
    607       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
    608       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
    609     };
    610     {
    611       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
    612       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
    613     };
    614     {
    615       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
    616       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
    617     };
    618     {
    619       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
    620       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
    621     };
    622     {
    623       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
    624       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
    625     };
    626     {
    627       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
    628       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
    629     };
    630   }
    631   {
    632     uint const t = min(r1, r5);
    633     r5 = max(r1, r5);
    634     r1 = t;
    635   };
    636   {
    637     uint const t = min(r3, r7);
    638     r7 = max(r3, r7);
    639     r3 = t;
    640   };
    641   {
    642     uint const t = min(r1, r3);
    643     r3 = max(r1, r3);
    644     r1 = t;
    645   };
    646   {
    647     uint const t = min(r5, r7);
    648     r7 = max(r5, r7);
    649     r5 = t;
    650   };
    651   {
    652     uint const t = min(r2, r6);
    653     r6 = max(r2, r6);
    654     r2 = t;
    655   };
    656   {
    657     uint const t = min(r4, r8);
    658     r8 = max(r4, r8);
    659     r4 = t;
    660   };
    661   {
    662     uint const t = min(r2, r4);
    663     r4 = max(r2, r4);
    664     r2 = t;
    665   };
    666   {
    667     uint const t = min(r6, r8);
    668     r8 = max(r6, r8);
    669     r6 = t;
    670   };
    671   {
    672     uint const t = min(r1, r2);
    673     r2 = max(r1, r2);
    674     r1 = t;
    675   };
    676   {
    677     uint const t = min(r3, r4);
    678     r4 = max(r3, r4);
    679     r3 = t;
    680   };
    681   {
    682     uint const t = min(r5, r6);
    683     r6 = max(r5, r6);
    684     r5 = t;
    685   };
    686   {
    687     uint const t = min(r7, r8);
    688     r8 = max(r7, r8);
    689     r7 = t;
    690   };
    691   vout[gmem_idx + (1 << 4) * 0] = r1;
    692   vout[gmem_idx + (1 << 4) * 1] = r2;
    693   vout[gmem_idx + (1 << 4) * 2] = r3;
    694   vout[gmem_idx + (1 << 4) * 3] = r4;
    695   vout[gmem_idx + (1 << 4) * 4] = r5;
    696   vout[gmem_idx + (1 << 4) * 5] = r6;
    697   vout[gmem_idx + (1 << 4) * 6] = r7;
    698   vout[gmem_idx + (1 << 4) * 7] = r8;
    699 }
    700 
    701 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
    702 __attribute__((reqd_work_group_size((1 << 4) * 2, 1, 1))) void
    703 hs_kernel_bs_1(__global uint const* const restrict vin,
    704                __global uint* const restrict vout)
    705 {
    706   __local struct
    707   {
    708     uint m[32 * 8];
    709   } shared;
    710 
    711   uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
    712                         (get_local_id(0) & ((1 << 4) - 1));
    713   uint r1 = vin[gmem_idx + (1 << 4) * 0];
    714   uint r2 = vin[gmem_idx + (1 << 4) * 1];
    715   uint r3 = vin[gmem_idx + (1 << 4) * 2];
    716   uint r4 = vin[gmem_idx + (1 << 4) * 3];
    717   uint r5 = vin[gmem_idx + (1 << 4) * 4];
    718   uint r6 = vin[gmem_idx + (1 << 4) * 5];
    719   uint r7 = vin[gmem_idx + (1 << 4) * 6];
    720   uint r8 = vin[gmem_idx + (1 << 4) * 7];
    721   {
    722     uint const t = min(r1, r5);
    723     r5 = max(r1, r5);
    724     r1 = t;
    725   };
    726   {
    727     uint const t = min(r2, r6);
    728     r6 = max(r2, r6);
    729     r2 = t;
    730   };
    731   {
    732     uint const t = min(r3, r7);
    733     r7 = max(r3, r7);
    734     r3 = t;
    735   };
    736   {
    737     uint const t = min(r4, r8);
    738     r8 = max(r4, r8);
    739     r4 = t;
    740   };
    741   {
    742     uint const t = min(r1, r3);
    743     r3 = max(r1, r3);
    744     r1 = t;
    745   };
    746   {
    747     uint const t = min(r2, r4);
    748     r4 = max(r2, r4);
    749     r2 = t;
    750   };
    751   {
    752     uint const t = min(r5, r7);
    753     r7 = max(r5, r7);
    754     r5 = t;
    755   };
    756   {
    757     uint const t = min(r6, r8);
    758     r8 = max(r6, r8);
    759     r6 = t;
    760   };
    761   {
    762     uint const t = min(r3, r5);
    763     r5 = max(r3, r5);
    764     r3 = t;
    765   };
    766   {
    767     uint const t = min(r4, r6);
    768     r6 = max(r4, r6);
    769     r4 = t;
    770   };
    771   {
    772     uint const t = min(r1, r2);
    773     r2 = max(r1, r2);
    774     r1 = t;
    775   };
    776   {
    777     uint const t = min(r3, r4);
    778     r4 = max(r3, r4);
    779     r3 = t;
    780   };
    781   {
    782     uint const t = min(r5, r6);
    783     r6 = max(r5, r6);
    784     r5 = t;
    785   };
    786   {
    787     uint const t = min(r7, r8);
    788     r8 = max(r7, r8);
    789     r7 = t;
    790   };
    791   {
    792     uint const t = min(r2, r5);
    793     r5 = max(r2, r5);
    794     r2 = t;
    795   };
    796   {
    797     uint const t = min(r4, r7);
    798     r7 = max(r4, r7);
    799     r4 = t;
    800   };
    801   {
    802     uint const t = min(r2, r3);
    803     r3 = max(r2, r3);
    804     r2 = t;
    805   };
    806   {
    807     uint const t = min(r4, r5);
    808     r5 = max(r4, r5);
    809     r4 = t;
    810   };
    811   {
    812     uint const t = min(r6, r7);
    813     r7 = max(r6, r7);
    814     r6 = t;
    815   };
    816   {
    817     uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
    818     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
    819     ;
    820     {
    821       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
    822       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
    823       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
    824       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
    825     };
    826     {
    827       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
    828       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
    829       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
    830       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
    831     };
    832     {
    833       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
    834       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
    835       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
    836       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
    837     };
    838     {
    839       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
    840       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
    841       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
    842       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
    843     };
    844   }
    845   {
    846     uint const t = min(r1, r5);
    847     r5 = max(r1, r5);
    848     r1 = t;
    849   };
    850   {
    851     uint const t = min(r3, r7);
    852     r7 = max(r3, r7);
    853     r3 = t;
    854   };
    855   {
    856     uint const t = min(r1, r3);
    857     r3 = max(r1, r3);
    858     r1 = t;
    859   };
    860   {
    861     uint const t = min(r5, r7);
    862     r7 = max(r5, r7);
    863     r5 = t;
    864   };
    865   {
    866     uint const t = min(r2, r6);
    867     r6 = max(r2, r6);
    868     r2 = t;
    869   };
    870   {
    871     uint const t = min(r4, r8);
    872     r8 = max(r4, r8);
    873     r4 = t;
    874   };
    875   {
    876     uint const t = min(r2, r4);
    877     r4 = max(r2, r4);
    878     r2 = t;
    879   };
    880   {
    881     uint const t = min(r6, r8);
    882     r8 = max(r6, r8);
    883     r6 = t;
    884   };
    885   {
    886     uint const t = min(r1, r2);
    887     r2 = max(r1, r2);
    888     r1 = t;
    889   };
    890   {
    891     uint const t = min(r3, r4);
    892     r4 = max(r3, r4);
    893     r3 = t;
    894   };
    895   {
    896     uint const t = min(r5, r6);
    897     r6 = max(r5, r6);
    898     r5 = t;
    899   };
    900   {
    901     uint const t = min(r7, r8);
    902     r8 = max(r7, r8);
    903     r7 = t;
    904   };
    905   {
    906     uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
    907     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
    908     ;
    909     {
    910       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
    911       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
    912       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
    913       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
    914     };
    915     {
    916       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
    917       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
    918       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
    919       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
    920     };
    921     {
    922       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
    923       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
    924       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
    925       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
    926     };
    927     {
    928       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
    929       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
    930       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
    931       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
    932     };
    933   }
    934   {
    935     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
    936     int const t_lt = get_sub_group_local_id() < half_lane_idx;
    937     ;
    938     {
    939       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
    940       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
    941     };
    942     {
    943       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
    944       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
    945     };
    946     {
    947       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
    948       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
    949     };
    950     {
    951       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
    952       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
    953     };
    954     {
    955       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
    956       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
    957     };
    958     {
    959       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
    960       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
    961     };
    962     {
    963       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
    964       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
    965     };
    966     {
    967       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
    968       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
    969     };
    970   }
    971   {
    972     uint const t = min(r1, r5);
    973     r5 = max(r1, r5);
    974     r1 = t;
    975   };
    976   {
    977     uint const t = min(r3, r7);
    978     r7 = max(r3, r7);
    979     r3 = t;
    980   };
    981   {
    982     uint const t = min(r1, r3);
    983     r3 = max(r1, r3);
    984     r1 = t;
    985   };
    986   {
    987     uint const t = min(r5, r7);
    988     r7 = max(r5, r7);
    989     r5 = t;
    990   };
    991   {
    992     uint const t = min(r2, r6);
    993     r6 = max(r2, r6);
    994     r2 = t;
    995   };
    996   {
    997     uint const t = min(r4, r8);
    998     r8 = max(r4, r8);
    999     r4 = t;
   1000   };
   1001   {
   1002     uint const t = min(r2, r4);
   1003     r4 = max(r2, r4);
   1004     r2 = t;
   1005   };
   1006   {
   1007     uint const t = min(r6, r8);
   1008     r8 = max(r6, r8);
   1009     r6 = t;
   1010   };
   1011   {
   1012     uint const t = min(r1, r2);
   1013     r2 = max(r1, r2);
   1014     r1 = t;
   1015   };
   1016   {
   1017     uint const t = min(r3, r4);
   1018     r4 = max(r3, r4);
   1019     r3 = t;
   1020   };
   1021   {
   1022     uint const t = min(r5, r6);
   1023     r6 = max(r5, r6);
   1024     r5 = t;
   1025   };
   1026   {
   1027     uint const t = min(r7, r8);
   1028     r8 = max(r7, r8);
   1029     r7 = t;
   1030   };
   1031   {
   1032     uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
   1033     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   1034     ;
   1035     {
   1036       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   1037       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   1038       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   1039       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1040     };
   1041     {
   1042       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   1043       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   1044       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   1045       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1046     };
   1047     {
   1048       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   1049       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   1050       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   1051       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1052     };
   1053     {
   1054       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   1055       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   1056       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   1057       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1058     };
   1059   }
   1060   {
   1061     uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   1062     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   1063     ;
   1064     {
   1065       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   1066       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   1067     };
   1068     {
   1069       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   1070       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   1071     };
   1072     {
   1073       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   1074       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   1075     };
   1076     {
   1077       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   1078       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   1079     };
   1080     {
   1081       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   1082       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1083     };
   1084     {
   1085       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   1086       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1087     };
   1088     {
   1089       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   1090       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1091     };
   1092     {
   1093       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   1094       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1095     };
   1096   }
   1097   {
   1098     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   1099     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   1100     ;
   1101     {
   1102       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   1103       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   1104     };
   1105     {
   1106       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   1107       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   1108     };
   1109     {
   1110       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   1111       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   1112     };
   1113     {
   1114       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   1115       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   1116     };
   1117     {
   1118       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   1119       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1120     };
   1121     {
   1122       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   1123       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1124     };
   1125     {
   1126       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   1127       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1128     };
   1129     {
   1130       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   1131       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1132     };
   1133   }
   1134   {
   1135     uint const t = min(r1, r5);
   1136     r5 = max(r1, r5);
   1137     r1 = t;
   1138   };
   1139   {
   1140     uint const t = min(r3, r7);
   1141     r7 = max(r3, r7);
   1142     r3 = t;
   1143   };
   1144   {
   1145     uint const t = min(r1, r3);
   1146     r3 = max(r1, r3);
   1147     r1 = t;
   1148   };
   1149   {
   1150     uint const t = min(r5, r7);
   1151     r7 = max(r5, r7);
   1152     r5 = t;
   1153   };
   1154   {
   1155     uint const t = min(r2, r6);
   1156     r6 = max(r2, r6);
   1157     r2 = t;
   1158   };
   1159   {
   1160     uint const t = min(r4, r8);
   1161     r8 = max(r4, r8);
   1162     r4 = t;
   1163   };
   1164   {
   1165     uint const t = min(r2, r4);
   1166     r4 = max(r2, r4);
   1167     r2 = t;
   1168   };
   1169   {
   1170     uint const t = min(r6, r8);
   1171     r8 = max(r6, r8);
   1172     r6 = t;
   1173   };
   1174   {
   1175     uint const t = min(r1, r2);
   1176     r2 = max(r1, r2);
   1177     r1 = t;
   1178   };
   1179   {
   1180     uint const t = min(r3, r4);
   1181     r4 = max(r3, r4);
   1182     r3 = t;
   1183   };
   1184   {
   1185     uint const t = min(r5, r6);
   1186     r6 = max(r5, r6);
   1187     r5 = t;
   1188   };
   1189   {
   1190     uint const t = min(r7, r8);
   1191     r8 = max(r7, r8);
   1192     r7 = t;
   1193   };
   1194   {
   1195     uint const flip_lane_idx = get_sub_group_local_id() ^ 15;
   1196     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   1197     ;
   1198     {
   1199       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   1200       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   1201       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   1202       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1203     };
   1204     {
   1205       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   1206       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   1207       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   1208       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1209     };
   1210     {
   1211       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   1212       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   1213       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   1214       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1215     };
   1216     {
   1217       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   1218       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   1219       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   1220       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1221     };
   1222   }
   1223   {
   1224     uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   1225     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   1226     ;
   1227     {
   1228       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   1229       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   1230     };
   1231     {
   1232       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   1233       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   1234     };
   1235     {
   1236       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   1237       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   1238     };
   1239     {
   1240       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   1241       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   1242     };
   1243     {
   1244       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   1245       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1246     };
   1247     {
   1248       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   1249       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1250     };
   1251     {
   1252       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   1253       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1254     };
   1255     {
   1256       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   1257       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1258     };
   1259   }
   1260   {
   1261     uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   1262     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   1263     ;
   1264     {
   1265       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   1266       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   1267     };
   1268     {
   1269       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   1270       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   1271     };
   1272     {
   1273       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   1274       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   1275     };
   1276     {
   1277       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   1278       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   1279     };
   1280     {
   1281       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   1282       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1283     };
   1284     {
   1285       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   1286       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1287     };
   1288     {
   1289       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   1290       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1291     };
   1292     {
   1293       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   1294       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1295     };
   1296   }
   1297   {
   1298     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   1299     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   1300     ;
   1301     {
   1302       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   1303       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   1304     };
   1305     {
   1306       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   1307       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   1308     };
   1309     {
   1310       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   1311       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   1312     };
   1313     {
   1314       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   1315       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   1316     };
   1317     {
   1318       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   1319       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1320     };
   1321     {
   1322       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   1323       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1324     };
   1325     {
   1326       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   1327       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1328     };
   1329     {
   1330       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   1331       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1332     };
   1333   }
   1334   {
   1335     uint const t = min(r1, r5);
   1336     r5 = max(r1, r5);
   1337     r1 = t;
   1338   };
   1339   {
   1340     uint const t = min(r3, r7);
   1341     r7 = max(r3, r7);
   1342     r3 = t;
   1343   };
   1344   {
   1345     uint const t = min(r1, r3);
   1346     r3 = max(r1, r3);
   1347     r1 = t;
   1348   };
   1349   {
   1350     uint const t = min(r5, r7);
   1351     r7 = max(r5, r7);
   1352     r5 = t;
   1353   };
   1354   {
   1355     uint const t = min(r2, r6);
   1356     r6 = max(r2, r6);
   1357     r2 = t;
   1358   };
   1359   {
   1360     uint const t = min(r4, r8);
   1361     r8 = max(r4, r8);
   1362     r4 = t;
   1363   };
   1364   {
   1365     uint const t = min(r2, r4);
   1366     r4 = max(r2, r4);
   1367     r2 = t;
   1368   };
   1369   {
   1370     uint const t = min(r6, r8);
   1371     r8 = max(r6, r8);
   1372     r6 = t;
   1373   };
   1374   {
   1375     uint const t = min(r1, r2);
   1376     r2 = max(r1, r2);
   1377     r1 = t;
   1378   };
   1379   {
   1380     uint const t = min(r3, r4);
   1381     r4 = max(r3, r4);
   1382     r3 = t;
   1383   };
   1384   {
   1385     uint const t = min(r5, r6);
   1386     r6 = max(r5, r6);
   1387     r5 = t;
   1388   };
   1389   {
   1390     uint const t = min(r7, r8);
   1391     r8 = max(r7, r8);
   1392     r7 = t;
   1393   };
   1394   uint const smem_l_idx =
   1395     get_sub_group_id() * ((1 << 4) * 2) + get_sub_group_local_id();
   1396   uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 4) * 2) +
   1397                           (get_sub_group_local_id() ^ ((1 << 4) - 1));
   1398   shared.m[get_local_id(0) + (2 * (1 << 4) * 0)] = r1;
   1399   shared.m[get_local_id(0) + (2 * (1 << 4) * 1)] = r8;
   1400   shared.m[get_local_id(0) + (2 * (1 << 4) * 2)] = r2;
   1401   shared.m[get_local_id(0) + (2 * (1 << 4) * 3)] = r7;
   1402   shared.m[get_local_id(0) + (2 * (1 << 4) * 4)] = r3;
   1403   shared.m[get_local_id(0) + (2 * (1 << 4) * 5)] = r6;
   1404   shared.m[get_local_id(0) + (2 * (1 << 4) * 6)] = r4;
   1405   shared.m[get_local_id(0) + (2 * (1 << 4) * 7)] = r5;
   1406   barrier(CLK_LOCAL_MEM_FENCE);
   1407   {
   1408     {
   1409       uint r0_1 = shared.m[smem_l_idx + (0)];
   1410       uint r0_2 = shared.m[smem_r_idx + (16)];
   1411       {
   1412         uint const t = min(r0_1, r0_2);
   1413         r0_2 = max(r0_1, r0_2);
   1414         r0_1 = t;
   1415       };
   1416       shared.m[smem_l_idx + (0)] = r0_1;
   1417       shared.m[smem_r_idx + (16)] = r0_2;
   1418     }
   1419     {
   1420       uint r0_1 = shared.m[smem_l_idx + (64)];
   1421       uint r0_2 = shared.m[smem_r_idx + (80)];
   1422       {
   1423         uint const t = min(r0_1, r0_2);
   1424         r0_2 = max(r0_1, r0_2);
   1425         r0_1 = t;
   1426       };
   1427       shared.m[smem_l_idx + (64)] = r0_1;
   1428       shared.m[smem_r_idx + (80)] = r0_2;
   1429     }
   1430     {
   1431       uint r0_1 = shared.m[smem_l_idx + (128)];
   1432       uint r0_2 = shared.m[smem_r_idx + (144)];
   1433       {
   1434         uint const t = min(r0_1, r0_2);
   1435         r0_2 = max(r0_1, r0_2);
   1436         r0_1 = t;
   1437       };
   1438       shared.m[smem_l_idx + (128)] = r0_1;
   1439       shared.m[smem_r_idx + (144)] = r0_2;
   1440     }
   1441     {
   1442       uint r0_1 = shared.m[smem_l_idx + (192)];
   1443       uint r0_2 = shared.m[smem_r_idx + (208)];
   1444       {
   1445         uint const t = min(r0_1, r0_2);
   1446         r0_2 = max(r0_1, r0_2);
   1447         r0_1 = t;
   1448       };
   1449       shared.m[smem_l_idx + (192)] = r0_1;
   1450       shared.m[smem_r_idx + (208)] = r0_2;
   1451     }
   1452   }
   1453   barrier(CLK_LOCAL_MEM_FENCE);
   1454   r1 = shared.m[get_local_id(0) + (2 * (1 << 4) * 0)];
   1455   r8 = shared.m[get_local_id(0) + (2 * (1 << 4) * 1)];
   1456   r2 = shared.m[get_local_id(0) + (2 * (1 << 4) * 2)];
   1457   r7 = shared.m[get_local_id(0) + (2 * (1 << 4) * 3)];
   1458   r3 = shared.m[get_local_id(0) + (2 * (1 << 4) * 4)];
   1459   r6 = shared.m[get_local_id(0) + (2 * (1 << 4) * 5)];
   1460   r4 = shared.m[get_local_id(0) + (2 * (1 << 4) * 6)];
   1461   r5 = shared.m[get_local_id(0) + (2 * (1 << 4) * 7)];
   1462   {
   1463     {
   1464       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   1465       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   1466       ;
   1467       {
   1468         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   1469         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   1470       };
   1471       {
   1472         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   1473         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   1474       };
   1475       {
   1476         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   1477         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   1478       };
   1479       {
   1480         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   1481         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   1482       };
   1483       {
   1484         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   1485         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1486       };
   1487       {
   1488         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   1489         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1490       };
   1491       {
   1492         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   1493         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1494       };
   1495       {
   1496         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   1497         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1498       };
   1499     }
   1500     {
   1501       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   1502       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   1503       ;
   1504       {
   1505         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   1506         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   1507       };
   1508       {
   1509         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   1510         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   1511       };
   1512       {
   1513         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   1514         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   1515       };
   1516       {
   1517         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   1518         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   1519       };
   1520       {
   1521         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   1522         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1523       };
   1524       {
   1525         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   1526         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1527       };
   1528       {
   1529         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   1530         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1531       };
   1532       {
   1533         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   1534         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1535       };
   1536     }
   1537     {
   1538       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   1539       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   1540       ;
   1541       {
   1542         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   1543         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   1544       };
   1545       {
   1546         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   1547         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   1548       };
   1549       {
   1550         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   1551         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   1552       };
   1553       {
   1554         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   1555         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   1556       };
   1557       {
   1558         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   1559         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1560       };
   1561       {
   1562         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   1563         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1564       };
   1565       {
   1566         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   1567         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1568       };
   1569       {
   1570         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   1571         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1572       };
   1573     }
   1574     {
   1575       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   1576       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   1577       ;
   1578       {
   1579         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   1580         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   1581       };
   1582       {
   1583         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   1584         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   1585       };
   1586       {
   1587         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   1588         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   1589       };
   1590       {
   1591         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   1592         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   1593       };
   1594       {
   1595         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   1596         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1597       };
   1598       {
   1599         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   1600         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1601       };
   1602       {
   1603         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   1604         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1605       };
   1606       {
   1607         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   1608         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1609       };
   1610     }
   1611     {
   1612       uint const t = min(r1, r5);
   1613       r5 = max(r1, r5);
   1614       r1 = t;
   1615     };
   1616     {
   1617       uint const t = min(r3, r7);
   1618       r7 = max(r3, r7);
   1619       r3 = t;
   1620     };
   1621     {
   1622       uint const t = min(r1, r3);
   1623       r3 = max(r1, r3);
   1624       r1 = t;
   1625     };
   1626     {
   1627       uint const t = min(r5, r7);
   1628       r7 = max(r5, r7);
   1629       r5 = t;
   1630     };
   1631     {
   1632       uint const t = min(r2, r6);
   1633       r6 = max(r2, r6);
   1634       r2 = t;
   1635     };
   1636     {
   1637       uint const t = min(r4, r8);
   1638       r8 = max(r4, r8);
   1639       r4 = t;
   1640     };
   1641     {
   1642       uint const t = min(r2, r4);
   1643       r4 = max(r2, r4);
   1644       r2 = t;
   1645     };
   1646     {
   1647       uint const t = min(r6, r8);
   1648       r8 = max(r6, r8);
   1649       r6 = t;
   1650     };
   1651     {
   1652       uint const t = min(r1, r2);
   1653       r2 = max(r1, r2);
   1654       r1 = t;
   1655     };
   1656     {
   1657       uint const t = min(r3, r4);
   1658       r4 = max(r3, r4);
   1659       r3 = t;
   1660     };
   1661     {
   1662       uint const t = min(r5, r6);
   1663       r6 = max(r5, r6);
   1664       r5 = t;
   1665     };
   1666     {
   1667       uint const t = min(r7, r8);
   1668       r8 = max(r7, r8);
   1669       r7 = t;
   1670     };
   1671   }
   1672   vout[gmem_idx + (1 << 4) * 0] = r1;
   1673   vout[gmem_idx + (1 << 4) * 1] = r2;
   1674   vout[gmem_idx + (1 << 4) * 2] = r3;
   1675   vout[gmem_idx + (1 << 4) * 3] = r4;
   1676   vout[gmem_idx + (1 << 4) * 4] = r5;
   1677   vout[gmem_idx + (1 << 4) * 5] = r6;
   1678   vout[gmem_idx + (1 << 4) * 6] = r7;
   1679   vout[gmem_idx + (1 << 4) * 7] = r8;
   1680 }
   1681 
   1682 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
   1683 __attribute__((reqd_work_group_size((1 << 4) * 4, 1, 1))) void
   1684 hs_kernel_bs_2(__global uint const* const restrict vin,
   1685                __global uint* const restrict vout)
   1686 {
   1687   __local struct
   1688   {
   1689     uint m[64 * 8];
   1690   } shared;
   1691 
   1692   uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
   1693                         (get_local_id(0) & ((1 << 4) - 1));
   1694   uint r1 = vin[gmem_idx + (1 << 4) * 0];
   1695   uint r2 = vin[gmem_idx + (1 << 4) * 1];
   1696   uint r3 = vin[gmem_idx + (1 << 4) * 2];
   1697   uint r4 = vin[gmem_idx + (1 << 4) * 3];
   1698   uint r5 = vin[gmem_idx + (1 << 4) * 4];
   1699   uint r6 = vin[gmem_idx + (1 << 4) * 5];
   1700   uint r7 = vin[gmem_idx + (1 << 4) * 6];
   1701   uint r8 = vin[gmem_idx + (1 << 4) * 7];
   1702   {
   1703     uint const t = min(r1, r5);
   1704     r5 = max(r1, r5);
   1705     r1 = t;
   1706   };
   1707   {
   1708     uint const t = min(r2, r6);
   1709     r6 = max(r2, r6);
   1710     r2 = t;
   1711   };
   1712   {
   1713     uint const t = min(r3, r7);
   1714     r7 = max(r3, r7);
   1715     r3 = t;
   1716   };
   1717   {
   1718     uint const t = min(r4, r8);
   1719     r8 = max(r4, r8);
   1720     r4 = t;
   1721   };
   1722   {
   1723     uint const t = min(r1, r3);
   1724     r3 = max(r1, r3);
   1725     r1 = t;
   1726   };
   1727   {
   1728     uint const t = min(r2, r4);
   1729     r4 = max(r2, r4);
   1730     r2 = t;
   1731   };
   1732   {
   1733     uint const t = min(r5, r7);
   1734     r7 = max(r5, r7);
   1735     r5 = t;
   1736   };
   1737   {
   1738     uint const t = min(r6, r8);
   1739     r8 = max(r6, r8);
   1740     r6 = t;
   1741   };
   1742   {
   1743     uint const t = min(r3, r5);
   1744     r5 = max(r3, r5);
   1745     r3 = t;
   1746   };
   1747   {
   1748     uint const t = min(r4, r6);
   1749     r6 = max(r4, r6);
   1750     r4 = t;
   1751   };
   1752   {
   1753     uint const t = min(r1, r2);
   1754     r2 = max(r1, r2);
   1755     r1 = t;
   1756   };
   1757   {
   1758     uint const t = min(r3, r4);
   1759     r4 = max(r3, r4);
   1760     r3 = t;
   1761   };
   1762   {
   1763     uint const t = min(r5, r6);
   1764     r6 = max(r5, r6);
   1765     r5 = t;
   1766   };
   1767   {
   1768     uint const t = min(r7, r8);
   1769     r8 = max(r7, r8);
   1770     r7 = t;
   1771   };
   1772   {
   1773     uint const t = min(r2, r5);
   1774     r5 = max(r2, r5);
   1775     r2 = t;
   1776   };
   1777   {
   1778     uint const t = min(r4, r7);
   1779     r7 = max(r4, r7);
   1780     r4 = t;
   1781   };
   1782   {
   1783     uint const t = min(r2, r3);
   1784     r3 = max(r2, r3);
   1785     r2 = t;
   1786   };
   1787   {
   1788     uint const t = min(r4, r5);
   1789     r5 = max(r4, r5);
   1790     r4 = t;
   1791   };
   1792   {
   1793     uint const t = min(r6, r7);
   1794     r7 = max(r6, r7);
   1795     r6 = t;
   1796   };
   1797   {
   1798     uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
   1799     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   1800     ;
   1801     {
   1802       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   1803       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   1804       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   1805       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1806     };
   1807     {
   1808       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   1809       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   1810       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   1811       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1812     };
   1813     {
   1814       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   1815       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   1816       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   1817       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1818     };
   1819     {
   1820       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   1821       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   1822       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   1823       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1824     };
   1825   }
   1826   {
   1827     uint const t = min(r1, r5);
   1828     r5 = max(r1, r5);
   1829     r1 = t;
   1830   };
   1831   {
   1832     uint const t = min(r3, r7);
   1833     r7 = max(r3, r7);
   1834     r3 = t;
   1835   };
   1836   {
   1837     uint const t = min(r1, r3);
   1838     r3 = max(r1, r3);
   1839     r1 = t;
   1840   };
   1841   {
   1842     uint const t = min(r5, r7);
   1843     r7 = max(r5, r7);
   1844     r5 = t;
   1845   };
   1846   {
   1847     uint const t = min(r2, r6);
   1848     r6 = max(r2, r6);
   1849     r2 = t;
   1850   };
   1851   {
   1852     uint const t = min(r4, r8);
   1853     r8 = max(r4, r8);
   1854     r4 = t;
   1855   };
   1856   {
   1857     uint const t = min(r2, r4);
   1858     r4 = max(r2, r4);
   1859     r2 = t;
   1860   };
   1861   {
   1862     uint const t = min(r6, r8);
   1863     r8 = max(r6, r8);
   1864     r6 = t;
   1865   };
   1866   {
   1867     uint const t = min(r1, r2);
   1868     r2 = max(r1, r2);
   1869     r1 = t;
   1870   };
   1871   {
   1872     uint const t = min(r3, r4);
   1873     r4 = max(r3, r4);
   1874     r3 = t;
   1875   };
   1876   {
   1877     uint const t = min(r5, r6);
   1878     r6 = max(r5, r6);
   1879     r5 = t;
   1880   };
   1881   {
   1882     uint const t = min(r7, r8);
   1883     r8 = max(r7, r8);
   1884     r7 = t;
   1885   };
   1886   {
   1887     uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
   1888     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   1889     ;
   1890     {
   1891       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   1892       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   1893       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   1894       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1895     };
   1896     {
   1897       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   1898       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   1899       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   1900       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1901     };
   1902     {
   1903       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   1904       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   1905       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   1906       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1907     };
   1908     {
   1909       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   1910       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   1911       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   1912       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1913     };
   1914   }
   1915   {
   1916     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   1917     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   1918     ;
   1919     {
   1920       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   1921       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   1922     };
   1923     {
   1924       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   1925       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   1926     };
   1927     {
   1928       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   1929       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   1930     };
   1931     {
   1932       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   1933       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   1934     };
   1935     {
   1936       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   1937       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   1938     };
   1939     {
   1940       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   1941       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   1942     };
   1943     {
   1944       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   1945       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   1946     };
   1947     {
   1948       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   1949       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   1950     };
   1951   }
   1952   {
   1953     uint const t = min(r1, r5);
   1954     r5 = max(r1, r5);
   1955     r1 = t;
   1956   };
   1957   {
   1958     uint const t = min(r3, r7);
   1959     r7 = max(r3, r7);
   1960     r3 = t;
   1961   };
   1962   {
   1963     uint const t = min(r1, r3);
   1964     r3 = max(r1, r3);
   1965     r1 = t;
   1966   };
   1967   {
   1968     uint const t = min(r5, r7);
   1969     r7 = max(r5, r7);
   1970     r5 = t;
   1971   };
   1972   {
   1973     uint const t = min(r2, r6);
   1974     r6 = max(r2, r6);
   1975     r2 = t;
   1976   };
   1977   {
   1978     uint const t = min(r4, r8);
   1979     r8 = max(r4, r8);
   1980     r4 = t;
   1981   };
   1982   {
   1983     uint const t = min(r2, r4);
   1984     r4 = max(r2, r4);
   1985     r2 = t;
   1986   };
   1987   {
   1988     uint const t = min(r6, r8);
   1989     r8 = max(r6, r8);
   1990     r6 = t;
   1991   };
   1992   {
   1993     uint const t = min(r1, r2);
   1994     r2 = max(r1, r2);
   1995     r1 = t;
   1996   };
   1997   {
   1998     uint const t = min(r3, r4);
   1999     r4 = max(r3, r4);
   2000     r3 = t;
   2001   };
   2002   {
   2003     uint const t = min(r5, r6);
   2004     r6 = max(r5, r6);
   2005     r5 = t;
   2006   };
   2007   {
   2008     uint const t = min(r7, r8);
   2009     r8 = max(r7, r8);
   2010     r7 = t;
   2011   };
   2012   {
   2013     uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
   2014     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   2015     ;
   2016     {
   2017       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   2018       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   2019       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   2020       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2021     };
   2022     {
   2023       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   2024       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   2025       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   2026       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2027     };
   2028     {
   2029       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   2030       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   2031       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   2032       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2033     };
   2034     {
   2035       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   2036       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   2037       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   2038       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2039     };
   2040   }
   2041   {
   2042     uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   2043     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   2044     ;
   2045     {
   2046       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   2047       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   2048     };
   2049     {
   2050       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   2051       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   2052     };
   2053     {
   2054       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   2055       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   2056     };
   2057     {
   2058       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   2059       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   2060     };
   2061     {
   2062       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   2063       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2064     };
   2065     {
   2066       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   2067       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2068     };
   2069     {
   2070       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   2071       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2072     };
   2073     {
   2074       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   2075       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2076     };
   2077   }
   2078   {
   2079     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   2080     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   2081     ;
   2082     {
   2083       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   2084       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   2085     };
   2086     {
   2087       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   2088       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   2089     };
   2090     {
   2091       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   2092       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   2093     };
   2094     {
   2095       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   2096       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   2097     };
   2098     {
   2099       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   2100       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2101     };
   2102     {
   2103       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   2104       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2105     };
   2106     {
   2107       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   2108       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2109     };
   2110     {
   2111       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   2112       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2113     };
   2114   }
   2115   {
   2116     uint const t = min(r1, r5);
   2117     r5 = max(r1, r5);
   2118     r1 = t;
   2119   };
   2120   {
   2121     uint const t = min(r3, r7);
   2122     r7 = max(r3, r7);
   2123     r3 = t;
   2124   };
   2125   {
   2126     uint const t = min(r1, r3);
   2127     r3 = max(r1, r3);
   2128     r1 = t;
   2129   };
   2130   {
   2131     uint const t = min(r5, r7);
   2132     r7 = max(r5, r7);
   2133     r5 = t;
   2134   };
   2135   {
   2136     uint const t = min(r2, r6);
   2137     r6 = max(r2, r6);
   2138     r2 = t;
   2139   };
   2140   {
   2141     uint const t = min(r4, r8);
   2142     r8 = max(r4, r8);
   2143     r4 = t;
   2144   };
   2145   {
   2146     uint const t = min(r2, r4);
   2147     r4 = max(r2, r4);
   2148     r2 = t;
   2149   };
   2150   {
   2151     uint const t = min(r6, r8);
   2152     r8 = max(r6, r8);
   2153     r6 = t;
   2154   };
   2155   {
   2156     uint const t = min(r1, r2);
   2157     r2 = max(r1, r2);
   2158     r1 = t;
   2159   };
   2160   {
   2161     uint const t = min(r3, r4);
   2162     r4 = max(r3, r4);
   2163     r3 = t;
   2164   };
   2165   {
   2166     uint const t = min(r5, r6);
   2167     r6 = max(r5, r6);
   2168     r5 = t;
   2169   };
   2170   {
   2171     uint const t = min(r7, r8);
   2172     r8 = max(r7, r8);
   2173     r7 = t;
   2174   };
   2175   {
   2176     uint const flip_lane_idx = get_sub_group_local_id() ^ 15;
   2177     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   2178     ;
   2179     {
   2180       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   2181       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   2182       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   2183       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2184     };
   2185     {
   2186       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   2187       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   2188       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   2189       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2190     };
   2191     {
   2192       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   2193       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   2194       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   2195       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2196     };
   2197     {
   2198       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   2199       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   2200       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   2201       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2202     };
   2203   }
   2204   {
   2205     uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   2206     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   2207     ;
   2208     {
   2209       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   2210       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   2211     };
   2212     {
   2213       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   2214       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   2215     };
   2216     {
   2217       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   2218       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   2219     };
   2220     {
   2221       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   2222       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   2223     };
   2224     {
   2225       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   2226       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2227     };
   2228     {
   2229       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   2230       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2231     };
   2232     {
   2233       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   2234       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2235     };
   2236     {
   2237       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   2238       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2239     };
   2240   }
   2241   {
   2242     uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   2243     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   2244     ;
   2245     {
   2246       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   2247       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   2248     };
   2249     {
   2250       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   2251       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   2252     };
   2253     {
   2254       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   2255       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   2256     };
   2257     {
   2258       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   2259       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   2260     };
   2261     {
   2262       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   2263       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2264     };
   2265     {
   2266       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   2267       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2268     };
   2269     {
   2270       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   2271       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2272     };
   2273     {
   2274       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   2275       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2276     };
   2277   }
   2278   {
   2279     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   2280     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   2281     ;
   2282     {
   2283       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   2284       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   2285     };
   2286     {
   2287       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   2288       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   2289     };
   2290     {
   2291       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   2292       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   2293     };
   2294     {
   2295       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   2296       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   2297     };
   2298     {
   2299       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   2300       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2301     };
   2302     {
   2303       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   2304       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2305     };
   2306     {
   2307       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   2308       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2309     };
   2310     {
   2311       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   2312       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2313     };
   2314   }
   2315   {
   2316     uint const t = min(r1, r5);
   2317     r5 = max(r1, r5);
   2318     r1 = t;
   2319   };
   2320   {
   2321     uint const t = min(r3, r7);
   2322     r7 = max(r3, r7);
   2323     r3 = t;
   2324   };
   2325   {
   2326     uint const t = min(r1, r3);
   2327     r3 = max(r1, r3);
   2328     r1 = t;
   2329   };
   2330   {
   2331     uint const t = min(r5, r7);
   2332     r7 = max(r5, r7);
   2333     r5 = t;
   2334   };
   2335   {
   2336     uint const t = min(r2, r6);
   2337     r6 = max(r2, r6);
   2338     r2 = t;
   2339   };
   2340   {
   2341     uint const t = min(r4, r8);
   2342     r8 = max(r4, r8);
   2343     r4 = t;
   2344   };
   2345   {
   2346     uint const t = min(r2, r4);
   2347     r4 = max(r2, r4);
   2348     r2 = t;
   2349   };
   2350   {
   2351     uint const t = min(r6, r8);
   2352     r8 = max(r6, r8);
   2353     r6 = t;
   2354   };
   2355   {
   2356     uint const t = min(r1, r2);
   2357     r2 = max(r1, r2);
   2358     r1 = t;
   2359   };
   2360   {
   2361     uint const t = min(r3, r4);
   2362     r4 = max(r3, r4);
   2363     r3 = t;
   2364   };
   2365   {
   2366     uint const t = min(r5, r6);
   2367     r6 = max(r5, r6);
   2368     r5 = t;
   2369   };
   2370   {
   2371     uint const t = min(r7, r8);
   2372     r8 = max(r7, r8);
   2373     r7 = t;
   2374   };
   2375   uint const smem_l_idx =
   2376     get_sub_group_id() * ((1 << 4) * 4) + get_sub_group_local_id();
   2377   uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 4) * 4) +
   2378                           (get_sub_group_local_id() ^ ((1 << 4) - 1));
   2379   shared.m[get_local_id(0) + (4 * (1 << 4) * 0)] = r1;
   2380   shared.m[get_local_id(0) + (4 * (1 << 4) * 1)] = r8;
   2381   shared.m[get_local_id(0) + (4 * (1 << 4) * 2)] = r2;
   2382   shared.m[get_local_id(0) + (4 * (1 << 4) * 3)] = r7;
   2383   shared.m[get_local_id(0) + (4 * (1 << 4) * 4)] = r3;
   2384   shared.m[get_local_id(0) + (4 * (1 << 4) * 5)] = r6;
   2385   shared.m[get_local_id(0) + (4 * (1 << 4) * 6)] = r4;
   2386   shared.m[get_local_id(0) + (4 * (1 << 4) * 7)] = r5;
   2387   barrier(CLK_LOCAL_MEM_FENCE);
   2388   {
   2389     {
   2390       uint r0_1 = shared.m[smem_l_idx + (0)];
   2391       uint r0_2 = shared.m[smem_r_idx + (16)];
   2392       {
   2393         uint const t = min(r0_1, r0_2);
   2394         r0_2 = max(r0_1, r0_2);
   2395         r0_1 = t;
   2396       };
   2397       shared.m[smem_l_idx + (0)] = r0_1;
   2398       shared.m[smem_r_idx + (16)] = r0_2;
   2399     }
   2400     {
   2401       uint r1_1 = shared.m[smem_l_idx + (32)];
   2402       uint r1_2 = shared.m[smem_r_idx + (48)];
   2403       {
   2404         uint const t = min(r1_1, r1_2);
   2405         r1_2 = max(r1_1, r1_2);
   2406         r1_1 = t;
   2407       };
   2408       shared.m[smem_l_idx + (32)] = r1_1;
   2409       shared.m[smem_r_idx + (48)] = r1_2;
   2410     }
   2411     {
   2412       uint r0_1 = shared.m[smem_l_idx + (256)];
   2413       uint r0_2 = shared.m[smem_r_idx + (272)];
   2414       {
   2415         uint const t = min(r0_1, r0_2);
   2416         r0_2 = max(r0_1, r0_2);
   2417         r0_1 = t;
   2418       };
   2419       shared.m[smem_l_idx + (256)] = r0_1;
   2420       shared.m[smem_r_idx + (272)] = r0_2;
   2421     }
   2422     {
   2423       uint r1_1 = shared.m[smem_l_idx + (288)];
   2424       uint r1_2 = shared.m[smem_r_idx + (304)];
   2425       {
   2426         uint const t = min(r1_1, r1_2);
   2427         r1_2 = max(r1_1, r1_2);
   2428         r1_1 = t;
   2429       };
   2430       shared.m[smem_l_idx + (288)] = r1_1;
   2431       shared.m[smem_r_idx + (304)] = r1_2;
   2432     }
   2433   }
   2434   barrier(CLK_LOCAL_MEM_FENCE);
   2435   r1 = shared.m[get_local_id(0) + (4 * (1 << 4) * 0)];
   2436   r8 = shared.m[get_local_id(0) + (4 * (1 << 4) * 1)];
   2437   r2 = shared.m[get_local_id(0) + (4 * (1 << 4) * 2)];
   2438   r7 = shared.m[get_local_id(0) + (4 * (1 << 4) * 3)];
   2439   r3 = shared.m[get_local_id(0) + (4 * (1 << 4) * 4)];
   2440   r6 = shared.m[get_local_id(0) + (4 * (1 << 4) * 5)];
   2441   r4 = shared.m[get_local_id(0) + (4 * (1 << 4) * 6)];
   2442   r5 = shared.m[get_local_id(0) + (4 * (1 << 4) * 7)];
   2443   {
   2444     {
   2445       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   2446       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   2447       ;
   2448       {
   2449         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   2450         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   2451       };
   2452       {
   2453         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   2454         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   2455       };
   2456       {
   2457         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   2458         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   2459       };
   2460       {
   2461         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   2462         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   2463       };
   2464       {
   2465         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   2466         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2467       };
   2468       {
   2469         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   2470         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2471       };
   2472       {
   2473         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   2474         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2475       };
   2476       {
   2477         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   2478         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2479       };
   2480     }
   2481     {
   2482       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   2483       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   2484       ;
   2485       {
   2486         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   2487         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   2488       };
   2489       {
   2490         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   2491         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   2492       };
   2493       {
   2494         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   2495         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   2496       };
   2497       {
   2498         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   2499         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   2500       };
   2501       {
   2502         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   2503         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2504       };
   2505       {
   2506         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   2507         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2508       };
   2509       {
   2510         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   2511         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2512       };
   2513       {
   2514         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   2515         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2516       };
   2517     }
   2518     {
   2519       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   2520       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   2521       ;
   2522       {
   2523         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   2524         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   2525       };
   2526       {
   2527         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   2528         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   2529       };
   2530       {
   2531         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   2532         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   2533       };
   2534       {
   2535         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   2536         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   2537       };
   2538       {
   2539         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   2540         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2541       };
   2542       {
   2543         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   2544         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2545       };
   2546       {
   2547         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   2548         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2549       };
   2550       {
   2551         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   2552         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2553       };
   2554     }
   2555     {
   2556       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   2557       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   2558       ;
   2559       {
   2560         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   2561         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   2562       };
   2563       {
   2564         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   2565         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   2566       };
   2567       {
   2568         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   2569         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   2570       };
   2571       {
   2572         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   2573         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   2574       };
   2575       {
   2576         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   2577         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2578       };
   2579       {
   2580         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   2581         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2582       };
   2583       {
   2584         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   2585         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2586       };
   2587       {
   2588         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   2589         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2590       };
   2591     }
   2592     {
   2593       uint const t = min(r1, r5);
   2594       r5 = max(r1, r5);
   2595       r1 = t;
   2596     };
   2597     {
   2598       uint const t = min(r3, r7);
   2599       r7 = max(r3, r7);
   2600       r3 = t;
   2601     };
   2602     {
   2603       uint const t = min(r1, r3);
   2604       r3 = max(r1, r3);
   2605       r1 = t;
   2606     };
   2607     {
   2608       uint const t = min(r5, r7);
   2609       r7 = max(r5, r7);
   2610       r5 = t;
   2611     };
   2612     {
   2613       uint const t = min(r2, r6);
   2614       r6 = max(r2, r6);
   2615       r2 = t;
   2616     };
   2617     {
   2618       uint const t = min(r4, r8);
   2619       r8 = max(r4, r8);
   2620       r4 = t;
   2621     };
   2622     {
   2623       uint const t = min(r2, r4);
   2624       r4 = max(r2, r4);
   2625       r2 = t;
   2626     };
   2627     {
   2628       uint const t = min(r6, r8);
   2629       r8 = max(r6, r8);
   2630       r6 = t;
   2631     };
   2632     {
   2633       uint const t = min(r1, r2);
   2634       r2 = max(r1, r2);
   2635       r1 = t;
   2636     };
   2637     {
   2638       uint const t = min(r3, r4);
   2639       r4 = max(r3, r4);
   2640       r3 = t;
   2641     };
   2642     {
   2643       uint const t = min(r5, r6);
   2644       r6 = max(r5, r6);
   2645       r5 = t;
   2646     };
   2647     {
   2648       uint const t = min(r7, r8);
   2649       r8 = max(r7, r8);
   2650       r7 = t;
   2651     };
   2652   }
   2653   shared.m[get_local_id(0) + (4 * (1 << 4) * 0)] = r1;
   2654   shared.m[get_local_id(0) + (4 * (1 << 4) * 1)] = r8;
   2655   shared.m[get_local_id(0) + (4 * (1 << 4) * 2)] = r2;
   2656   shared.m[get_local_id(0) + (4 * (1 << 4) * 3)] = r7;
   2657   shared.m[get_local_id(0) + (4 * (1 << 4) * 4)] = r3;
   2658   shared.m[get_local_id(0) + (4 * (1 << 4) * 5)] = r6;
   2659   shared.m[get_local_id(0) + (4 * (1 << 4) * 6)] = r4;
   2660   shared.m[get_local_id(0) + (4 * (1 << 4) * 7)] = r5;
   2661   barrier(CLK_LOCAL_MEM_FENCE);
   2662   {
   2663     {
   2664       uint r0_1 = shared.m[smem_l_idx + (0)];
   2665       uint r0_2 = shared.m[smem_l_idx + (16)];
   2666       uint r0_3 = shared.m[smem_r_idx + (32)];
   2667       uint r0_4 = shared.m[smem_r_idx + (48)];
   2668       {
   2669         uint const t = min(r0_2, r0_3);
   2670         r0_3 = max(r0_2, r0_3);
   2671         r0_2 = t;
   2672       };
   2673       {
   2674         uint const t = min(r0_1, r0_4);
   2675         r0_4 = max(r0_1, r0_4);
   2676         r0_1 = t;
   2677       };
   2678       {
   2679         uint const t = min(r0_3, r0_4);
   2680         r0_4 = max(r0_3, r0_4);
   2681         r0_3 = t;
   2682       };
   2683       {
   2684         uint const t = min(r0_1, r0_2);
   2685         r0_2 = max(r0_1, r0_2);
   2686         r0_1 = t;
   2687       };
   2688       shared.m[smem_l_idx + (0)] = r0_1;
   2689       shared.m[smem_l_idx + (16)] = r0_2;
   2690       shared.m[smem_r_idx + (32)] = r0_3;
   2691       shared.m[smem_r_idx + (48)] = r0_4;
   2692     }
   2693     {
   2694       uint r0_1 = shared.m[smem_l_idx + (256)];
   2695       uint r0_2 = shared.m[smem_l_idx + (272)];
   2696       uint r0_3 = shared.m[smem_r_idx + (288)];
   2697       uint r0_4 = shared.m[smem_r_idx + (304)];
   2698       {
   2699         uint const t = min(r0_2, r0_3);
   2700         r0_3 = max(r0_2, r0_3);
   2701         r0_2 = t;
   2702       };
   2703       {
   2704         uint const t = min(r0_1, r0_4);
   2705         r0_4 = max(r0_1, r0_4);
   2706         r0_1 = t;
   2707       };
   2708       {
   2709         uint const t = min(r0_3, r0_4);
   2710         r0_4 = max(r0_3, r0_4);
   2711         r0_3 = t;
   2712       };
   2713       {
   2714         uint const t = min(r0_1, r0_2);
   2715         r0_2 = max(r0_1, r0_2);
   2716         r0_1 = t;
   2717       };
   2718       shared.m[smem_l_idx + (256)] = r0_1;
   2719       shared.m[smem_l_idx + (272)] = r0_2;
   2720       shared.m[smem_r_idx + (288)] = r0_3;
   2721       shared.m[smem_r_idx + (304)] = r0_4;
   2722     }
   2723   }
   2724   barrier(CLK_LOCAL_MEM_FENCE);
   2725   r1 = shared.m[get_local_id(0) + (4 * (1 << 4) * 0)];
   2726   r8 = shared.m[get_local_id(0) + (4 * (1 << 4) * 1)];
   2727   r2 = shared.m[get_local_id(0) + (4 * (1 << 4) * 2)];
   2728   r7 = shared.m[get_local_id(0) + (4 * (1 << 4) * 3)];
   2729   r3 = shared.m[get_local_id(0) + (4 * (1 << 4) * 4)];
   2730   r6 = shared.m[get_local_id(0) + (4 * (1 << 4) * 5)];
   2731   r4 = shared.m[get_local_id(0) + (4 * (1 << 4) * 6)];
   2732   r5 = shared.m[get_local_id(0) + (4 * (1 << 4) * 7)];
   2733   {
   2734     {
   2735       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   2736       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   2737       ;
   2738       {
   2739         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   2740         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   2741       };
   2742       {
   2743         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   2744         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   2745       };
   2746       {
   2747         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   2748         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   2749       };
   2750       {
   2751         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   2752         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   2753       };
   2754       {
   2755         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   2756         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2757       };
   2758       {
   2759         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   2760         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2761       };
   2762       {
   2763         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   2764         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2765       };
   2766       {
   2767         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   2768         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2769       };
   2770     }
   2771     {
   2772       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   2773       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   2774       ;
   2775       {
   2776         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   2777         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   2778       };
   2779       {
   2780         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   2781         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   2782       };
   2783       {
   2784         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   2785         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   2786       };
   2787       {
   2788         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   2789         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   2790       };
   2791       {
   2792         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   2793         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2794       };
   2795       {
   2796         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   2797         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2798       };
   2799       {
   2800         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   2801         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2802       };
   2803       {
   2804         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   2805         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2806       };
   2807     }
   2808     {
   2809       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   2810       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   2811       ;
   2812       {
   2813         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   2814         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   2815       };
   2816       {
   2817         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   2818         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   2819       };
   2820       {
   2821         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   2822         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   2823       };
   2824       {
   2825         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   2826         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   2827       };
   2828       {
   2829         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   2830         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2831       };
   2832       {
   2833         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   2834         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2835       };
   2836       {
   2837         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   2838         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2839       };
   2840       {
   2841         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   2842         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2843       };
   2844     }
   2845     {
   2846       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   2847       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   2848       ;
   2849       {
   2850         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   2851         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   2852       };
   2853       {
   2854         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   2855         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   2856       };
   2857       {
   2858         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   2859         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   2860       };
   2861       {
   2862         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   2863         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   2864       };
   2865       {
   2866         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   2867         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   2868       };
   2869       {
   2870         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   2871         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   2872       };
   2873       {
   2874         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   2875         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   2876       };
   2877       {
   2878         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   2879         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   2880       };
   2881     }
   2882     {
   2883       uint const t = min(r1, r5);
   2884       r5 = max(r1, r5);
   2885       r1 = t;
   2886     };
   2887     {
   2888       uint const t = min(r3, r7);
   2889       r7 = max(r3, r7);
   2890       r3 = t;
   2891     };
   2892     {
   2893       uint const t = min(r1, r3);
   2894       r3 = max(r1, r3);
   2895       r1 = t;
   2896     };
   2897     {
   2898       uint const t = min(r5, r7);
   2899       r7 = max(r5, r7);
   2900       r5 = t;
   2901     };
   2902     {
   2903       uint const t = min(r2, r6);
   2904       r6 = max(r2, r6);
   2905       r2 = t;
   2906     };
   2907     {
   2908       uint const t = min(r4, r8);
   2909       r8 = max(r4, r8);
   2910       r4 = t;
   2911     };
   2912     {
   2913       uint const t = min(r2, r4);
   2914       r4 = max(r2, r4);
   2915       r2 = t;
   2916     };
   2917     {
   2918       uint const t = min(r6, r8);
   2919       r8 = max(r6, r8);
   2920       r6 = t;
   2921     };
   2922     {
   2923       uint const t = min(r1, r2);
   2924       r2 = max(r1, r2);
   2925       r1 = t;
   2926     };
   2927     {
   2928       uint const t = min(r3, r4);
   2929       r4 = max(r3, r4);
   2930       r3 = t;
   2931     };
   2932     {
   2933       uint const t = min(r5, r6);
   2934       r6 = max(r5, r6);
   2935       r5 = t;
   2936     };
   2937     {
   2938       uint const t = min(r7, r8);
   2939       r8 = max(r7, r8);
   2940       r7 = t;
   2941     };
   2942   }
   2943   vout[gmem_idx + (1 << 4) * 0] = r1;
   2944   vout[gmem_idx + (1 << 4) * 1] = r2;
   2945   vout[gmem_idx + (1 << 4) * 2] = r3;
   2946   vout[gmem_idx + (1 << 4) * 3] = r4;
   2947   vout[gmem_idx + (1 << 4) * 4] = r5;
   2948   vout[gmem_idx + (1 << 4) * 5] = r6;
   2949   vout[gmem_idx + (1 << 4) * 6] = r7;
   2950   vout[gmem_idx + (1 << 4) * 7] = r8;
   2951 }
   2952 
   2953 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
   2954 __attribute__((reqd_work_group_size((1 << 4) * 8, 1, 1))) void
   2955 hs_kernel_bs_3(__global uint const* const restrict vin,
   2956                __global uint* const restrict vout)
   2957 {
   2958   __local struct
   2959   {
   2960     uint m[128 * 8];
   2961   } shared;
   2962 
   2963   uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
   2964                         (get_local_id(0) & ((1 << 4) - 1));
   2965   uint r1 = vin[gmem_idx + (1 << 4) * 0];
   2966   uint r2 = vin[gmem_idx + (1 << 4) * 1];
   2967   uint r3 = vin[gmem_idx + (1 << 4) * 2];
   2968   uint r4 = vin[gmem_idx + (1 << 4) * 3];
   2969   uint r5 = vin[gmem_idx + (1 << 4) * 4];
   2970   uint r6 = vin[gmem_idx + (1 << 4) * 5];
   2971   uint r7 = vin[gmem_idx + (1 << 4) * 6];
   2972   uint r8 = vin[gmem_idx + (1 << 4) * 7];
   2973   {
   2974     uint const t = min(r1, r5);
   2975     r5 = max(r1, r5);
   2976     r1 = t;
   2977   };
   2978   {
   2979     uint const t = min(r2, r6);
   2980     r6 = max(r2, r6);
   2981     r2 = t;
   2982   };
   2983   {
   2984     uint const t = min(r3, r7);
   2985     r7 = max(r3, r7);
   2986     r3 = t;
   2987   };
   2988   {
   2989     uint const t = min(r4, r8);
   2990     r8 = max(r4, r8);
   2991     r4 = t;
   2992   };
   2993   {
   2994     uint const t = min(r1, r3);
   2995     r3 = max(r1, r3);
   2996     r1 = t;
   2997   };
   2998   {
   2999     uint const t = min(r2, r4);
   3000     r4 = max(r2, r4);
   3001     r2 = t;
   3002   };
   3003   {
   3004     uint const t = min(r5, r7);
   3005     r7 = max(r5, r7);
   3006     r5 = t;
   3007   };
   3008   {
   3009     uint const t = min(r6, r8);
   3010     r8 = max(r6, r8);
   3011     r6 = t;
   3012   };
   3013   {
   3014     uint const t = min(r3, r5);
   3015     r5 = max(r3, r5);
   3016     r3 = t;
   3017   };
   3018   {
   3019     uint const t = min(r4, r6);
   3020     r6 = max(r4, r6);
   3021     r4 = t;
   3022   };
   3023   {
   3024     uint const t = min(r1, r2);
   3025     r2 = max(r1, r2);
   3026     r1 = t;
   3027   };
   3028   {
   3029     uint const t = min(r3, r4);
   3030     r4 = max(r3, r4);
   3031     r3 = t;
   3032   };
   3033   {
   3034     uint const t = min(r5, r6);
   3035     r6 = max(r5, r6);
   3036     r5 = t;
   3037   };
   3038   {
   3039     uint const t = min(r7, r8);
   3040     r8 = max(r7, r8);
   3041     r7 = t;
   3042   };
   3043   {
   3044     uint const t = min(r2, r5);
   3045     r5 = max(r2, r5);
   3046     r2 = t;
   3047   };
   3048   {
   3049     uint const t = min(r4, r7);
   3050     r7 = max(r4, r7);
   3051     r4 = t;
   3052   };
   3053   {
   3054     uint const t = min(r2, r3);
   3055     r3 = max(r2, r3);
   3056     r2 = t;
   3057   };
   3058   {
   3059     uint const t = min(r4, r5);
   3060     r5 = max(r4, r5);
   3061     r4 = t;
   3062   };
   3063   {
   3064     uint const t = min(r6, r7);
   3065     r7 = max(r6, r7);
   3066     r6 = t;
   3067   };
   3068   {
   3069     uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
   3070     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   3071     ;
   3072     {
   3073       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   3074       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   3075       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   3076       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3077     };
   3078     {
   3079       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   3080       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   3081       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   3082       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3083     };
   3084     {
   3085       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   3086       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   3087       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   3088       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3089     };
   3090     {
   3091       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   3092       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   3093       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   3094       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3095     };
   3096   }
   3097   {
   3098     uint const t = min(r1, r5);
   3099     r5 = max(r1, r5);
   3100     r1 = t;
   3101   };
   3102   {
   3103     uint const t = min(r3, r7);
   3104     r7 = max(r3, r7);
   3105     r3 = t;
   3106   };
   3107   {
   3108     uint const t = min(r1, r3);
   3109     r3 = max(r1, r3);
   3110     r1 = t;
   3111   };
   3112   {
   3113     uint const t = min(r5, r7);
   3114     r7 = max(r5, r7);
   3115     r5 = t;
   3116   };
   3117   {
   3118     uint const t = min(r2, r6);
   3119     r6 = max(r2, r6);
   3120     r2 = t;
   3121   };
   3122   {
   3123     uint const t = min(r4, r8);
   3124     r8 = max(r4, r8);
   3125     r4 = t;
   3126   };
   3127   {
   3128     uint const t = min(r2, r4);
   3129     r4 = max(r2, r4);
   3130     r2 = t;
   3131   };
   3132   {
   3133     uint const t = min(r6, r8);
   3134     r8 = max(r6, r8);
   3135     r6 = t;
   3136   };
   3137   {
   3138     uint const t = min(r1, r2);
   3139     r2 = max(r1, r2);
   3140     r1 = t;
   3141   };
   3142   {
   3143     uint const t = min(r3, r4);
   3144     r4 = max(r3, r4);
   3145     r3 = t;
   3146   };
   3147   {
   3148     uint const t = min(r5, r6);
   3149     r6 = max(r5, r6);
   3150     r5 = t;
   3151   };
   3152   {
   3153     uint const t = min(r7, r8);
   3154     r8 = max(r7, r8);
   3155     r7 = t;
   3156   };
   3157   {
   3158     uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
   3159     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   3160     ;
   3161     {
   3162       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   3163       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   3164       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   3165       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3166     };
   3167     {
   3168       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   3169       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   3170       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   3171       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3172     };
   3173     {
   3174       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   3175       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   3176       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   3177       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3178     };
   3179     {
   3180       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   3181       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   3182       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   3183       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3184     };
   3185   }
   3186   {
   3187     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   3188     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   3189     ;
   3190     {
   3191       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   3192       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   3193     };
   3194     {
   3195       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   3196       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   3197     };
   3198     {
   3199       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   3200       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   3201     };
   3202     {
   3203       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   3204       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   3205     };
   3206     {
   3207       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   3208       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3209     };
   3210     {
   3211       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   3212       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3213     };
   3214     {
   3215       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   3216       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3217     };
   3218     {
   3219       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   3220       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3221     };
   3222   }
   3223   {
   3224     uint const t = min(r1, r5);
   3225     r5 = max(r1, r5);
   3226     r1 = t;
   3227   };
   3228   {
   3229     uint const t = min(r3, r7);
   3230     r7 = max(r3, r7);
   3231     r3 = t;
   3232   };
   3233   {
   3234     uint const t = min(r1, r3);
   3235     r3 = max(r1, r3);
   3236     r1 = t;
   3237   };
   3238   {
   3239     uint const t = min(r5, r7);
   3240     r7 = max(r5, r7);
   3241     r5 = t;
   3242   };
   3243   {
   3244     uint const t = min(r2, r6);
   3245     r6 = max(r2, r6);
   3246     r2 = t;
   3247   };
   3248   {
   3249     uint const t = min(r4, r8);
   3250     r8 = max(r4, r8);
   3251     r4 = t;
   3252   };
   3253   {
   3254     uint const t = min(r2, r4);
   3255     r4 = max(r2, r4);
   3256     r2 = t;
   3257   };
   3258   {
   3259     uint const t = min(r6, r8);
   3260     r8 = max(r6, r8);
   3261     r6 = t;
   3262   };
   3263   {
   3264     uint const t = min(r1, r2);
   3265     r2 = max(r1, r2);
   3266     r1 = t;
   3267   };
   3268   {
   3269     uint const t = min(r3, r4);
   3270     r4 = max(r3, r4);
   3271     r3 = t;
   3272   };
   3273   {
   3274     uint const t = min(r5, r6);
   3275     r6 = max(r5, r6);
   3276     r5 = t;
   3277   };
   3278   {
   3279     uint const t = min(r7, r8);
   3280     r8 = max(r7, r8);
   3281     r7 = t;
   3282   };
   3283   {
   3284     uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
   3285     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   3286     ;
   3287     {
   3288       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   3289       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   3290       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   3291       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3292     };
   3293     {
   3294       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   3295       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   3296       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   3297       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3298     };
   3299     {
   3300       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   3301       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   3302       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   3303       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3304     };
   3305     {
   3306       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   3307       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   3308       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   3309       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3310     };
   3311   }
   3312   {
   3313     uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   3314     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   3315     ;
   3316     {
   3317       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   3318       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   3319     };
   3320     {
   3321       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   3322       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   3323     };
   3324     {
   3325       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   3326       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   3327     };
   3328     {
   3329       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   3330       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   3331     };
   3332     {
   3333       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   3334       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3335     };
   3336     {
   3337       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   3338       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3339     };
   3340     {
   3341       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   3342       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3343     };
   3344     {
   3345       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   3346       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3347     };
   3348   }
   3349   {
   3350     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   3351     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   3352     ;
   3353     {
   3354       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   3355       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   3356     };
   3357     {
   3358       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   3359       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   3360     };
   3361     {
   3362       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   3363       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   3364     };
   3365     {
   3366       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   3367       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   3368     };
   3369     {
   3370       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   3371       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3372     };
   3373     {
   3374       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   3375       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3376     };
   3377     {
   3378       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   3379       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3380     };
   3381     {
   3382       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   3383       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3384     };
   3385   }
   3386   {
   3387     uint const t = min(r1, r5);
   3388     r5 = max(r1, r5);
   3389     r1 = t;
   3390   };
   3391   {
   3392     uint const t = min(r3, r7);
   3393     r7 = max(r3, r7);
   3394     r3 = t;
   3395   };
   3396   {
   3397     uint const t = min(r1, r3);
   3398     r3 = max(r1, r3);
   3399     r1 = t;
   3400   };
   3401   {
   3402     uint const t = min(r5, r7);
   3403     r7 = max(r5, r7);
   3404     r5 = t;
   3405   };
   3406   {
   3407     uint const t = min(r2, r6);
   3408     r6 = max(r2, r6);
   3409     r2 = t;
   3410   };
   3411   {
   3412     uint const t = min(r4, r8);
   3413     r8 = max(r4, r8);
   3414     r4 = t;
   3415   };
   3416   {
   3417     uint const t = min(r2, r4);
   3418     r4 = max(r2, r4);
   3419     r2 = t;
   3420   };
   3421   {
   3422     uint const t = min(r6, r8);
   3423     r8 = max(r6, r8);
   3424     r6 = t;
   3425   };
   3426   {
   3427     uint const t = min(r1, r2);
   3428     r2 = max(r1, r2);
   3429     r1 = t;
   3430   };
   3431   {
   3432     uint const t = min(r3, r4);
   3433     r4 = max(r3, r4);
   3434     r3 = t;
   3435   };
   3436   {
   3437     uint const t = min(r5, r6);
   3438     r6 = max(r5, r6);
   3439     r5 = t;
   3440   };
   3441   {
   3442     uint const t = min(r7, r8);
   3443     r8 = max(r7, r8);
   3444     r7 = t;
   3445   };
   3446   {
   3447     uint const flip_lane_idx = get_sub_group_local_id() ^ 15;
   3448     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   3449     ;
   3450     {
   3451       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   3452       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   3453       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   3454       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3455     };
   3456     {
   3457       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   3458       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   3459       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   3460       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3461     };
   3462     {
   3463       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   3464       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   3465       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   3466       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3467     };
   3468     {
   3469       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   3470       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   3471       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   3472       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3473     };
   3474   }
   3475   {
   3476     uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   3477     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   3478     ;
   3479     {
   3480       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   3481       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   3482     };
   3483     {
   3484       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   3485       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   3486     };
   3487     {
   3488       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   3489       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   3490     };
   3491     {
   3492       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   3493       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   3494     };
   3495     {
   3496       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   3497       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3498     };
   3499     {
   3500       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   3501       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3502     };
   3503     {
   3504       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   3505       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3506     };
   3507     {
   3508       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   3509       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3510     };
   3511   }
   3512   {
   3513     uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   3514     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   3515     ;
   3516     {
   3517       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   3518       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   3519     };
   3520     {
   3521       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   3522       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   3523     };
   3524     {
   3525       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   3526       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   3527     };
   3528     {
   3529       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   3530       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   3531     };
   3532     {
   3533       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   3534       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3535     };
   3536     {
   3537       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   3538       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3539     };
   3540     {
   3541       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   3542       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3543     };
   3544     {
   3545       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   3546       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3547     };
   3548   }
   3549   {
   3550     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   3551     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   3552     ;
   3553     {
   3554       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   3555       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   3556     };
   3557     {
   3558       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   3559       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   3560     };
   3561     {
   3562       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   3563       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   3564     };
   3565     {
   3566       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   3567       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   3568     };
   3569     {
   3570       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   3571       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3572     };
   3573     {
   3574       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   3575       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3576     };
   3577     {
   3578       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   3579       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3580     };
   3581     {
   3582       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   3583       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3584     };
   3585   }
   3586   {
   3587     uint const t = min(r1, r5);
   3588     r5 = max(r1, r5);
   3589     r1 = t;
   3590   };
   3591   {
   3592     uint const t = min(r3, r7);
   3593     r7 = max(r3, r7);
   3594     r3 = t;
   3595   };
   3596   {
   3597     uint const t = min(r1, r3);
   3598     r3 = max(r1, r3);
   3599     r1 = t;
   3600   };
   3601   {
   3602     uint const t = min(r5, r7);
   3603     r7 = max(r5, r7);
   3604     r5 = t;
   3605   };
   3606   {
   3607     uint const t = min(r2, r6);
   3608     r6 = max(r2, r6);
   3609     r2 = t;
   3610   };
   3611   {
   3612     uint const t = min(r4, r8);
   3613     r8 = max(r4, r8);
   3614     r4 = t;
   3615   };
   3616   {
   3617     uint const t = min(r2, r4);
   3618     r4 = max(r2, r4);
   3619     r2 = t;
   3620   };
   3621   {
   3622     uint const t = min(r6, r8);
   3623     r8 = max(r6, r8);
   3624     r6 = t;
   3625   };
   3626   {
   3627     uint const t = min(r1, r2);
   3628     r2 = max(r1, r2);
   3629     r1 = t;
   3630   };
   3631   {
   3632     uint const t = min(r3, r4);
   3633     r4 = max(r3, r4);
   3634     r3 = t;
   3635   };
   3636   {
   3637     uint const t = min(r5, r6);
   3638     r6 = max(r5, r6);
   3639     r5 = t;
   3640   };
   3641   {
   3642     uint const t = min(r7, r8);
   3643     r8 = max(r7, r8);
   3644     r7 = t;
   3645   };
   3646   uint const smem_l_idx =
   3647     get_sub_group_id() * ((1 << 4) * 8) + get_sub_group_local_id();
   3648   uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 4) * 8) +
   3649                           (get_sub_group_local_id() ^ ((1 << 4) - 1));
   3650   shared.m[get_local_id(0) + (8 * (1 << 4) * 0)] = r1;
   3651   shared.m[get_local_id(0) + (8 * (1 << 4) * 1)] = r8;
   3652   shared.m[get_local_id(0) + (8 * (1 << 4) * 2)] = r2;
   3653   shared.m[get_local_id(0) + (8 * (1 << 4) * 3)] = r7;
   3654   shared.m[get_local_id(0) + (8 * (1 << 4) * 4)] = r3;
   3655   shared.m[get_local_id(0) + (8 * (1 << 4) * 5)] = r6;
   3656   shared.m[get_local_id(0) + (8 * (1 << 4) * 6)] = r4;
   3657   shared.m[get_local_id(0) + (8 * (1 << 4) * 7)] = r5;
   3658   barrier(CLK_LOCAL_MEM_FENCE);
   3659   {
   3660     {
   3661       uint r0_1 = shared.m[smem_l_idx + (0)];
   3662       uint r0_2 = shared.m[smem_r_idx + (16)];
   3663       {
   3664         uint const t = min(r0_1, r0_2);
   3665         r0_2 = max(r0_1, r0_2);
   3666         r0_1 = t;
   3667       };
   3668       shared.m[smem_l_idx + (0)] = r0_1;
   3669       shared.m[smem_r_idx + (16)] = r0_2;
   3670     }
   3671     {
   3672       uint r1_1 = shared.m[smem_l_idx + (32)];
   3673       uint r1_2 = shared.m[smem_r_idx + (48)];
   3674       {
   3675         uint const t = min(r1_1, r1_2);
   3676         r1_2 = max(r1_1, r1_2);
   3677         r1_1 = t;
   3678       };
   3679       shared.m[smem_l_idx + (32)] = r1_1;
   3680       shared.m[smem_r_idx + (48)] = r1_2;
   3681     }
   3682     {
   3683       uint r2_1 = shared.m[smem_l_idx + (64)];
   3684       uint r2_2 = shared.m[smem_r_idx + (80)];
   3685       {
   3686         uint const t = min(r2_1, r2_2);
   3687         r2_2 = max(r2_1, r2_2);
   3688         r2_1 = t;
   3689       };
   3690       shared.m[smem_l_idx + (64)] = r2_1;
   3691       shared.m[smem_r_idx + (80)] = r2_2;
   3692     }
   3693     {
   3694       uint r3_1 = shared.m[smem_l_idx + (96)];
   3695       uint r3_2 = shared.m[smem_r_idx + (112)];
   3696       {
   3697         uint const t = min(r3_1, r3_2);
   3698         r3_2 = max(r3_1, r3_2);
   3699         r3_1 = t;
   3700       };
   3701       shared.m[smem_l_idx + (96)] = r3_1;
   3702       shared.m[smem_r_idx + (112)] = r3_2;
   3703     }
   3704   }
   3705   barrier(CLK_LOCAL_MEM_FENCE);
   3706   r1 = shared.m[get_local_id(0) + (8 * (1 << 4) * 0)];
   3707   r8 = shared.m[get_local_id(0) + (8 * (1 << 4) * 1)];
   3708   r2 = shared.m[get_local_id(0) + (8 * (1 << 4) * 2)];
   3709   r7 = shared.m[get_local_id(0) + (8 * (1 << 4) * 3)];
   3710   r3 = shared.m[get_local_id(0) + (8 * (1 << 4) * 4)];
   3711   r6 = shared.m[get_local_id(0) + (8 * (1 << 4) * 5)];
   3712   r4 = shared.m[get_local_id(0) + (8 * (1 << 4) * 6)];
   3713   r5 = shared.m[get_local_id(0) + (8 * (1 << 4) * 7)];
   3714   {
   3715     {
   3716       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   3717       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   3718       ;
   3719       {
   3720         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   3721         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   3722       };
   3723       {
   3724         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   3725         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   3726       };
   3727       {
   3728         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   3729         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   3730       };
   3731       {
   3732         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   3733         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   3734       };
   3735       {
   3736         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   3737         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3738       };
   3739       {
   3740         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   3741         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3742       };
   3743       {
   3744         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   3745         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3746       };
   3747       {
   3748         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   3749         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3750       };
   3751     }
   3752     {
   3753       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   3754       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   3755       ;
   3756       {
   3757         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   3758         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   3759       };
   3760       {
   3761         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   3762         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   3763       };
   3764       {
   3765         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   3766         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   3767       };
   3768       {
   3769         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   3770         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   3771       };
   3772       {
   3773         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   3774         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3775       };
   3776       {
   3777         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   3778         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3779       };
   3780       {
   3781         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   3782         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3783       };
   3784       {
   3785         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   3786         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3787       };
   3788     }
   3789     {
   3790       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   3791       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   3792       ;
   3793       {
   3794         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   3795         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   3796       };
   3797       {
   3798         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   3799         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   3800       };
   3801       {
   3802         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   3803         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   3804       };
   3805       {
   3806         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   3807         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   3808       };
   3809       {
   3810         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   3811         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3812       };
   3813       {
   3814         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   3815         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3816       };
   3817       {
   3818         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   3819         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3820       };
   3821       {
   3822         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   3823         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3824       };
   3825     }
   3826     {
   3827       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   3828       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   3829       ;
   3830       {
   3831         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   3832         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   3833       };
   3834       {
   3835         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   3836         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   3837       };
   3838       {
   3839         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   3840         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   3841       };
   3842       {
   3843         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   3844         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   3845       };
   3846       {
   3847         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   3848         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   3849       };
   3850       {
   3851         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   3852         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   3853       };
   3854       {
   3855         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   3856         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   3857       };
   3858       {
   3859         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   3860         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   3861       };
   3862     }
   3863     {
   3864       uint const t = min(r1, r5);
   3865       r5 = max(r1, r5);
   3866       r1 = t;
   3867     };
   3868     {
   3869       uint const t = min(r3, r7);
   3870       r7 = max(r3, r7);
   3871       r3 = t;
   3872     };
   3873     {
   3874       uint const t = min(r1, r3);
   3875       r3 = max(r1, r3);
   3876       r1 = t;
   3877     };
   3878     {
   3879       uint const t = min(r5, r7);
   3880       r7 = max(r5, r7);
   3881       r5 = t;
   3882     };
   3883     {
   3884       uint const t = min(r2, r6);
   3885       r6 = max(r2, r6);
   3886       r2 = t;
   3887     };
   3888     {
   3889       uint const t = min(r4, r8);
   3890       r8 = max(r4, r8);
   3891       r4 = t;
   3892     };
   3893     {
   3894       uint const t = min(r2, r4);
   3895       r4 = max(r2, r4);
   3896       r2 = t;
   3897     };
   3898     {
   3899       uint const t = min(r6, r8);
   3900       r8 = max(r6, r8);
   3901       r6 = t;
   3902     };
   3903     {
   3904       uint const t = min(r1, r2);
   3905       r2 = max(r1, r2);
   3906       r1 = t;
   3907     };
   3908     {
   3909       uint const t = min(r3, r4);
   3910       r4 = max(r3, r4);
   3911       r3 = t;
   3912     };
   3913     {
   3914       uint const t = min(r5, r6);
   3915       r6 = max(r5, r6);
   3916       r5 = t;
   3917     };
   3918     {
   3919       uint const t = min(r7, r8);
   3920       r8 = max(r7, r8);
   3921       r7 = t;
   3922     };
   3923   }
   3924   shared.m[get_local_id(0) + (8 * (1 << 4) * 0)] = r1;
   3925   shared.m[get_local_id(0) + (8 * (1 << 4) * 1)] = r8;
   3926   shared.m[get_local_id(0) + (8 * (1 << 4) * 2)] = r2;
   3927   shared.m[get_local_id(0) + (8 * (1 << 4) * 3)] = r7;
   3928   shared.m[get_local_id(0) + (8 * (1 << 4) * 4)] = r3;
   3929   shared.m[get_local_id(0) + (8 * (1 << 4) * 5)] = r6;
   3930   shared.m[get_local_id(0) + (8 * (1 << 4) * 6)] = r4;
   3931   shared.m[get_local_id(0) + (8 * (1 << 4) * 7)] = r5;
   3932   barrier(CLK_LOCAL_MEM_FENCE);
   3933   {
   3934     {
   3935       uint r0_1 = shared.m[smem_l_idx + (0)];
   3936       uint r0_2 = shared.m[smem_l_idx + (16)];
   3937       uint r0_3 = shared.m[smem_r_idx + (32)];
   3938       uint r0_4 = shared.m[smem_r_idx + (48)];
   3939       {
   3940         uint const t = min(r0_2, r0_3);
   3941         r0_3 = max(r0_2, r0_3);
   3942         r0_2 = t;
   3943       };
   3944       {
   3945         uint const t = min(r0_1, r0_4);
   3946         r0_4 = max(r0_1, r0_4);
   3947         r0_1 = t;
   3948       };
   3949       {
   3950         uint const t = min(r0_3, r0_4);
   3951         r0_4 = max(r0_3, r0_4);
   3952         r0_3 = t;
   3953       };
   3954       {
   3955         uint const t = min(r0_1, r0_2);
   3956         r0_2 = max(r0_1, r0_2);
   3957         r0_1 = t;
   3958       };
   3959       shared.m[smem_l_idx + (0)] = r0_1;
   3960       shared.m[smem_l_idx + (16)] = r0_2;
   3961       shared.m[smem_r_idx + (32)] = r0_3;
   3962       shared.m[smem_r_idx + (48)] = r0_4;
   3963     }
   3964     {
   3965       uint r1_1 = shared.m[smem_l_idx + (64)];
   3966       uint r1_2 = shared.m[smem_l_idx + (80)];
   3967       uint r1_3 = shared.m[smem_r_idx + (96)];
   3968       uint r1_4 = shared.m[smem_r_idx + (112)];
   3969       {
   3970         uint const t = min(r1_2, r1_3);
   3971         r1_3 = max(r1_2, r1_3);
   3972         r1_2 = t;
   3973       };
   3974       {
   3975         uint const t = min(r1_1, r1_4);
   3976         r1_4 = max(r1_1, r1_4);
   3977         r1_1 = t;
   3978       };
   3979       {
   3980         uint const t = min(r1_3, r1_4);
   3981         r1_4 = max(r1_3, r1_4);
   3982         r1_3 = t;
   3983       };
   3984       {
   3985         uint const t = min(r1_1, r1_2);
   3986         r1_2 = max(r1_1, r1_2);
   3987         r1_1 = t;
   3988       };
   3989       shared.m[smem_l_idx + (64)] = r1_1;
   3990       shared.m[smem_l_idx + (80)] = r1_2;
   3991       shared.m[smem_r_idx + (96)] = r1_3;
   3992       shared.m[smem_r_idx + (112)] = r1_4;
   3993     }
   3994   }
   3995   barrier(CLK_LOCAL_MEM_FENCE);
   3996   r1 = shared.m[get_local_id(0) + (8 * (1 << 4) * 0)];
   3997   r8 = shared.m[get_local_id(0) + (8 * (1 << 4) * 1)];
   3998   r2 = shared.m[get_local_id(0) + (8 * (1 << 4) * 2)];
   3999   r7 = shared.m[get_local_id(0) + (8 * (1 << 4) * 3)];
   4000   r3 = shared.m[get_local_id(0) + (8 * (1 << 4) * 4)];
   4001   r6 = shared.m[get_local_id(0) + (8 * (1 << 4) * 5)];
   4002   r4 = shared.m[get_local_id(0) + (8 * (1 << 4) * 6)];
   4003   r5 = shared.m[get_local_id(0) + (8 * (1 << 4) * 7)];
   4004   {
   4005     {
   4006       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   4007       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   4008       ;
   4009       {
   4010         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   4011         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   4012       };
   4013       {
   4014         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   4015         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   4016       };
   4017       {
   4018         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   4019         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   4020       };
   4021       {
   4022         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   4023         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   4024       };
   4025       {
   4026         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   4027         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4028       };
   4029       {
   4030         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   4031         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4032       };
   4033       {
   4034         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   4035         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4036       };
   4037       {
   4038         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   4039         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4040       };
   4041     }
   4042     {
   4043       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   4044       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   4045       ;
   4046       {
   4047         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   4048         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   4049       };
   4050       {
   4051         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   4052         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   4053       };
   4054       {
   4055         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   4056         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   4057       };
   4058       {
   4059         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   4060         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   4061       };
   4062       {
   4063         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   4064         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4065       };
   4066       {
   4067         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   4068         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4069       };
   4070       {
   4071         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   4072         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4073       };
   4074       {
   4075         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   4076         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4077       };
   4078     }
   4079     {
   4080       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   4081       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   4082       ;
   4083       {
   4084         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   4085         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   4086       };
   4087       {
   4088         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   4089         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   4090       };
   4091       {
   4092         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   4093         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   4094       };
   4095       {
   4096         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   4097         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   4098       };
   4099       {
   4100         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   4101         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4102       };
   4103       {
   4104         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   4105         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4106       };
   4107       {
   4108         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   4109         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4110       };
   4111       {
   4112         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   4113         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4114       };
   4115     }
   4116     {
   4117       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   4118       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   4119       ;
   4120       {
   4121         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   4122         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   4123       };
   4124       {
   4125         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   4126         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   4127       };
   4128       {
   4129         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   4130         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   4131       };
   4132       {
   4133         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   4134         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   4135       };
   4136       {
   4137         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   4138         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4139       };
   4140       {
   4141         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   4142         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4143       };
   4144       {
   4145         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   4146         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4147       };
   4148       {
   4149         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   4150         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4151       };
   4152     }
   4153     {
   4154       uint const t = min(r1, r5);
   4155       r5 = max(r1, r5);
   4156       r1 = t;
   4157     };
   4158     {
   4159       uint const t = min(r3, r7);
   4160       r7 = max(r3, r7);
   4161       r3 = t;
   4162     };
   4163     {
   4164       uint const t = min(r1, r3);
   4165       r3 = max(r1, r3);
   4166       r1 = t;
   4167     };
   4168     {
   4169       uint const t = min(r5, r7);
   4170       r7 = max(r5, r7);
   4171       r5 = t;
   4172     };
   4173     {
   4174       uint const t = min(r2, r6);
   4175       r6 = max(r2, r6);
   4176       r2 = t;
   4177     };
   4178     {
   4179       uint const t = min(r4, r8);
   4180       r8 = max(r4, r8);
   4181       r4 = t;
   4182     };
   4183     {
   4184       uint const t = min(r2, r4);
   4185       r4 = max(r2, r4);
   4186       r2 = t;
   4187     };
   4188     {
   4189       uint const t = min(r6, r8);
   4190       r8 = max(r6, r8);
   4191       r6 = t;
   4192     };
   4193     {
   4194       uint const t = min(r1, r2);
   4195       r2 = max(r1, r2);
   4196       r1 = t;
   4197     };
   4198     {
   4199       uint const t = min(r3, r4);
   4200       r4 = max(r3, r4);
   4201       r3 = t;
   4202     };
   4203     {
   4204       uint const t = min(r5, r6);
   4205       r6 = max(r5, r6);
   4206       r5 = t;
   4207     };
   4208     {
   4209       uint const t = min(r7, r8);
   4210       r8 = max(r7, r8);
   4211       r7 = t;
   4212     };
   4213   }
   4214   shared.m[get_local_id(0) + (8 * (1 << 4) * 0)] = r1;
   4215   shared.m[get_local_id(0) + (8 * (1 << 4) * 1)] = r8;
   4216   shared.m[get_local_id(0) + (8 * (1 << 4) * 2)] = r2;
   4217   shared.m[get_local_id(0) + (8 * (1 << 4) * 3)] = r7;
   4218   shared.m[get_local_id(0) + (8 * (1 << 4) * 4)] = r3;
   4219   shared.m[get_local_id(0) + (8 * (1 << 4) * 5)] = r6;
   4220   shared.m[get_local_id(0) + (8 * (1 << 4) * 6)] = r4;
   4221   shared.m[get_local_id(0) + (8 * (1 << 4) * 7)] = r5;
   4222   barrier(CLK_LOCAL_MEM_FENCE);
   4223   {
   4224     {
   4225       uint r0_1 = shared.m[smem_l_idx + (0)];
   4226       uint r0_2 = shared.m[smem_l_idx + (16)];
   4227       uint r0_3 = shared.m[smem_l_idx + (32)];
   4228       uint r0_4 = shared.m[smem_l_idx + (48)];
   4229       uint r0_5 = shared.m[smem_r_idx + (64)];
   4230       uint r0_6 = shared.m[smem_r_idx + (80)];
   4231       uint r0_7 = shared.m[smem_r_idx + (96)];
   4232       uint r0_8 = shared.m[smem_r_idx + (112)];
   4233       {
   4234         uint const t = min(r0_4, r0_5);
   4235         r0_5 = max(r0_4, r0_5);
   4236         r0_4 = t;
   4237       };
   4238       {
   4239         uint const t = min(r0_3, r0_6);
   4240         r0_6 = max(r0_3, r0_6);
   4241         r0_3 = t;
   4242       };
   4243       {
   4244         uint const t = min(r0_2, r0_7);
   4245         r0_7 = max(r0_2, r0_7);
   4246         r0_2 = t;
   4247       };
   4248       {
   4249         uint const t = min(r0_1, r0_8);
   4250         r0_8 = max(r0_1, r0_8);
   4251         r0_1 = t;
   4252       };
   4253       {
   4254         uint const t = min(r0_5, r0_7);
   4255         r0_7 = max(r0_5, r0_7);
   4256         r0_5 = t;
   4257       };
   4258       {
   4259         uint const t = min(r0_6, r0_8);
   4260         r0_8 = max(r0_6, r0_8);
   4261         r0_6 = t;
   4262       };
   4263       {
   4264         uint const t = min(r0_5, r0_6);
   4265         r0_6 = max(r0_5, r0_6);
   4266         r0_5 = t;
   4267       };
   4268       {
   4269         uint const t = min(r0_7, r0_8);
   4270         r0_8 = max(r0_7, r0_8);
   4271         r0_7 = t;
   4272       };
   4273       {
   4274         uint const t = min(r0_1, r0_3);
   4275         r0_3 = max(r0_1, r0_3);
   4276         r0_1 = t;
   4277       };
   4278       {
   4279         uint const t = min(r0_2, r0_4);
   4280         r0_4 = max(r0_2, r0_4);
   4281         r0_2 = t;
   4282       };
   4283       {
   4284         uint const t = min(r0_1, r0_2);
   4285         r0_2 = max(r0_1, r0_2);
   4286         r0_1 = t;
   4287       };
   4288       {
   4289         uint const t = min(r0_3, r0_4);
   4290         r0_4 = max(r0_3, r0_4);
   4291         r0_3 = t;
   4292       };
   4293       shared.m[smem_l_idx + (0)] = r0_1;
   4294       shared.m[smem_l_idx + (16)] = r0_2;
   4295       shared.m[smem_l_idx + (32)] = r0_3;
   4296       shared.m[smem_l_idx + (48)] = r0_4;
   4297       shared.m[smem_r_idx + (64)] = r0_5;
   4298       shared.m[smem_r_idx + (80)] = r0_6;
   4299       shared.m[smem_r_idx + (96)] = r0_7;
   4300       shared.m[smem_r_idx + (112)] = r0_8;
   4301     }
   4302   }
   4303   barrier(CLK_LOCAL_MEM_FENCE);
   4304   r1 = shared.m[get_local_id(0) + (8 * (1 << 4) * 0)];
   4305   r8 = shared.m[get_local_id(0) + (8 * (1 << 4) * 1)];
   4306   r2 = shared.m[get_local_id(0) + (8 * (1 << 4) * 2)];
   4307   r7 = shared.m[get_local_id(0) + (8 * (1 << 4) * 3)];
   4308   r3 = shared.m[get_local_id(0) + (8 * (1 << 4) * 4)];
   4309   r6 = shared.m[get_local_id(0) + (8 * (1 << 4) * 5)];
   4310   r4 = shared.m[get_local_id(0) + (8 * (1 << 4) * 6)];
   4311   r5 = shared.m[get_local_id(0) + (8 * (1 << 4) * 7)];
   4312   {
   4313     {
   4314       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   4315       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   4316       ;
   4317       {
   4318         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   4319         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   4320       };
   4321       {
   4322         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   4323         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   4324       };
   4325       {
   4326         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   4327         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   4328       };
   4329       {
   4330         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   4331         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   4332       };
   4333       {
   4334         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   4335         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4336       };
   4337       {
   4338         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   4339         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4340       };
   4341       {
   4342         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   4343         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4344       };
   4345       {
   4346         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   4347         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4348       };
   4349     }
   4350     {
   4351       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   4352       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   4353       ;
   4354       {
   4355         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   4356         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   4357       };
   4358       {
   4359         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   4360         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   4361       };
   4362       {
   4363         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   4364         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   4365       };
   4366       {
   4367         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   4368         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   4369       };
   4370       {
   4371         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   4372         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4373       };
   4374       {
   4375         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   4376         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4377       };
   4378       {
   4379         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   4380         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4381       };
   4382       {
   4383         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   4384         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4385       };
   4386     }
   4387     {
   4388       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   4389       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   4390       ;
   4391       {
   4392         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   4393         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   4394       };
   4395       {
   4396         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   4397         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   4398       };
   4399       {
   4400         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   4401         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   4402       };
   4403       {
   4404         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   4405         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   4406       };
   4407       {
   4408         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   4409         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4410       };
   4411       {
   4412         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   4413         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4414       };
   4415       {
   4416         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   4417         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4418       };
   4419       {
   4420         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   4421         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4422       };
   4423     }
   4424     {
   4425       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   4426       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   4427       ;
   4428       {
   4429         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   4430         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   4431       };
   4432       {
   4433         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   4434         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   4435       };
   4436       {
   4437         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   4438         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   4439       };
   4440       {
   4441         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   4442         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   4443       };
   4444       {
   4445         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   4446         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4447       };
   4448       {
   4449         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   4450         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4451       };
   4452       {
   4453         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   4454         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4455       };
   4456       {
   4457         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   4458         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4459       };
   4460     }
   4461     {
   4462       uint const t = min(r1, r5);
   4463       r5 = max(r1, r5);
   4464       r1 = t;
   4465     };
   4466     {
   4467       uint const t = min(r3, r7);
   4468       r7 = max(r3, r7);
   4469       r3 = t;
   4470     };
   4471     {
   4472       uint const t = min(r1, r3);
   4473       r3 = max(r1, r3);
   4474       r1 = t;
   4475     };
   4476     {
   4477       uint const t = min(r5, r7);
   4478       r7 = max(r5, r7);
   4479       r5 = t;
   4480     };
   4481     {
   4482       uint const t = min(r2, r6);
   4483       r6 = max(r2, r6);
   4484       r2 = t;
   4485     };
   4486     {
   4487       uint const t = min(r4, r8);
   4488       r8 = max(r4, r8);
   4489       r4 = t;
   4490     };
   4491     {
   4492       uint const t = min(r2, r4);
   4493       r4 = max(r2, r4);
   4494       r2 = t;
   4495     };
   4496     {
   4497       uint const t = min(r6, r8);
   4498       r8 = max(r6, r8);
   4499       r6 = t;
   4500     };
   4501     {
   4502       uint const t = min(r1, r2);
   4503       r2 = max(r1, r2);
   4504       r1 = t;
   4505     };
   4506     {
   4507       uint const t = min(r3, r4);
   4508       r4 = max(r3, r4);
   4509       r3 = t;
   4510     };
   4511     {
   4512       uint const t = min(r5, r6);
   4513       r6 = max(r5, r6);
   4514       r5 = t;
   4515     };
   4516     {
   4517       uint const t = min(r7, r8);
   4518       r8 = max(r7, r8);
   4519       r7 = t;
   4520     };
   4521   }
   4522   vout[gmem_idx + (1 << 4) * 0] = r1;
   4523   vout[gmem_idx + (1 << 4) * 1] = r2;
   4524   vout[gmem_idx + (1 << 4) * 2] = r3;
   4525   vout[gmem_idx + (1 << 4) * 3] = r4;
   4526   vout[gmem_idx + (1 << 4) * 4] = r5;
   4527   vout[gmem_idx + (1 << 4) * 5] = r6;
   4528   vout[gmem_idx + (1 << 4) * 6] = r7;
   4529   vout[gmem_idx + (1 << 4) * 7] = r8;
   4530 }
   4531 
   4532 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
   4533 __attribute__((reqd_work_group_size((1 << 4) * 16, 1, 1))) void
   4534 hs_kernel_bs_4(__global uint const* const restrict vin,
   4535                __global uint* const restrict vout)
   4536 {
   4537   __local struct
   4538   {
   4539     uint m[256 * 8];
   4540   } shared;
   4541 
   4542   uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
   4543                         (get_local_id(0) & ((1 << 4) - 1));
   4544   uint r1 = vin[gmem_idx + (1 << 4) * 0];
   4545   uint r2 = vin[gmem_idx + (1 << 4) * 1];
   4546   uint r3 = vin[gmem_idx + (1 << 4) * 2];
   4547   uint r4 = vin[gmem_idx + (1 << 4) * 3];
   4548   uint r5 = vin[gmem_idx + (1 << 4) * 4];
   4549   uint r6 = vin[gmem_idx + (1 << 4) * 5];
   4550   uint r7 = vin[gmem_idx + (1 << 4) * 6];
   4551   uint r8 = vin[gmem_idx + (1 << 4) * 7];
   4552   {
   4553     uint const t = min(r1, r5);
   4554     r5 = max(r1, r5);
   4555     r1 = t;
   4556   };
   4557   {
   4558     uint const t = min(r2, r6);
   4559     r6 = max(r2, r6);
   4560     r2 = t;
   4561   };
   4562   {
   4563     uint const t = min(r3, r7);
   4564     r7 = max(r3, r7);
   4565     r3 = t;
   4566   };
   4567   {
   4568     uint const t = min(r4, r8);
   4569     r8 = max(r4, r8);
   4570     r4 = t;
   4571   };
   4572   {
   4573     uint const t = min(r1, r3);
   4574     r3 = max(r1, r3);
   4575     r1 = t;
   4576   };
   4577   {
   4578     uint const t = min(r2, r4);
   4579     r4 = max(r2, r4);
   4580     r2 = t;
   4581   };
   4582   {
   4583     uint const t = min(r5, r7);
   4584     r7 = max(r5, r7);
   4585     r5 = t;
   4586   };
   4587   {
   4588     uint const t = min(r6, r8);
   4589     r8 = max(r6, r8);
   4590     r6 = t;
   4591   };
   4592   {
   4593     uint const t = min(r3, r5);
   4594     r5 = max(r3, r5);
   4595     r3 = t;
   4596   };
   4597   {
   4598     uint const t = min(r4, r6);
   4599     r6 = max(r4, r6);
   4600     r4 = t;
   4601   };
   4602   {
   4603     uint const t = min(r1, r2);
   4604     r2 = max(r1, r2);
   4605     r1 = t;
   4606   };
   4607   {
   4608     uint const t = min(r3, r4);
   4609     r4 = max(r3, r4);
   4610     r3 = t;
   4611   };
   4612   {
   4613     uint const t = min(r5, r6);
   4614     r6 = max(r5, r6);
   4615     r5 = t;
   4616   };
   4617   {
   4618     uint const t = min(r7, r8);
   4619     r8 = max(r7, r8);
   4620     r7 = t;
   4621   };
   4622   {
   4623     uint const t = min(r2, r5);
   4624     r5 = max(r2, r5);
   4625     r2 = t;
   4626   };
   4627   {
   4628     uint const t = min(r4, r7);
   4629     r7 = max(r4, r7);
   4630     r4 = t;
   4631   };
   4632   {
   4633     uint const t = min(r2, r3);
   4634     r3 = max(r2, r3);
   4635     r2 = t;
   4636   };
   4637   {
   4638     uint const t = min(r4, r5);
   4639     r5 = max(r4, r5);
   4640     r4 = t;
   4641   };
   4642   {
   4643     uint const t = min(r6, r7);
   4644     r7 = max(r6, r7);
   4645     r6 = t;
   4646   };
   4647   {
   4648     uint const flip_lane_idx = get_sub_group_local_id() ^ 1;
   4649     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   4650     ;
   4651     {
   4652       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   4653       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   4654       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   4655       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4656     };
   4657     {
   4658       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   4659       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   4660       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   4661       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4662     };
   4663     {
   4664       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   4665       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   4666       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   4667       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4668     };
   4669     {
   4670       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   4671       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   4672       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   4673       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4674     };
   4675   }
   4676   {
   4677     uint const t = min(r1, r5);
   4678     r5 = max(r1, r5);
   4679     r1 = t;
   4680   };
   4681   {
   4682     uint const t = min(r3, r7);
   4683     r7 = max(r3, r7);
   4684     r3 = t;
   4685   };
   4686   {
   4687     uint const t = min(r1, r3);
   4688     r3 = max(r1, r3);
   4689     r1 = t;
   4690   };
   4691   {
   4692     uint const t = min(r5, r7);
   4693     r7 = max(r5, r7);
   4694     r5 = t;
   4695   };
   4696   {
   4697     uint const t = min(r2, r6);
   4698     r6 = max(r2, r6);
   4699     r2 = t;
   4700   };
   4701   {
   4702     uint const t = min(r4, r8);
   4703     r8 = max(r4, r8);
   4704     r4 = t;
   4705   };
   4706   {
   4707     uint const t = min(r2, r4);
   4708     r4 = max(r2, r4);
   4709     r2 = t;
   4710   };
   4711   {
   4712     uint const t = min(r6, r8);
   4713     r8 = max(r6, r8);
   4714     r6 = t;
   4715   };
   4716   {
   4717     uint const t = min(r1, r2);
   4718     r2 = max(r1, r2);
   4719     r1 = t;
   4720   };
   4721   {
   4722     uint const t = min(r3, r4);
   4723     r4 = max(r3, r4);
   4724     r3 = t;
   4725   };
   4726   {
   4727     uint const t = min(r5, r6);
   4728     r6 = max(r5, r6);
   4729     r5 = t;
   4730   };
   4731   {
   4732     uint const t = min(r7, r8);
   4733     r8 = max(r7, r8);
   4734     r7 = t;
   4735   };
   4736   {
   4737     uint const flip_lane_idx = get_sub_group_local_id() ^ 3;
   4738     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   4739     ;
   4740     {
   4741       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   4742       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   4743       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   4744       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4745     };
   4746     {
   4747       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   4748       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   4749       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   4750       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4751     };
   4752     {
   4753       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   4754       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   4755       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   4756       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4757     };
   4758     {
   4759       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   4760       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   4761       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   4762       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4763     };
   4764   }
   4765   {
   4766     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   4767     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   4768     ;
   4769     {
   4770       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   4771       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   4772     };
   4773     {
   4774       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   4775       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   4776     };
   4777     {
   4778       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   4779       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   4780     };
   4781     {
   4782       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   4783       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   4784     };
   4785     {
   4786       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   4787       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4788     };
   4789     {
   4790       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   4791       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4792     };
   4793     {
   4794       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   4795       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4796     };
   4797     {
   4798       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   4799       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4800     };
   4801   }
   4802   {
   4803     uint const t = min(r1, r5);
   4804     r5 = max(r1, r5);
   4805     r1 = t;
   4806   };
   4807   {
   4808     uint const t = min(r3, r7);
   4809     r7 = max(r3, r7);
   4810     r3 = t;
   4811   };
   4812   {
   4813     uint const t = min(r1, r3);
   4814     r3 = max(r1, r3);
   4815     r1 = t;
   4816   };
   4817   {
   4818     uint const t = min(r5, r7);
   4819     r7 = max(r5, r7);
   4820     r5 = t;
   4821   };
   4822   {
   4823     uint const t = min(r2, r6);
   4824     r6 = max(r2, r6);
   4825     r2 = t;
   4826   };
   4827   {
   4828     uint const t = min(r4, r8);
   4829     r8 = max(r4, r8);
   4830     r4 = t;
   4831   };
   4832   {
   4833     uint const t = min(r2, r4);
   4834     r4 = max(r2, r4);
   4835     r2 = t;
   4836   };
   4837   {
   4838     uint const t = min(r6, r8);
   4839     r8 = max(r6, r8);
   4840     r6 = t;
   4841   };
   4842   {
   4843     uint const t = min(r1, r2);
   4844     r2 = max(r1, r2);
   4845     r1 = t;
   4846   };
   4847   {
   4848     uint const t = min(r3, r4);
   4849     r4 = max(r3, r4);
   4850     r3 = t;
   4851   };
   4852   {
   4853     uint const t = min(r5, r6);
   4854     r6 = max(r5, r6);
   4855     r5 = t;
   4856   };
   4857   {
   4858     uint const t = min(r7, r8);
   4859     r8 = max(r7, r8);
   4860     r7 = t;
   4861   };
   4862   {
   4863     uint const flip_lane_idx = get_sub_group_local_id() ^ 7;
   4864     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   4865     ;
   4866     {
   4867       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   4868       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   4869       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   4870       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4871     };
   4872     {
   4873       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   4874       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   4875       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   4876       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4877     };
   4878     {
   4879       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   4880       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   4881       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   4882       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4883     };
   4884     {
   4885       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   4886       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   4887       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   4888       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4889     };
   4890   }
   4891   {
   4892     uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   4893     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   4894     ;
   4895     {
   4896       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   4897       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   4898     };
   4899     {
   4900       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   4901       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   4902     };
   4903     {
   4904       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   4905       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   4906     };
   4907     {
   4908       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   4909       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   4910     };
   4911     {
   4912       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   4913       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4914     };
   4915     {
   4916       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   4917       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4918     };
   4919     {
   4920       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   4921       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4922     };
   4923     {
   4924       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   4925       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4926     };
   4927   }
   4928   {
   4929     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   4930     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   4931     ;
   4932     {
   4933       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   4934       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   4935     };
   4936     {
   4937       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   4938       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   4939     };
   4940     {
   4941       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   4942       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   4943     };
   4944     {
   4945       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   4946       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   4947     };
   4948     {
   4949       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   4950       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   4951     };
   4952     {
   4953       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   4954       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   4955     };
   4956     {
   4957       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   4958       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   4959     };
   4960     {
   4961       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   4962       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   4963     };
   4964   }
   4965   {
   4966     uint const t = min(r1, r5);
   4967     r5 = max(r1, r5);
   4968     r1 = t;
   4969   };
   4970   {
   4971     uint const t = min(r3, r7);
   4972     r7 = max(r3, r7);
   4973     r3 = t;
   4974   };
   4975   {
   4976     uint const t = min(r1, r3);
   4977     r3 = max(r1, r3);
   4978     r1 = t;
   4979   };
   4980   {
   4981     uint const t = min(r5, r7);
   4982     r7 = max(r5, r7);
   4983     r5 = t;
   4984   };
   4985   {
   4986     uint const t = min(r2, r6);
   4987     r6 = max(r2, r6);
   4988     r2 = t;
   4989   };
   4990   {
   4991     uint const t = min(r4, r8);
   4992     r8 = max(r4, r8);
   4993     r4 = t;
   4994   };
   4995   {
   4996     uint const t = min(r2, r4);
   4997     r4 = max(r2, r4);
   4998     r2 = t;
   4999   };
   5000   {
   5001     uint const t = min(r6, r8);
   5002     r8 = max(r6, r8);
   5003     r6 = t;
   5004   };
   5005   {
   5006     uint const t = min(r1, r2);
   5007     r2 = max(r1, r2);
   5008     r1 = t;
   5009   };
   5010   {
   5011     uint const t = min(r3, r4);
   5012     r4 = max(r3, r4);
   5013     r3 = t;
   5014   };
   5015   {
   5016     uint const t = min(r5, r6);
   5017     r6 = max(r5, r6);
   5018     r5 = t;
   5019   };
   5020   {
   5021     uint const t = min(r7, r8);
   5022     r8 = max(r7, r8);
   5023     r7 = t;
   5024   };
   5025   {
   5026     uint const flip_lane_idx = get_sub_group_local_id() ^ 15;
   5027     int const t_lt = get_sub_group_local_id() < flip_lane_idx;
   5028     ;
   5029     {
   5030       uint const ta = intel_sub_group_shuffle(r1, flip_lane_idx);
   5031       uint const tb = intel_sub_group_shuffle(r8, flip_lane_idx);
   5032       r1 = ((r1 <= tb) ^ t_lt) ? tb : r1;
   5033       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   5034     };
   5035     {
   5036       uint const ta = intel_sub_group_shuffle(r2, flip_lane_idx);
   5037       uint const tb = intel_sub_group_shuffle(r7, flip_lane_idx);
   5038       r2 = ((r2 <= tb) ^ t_lt) ? tb : r2;
   5039       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   5040     };
   5041     {
   5042       uint const ta = intel_sub_group_shuffle(r3, flip_lane_idx);
   5043       uint const tb = intel_sub_group_shuffle(r6, flip_lane_idx);
   5044       r3 = ((r3 <= tb) ^ t_lt) ? tb : r3;
   5045       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   5046     };
   5047     {
   5048       uint const ta = intel_sub_group_shuffle(r4, flip_lane_idx);
   5049       uint const tb = intel_sub_group_shuffle(r5, flip_lane_idx);
   5050       r4 = ((r4 <= tb) ^ t_lt) ? tb : r4;
   5051       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   5052     };
   5053   }
   5054   {
   5055     uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   5056     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   5057     ;
   5058     {
   5059       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   5060       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   5061     };
   5062     {
   5063       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   5064       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   5065     };
   5066     {
   5067       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   5068       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   5069     };
   5070     {
   5071       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   5072       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   5073     };
   5074     {
   5075       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   5076       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   5077     };
   5078     {
   5079       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   5080       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   5081     };
   5082     {
   5083       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   5084       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   5085     };
   5086     {
   5087       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   5088       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   5089     };
   5090   }
   5091   {
   5092     uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   5093     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   5094     ;
   5095     {
   5096       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   5097       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   5098     };
   5099     {
   5100       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   5101       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   5102     };
   5103     {
   5104       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   5105       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   5106     };
   5107     {
   5108       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   5109       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   5110     };
   5111     {
   5112       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   5113       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   5114     };
   5115     {
   5116       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   5117       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   5118     };
   5119     {
   5120       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   5121       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   5122     };
   5123     {
   5124       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   5125       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   5126     };
   5127   }
   5128   {
   5129     uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   5130     int const t_lt = get_sub_group_local_id() < half_lane_idx;
   5131     ;
   5132     {
   5133       uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   5134       r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   5135     };
   5136     {
   5137       uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   5138       r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   5139     };
   5140     {
   5141       uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   5142       r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   5143     };
   5144     {
   5145       uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   5146       r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   5147     };
   5148     {
   5149       uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   5150       r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   5151     };
   5152     {
   5153       uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   5154       r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   5155     };
   5156     {
   5157       uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   5158       r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   5159     };
   5160     {
   5161       uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   5162       r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   5163     };
   5164   }
   5165   {
   5166     uint const t = min(r1, r5);
   5167     r5 = max(r1, r5);
   5168     r1 = t;
   5169   };
   5170   {
   5171     uint const t = min(r3, r7);
   5172     r7 = max(r3, r7);
   5173     r3 = t;
   5174   };
   5175   {
   5176     uint const t = min(r1, r3);
   5177     r3 = max(r1, r3);
   5178     r1 = t;
   5179   };
   5180   {
   5181     uint const t = min(r5, r7);
   5182     r7 = max(r5, r7);
   5183     r5 = t;
   5184   };
   5185   {
   5186     uint const t = min(r2, r6);
   5187     r6 = max(r2, r6);
   5188     r2 = t;
   5189   };
   5190   {
   5191     uint const t = min(r4, r8);
   5192     r8 = max(r4, r8);
   5193     r4 = t;
   5194   };
   5195   {
   5196     uint const t = min(r2, r4);
   5197     r4 = max(r2, r4);
   5198     r2 = t;
   5199   };
   5200   {
   5201     uint const t = min(r6, r8);
   5202     r8 = max(r6, r8);
   5203     r6 = t;
   5204   };
   5205   {
   5206     uint const t = min(r1, r2);
   5207     r2 = max(r1, r2);
   5208     r1 = t;
   5209   };
   5210   {
   5211     uint const t = min(r3, r4);
   5212     r4 = max(r3, r4);
   5213     r3 = t;
   5214   };
   5215   {
   5216     uint const t = min(r5, r6);
   5217     r6 = max(r5, r6);
   5218     r5 = t;
   5219   };
   5220   {
   5221     uint const t = min(r7, r8);
   5222     r8 = max(r7, r8);
   5223     r7 = t;
   5224   };
   5225   uint const smem_l_idx =
   5226     get_sub_group_id() * ((1 << 4) * 16) + get_sub_group_local_id();
   5227   uint const smem_r_idx = (get_sub_group_id() ^ 1) * ((1 << 4) * 16) +
   5228                           (get_sub_group_local_id() ^ ((1 << 4) - 1));
   5229   shared.m[get_local_id(0) + (16 * (1 << 4) * 0)] = r1;
   5230   shared.m[get_local_id(0) + (16 * (1 << 4) * 1)] = r8;
   5231   shared.m[get_local_id(0) + (16 * (1 << 4) * 2)] = r2;
   5232   shared.m[get_local_id(0) + (16 * (1 << 4) * 3)] = r7;
   5233   shared.m[get_local_id(0) + (16 * (1 << 4) * 4)] = r3;
   5234   shared.m[get_local_id(0) + (16 * (1 << 4) * 5)] = r6;
   5235   shared.m[get_local_id(0) + (16 * (1 << 4) * 6)] = r4;
   5236   shared.m[get_local_id(0) + (16 * (1 << 4) * 7)] = r5;
   5237   barrier(CLK_LOCAL_MEM_FENCE);
   5238   if (get_sub_group_id() < 8) {
   5239     {
   5240       uint r0_1 = shared.m[smem_l_idx + (0)];
   5241       uint r0_2 = shared.m[smem_r_idx + (16)];
   5242       {
   5243         uint const t = min(r0_1, r0_2);
   5244         r0_2 = max(r0_1, r0_2);
   5245         r0_1 = t;
   5246       };
   5247       shared.m[smem_l_idx + (0)] = r0_1;
   5248       shared.m[smem_r_idx + (16)] = r0_2;
   5249     }
   5250     {
   5251       uint r1_1 = shared.m[smem_l_idx + (32)];
   5252       uint r1_2 = shared.m[smem_r_idx + (48)];
   5253       {
   5254         uint const t = min(r1_1, r1_2);
   5255         r1_2 = max(r1_1, r1_2);
   5256         r1_1 = t;
   5257       };
   5258       shared.m[smem_l_idx + (32)] = r1_1;
   5259       shared.m[smem_r_idx + (48)] = r1_2;
   5260     }
   5261     {
   5262       uint r2_1 = shared.m[smem_l_idx + (64)];
   5263       uint r2_2 = shared.m[smem_r_idx + (80)];
   5264       {
   5265         uint const t = min(r2_1, r2_2);
   5266         r2_2 = max(r2_1, r2_2);
   5267         r2_1 = t;
   5268       };
   5269       shared.m[smem_l_idx + (64)] = r2_1;
   5270       shared.m[smem_r_idx + (80)] = r2_2;
   5271     }
   5272     {
   5273       uint r3_1 = shared.m[smem_l_idx + (96)];
   5274       uint r3_2 = shared.m[smem_r_idx + (112)];
   5275       {
   5276         uint const t = min(r3_1, r3_2);
   5277         r3_2 = max(r3_1, r3_2);
   5278         r3_1 = t;
   5279       };
   5280       shared.m[smem_l_idx + (96)] = r3_1;
   5281       shared.m[smem_r_idx + (112)] = r3_2;
   5282     }
   5283     {
   5284       uint r4_1 = shared.m[smem_l_idx + (128)];
   5285       uint r4_2 = shared.m[smem_r_idx + (144)];
   5286       {
   5287         uint const t = min(r4_1, r4_2);
   5288         r4_2 = max(r4_1, r4_2);
   5289         r4_1 = t;
   5290       };
   5291       shared.m[smem_l_idx + (128)] = r4_1;
   5292       shared.m[smem_r_idx + (144)] = r4_2;
   5293     }
   5294     {
   5295       uint r5_1 = shared.m[smem_l_idx + (160)];
   5296       uint r5_2 = shared.m[smem_r_idx + (176)];
   5297       {
   5298         uint const t = min(r5_1, r5_2);
   5299         r5_2 = max(r5_1, r5_2);
   5300         r5_1 = t;
   5301       };
   5302       shared.m[smem_l_idx + (160)] = r5_1;
   5303       shared.m[smem_r_idx + (176)] = r5_2;
   5304     }
   5305     {
   5306       uint r6_1 = shared.m[smem_l_idx + (192)];
   5307       uint r6_2 = shared.m[smem_r_idx + (208)];
   5308       {
   5309         uint const t = min(r6_1, r6_2);
   5310         r6_2 = max(r6_1, r6_2);
   5311         r6_1 = t;
   5312       };
   5313       shared.m[smem_l_idx + (192)] = r6_1;
   5314       shared.m[smem_r_idx + (208)] = r6_2;
   5315     }
   5316     {
   5317       uint r7_1 = shared.m[smem_l_idx + (224)];
   5318       uint r7_2 = shared.m[smem_r_idx + (240)];
   5319       {
   5320         uint const t = min(r7_1, r7_2);
   5321         r7_2 = max(r7_1, r7_2);
   5322         r7_1 = t;
   5323       };
   5324       shared.m[smem_l_idx + (224)] = r7_1;
   5325       shared.m[smem_r_idx + (240)] = r7_2;
   5326     }
   5327   }
   5328   barrier(CLK_LOCAL_MEM_FENCE);
   5329   r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)];
   5330   r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)];
   5331   r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)];
   5332   r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)];
   5333   r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)];
   5334   r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)];
   5335   r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)];
   5336   r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)];
   5337   {
   5338     {
   5339       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   5340       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   5341       ;
   5342       {
   5343         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   5344         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   5345       };
   5346       {
   5347         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   5348         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   5349       };
   5350       {
   5351         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   5352         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   5353       };
   5354       {
   5355         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   5356         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   5357       };
   5358       {
   5359         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   5360         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   5361       };
   5362       {
   5363         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   5364         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   5365       };
   5366       {
   5367         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   5368         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   5369       };
   5370       {
   5371         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   5372         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   5373       };
   5374     }
   5375     {
   5376       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   5377       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   5378       ;
   5379       {
   5380         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   5381         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   5382       };
   5383       {
   5384         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   5385         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   5386       };
   5387       {
   5388         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   5389         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   5390       };
   5391       {
   5392         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   5393         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   5394       };
   5395       {
   5396         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   5397         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   5398       };
   5399       {
   5400         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   5401         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   5402       };
   5403       {
   5404         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   5405         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   5406       };
   5407       {
   5408         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   5409         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   5410       };
   5411     }
   5412     {
   5413       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   5414       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   5415       ;
   5416       {
   5417         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   5418         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   5419       };
   5420       {
   5421         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   5422         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   5423       };
   5424       {
   5425         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   5426         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   5427       };
   5428       {
   5429         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   5430         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   5431       };
   5432       {
   5433         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   5434         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   5435       };
   5436       {
   5437         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   5438         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   5439       };
   5440       {
   5441         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   5442         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   5443       };
   5444       {
   5445         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   5446         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   5447       };
   5448     }
   5449     {
   5450       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   5451       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   5452       ;
   5453       {
   5454         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   5455         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   5456       };
   5457       {
   5458         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   5459         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   5460       };
   5461       {
   5462         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   5463         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   5464       };
   5465       {
   5466         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   5467         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   5468       };
   5469       {
   5470         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   5471         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   5472       };
   5473       {
   5474         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   5475         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   5476       };
   5477       {
   5478         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   5479         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   5480       };
   5481       {
   5482         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   5483         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   5484       };
   5485     }
   5486     {
   5487       uint const t = min(r1, r5);
   5488       r5 = max(r1, r5);
   5489       r1 = t;
   5490     };
   5491     {
   5492       uint const t = min(r3, r7);
   5493       r7 = max(r3, r7);
   5494       r3 = t;
   5495     };
   5496     {
   5497       uint const t = min(r1, r3);
   5498       r3 = max(r1, r3);
   5499       r1 = t;
   5500     };
   5501     {
   5502       uint const t = min(r5, r7);
   5503       r7 = max(r5, r7);
   5504       r5 = t;
   5505     };
   5506     {
   5507       uint const t = min(r2, r6);
   5508       r6 = max(r2, r6);
   5509       r2 = t;
   5510     };
   5511     {
   5512       uint const t = min(r4, r8);
   5513       r8 = max(r4, r8);
   5514       r4 = t;
   5515     };
   5516     {
   5517       uint const t = min(r2, r4);
   5518       r4 = max(r2, r4);
   5519       r2 = t;
   5520     };
   5521     {
   5522       uint const t = min(r6, r8);
   5523       r8 = max(r6, r8);
   5524       r6 = t;
   5525     };
   5526     {
   5527       uint const t = min(r1, r2);
   5528       r2 = max(r1, r2);
   5529       r1 = t;
   5530     };
   5531     {
   5532       uint const t = min(r3, r4);
   5533       r4 = max(r3, r4);
   5534       r3 = t;
   5535     };
   5536     {
   5537       uint const t = min(r5, r6);
   5538       r6 = max(r5, r6);
   5539       r5 = t;
   5540     };
   5541     {
   5542       uint const t = min(r7, r8);
   5543       r8 = max(r7, r8);
   5544       r7 = t;
   5545     };
   5546   }
   5547   shared.m[get_local_id(0) + (16 * (1 << 4) * 0)] = r1;
   5548   shared.m[get_local_id(0) + (16 * (1 << 4) * 1)] = r8;
   5549   shared.m[get_local_id(0) + (16 * (1 << 4) * 2)] = r2;
   5550   shared.m[get_local_id(0) + (16 * (1 << 4) * 3)] = r7;
   5551   shared.m[get_local_id(0) + (16 * (1 << 4) * 4)] = r3;
   5552   shared.m[get_local_id(0) + (16 * (1 << 4) * 5)] = r6;
   5553   shared.m[get_local_id(0) + (16 * (1 << 4) * 6)] = r4;
   5554   shared.m[get_local_id(0) + (16 * (1 << 4) * 7)] = r5;
   5555   barrier(CLK_LOCAL_MEM_FENCE);
   5556   if (get_sub_group_id() < 8) {
   5557     {
   5558       uint r0_1 = shared.m[smem_l_idx + (0)];
   5559       uint r0_2 = shared.m[smem_l_idx + (16)];
   5560       uint r0_3 = shared.m[smem_r_idx + (32)];
   5561       uint r0_4 = shared.m[smem_r_idx + (48)];
   5562       {
   5563         uint const t = min(r0_2, r0_3);
   5564         r0_3 = max(r0_2, r0_3);
   5565         r0_2 = t;
   5566       };
   5567       {
   5568         uint const t = min(r0_1, r0_4);
   5569         r0_4 = max(r0_1, r0_4);
   5570         r0_1 = t;
   5571       };
   5572       {
   5573         uint const t = min(r0_3, r0_4);
   5574         r0_4 = max(r0_3, r0_4);
   5575         r0_3 = t;
   5576       };
   5577       {
   5578         uint const t = min(r0_1, r0_2);
   5579         r0_2 = max(r0_1, r0_2);
   5580         r0_1 = t;
   5581       };
   5582       shared.m[smem_l_idx + (0)] = r0_1;
   5583       shared.m[smem_l_idx + (16)] = r0_2;
   5584       shared.m[smem_r_idx + (32)] = r0_3;
   5585       shared.m[smem_r_idx + (48)] = r0_4;
   5586     }
   5587     {
   5588       uint r1_1 = shared.m[smem_l_idx + (64)];
   5589       uint r1_2 = shared.m[smem_l_idx + (80)];
   5590       uint r1_3 = shared.m[smem_r_idx + (96)];
   5591       uint r1_4 = shared.m[smem_r_idx + (112)];
   5592       {
   5593         uint const t = min(r1_2, r1_3);
   5594         r1_3 = max(r1_2, r1_3);
   5595         r1_2 = t;
   5596       };
   5597       {
   5598         uint const t = min(r1_1, r1_4);
   5599         r1_4 = max(r1_1, r1_4);
   5600         r1_1 = t;
   5601       };
   5602       {
   5603         uint const t = min(r1_3, r1_4);
   5604         r1_4 = max(r1_3, r1_4);
   5605         r1_3 = t;
   5606       };
   5607       {
   5608         uint const t = min(r1_1, r1_2);
   5609         r1_2 = max(r1_1, r1_2);
   5610         r1_1 = t;
   5611       };
   5612       shared.m[smem_l_idx + (64)] = r1_1;
   5613       shared.m[smem_l_idx + (80)] = r1_2;
   5614       shared.m[smem_r_idx + (96)] = r1_3;
   5615       shared.m[smem_r_idx + (112)] = r1_4;
   5616     }
   5617     {
   5618       uint r2_1 = shared.m[smem_l_idx + (128)];
   5619       uint r2_2 = shared.m[smem_l_idx + (144)];
   5620       uint r2_3 = shared.m[smem_r_idx + (160)];
   5621       uint r2_4 = shared.m[smem_r_idx + (176)];
   5622       {
   5623         uint const t = min(r2_2, r2_3);
   5624         r2_3 = max(r2_2, r2_3);
   5625         r2_2 = t;
   5626       };
   5627       {
   5628         uint const t = min(r2_1, r2_4);
   5629         r2_4 = max(r2_1, r2_4);
   5630         r2_1 = t;
   5631       };
   5632       {
   5633         uint const t = min(r2_3, r2_4);
   5634         r2_4 = max(r2_3, r2_4);
   5635         r2_3 = t;
   5636       };
   5637       {
   5638         uint const t = min(r2_1, r2_2);
   5639         r2_2 = max(r2_1, r2_2);
   5640         r2_1 = t;
   5641       };
   5642       shared.m[smem_l_idx + (128)] = r2_1;
   5643       shared.m[smem_l_idx + (144)] = r2_2;
   5644       shared.m[smem_r_idx + (160)] = r2_3;
   5645       shared.m[smem_r_idx + (176)] = r2_4;
   5646     }
   5647     {
   5648       uint r3_1 = shared.m[smem_l_idx + (192)];
   5649       uint r3_2 = shared.m[smem_l_idx + (208)];
   5650       uint r3_3 = shared.m[smem_r_idx + (224)];
   5651       uint r3_4 = shared.m[smem_r_idx + (240)];
   5652       {
   5653         uint const t = min(r3_2, r3_3);
   5654         r3_3 = max(r3_2, r3_3);
   5655         r3_2 = t;
   5656       };
   5657       {
   5658         uint const t = min(r3_1, r3_4);
   5659         r3_4 = max(r3_1, r3_4);
   5660         r3_1 = t;
   5661       };
   5662       {
   5663         uint const t = min(r3_3, r3_4);
   5664         r3_4 = max(r3_3, r3_4);
   5665         r3_3 = t;
   5666       };
   5667       {
   5668         uint const t = min(r3_1, r3_2);
   5669         r3_2 = max(r3_1, r3_2);
   5670         r3_1 = t;
   5671       };
   5672       shared.m[smem_l_idx + (192)] = r3_1;
   5673       shared.m[smem_l_idx + (208)] = r3_2;
   5674       shared.m[smem_r_idx + (224)] = r3_3;
   5675       shared.m[smem_r_idx + (240)] = r3_4;
   5676     }
   5677   }
   5678   barrier(CLK_LOCAL_MEM_FENCE);
   5679   r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)];
   5680   r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)];
   5681   r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)];
   5682   r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)];
   5683   r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)];
   5684   r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)];
   5685   r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)];
   5686   r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)];
   5687   {
   5688     {
   5689       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   5690       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   5691       ;
   5692       {
   5693         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   5694         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   5695       };
   5696       {
   5697         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   5698         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   5699       };
   5700       {
   5701         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   5702         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   5703       };
   5704       {
   5705         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   5706         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   5707       };
   5708       {
   5709         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   5710         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   5711       };
   5712       {
   5713         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   5714         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   5715       };
   5716       {
   5717         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   5718         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   5719       };
   5720       {
   5721         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   5722         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   5723       };
   5724     }
   5725     {
   5726       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   5727       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   5728       ;
   5729       {
   5730         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   5731         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   5732       };
   5733       {
   5734         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   5735         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   5736       };
   5737       {
   5738         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   5739         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   5740       };
   5741       {
   5742         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   5743         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   5744       };
   5745       {
   5746         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   5747         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   5748       };
   5749       {
   5750         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   5751         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   5752       };
   5753       {
   5754         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   5755         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   5756       };
   5757       {
   5758         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   5759         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   5760       };
   5761     }
   5762     {
   5763       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   5764       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   5765       ;
   5766       {
   5767         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   5768         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   5769       };
   5770       {
   5771         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   5772         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   5773       };
   5774       {
   5775         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   5776         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   5777       };
   5778       {
   5779         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   5780         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   5781       };
   5782       {
   5783         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   5784         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   5785       };
   5786       {
   5787         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   5788         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   5789       };
   5790       {
   5791         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   5792         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   5793       };
   5794       {
   5795         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   5796         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   5797       };
   5798     }
   5799     {
   5800       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   5801       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   5802       ;
   5803       {
   5804         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   5805         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   5806       };
   5807       {
   5808         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   5809         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   5810       };
   5811       {
   5812         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   5813         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   5814       };
   5815       {
   5816         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   5817         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   5818       };
   5819       {
   5820         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   5821         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   5822       };
   5823       {
   5824         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   5825         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   5826       };
   5827       {
   5828         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   5829         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   5830       };
   5831       {
   5832         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   5833         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   5834       };
   5835     }
   5836     {
   5837       uint const t = min(r1, r5);
   5838       r5 = max(r1, r5);
   5839       r1 = t;
   5840     };
   5841     {
   5842       uint const t = min(r3, r7);
   5843       r7 = max(r3, r7);
   5844       r3 = t;
   5845     };
   5846     {
   5847       uint const t = min(r1, r3);
   5848       r3 = max(r1, r3);
   5849       r1 = t;
   5850     };
   5851     {
   5852       uint const t = min(r5, r7);
   5853       r7 = max(r5, r7);
   5854       r5 = t;
   5855     };
   5856     {
   5857       uint const t = min(r2, r6);
   5858       r6 = max(r2, r6);
   5859       r2 = t;
   5860     };
   5861     {
   5862       uint const t = min(r4, r8);
   5863       r8 = max(r4, r8);
   5864       r4 = t;
   5865     };
   5866     {
   5867       uint const t = min(r2, r4);
   5868       r4 = max(r2, r4);
   5869       r2 = t;
   5870     };
   5871     {
   5872       uint const t = min(r6, r8);
   5873       r8 = max(r6, r8);
   5874       r6 = t;
   5875     };
   5876     {
   5877       uint const t = min(r1, r2);
   5878       r2 = max(r1, r2);
   5879       r1 = t;
   5880     };
   5881     {
   5882       uint const t = min(r3, r4);
   5883       r4 = max(r3, r4);
   5884       r3 = t;
   5885     };
   5886     {
   5887       uint const t = min(r5, r6);
   5888       r6 = max(r5, r6);
   5889       r5 = t;
   5890     };
   5891     {
   5892       uint const t = min(r7, r8);
   5893       r8 = max(r7, r8);
   5894       r7 = t;
   5895     };
   5896   }
   5897   shared.m[get_local_id(0) + (16 * (1 << 4) * 0)] = r1;
   5898   shared.m[get_local_id(0) + (16 * (1 << 4) * 1)] = r8;
   5899   shared.m[get_local_id(0) + (16 * (1 << 4) * 2)] = r2;
   5900   shared.m[get_local_id(0) + (16 * (1 << 4) * 3)] = r7;
   5901   shared.m[get_local_id(0) + (16 * (1 << 4) * 4)] = r3;
   5902   shared.m[get_local_id(0) + (16 * (1 << 4) * 5)] = r6;
   5903   shared.m[get_local_id(0) + (16 * (1 << 4) * 6)] = r4;
   5904   shared.m[get_local_id(0) + (16 * (1 << 4) * 7)] = r5;
   5905   barrier(CLK_LOCAL_MEM_FENCE);
   5906   if (get_sub_group_id() < 8) {
   5907     {
   5908       uint r0_1 = shared.m[smem_l_idx + (0)];
   5909       uint r0_2 = shared.m[smem_l_idx + (16)];
   5910       uint r0_3 = shared.m[smem_l_idx + (32)];
   5911       uint r0_4 = shared.m[smem_l_idx + (48)];
   5912       uint r0_5 = shared.m[smem_r_idx + (64)];
   5913       uint r0_6 = shared.m[smem_r_idx + (80)];
   5914       uint r0_7 = shared.m[smem_r_idx + (96)];
   5915       uint r0_8 = shared.m[smem_r_idx + (112)];
   5916       {
   5917         uint const t = min(r0_4, r0_5);
   5918         r0_5 = max(r0_4, r0_5);
   5919         r0_4 = t;
   5920       };
   5921       {
   5922         uint const t = min(r0_3, r0_6);
   5923         r0_6 = max(r0_3, r0_6);
   5924         r0_3 = t;
   5925       };
   5926       {
   5927         uint const t = min(r0_2, r0_7);
   5928         r0_7 = max(r0_2, r0_7);
   5929         r0_2 = t;
   5930       };
   5931       {
   5932         uint const t = min(r0_1, r0_8);
   5933         r0_8 = max(r0_1, r0_8);
   5934         r0_1 = t;
   5935       };
   5936       {
   5937         uint const t = min(r0_5, r0_7);
   5938         r0_7 = max(r0_5, r0_7);
   5939         r0_5 = t;
   5940       };
   5941       {
   5942         uint const t = min(r0_6, r0_8);
   5943         r0_8 = max(r0_6, r0_8);
   5944         r0_6 = t;
   5945       };
   5946       {
   5947         uint const t = min(r0_5, r0_6);
   5948         r0_6 = max(r0_5, r0_6);
   5949         r0_5 = t;
   5950       };
   5951       {
   5952         uint const t = min(r0_7, r0_8);
   5953         r0_8 = max(r0_7, r0_8);
   5954         r0_7 = t;
   5955       };
   5956       {
   5957         uint const t = min(r0_1, r0_3);
   5958         r0_3 = max(r0_1, r0_3);
   5959         r0_1 = t;
   5960       };
   5961       {
   5962         uint const t = min(r0_2, r0_4);
   5963         r0_4 = max(r0_2, r0_4);
   5964         r0_2 = t;
   5965       };
   5966       {
   5967         uint const t = min(r0_1, r0_2);
   5968         r0_2 = max(r0_1, r0_2);
   5969         r0_1 = t;
   5970       };
   5971       {
   5972         uint const t = min(r0_3, r0_4);
   5973         r0_4 = max(r0_3, r0_4);
   5974         r0_3 = t;
   5975       };
   5976       shared.m[smem_l_idx + (0)] = r0_1;
   5977       shared.m[smem_l_idx + (16)] = r0_2;
   5978       shared.m[smem_l_idx + (32)] = r0_3;
   5979       shared.m[smem_l_idx + (48)] = r0_4;
   5980       shared.m[smem_r_idx + (64)] = r0_5;
   5981       shared.m[smem_r_idx + (80)] = r0_6;
   5982       shared.m[smem_r_idx + (96)] = r0_7;
   5983       shared.m[smem_r_idx + (112)] = r0_8;
   5984     }
   5985     {
   5986       uint r1_1 = shared.m[smem_l_idx + (128)];
   5987       uint r1_2 = shared.m[smem_l_idx + (144)];
   5988       uint r1_3 = shared.m[smem_l_idx + (160)];
   5989       uint r1_4 = shared.m[smem_l_idx + (176)];
   5990       uint r1_5 = shared.m[smem_r_idx + (192)];
   5991       uint r1_6 = shared.m[smem_r_idx + (208)];
   5992       uint r1_7 = shared.m[smem_r_idx + (224)];
   5993       uint r1_8 = shared.m[smem_r_idx + (240)];
   5994       {
   5995         uint const t = min(r1_4, r1_5);
   5996         r1_5 = max(r1_4, r1_5);
   5997         r1_4 = t;
   5998       };
   5999       {
   6000         uint const t = min(r1_3, r1_6);
   6001         r1_6 = max(r1_3, r1_6);
   6002         r1_3 = t;
   6003       };
   6004       {
   6005         uint const t = min(r1_2, r1_7);
   6006         r1_7 = max(r1_2, r1_7);
   6007         r1_2 = t;
   6008       };
   6009       {
   6010         uint const t = min(r1_1, r1_8);
   6011         r1_8 = max(r1_1, r1_8);
   6012         r1_1 = t;
   6013       };
   6014       {
   6015         uint const t = min(r1_5, r1_7);
   6016         r1_7 = max(r1_5, r1_7);
   6017         r1_5 = t;
   6018       };
   6019       {
   6020         uint const t = min(r1_6, r1_8);
   6021         r1_8 = max(r1_6, r1_8);
   6022         r1_6 = t;
   6023       };
   6024       {
   6025         uint const t = min(r1_5, r1_6);
   6026         r1_6 = max(r1_5, r1_6);
   6027         r1_5 = t;
   6028       };
   6029       {
   6030         uint const t = min(r1_7, r1_8);
   6031         r1_8 = max(r1_7, r1_8);
   6032         r1_7 = t;
   6033       };
   6034       {
   6035         uint const t = min(r1_1, r1_3);
   6036         r1_3 = max(r1_1, r1_3);
   6037         r1_1 = t;
   6038       };
   6039       {
   6040         uint const t = min(r1_2, r1_4);
   6041         r1_4 = max(r1_2, r1_4);
   6042         r1_2 = t;
   6043       };
   6044       {
   6045         uint const t = min(r1_1, r1_2);
   6046         r1_2 = max(r1_1, r1_2);
   6047         r1_1 = t;
   6048       };
   6049       {
   6050         uint const t = min(r1_3, r1_4);
   6051         r1_4 = max(r1_3, r1_4);
   6052         r1_3 = t;
   6053       };
   6054       shared.m[smem_l_idx + (128)] = r1_1;
   6055       shared.m[smem_l_idx + (144)] = r1_2;
   6056       shared.m[smem_l_idx + (160)] = r1_3;
   6057       shared.m[smem_l_idx + (176)] = r1_4;
   6058       shared.m[smem_r_idx + (192)] = r1_5;
   6059       shared.m[smem_r_idx + (208)] = r1_6;
   6060       shared.m[smem_r_idx + (224)] = r1_7;
   6061       shared.m[smem_r_idx + (240)] = r1_8;
   6062     }
   6063   }
   6064   barrier(CLK_LOCAL_MEM_FENCE);
   6065   r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)];
   6066   r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)];
   6067   r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)];
   6068   r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)];
   6069   r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)];
   6070   r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)];
   6071   r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)];
   6072   r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)];
   6073   {
   6074     {
   6075       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   6076       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   6077       ;
   6078       {
   6079         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   6080         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   6081       };
   6082       {
   6083         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   6084         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   6085       };
   6086       {
   6087         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   6088         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   6089       };
   6090       {
   6091         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   6092         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   6093       };
   6094       {
   6095         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   6096         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   6097       };
   6098       {
   6099         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   6100         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   6101       };
   6102       {
   6103         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   6104         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   6105       };
   6106       {
   6107         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   6108         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   6109       };
   6110     }
   6111     {
   6112       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   6113       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   6114       ;
   6115       {
   6116         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   6117         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   6118       };
   6119       {
   6120         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   6121         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   6122       };
   6123       {
   6124         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   6125         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   6126       };
   6127       {
   6128         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   6129         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   6130       };
   6131       {
   6132         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   6133         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   6134       };
   6135       {
   6136         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   6137         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   6138       };
   6139       {
   6140         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   6141         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   6142       };
   6143       {
   6144         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   6145         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   6146       };
   6147     }
   6148     {
   6149       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   6150       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   6151       ;
   6152       {
   6153         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   6154         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   6155       };
   6156       {
   6157         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   6158         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   6159       };
   6160       {
   6161         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   6162         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   6163       };
   6164       {
   6165         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   6166         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   6167       };
   6168       {
   6169         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   6170         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   6171       };
   6172       {
   6173         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   6174         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   6175       };
   6176       {
   6177         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   6178         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   6179       };
   6180       {
   6181         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   6182         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   6183       };
   6184     }
   6185     {
   6186       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   6187       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   6188       ;
   6189       {
   6190         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   6191         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   6192       };
   6193       {
   6194         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   6195         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   6196       };
   6197       {
   6198         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   6199         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   6200       };
   6201       {
   6202         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   6203         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   6204       };
   6205       {
   6206         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   6207         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   6208       };
   6209       {
   6210         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   6211         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   6212       };
   6213       {
   6214         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   6215         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   6216       };
   6217       {
   6218         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   6219         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   6220       };
   6221     }
   6222     {
   6223       uint const t = min(r1, r5);
   6224       r5 = max(r1, r5);
   6225       r1 = t;
   6226     };
   6227     {
   6228       uint const t = min(r3, r7);
   6229       r7 = max(r3, r7);
   6230       r3 = t;
   6231     };
   6232     {
   6233       uint const t = min(r1, r3);
   6234       r3 = max(r1, r3);
   6235       r1 = t;
   6236     };
   6237     {
   6238       uint const t = min(r5, r7);
   6239       r7 = max(r5, r7);
   6240       r5 = t;
   6241     };
   6242     {
   6243       uint const t = min(r2, r6);
   6244       r6 = max(r2, r6);
   6245       r2 = t;
   6246     };
   6247     {
   6248       uint const t = min(r4, r8);
   6249       r8 = max(r4, r8);
   6250       r4 = t;
   6251     };
   6252     {
   6253       uint const t = min(r2, r4);
   6254       r4 = max(r2, r4);
   6255       r2 = t;
   6256     };
   6257     {
   6258       uint const t = min(r6, r8);
   6259       r8 = max(r6, r8);
   6260       r6 = t;
   6261     };
   6262     {
   6263       uint const t = min(r1, r2);
   6264       r2 = max(r1, r2);
   6265       r1 = t;
   6266     };
   6267     {
   6268       uint const t = min(r3, r4);
   6269       r4 = max(r3, r4);
   6270       r3 = t;
   6271     };
   6272     {
   6273       uint const t = min(r5, r6);
   6274       r6 = max(r5, r6);
   6275       r5 = t;
   6276     };
   6277     {
   6278       uint const t = min(r7, r8);
   6279       r8 = max(r7, r8);
   6280       r7 = t;
   6281     };
   6282   }
   6283   shared.m[get_local_id(0) + (16 * (1 << 4) * 0)] = r1;
   6284   shared.m[get_local_id(0) + (16 * (1 << 4) * 1)] = r8;
   6285   shared.m[get_local_id(0) + (16 * (1 << 4) * 2)] = r2;
   6286   shared.m[get_local_id(0) + (16 * (1 << 4) * 3)] = r7;
   6287   shared.m[get_local_id(0) + (16 * (1 << 4) * 4)] = r3;
   6288   shared.m[get_local_id(0) + (16 * (1 << 4) * 5)] = r6;
   6289   shared.m[get_local_id(0) + (16 * (1 << 4) * 6)] = r4;
   6290   shared.m[get_local_id(0) + (16 * (1 << 4) * 7)] = r5;
   6291   barrier(CLK_LOCAL_MEM_FENCE);
   6292   if (get_sub_group_id() < 8) {
   6293     {
   6294       uint r0_1 = shared.m[smem_l_idx + (0)];
   6295       uint r0_2 = shared.m[smem_l_idx + (16)];
   6296       uint r0_3 = shared.m[smem_l_idx + (32)];
   6297       uint r0_4 = shared.m[smem_l_idx + (48)];
   6298       uint r0_5 = shared.m[smem_l_idx + (64)];
   6299       uint r0_6 = shared.m[smem_l_idx + (80)];
   6300       uint r0_7 = shared.m[smem_l_idx + (96)];
   6301       uint r0_8 = shared.m[smem_l_idx + (112)];
   6302       uint r0_9 = shared.m[smem_r_idx + (128)];
   6303       uint r0_10 = shared.m[smem_r_idx + (144)];
   6304       uint r0_11 = shared.m[smem_r_idx + (160)];
   6305       uint r0_12 = shared.m[smem_r_idx + (176)];
   6306       uint r0_13 = shared.m[smem_r_idx + (192)];
   6307       uint r0_14 = shared.m[smem_r_idx + (208)];
   6308       uint r0_15 = shared.m[smem_r_idx + (224)];
   6309       uint r0_16 = shared.m[smem_r_idx + (240)];
   6310       {
   6311         uint const t = min(r0_8, r0_9);
   6312         r0_9 = max(r0_8, r0_9);
   6313         r0_8 = t;
   6314       };
   6315       {
   6316         uint const t = min(r0_7, r0_10);
   6317         r0_10 = max(r0_7, r0_10);
   6318         r0_7 = t;
   6319       };
   6320       {
   6321         uint const t = min(r0_6, r0_11);
   6322         r0_11 = max(r0_6, r0_11);
   6323         r0_6 = t;
   6324       };
   6325       {
   6326         uint const t = min(r0_5, r0_12);
   6327         r0_12 = max(r0_5, r0_12);
   6328         r0_5 = t;
   6329       };
   6330       {
   6331         uint const t = min(r0_4, r0_13);
   6332         r0_13 = max(r0_4, r0_13);
   6333         r0_4 = t;
   6334       };
   6335       {
   6336         uint const t = min(r0_3, r0_14);
   6337         r0_14 = max(r0_3, r0_14);
   6338         r0_3 = t;
   6339       };
   6340       {
   6341         uint const t = min(r0_2, r0_15);
   6342         r0_15 = max(r0_2, r0_15);
   6343         r0_2 = t;
   6344       };
   6345       {
   6346         uint const t = min(r0_1, r0_16);
   6347         r0_16 = max(r0_1, r0_16);
   6348         r0_1 = t;
   6349       };
   6350       {
   6351         uint const t = min(r0_9, r0_13);
   6352         r0_13 = max(r0_9, r0_13);
   6353         r0_9 = t;
   6354       };
   6355       {
   6356         uint const t = min(r0_11, r0_15);
   6357         r0_15 = max(r0_11, r0_15);
   6358         r0_11 = t;
   6359       };
   6360       {
   6361         uint const t = min(r0_9, r0_11);
   6362         r0_11 = max(r0_9, r0_11);
   6363         r0_9 = t;
   6364       };
   6365       {
   6366         uint const t = min(r0_13, r0_15);
   6367         r0_15 = max(r0_13, r0_15);
   6368         r0_13 = t;
   6369       };
   6370       {
   6371         uint const t = min(r0_10, r0_14);
   6372         r0_14 = max(r0_10, r0_14);
   6373         r0_10 = t;
   6374       };
   6375       {
   6376         uint const t = min(r0_12, r0_16);
   6377         r0_16 = max(r0_12, r0_16);
   6378         r0_12 = t;
   6379       };
   6380       {
   6381         uint const t = min(r0_10, r0_12);
   6382         r0_12 = max(r0_10, r0_12);
   6383         r0_10 = t;
   6384       };
   6385       {
   6386         uint const t = min(r0_14, r0_16);
   6387         r0_16 = max(r0_14, r0_16);
   6388         r0_14 = t;
   6389       };
   6390       {
   6391         uint const t = min(r0_9, r0_10);
   6392         r0_10 = max(r0_9, r0_10);
   6393         r0_9 = t;
   6394       };
   6395       {
   6396         uint const t = min(r0_11, r0_12);
   6397         r0_12 = max(r0_11, r0_12);
   6398         r0_11 = t;
   6399       };
   6400       {
   6401         uint const t = min(r0_13, r0_14);
   6402         r0_14 = max(r0_13, r0_14);
   6403         r0_13 = t;
   6404       };
   6405       {
   6406         uint const t = min(r0_15, r0_16);
   6407         r0_16 = max(r0_15, r0_16);
   6408         r0_15 = t;
   6409       };
   6410       {
   6411         uint const t = min(r0_1, r0_5);
   6412         r0_5 = max(r0_1, r0_5);
   6413         r0_1 = t;
   6414       };
   6415       {
   6416         uint const t = min(r0_3, r0_7);
   6417         r0_7 = max(r0_3, r0_7);
   6418         r0_3 = t;
   6419       };
   6420       {
   6421         uint const t = min(r0_1, r0_3);
   6422         r0_3 = max(r0_1, r0_3);
   6423         r0_1 = t;
   6424       };
   6425       {
   6426         uint const t = min(r0_5, r0_7);
   6427         r0_7 = max(r0_5, r0_7);
   6428         r0_5 = t;
   6429       };
   6430       {
   6431         uint const t = min(r0_2, r0_6);
   6432         r0_6 = max(r0_2, r0_6);
   6433         r0_2 = t;
   6434       };
   6435       {
   6436         uint const t = min(r0_4, r0_8);
   6437         r0_8 = max(r0_4, r0_8);
   6438         r0_4 = t;
   6439       };
   6440       {
   6441         uint const t = min(r0_2, r0_4);
   6442         r0_4 = max(r0_2, r0_4);
   6443         r0_2 = t;
   6444       };
   6445       {
   6446         uint const t = min(r0_6, r0_8);
   6447         r0_8 = max(r0_6, r0_8);
   6448         r0_6 = t;
   6449       };
   6450       {
   6451         uint const t = min(r0_1, r0_2);
   6452         r0_2 = max(r0_1, r0_2);
   6453         r0_1 = t;
   6454       };
   6455       {
   6456         uint const t = min(r0_3, r0_4);
   6457         r0_4 = max(r0_3, r0_4);
   6458         r0_3 = t;
   6459       };
   6460       {
   6461         uint const t = min(r0_5, r0_6);
   6462         r0_6 = max(r0_5, r0_6);
   6463         r0_5 = t;
   6464       };
   6465       {
   6466         uint const t = min(r0_7, r0_8);
   6467         r0_8 = max(r0_7, r0_8);
   6468         r0_7 = t;
   6469       };
   6470       shared.m[smem_l_idx + (0)] = r0_1;
   6471       shared.m[smem_l_idx + (16)] = r0_2;
   6472       shared.m[smem_l_idx + (32)] = r0_3;
   6473       shared.m[smem_l_idx + (48)] = r0_4;
   6474       shared.m[smem_l_idx + (64)] = r0_5;
   6475       shared.m[smem_l_idx + (80)] = r0_6;
   6476       shared.m[smem_l_idx + (96)] = r0_7;
   6477       shared.m[smem_l_idx + (112)] = r0_8;
   6478       shared.m[smem_r_idx + (128)] = r0_9;
   6479       shared.m[smem_r_idx + (144)] = r0_10;
   6480       shared.m[smem_r_idx + (160)] = r0_11;
   6481       shared.m[smem_r_idx + (176)] = r0_12;
   6482       shared.m[smem_r_idx + (192)] = r0_13;
   6483       shared.m[smem_r_idx + (208)] = r0_14;
   6484       shared.m[smem_r_idx + (224)] = r0_15;
   6485       shared.m[smem_r_idx + (240)] = r0_16;
   6486     }
   6487   }
   6488   barrier(CLK_LOCAL_MEM_FENCE);
   6489   r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)];
   6490   r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)];
   6491   r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)];
   6492   r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)];
   6493   r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)];
   6494   r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)];
   6495   r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)];
   6496   r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)];
   6497   {
   6498     {
   6499       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   6500       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   6501       ;
   6502       {
   6503         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   6504         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   6505       };
   6506       {
   6507         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   6508         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   6509       };
   6510       {
   6511         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   6512         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   6513       };
   6514       {
   6515         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   6516         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   6517       };
   6518       {
   6519         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   6520         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   6521       };
   6522       {
   6523         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   6524         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   6525       };
   6526       {
   6527         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   6528         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   6529       };
   6530       {
   6531         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   6532         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   6533       };
   6534     }
   6535     {
   6536       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   6537       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   6538       ;
   6539       {
   6540         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   6541         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   6542       };
   6543       {
   6544         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   6545         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   6546       };
   6547       {
   6548         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   6549         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   6550       };
   6551       {
   6552         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   6553         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   6554       };
   6555       {
   6556         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   6557         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   6558       };
   6559       {
   6560         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   6561         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   6562       };
   6563       {
   6564         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   6565         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   6566       };
   6567       {
   6568         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   6569         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   6570       };
   6571     }
   6572     {
   6573       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   6574       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   6575       ;
   6576       {
   6577         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   6578         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   6579       };
   6580       {
   6581         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   6582         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   6583       };
   6584       {
   6585         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   6586         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   6587       };
   6588       {
   6589         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   6590         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   6591       };
   6592       {
   6593         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   6594         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   6595       };
   6596       {
   6597         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   6598         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   6599       };
   6600       {
   6601         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   6602         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   6603       };
   6604       {
   6605         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   6606         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   6607       };
   6608     }
   6609     {
   6610       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   6611       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   6612       ;
   6613       {
   6614         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   6615         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   6616       };
   6617       {
   6618         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   6619         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   6620       };
   6621       {
   6622         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   6623         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   6624       };
   6625       {
   6626         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   6627         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   6628       };
   6629       {
   6630         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   6631         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   6632       };
   6633       {
   6634         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   6635         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   6636       };
   6637       {
   6638         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   6639         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   6640       };
   6641       {
   6642         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   6643         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   6644       };
   6645     }
   6646     {
   6647       uint const t = min(r1, r5);
   6648       r5 = max(r1, r5);
   6649       r1 = t;
   6650     };
   6651     {
   6652       uint const t = min(r3, r7);
   6653       r7 = max(r3, r7);
   6654       r3 = t;
   6655     };
   6656     {
   6657       uint const t = min(r1, r3);
   6658       r3 = max(r1, r3);
   6659       r1 = t;
   6660     };
   6661     {
   6662       uint const t = min(r5, r7);
   6663       r7 = max(r5, r7);
   6664       r5 = t;
   6665     };
   6666     {
   6667       uint const t = min(r2, r6);
   6668       r6 = max(r2, r6);
   6669       r2 = t;
   6670     };
   6671     {
   6672       uint const t = min(r4, r8);
   6673       r8 = max(r4, r8);
   6674       r4 = t;
   6675     };
   6676     {
   6677       uint const t = min(r2, r4);
   6678       r4 = max(r2, r4);
   6679       r2 = t;
   6680     };
   6681     {
   6682       uint const t = min(r6, r8);
   6683       r8 = max(r6, r8);
   6684       r6 = t;
   6685     };
   6686     {
   6687       uint const t = min(r1, r2);
   6688       r2 = max(r1, r2);
   6689       r1 = t;
   6690     };
   6691     {
   6692       uint const t = min(r3, r4);
   6693       r4 = max(r3, r4);
   6694       r3 = t;
   6695     };
   6696     {
   6697       uint const t = min(r5, r6);
   6698       r6 = max(r5, r6);
   6699       r5 = t;
   6700     };
   6701     {
   6702       uint const t = min(r7, r8);
   6703       r8 = max(r7, r8);
   6704       r7 = t;
   6705     };
   6706   }
   6707   vout[gmem_idx + (1 << 4) * 0] = r1;
   6708   vout[gmem_idx + (1 << 4) * 1] = r2;
   6709   vout[gmem_idx + (1 << 4) * 2] = r3;
   6710   vout[gmem_idx + (1 << 4) * 3] = r4;
   6711   vout[gmem_idx + (1 << 4) * 4] = r5;
   6712   vout[gmem_idx + (1 << 4) * 5] = r6;
   6713   vout[gmem_idx + (1 << 4) * 6] = r7;
   6714   vout[gmem_idx + (1 << 4) * 7] = r8;
   6715 }
   6716 
   6717 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
   6718 __attribute__((reqd_work_group_size((1 << 4) * 1, 1, 1))) void
   6719 hs_kernel_bc_0(__global uint* const restrict vout)
   6720 {
   6721   uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
   6722                         (get_local_id(0) & ((1 << 4) - 1));
   6723   uint r1 = vout[gmem_idx + (1 << 4) * 0];
   6724   uint r2 = vout[gmem_idx + (1 << 4) * 1];
   6725   uint r3 = vout[gmem_idx + (1 << 4) * 2];
   6726   uint r4 = vout[gmem_idx + (1 << 4) * 3];
   6727   uint r5 = vout[gmem_idx + (1 << 4) * 4];
   6728   uint r6 = vout[gmem_idx + (1 << 4) * 5];
   6729   uint r7 = vout[gmem_idx + (1 << 4) * 6];
   6730   uint r8 = vout[gmem_idx + (1 << 4) * 7];
   6731   {
   6732     {
   6733       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   6734       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   6735       ;
   6736       {
   6737         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   6738         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   6739       };
   6740       {
   6741         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   6742         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   6743       };
   6744       {
   6745         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   6746         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   6747       };
   6748       {
   6749         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   6750         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   6751       };
   6752       {
   6753         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   6754         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   6755       };
   6756       {
   6757         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   6758         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   6759       };
   6760       {
   6761         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   6762         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   6763       };
   6764       {
   6765         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   6766         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   6767       };
   6768     }
   6769     {
   6770       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   6771       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   6772       ;
   6773       {
   6774         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   6775         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   6776       };
   6777       {
   6778         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   6779         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   6780       };
   6781       {
   6782         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   6783         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   6784       };
   6785       {
   6786         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   6787         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   6788       };
   6789       {
   6790         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   6791         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   6792       };
   6793       {
   6794         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   6795         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   6796       };
   6797       {
   6798         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   6799         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   6800       };
   6801       {
   6802         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   6803         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   6804       };
   6805     }
   6806     {
   6807       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   6808       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   6809       ;
   6810       {
   6811         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   6812         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   6813       };
   6814       {
   6815         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   6816         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   6817       };
   6818       {
   6819         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   6820         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   6821       };
   6822       {
   6823         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   6824         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   6825       };
   6826       {
   6827         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   6828         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   6829       };
   6830       {
   6831         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   6832         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   6833       };
   6834       {
   6835         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   6836         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   6837       };
   6838       {
   6839         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   6840         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   6841       };
   6842     }
   6843     {
   6844       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   6845       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   6846       ;
   6847       {
   6848         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   6849         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   6850       };
   6851       {
   6852         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   6853         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   6854       };
   6855       {
   6856         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   6857         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   6858       };
   6859       {
   6860         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   6861         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   6862       };
   6863       {
   6864         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   6865         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   6866       };
   6867       {
   6868         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   6869         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   6870       };
   6871       {
   6872         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   6873         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   6874       };
   6875       {
   6876         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   6877         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   6878       };
   6879     }
   6880     {
   6881       uint const t = min(r1, r5);
   6882       r5 = max(r1, r5);
   6883       r1 = t;
   6884     };
   6885     {
   6886       uint const t = min(r3, r7);
   6887       r7 = max(r3, r7);
   6888       r3 = t;
   6889     };
   6890     {
   6891       uint const t = min(r1, r3);
   6892       r3 = max(r1, r3);
   6893       r1 = t;
   6894     };
   6895     {
   6896       uint const t = min(r5, r7);
   6897       r7 = max(r5, r7);
   6898       r5 = t;
   6899     };
   6900     {
   6901       uint const t = min(r2, r6);
   6902       r6 = max(r2, r6);
   6903       r2 = t;
   6904     };
   6905     {
   6906       uint const t = min(r4, r8);
   6907       r8 = max(r4, r8);
   6908       r4 = t;
   6909     };
   6910     {
   6911       uint const t = min(r2, r4);
   6912       r4 = max(r2, r4);
   6913       r2 = t;
   6914     };
   6915     {
   6916       uint const t = min(r6, r8);
   6917       r8 = max(r6, r8);
   6918       r6 = t;
   6919     };
   6920     {
   6921       uint const t = min(r1, r2);
   6922       r2 = max(r1, r2);
   6923       r1 = t;
   6924     };
   6925     {
   6926       uint const t = min(r3, r4);
   6927       r4 = max(r3, r4);
   6928       r3 = t;
   6929     };
   6930     {
   6931       uint const t = min(r5, r6);
   6932       r6 = max(r5, r6);
   6933       r5 = t;
   6934     };
   6935     {
   6936       uint const t = min(r7, r8);
   6937       r8 = max(r7, r8);
   6938       r7 = t;
   6939     };
   6940   }
   6941   vout[gmem_idx + (1 << 4) * 0] = r1;
   6942   vout[gmem_idx + (1 << 4) * 1] = r2;
   6943   vout[gmem_idx + (1 << 4) * 2] = r3;
   6944   vout[gmem_idx + (1 << 4) * 3] = r4;
   6945   vout[gmem_idx + (1 << 4) * 4] = r5;
   6946   vout[gmem_idx + (1 << 4) * 5] = r6;
   6947   vout[gmem_idx + (1 << 4) * 6] = r7;
   6948   vout[gmem_idx + (1 << 4) * 7] = r8;
   6949 }
   6950 
   6951 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
   6952 __attribute__((reqd_work_group_size((1 << 4) * 2, 1, 1))) void
   6953 hs_kernel_bc_1(__global uint* const restrict vout)
   6954 {
   6955   __local struct
   6956   {
   6957     uint m[32 * 8];
   6958   } shared;
   6959 
   6960   uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
   6961                         (get_local_id(0) & ((1 << 4) - 1));
   6962   uint const gmem_l_idx =
   6963     (get_global_id(0) & ~((1 << 4) * 2 - 1)) * 8 + get_local_id(0);
   6964   uint const smem_l_idx =
   6965     get_sub_group_id() * ((1 << 4) * 2) + get_sub_group_local_id();
   6966   {
   6967     {
   6968       uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 0)];
   6969       uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 8)];
   6970       {
   6971         uint const t = min(r0_1, r0_2);
   6972         r0_2 = max(r0_1, r0_2);
   6973         r0_1 = t;
   6974       };
   6975       shared.m[smem_l_idx + (0)] = r0_1;
   6976       shared.m[smem_l_idx + (16)] = r0_2;
   6977     }
   6978     {
   6979       uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 2)];
   6980       uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 10)];
   6981       {
   6982         uint const t = min(r0_1, r0_2);
   6983         r0_2 = max(r0_1, r0_2);
   6984         r0_1 = t;
   6985       };
   6986       shared.m[smem_l_idx + (64)] = r0_1;
   6987       shared.m[smem_l_idx + (80)] = r0_2;
   6988     }
   6989     {
   6990       uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 4)];
   6991       uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 12)];
   6992       {
   6993         uint const t = min(r0_1, r0_2);
   6994         r0_2 = max(r0_1, r0_2);
   6995         r0_1 = t;
   6996       };
   6997       shared.m[smem_l_idx + (128)] = r0_1;
   6998       shared.m[smem_l_idx + (144)] = r0_2;
   6999     }
   7000     {
   7001       uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 6)];
   7002       uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 14)];
   7003       {
   7004         uint const t = min(r0_1, r0_2);
   7005         r0_2 = max(r0_1, r0_2);
   7006         r0_1 = t;
   7007       };
   7008       shared.m[smem_l_idx + (192)] = r0_1;
   7009       shared.m[smem_l_idx + (208)] = r0_2;
   7010     }
   7011   }
   7012   barrier(CLK_LOCAL_MEM_FENCE);
   7013   uint r1 = shared.m[get_local_id(0) + (2 * (1 << 4) * 0)];
   7014   uint r2 = shared.m[get_local_id(0) + (2 * (1 << 4) * 1)];
   7015   uint r3 = shared.m[get_local_id(0) + (2 * (1 << 4) * 2)];
   7016   uint r4 = shared.m[get_local_id(0) + (2 * (1 << 4) * 3)];
   7017   uint r5 = shared.m[get_local_id(0) + (2 * (1 << 4) * 4)];
   7018   uint r6 = shared.m[get_local_id(0) + (2 * (1 << 4) * 5)];
   7019   uint r7 = shared.m[get_local_id(0) + (2 * (1 << 4) * 6)];
   7020   uint r8 = shared.m[get_local_id(0) + (2 * (1 << 4) * 7)];
   7021   {
   7022     {
   7023       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   7024       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   7025       ;
   7026       {
   7027         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   7028         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   7029       };
   7030       {
   7031         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   7032         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   7033       };
   7034       {
   7035         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   7036         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   7037       };
   7038       {
   7039         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   7040         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   7041       };
   7042       {
   7043         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   7044         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   7045       };
   7046       {
   7047         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   7048         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   7049       };
   7050       {
   7051         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   7052         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   7053       };
   7054       {
   7055         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   7056         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   7057       };
   7058     }
   7059     {
   7060       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   7061       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   7062       ;
   7063       {
   7064         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   7065         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   7066       };
   7067       {
   7068         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   7069         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   7070       };
   7071       {
   7072         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   7073         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   7074       };
   7075       {
   7076         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   7077         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   7078       };
   7079       {
   7080         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   7081         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   7082       };
   7083       {
   7084         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   7085         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   7086       };
   7087       {
   7088         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   7089         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   7090       };
   7091       {
   7092         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   7093         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   7094       };
   7095     }
   7096     {
   7097       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   7098       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   7099       ;
   7100       {
   7101         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   7102         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   7103       };
   7104       {
   7105         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   7106         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   7107       };
   7108       {
   7109         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   7110         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   7111       };
   7112       {
   7113         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   7114         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   7115       };
   7116       {
   7117         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   7118         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   7119       };
   7120       {
   7121         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   7122         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   7123       };
   7124       {
   7125         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   7126         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   7127       };
   7128       {
   7129         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   7130         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   7131       };
   7132     }
   7133     {
   7134       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   7135       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   7136       ;
   7137       {
   7138         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   7139         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   7140       };
   7141       {
   7142         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   7143         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   7144       };
   7145       {
   7146         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   7147         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   7148       };
   7149       {
   7150         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   7151         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   7152       };
   7153       {
   7154         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   7155         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   7156       };
   7157       {
   7158         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   7159         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   7160       };
   7161       {
   7162         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   7163         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   7164       };
   7165       {
   7166         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   7167         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   7168       };
   7169     }
   7170     {
   7171       uint const t = min(r1, r5);
   7172       r5 = max(r1, r5);
   7173       r1 = t;
   7174     };
   7175     {
   7176       uint const t = min(r3, r7);
   7177       r7 = max(r3, r7);
   7178       r3 = t;
   7179     };
   7180     {
   7181       uint const t = min(r1, r3);
   7182       r3 = max(r1, r3);
   7183       r1 = t;
   7184     };
   7185     {
   7186       uint const t = min(r5, r7);
   7187       r7 = max(r5, r7);
   7188       r5 = t;
   7189     };
   7190     {
   7191       uint const t = min(r2, r6);
   7192       r6 = max(r2, r6);
   7193       r2 = t;
   7194     };
   7195     {
   7196       uint const t = min(r4, r8);
   7197       r8 = max(r4, r8);
   7198       r4 = t;
   7199     };
   7200     {
   7201       uint const t = min(r2, r4);
   7202       r4 = max(r2, r4);
   7203       r2 = t;
   7204     };
   7205     {
   7206       uint const t = min(r6, r8);
   7207       r8 = max(r6, r8);
   7208       r6 = t;
   7209     };
   7210     {
   7211       uint const t = min(r1, r2);
   7212       r2 = max(r1, r2);
   7213       r1 = t;
   7214     };
   7215     {
   7216       uint const t = min(r3, r4);
   7217       r4 = max(r3, r4);
   7218       r3 = t;
   7219     };
   7220     {
   7221       uint const t = min(r5, r6);
   7222       r6 = max(r5, r6);
   7223       r5 = t;
   7224     };
   7225     {
   7226       uint const t = min(r7, r8);
   7227       r8 = max(r7, r8);
   7228       r7 = t;
   7229     };
   7230   }
   7231   vout[gmem_idx + (1 << 4) * 0] = r1;
   7232   vout[gmem_idx + (1 << 4) * 1] = r2;
   7233   vout[gmem_idx + (1 << 4) * 2] = r3;
   7234   vout[gmem_idx + (1 << 4) * 3] = r4;
   7235   vout[gmem_idx + (1 << 4) * 4] = r5;
   7236   vout[gmem_idx + (1 << 4) * 5] = r6;
   7237   vout[gmem_idx + (1 << 4) * 6] = r7;
   7238   vout[gmem_idx + (1 << 4) * 7] = r8;
   7239 }
   7240 
   7241 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
   7242 __attribute__((reqd_work_group_size((1 << 4) * 4, 1, 1))) void
   7243 hs_kernel_bc_2(__global uint* const restrict vout)
   7244 {
   7245   __local struct
   7246   {
   7247     uint m[64 * 8];
   7248   } shared;
   7249 
   7250   uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
   7251                         (get_local_id(0) & ((1 << 4) - 1));
   7252   uint const gmem_l_idx =
   7253     (get_global_id(0) & ~((1 << 4) * 4 - 1)) * 8 + get_local_id(0);
   7254   uint const smem_l_idx =
   7255     get_sub_group_id() * ((1 << 4) * 4) + get_sub_group_local_id();
   7256   {
   7257     {
   7258       uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 0)];
   7259       uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 8)];
   7260       uint r0_3 = vout[gmem_l_idx + ((1 << 4) * 16)];
   7261       uint r0_4 = vout[gmem_l_idx + ((1 << 4) * 24)];
   7262       {
   7263         uint const t = min(r0_1, r0_3);
   7264         r0_3 = max(r0_1, r0_3);
   7265         r0_1 = t;
   7266       };
   7267       {
   7268         uint const t = min(r0_2, r0_4);
   7269         r0_4 = max(r0_2, r0_4);
   7270         r0_2 = t;
   7271       };
   7272       {
   7273         uint const t = min(r0_1, r0_2);
   7274         r0_2 = max(r0_1, r0_2);
   7275         r0_1 = t;
   7276       };
   7277       {
   7278         uint const t = min(r0_3, r0_4);
   7279         r0_4 = max(r0_3, r0_4);
   7280         r0_3 = t;
   7281       };
   7282       shared.m[smem_l_idx + (0)] = r0_1;
   7283       shared.m[smem_l_idx + (16)] = r0_2;
   7284       shared.m[smem_l_idx + (32)] = r0_3;
   7285       shared.m[smem_l_idx + (48)] = r0_4;
   7286     }
   7287     {
   7288       uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 4)];
   7289       uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 12)];
   7290       uint r0_3 = vout[gmem_l_idx + ((1 << 4) * 20)];
   7291       uint r0_4 = vout[gmem_l_idx + ((1 << 4) * 28)];
   7292       {
   7293         uint const t = min(r0_1, r0_3);
   7294         r0_3 = max(r0_1, r0_3);
   7295         r0_1 = t;
   7296       };
   7297       {
   7298         uint const t = min(r0_2, r0_4);
   7299         r0_4 = max(r0_2, r0_4);
   7300         r0_2 = t;
   7301       };
   7302       {
   7303         uint const t = min(r0_1, r0_2);
   7304         r0_2 = max(r0_1, r0_2);
   7305         r0_1 = t;
   7306       };
   7307       {
   7308         uint const t = min(r0_3, r0_4);
   7309         r0_4 = max(r0_3, r0_4);
   7310         r0_3 = t;
   7311       };
   7312       shared.m[smem_l_idx + (256)] = r0_1;
   7313       shared.m[smem_l_idx + (272)] = r0_2;
   7314       shared.m[smem_l_idx + (288)] = r0_3;
   7315       shared.m[smem_l_idx + (304)] = r0_4;
   7316     }
   7317   }
   7318   barrier(CLK_LOCAL_MEM_FENCE);
   7319   uint r1 = shared.m[get_local_id(0) + (4 * (1 << 4) * 0)];
   7320   uint r2 = shared.m[get_local_id(0) + (4 * (1 << 4) * 1)];
   7321   uint r3 = shared.m[get_local_id(0) + (4 * (1 << 4) * 2)];
   7322   uint r4 = shared.m[get_local_id(0) + (4 * (1 << 4) * 3)];
   7323   uint r5 = shared.m[get_local_id(0) + (4 * (1 << 4) * 4)];
   7324   uint r6 = shared.m[get_local_id(0) + (4 * (1 << 4) * 5)];
   7325   uint r7 = shared.m[get_local_id(0) + (4 * (1 << 4) * 6)];
   7326   uint r8 = shared.m[get_local_id(0) + (4 * (1 << 4) * 7)];
   7327   {
   7328     {
   7329       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   7330       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   7331       ;
   7332       {
   7333         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   7334         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   7335       };
   7336       {
   7337         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   7338         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   7339       };
   7340       {
   7341         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   7342         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   7343       };
   7344       {
   7345         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   7346         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   7347       };
   7348       {
   7349         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   7350         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   7351       };
   7352       {
   7353         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   7354         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   7355       };
   7356       {
   7357         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   7358         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   7359       };
   7360       {
   7361         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   7362         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   7363       };
   7364     }
   7365     {
   7366       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   7367       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   7368       ;
   7369       {
   7370         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   7371         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   7372       };
   7373       {
   7374         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   7375         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   7376       };
   7377       {
   7378         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   7379         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   7380       };
   7381       {
   7382         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   7383         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   7384       };
   7385       {
   7386         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   7387         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   7388       };
   7389       {
   7390         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   7391         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   7392       };
   7393       {
   7394         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   7395         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   7396       };
   7397       {
   7398         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   7399         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   7400       };
   7401     }
   7402     {
   7403       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   7404       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   7405       ;
   7406       {
   7407         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   7408         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   7409       };
   7410       {
   7411         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   7412         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   7413       };
   7414       {
   7415         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   7416         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   7417       };
   7418       {
   7419         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   7420         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   7421       };
   7422       {
   7423         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   7424         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   7425       };
   7426       {
   7427         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   7428         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   7429       };
   7430       {
   7431         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   7432         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   7433       };
   7434       {
   7435         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   7436         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   7437       };
   7438     }
   7439     {
   7440       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   7441       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   7442       ;
   7443       {
   7444         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   7445         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   7446       };
   7447       {
   7448         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   7449         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   7450       };
   7451       {
   7452         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   7453         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   7454       };
   7455       {
   7456         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   7457         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   7458       };
   7459       {
   7460         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   7461         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   7462       };
   7463       {
   7464         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   7465         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   7466       };
   7467       {
   7468         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   7469         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   7470       };
   7471       {
   7472         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   7473         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   7474       };
   7475     }
   7476     {
   7477       uint const t = min(r1, r5);
   7478       r5 = max(r1, r5);
   7479       r1 = t;
   7480     };
   7481     {
   7482       uint const t = min(r3, r7);
   7483       r7 = max(r3, r7);
   7484       r3 = t;
   7485     };
   7486     {
   7487       uint const t = min(r1, r3);
   7488       r3 = max(r1, r3);
   7489       r1 = t;
   7490     };
   7491     {
   7492       uint const t = min(r5, r7);
   7493       r7 = max(r5, r7);
   7494       r5 = t;
   7495     };
   7496     {
   7497       uint const t = min(r2, r6);
   7498       r6 = max(r2, r6);
   7499       r2 = t;
   7500     };
   7501     {
   7502       uint const t = min(r4, r8);
   7503       r8 = max(r4, r8);
   7504       r4 = t;
   7505     };
   7506     {
   7507       uint const t = min(r2, r4);
   7508       r4 = max(r2, r4);
   7509       r2 = t;
   7510     };
   7511     {
   7512       uint const t = min(r6, r8);
   7513       r8 = max(r6, r8);
   7514       r6 = t;
   7515     };
   7516     {
   7517       uint const t = min(r1, r2);
   7518       r2 = max(r1, r2);
   7519       r1 = t;
   7520     };
   7521     {
   7522       uint const t = min(r3, r4);
   7523       r4 = max(r3, r4);
   7524       r3 = t;
   7525     };
   7526     {
   7527       uint const t = min(r5, r6);
   7528       r6 = max(r5, r6);
   7529       r5 = t;
   7530     };
   7531     {
   7532       uint const t = min(r7, r8);
   7533       r8 = max(r7, r8);
   7534       r7 = t;
   7535     };
   7536   }
   7537   vout[gmem_idx + (1 << 4) * 0] = r1;
   7538   vout[gmem_idx + (1 << 4) * 1] = r2;
   7539   vout[gmem_idx + (1 << 4) * 2] = r3;
   7540   vout[gmem_idx + (1 << 4) * 3] = r4;
   7541   vout[gmem_idx + (1 << 4) * 4] = r5;
   7542   vout[gmem_idx + (1 << 4) * 5] = r6;
   7543   vout[gmem_idx + (1 << 4) * 6] = r7;
   7544   vout[gmem_idx + (1 << 4) * 7] = r8;
   7545 }
   7546 
   7547 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
   7548 __attribute__((reqd_work_group_size((1 << 4) * 8, 1, 1))) void
   7549 hs_kernel_bc_3(__global uint* const restrict vout)
   7550 {
   7551   __local struct
   7552   {
   7553     uint m[128 * 8];
   7554   } shared;
   7555 
   7556   uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
   7557                         (get_local_id(0) & ((1 << 4) - 1));
   7558   uint const gmem_l_idx =
   7559     (get_global_id(0) & ~((1 << 4) * 8 - 1)) * 8 + get_local_id(0);
   7560   uint const smem_l_idx =
   7561     get_sub_group_id() * ((1 << 4) * 8) + get_sub_group_local_id();
   7562   {
   7563     {
   7564       uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 0)];
   7565       uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 8)];
   7566       uint r0_3 = vout[gmem_l_idx + ((1 << 4) * 16)];
   7567       uint r0_4 = vout[gmem_l_idx + ((1 << 4) * 24)];
   7568       uint r0_5 = vout[gmem_l_idx + ((1 << 4) * 32)];
   7569       uint r0_6 = vout[gmem_l_idx + ((1 << 4) * 40)];
   7570       uint r0_7 = vout[gmem_l_idx + ((1 << 4) * 48)];
   7571       uint r0_8 = vout[gmem_l_idx + ((1 << 4) * 56)];
   7572       {
   7573         uint const t = min(r0_1, r0_5);
   7574         r0_5 = max(r0_1, r0_5);
   7575         r0_1 = t;
   7576       };
   7577       {
   7578         uint const t = min(r0_3, r0_7);
   7579         r0_7 = max(r0_3, r0_7);
   7580         r0_3 = t;
   7581       };
   7582       {
   7583         uint const t = min(r0_1, r0_3);
   7584         r0_3 = max(r0_1, r0_3);
   7585         r0_1 = t;
   7586       };
   7587       {
   7588         uint const t = min(r0_5, r0_7);
   7589         r0_7 = max(r0_5, r0_7);
   7590         r0_5 = t;
   7591       };
   7592       {
   7593         uint const t = min(r0_2, r0_6);
   7594         r0_6 = max(r0_2, r0_6);
   7595         r0_2 = t;
   7596       };
   7597       {
   7598         uint const t = min(r0_4, r0_8);
   7599         r0_8 = max(r0_4, r0_8);
   7600         r0_4 = t;
   7601       };
   7602       {
   7603         uint const t = min(r0_2, r0_4);
   7604         r0_4 = max(r0_2, r0_4);
   7605         r0_2 = t;
   7606       };
   7607       {
   7608         uint const t = min(r0_6, r0_8);
   7609         r0_8 = max(r0_6, r0_8);
   7610         r0_6 = t;
   7611       };
   7612       {
   7613         uint const t = min(r0_1, r0_2);
   7614         r0_2 = max(r0_1, r0_2);
   7615         r0_1 = t;
   7616       };
   7617       {
   7618         uint const t = min(r0_3, r0_4);
   7619         r0_4 = max(r0_3, r0_4);
   7620         r0_3 = t;
   7621       };
   7622       {
   7623         uint const t = min(r0_5, r0_6);
   7624         r0_6 = max(r0_5, r0_6);
   7625         r0_5 = t;
   7626       };
   7627       {
   7628         uint const t = min(r0_7, r0_8);
   7629         r0_8 = max(r0_7, r0_8);
   7630         r0_7 = t;
   7631       };
   7632       shared.m[smem_l_idx + (0)] = r0_1;
   7633       shared.m[smem_l_idx + (16)] = r0_2;
   7634       shared.m[smem_l_idx + (32)] = r0_3;
   7635       shared.m[smem_l_idx + (48)] = r0_4;
   7636       shared.m[smem_l_idx + (64)] = r0_5;
   7637       shared.m[smem_l_idx + (80)] = r0_6;
   7638       shared.m[smem_l_idx + (96)] = r0_7;
   7639       shared.m[smem_l_idx + (112)] = r0_8;
   7640     }
   7641   }
   7642   barrier(CLK_LOCAL_MEM_FENCE);
   7643   uint r1 = shared.m[get_local_id(0) + (8 * (1 << 4) * 0)];
   7644   uint r2 = shared.m[get_local_id(0) + (8 * (1 << 4) * 1)];
   7645   uint r3 = shared.m[get_local_id(0) + (8 * (1 << 4) * 2)];
   7646   uint r4 = shared.m[get_local_id(0) + (8 * (1 << 4) * 3)];
   7647   uint r5 = shared.m[get_local_id(0) + (8 * (1 << 4) * 4)];
   7648   uint r6 = shared.m[get_local_id(0) + (8 * (1 << 4) * 5)];
   7649   uint r7 = shared.m[get_local_id(0) + (8 * (1 << 4) * 6)];
   7650   uint r8 = shared.m[get_local_id(0) + (8 * (1 << 4) * 7)];
   7651   {
   7652     {
   7653       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   7654       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   7655       ;
   7656       {
   7657         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   7658         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   7659       };
   7660       {
   7661         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   7662         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   7663       };
   7664       {
   7665         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   7666         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   7667       };
   7668       {
   7669         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   7670         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   7671       };
   7672       {
   7673         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   7674         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   7675       };
   7676       {
   7677         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   7678         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   7679       };
   7680       {
   7681         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   7682         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   7683       };
   7684       {
   7685         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   7686         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   7687       };
   7688     }
   7689     {
   7690       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   7691       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   7692       ;
   7693       {
   7694         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   7695         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   7696       };
   7697       {
   7698         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   7699         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   7700       };
   7701       {
   7702         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   7703         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   7704       };
   7705       {
   7706         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   7707         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   7708       };
   7709       {
   7710         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   7711         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   7712       };
   7713       {
   7714         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   7715         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   7716       };
   7717       {
   7718         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   7719         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   7720       };
   7721       {
   7722         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   7723         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   7724       };
   7725     }
   7726     {
   7727       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   7728       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   7729       ;
   7730       {
   7731         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   7732         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   7733       };
   7734       {
   7735         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   7736         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   7737       };
   7738       {
   7739         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   7740         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   7741       };
   7742       {
   7743         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   7744         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   7745       };
   7746       {
   7747         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   7748         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   7749       };
   7750       {
   7751         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   7752         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   7753       };
   7754       {
   7755         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   7756         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   7757       };
   7758       {
   7759         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   7760         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   7761       };
   7762     }
   7763     {
   7764       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   7765       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   7766       ;
   7767       {
   7768         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   7769         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   7770       };
   7771       {
   7772         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   7773         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   7774       };
   7775       {
   7776         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   7777         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   7778       };
   7779       {
   7780         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   7781         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   7782       };
   7783       {
   7784         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   7785         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   7786       };
   7787       {
   7788         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   7789         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   7790       };
   7791       {
   7792         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   7793         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   7794       };
   7795       {
   7796         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   7797         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   7798       };
   7799     }
   7800     {
   7801       uint const t = min(r1, r5);
   7802       r5 = max(r1, r5);
   7803       r1 = t;
   7804     };
   7805     {
   7806       uint const t = min(r3, r7);
   7807       r7 = max(r3, r7);
   7808       r3 = t;
   7809     };
   7810     {
   7811       uint const t = min(r1, r3);
   7812       r3 = max(r1, r3);
   7813       r1 = t;
   7814     };
   7815     {
   7816       uint const t = min(r5, r7);
   7817       r7 = max(r5, r7);
   7818       r5 = t;
   7819     };
   7820     {
   7821       uint const t = min(r2, r6);
   7822       r6 = max(r2, r6);
   7823       r2 = t;
   7824     };
   7825     {
   7826       uint const t = min(r4, r8);
   7827       r8 = max(r4, r8);
   7828       r4 = t;
   7829     };
   7830     {
   7831       uint const t = min(r2, r4);
   7832       r4 = max(r2, r4);
   7833       r2 = t;
   7834     };
   7835     {
   7836       uint const t = min(r6, r8);
   7837       r8 = max(r6, r8);
   7838       r6 = t;
   7839     };
   7840     {
   7841       uint const t = min(r1, r2);
   7842       r2 = max(r1, r2);
   7843       r1 = t;
   7844     };
   7845     {
   7846       uint const t = min(r3, r4);
   7847       r4 = max(r3, r4);
   7848       r3 = t;
   7849     };
   7850     {
   7851       uint const t = min(r5, r6);
   7852       r6 = max(r5, r6);
   7853       r5 = t;
   7854     };
   7855     {
   7856       uint const t = min(r7, r8);
   7857       r8 = max(r7, r8);
   7858       r7 = t;
   7859     };
   7860   }
   7861   vout[gmem_idx + (1 << 4) * 0] = r1;
   7862   vout[gmem_idx + (1 << 4) * 1] = r2;
   7863   vout[gmem_idx + (1 << 4) * 2] = r3;
   7864   vout[gmem_idx + (1 << 4) * 3] = r4;
   7865   vout[gmem_idx + (1 << 4) * 4] = r5;
   7866   vout[gmem_idx + (1 << 4) * 5] = r6;
   7867   vout[gmem_idx + (1 << 4) * 6] = r7;
   7868   vout[gmem_idx + (1 << 4) * 7] = r8;
   7869 }
   7870 
   7871 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4))))
   7872 __attribute__((reqd_work_group_size((1 << 4) * 16, 1, 1))) void
   7873 hs_kernel_bc_4(__global uint* const restrict vout)
   7874 {
   7875   __local struct
   7876   {
   7877     uint m[256 * 8];
   7878   } shared;
   7879 
   7880   uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
   7881                         (get_local_id(0) & ((1 << 4) - 1));
   7882   uint const gmem_l_idx =
   7883     (get_global_id(0) & ~((1 << 4) * 16 - 1)) * 8 + get_local_id(0);
   7884   uint const smem_l_idx =
   7885     get_sub_group_id() * ((1 << 4) * 16) + get_sub_group_local_id();
   7886   if (get_sub_group_id() < 8) {
   7887     {
   7888       uint r0_1 = vout[gmem_l_idx + ((1 << 4) * 0)];
   7889       uint r0_2 = vout[gmem_l_idx + ((1 << 4) * 8)];
   7890       uint r0_3 = vout[gmem_l_idx + ((1 << 4) * 16)];
   7891       uint r0_4 = vout[gmem_l_idx + ((1 << 4) * 24)];
   7892       uint r0_5 = vout[gmem_l_idx + ((1 << 4) * 32)];
   7893       uint r0_6 = vout[gmem_l_idx + ((1 << 4) * 40)];
   7894       uint r0_7 = vout[gmem_l_idx + ((1 << 4) * 48)];
   7895       uint r0_8 = vout[gmem_l_idx + ((1 << 4) * 56)];
   7896       uint r0_9 = vout[gmem_l_idx + ((1 << 4) * 64)];
   7897       uint r0_10 = vout[gmem_l_idx + ((1 << 4) * 72)];
   7898       uint r0_11 = vout[gmem_l_idx + ((1 << 4) * 80)];
   7899       uint r0_12 = vout[gmem_l_idx + ((1 << 4) * 88)];
   7900       uint r0_13 = vout[gmem_l_idx + ((1 << 4) * 96)];
   7901       uint r0_14 = vout[gmem_l_idx + ((1 << 4) * 104)];
   7902       uint r0_15 = vout[gmem_l_idx + ((1 << 4) * 112)];
   7903       uint r0_16 = vout[gmem_l_idx + ((1 << 4) * 120)];
   7904       {
   7905         uint const t = min(r0_1, r0_9);
   7906         r0_9 = max(r0_1, r0_9);
   7907         r0_1 = t;
   7908       };
   7909       {
   7910         uint const t = min(r0_5, r0_13);
   7911         r0_13 = max(r0_5, r0_13);
   7912         r0_5 = t;
   7913       };
   7914       {
   7915         uint const t = min(r0_1, r0_5);
   7916         r0_5 = max(r0_1, r0_5);
   7917         r0_1 = t;
   7918       };
   7919       {
   7920         uint const t = min(r0_9, r0_13);
   7921         r0_13 = max(r0_9, r0_13);
   7922         r0_9 = t;
   7923       };
   7924       {
   7925         uint const t = min(r0_3, r0_11);
   7926         r0_11 = max(r0_3, r0_11);
   7927         r0_3 = t;
   7928       };
   7929       {
   7930         uint const t = min(r0_7, r0_15);
   7931         r0_15 = max(r0_7, r0_15);
   7932         r0_7 = t;
   7933       };
   7934       {
   7935         uint const t = min(r0_3, r0_7);
   7936         r0_7 = max(r0_3, r0_7);
   7937         r0_3 = t;
   7938       };
   7939       {
   7940         uint const t = min(r0_11, r0_15);
   7941         r0_15 = max(r0_11, r0_15);
   7942         r0_11 = t;
   7943       };
   7944       {
   7945         uint const t = min(r0_1, r0_3);
   7946         r0_3 = max(r0_1, r0_3);
   7947         r0_1 = t;
   7948       };
   7949       {
   7950         uint const t = min(r0_5, r0_7);
   7951         r0_7 = max(r0_5, r0_7);
   7952         r0_5 = t;
   7953       };
   7954       {
   7955         uint const t = min(r0_9, r0_11);
   7956         r0_11 = max(r0_9, r0_11);
   7957         r0_9 = t;
   7958       };
   7959       {
   7960         uint const t = min(r0_13, r0_15);
   7961         r0_15 = max(r0_13, r0_15);
   7962         r0_13 = t;
   7963       };
   7964       {
   7965         uint const t = min(r0_2, r0_10);
   7966         r0_10 = max(r0_2, r0_10);
   7967         r0_2 = t;
   7968       };
   7969       {
   7970         uint const t = min(r0_6, r0_14);
   7971         r0_14 = max(r0_6, r0_14);
   7972         r0_6 = t;
   7973       };
   7974       {
   7975         uint const t = min(r0_2, r0_6);
   7976         r0_6 = max(r0_2, r0_6);
   7977         r0_2 = t;
   7978       };
   7979       {
   7980         uint const t = min(r0_10, r0_14);
   7981         r0_14 = max(r0_10, r0_14);
   7982         r0_10 = t;
   7983       };
   7984       {
   7985         uint const t = min(r0_4, r0_12);
   7986         r0_12 = max(r0_4, r0_12);
   7987         r0_4 = t;
   7988       };
   7989       {
   7990         uint const t = min(r0_8, r0_16);
   7991         r0_16 = max(r0_8, r0_16);
   7992         r0_8 = t;
   7993       };
   7994       {
   7995         uint const t = min(r0_4, r0_8);
   7996         r0_8 = max(r0_4, r0_8);
   7997         r0_4 = t;
   7998       };
   7999       {
   8000         uint const t = min(r0_12, r0_16);
   8001         r0_16 = max(r0_12, r0_16);
   8002         r0_12 = t;
   8003       };
   8004       {
   8005         uint const t = min(r0_2, r0_4);
   8006         r0_4 = max(r0_2, r0_4);
   8007         r0_2 = t;
   8008       };
   8009       {
   8010         uint const t = min(r0_6, r0_8);
   8011         r0_8 = max(r0_6, r0_8);
   8012         r0_6 = t;
   8013       };
   8014       {
   8015         uint const t = min(r0_10, r0_12);
   8016         r0_12 = max(r0_10, r0_12);
   8017         r0_10 = t;
   8018       };
   8019       {
   8020         uint const t = min(r0_14, r0_16);
   8021         r0_16 = max(r0_14, r0_16);
   8022         r0_14 = t;
   8023       };
   8024       {
   8025         uint const t = min(r0_1, r0_2);
   8026         r0_2 = max(r0_1, r0_2);
   8027         r0_1 = t;
   8028       };
   8029       {
   8030         uint const t = min(r0_3, r0_4);
   8031         r0_4 = max(r0_3, r0_4);
   8032         r0_3 = t;
   8033       };
   8034       {
   8035         uint const t = min(r0_5, r0_6);
   8036         r0_6 = max(r0_5, r0_6);
   8037         r0_5 = t;
   8038       };
   8039       {
   8040         uint const t = min(r0_7, r0_8);
   8041         r0_8 = max(r0_7, r0_8);
   8042         r0_7 = t;
   8043       };
   8044       {
   8045         uint const t = min(r0_9, r0_10);
   8046         r0_10 = max(r0_9, r0_10);
   8047         r0_9 = t;
   8048       };
   8049       {
   8050         uint const t = min(r0_11, r0_12);
   8051         r0_12 = max(r0_11, r0_12);
   8052         r0_11 = t;
   8053       };
   8054       {
   8055         uint const t = min(r0_13, r0_14);
   8056         r0_14 = max(r0_13, r0_14);
   8057         r0_13 = t;
   8058       };
   8059       {
   8060         uint const t = min(r0_15, r0_16);
   8061         r0_16 = max(r0_15, r0_16);
   8062         r0_15 = t;
   8063       };
   8064       shared.m[smem_l_idx + (0)] = r0_1;
   8065       shared.m[smem_l_idx + (16)] = r0_2;
   8066       shared.m[smem_l_idx + (32)] = r0_3;
   8067       shared.m[smem_l_idx + (48)] = r0_4;
   8068       shared.m[smem_l_idx + (64)] = r0_5;
   8069       shared.m[smem_l_idx + (80)] = r0_6;
   8070       shared.m[smem_l_idx + (96)] = r0_7;
   8071       shared.m[smem_l_idx + (112)] = r0_8;
   8072       shared.m[smem_l_idx + (128)] = r0_9;
   8073       shared.m[smem_l_idx + (144)] = r0_10;
   8074       shared.m[smem_l_idx + (160)] = r0_11;
   8075       shared.m[smem_l_idx + (176)] = r0_12;
   8076       shared.m[smem_l_idx + (192)] = r0_13;
   8077       shared.m[smem_l_idx + (208)] = r0_14;
   8078       shared.m[smem_l_idx + (224)] = r0_15;
   8079       shared.m[smem_l_idx + (240)] = r0_16;
   8080     }
   8081   }
   8082   barrier(CLK_LOCAL_MEM_FENCE);
   8083   uint r1 = shared.m[get_local_id(0) + (16 * (1 << 4) * 0)];
   8084   uint r2 = shared.m[get_local_id(0) + (16 * (1 << 4) * 1)];
   8085   uint r3 = shared.m[get_local_id(0) + (16 * (1 << 4) * 2)];
   8086   uint r4 = shared.m[get_local_id(0) + (16 * (1 << 4) * 3)];
   8087   uint r5 = shared.m[get_local_id(0) + (16 * (1 << 4) * 4)];
   8088   uint r6 = shared.m[get_local_id(0) + (16 * (1 << 4) * 5)];
   8089   uint r7 = shared.m[get_local_id(0) + (16 * (1 << 4) * 6)];
   8090   uint r8 = shared.m[get_local_id(0) + (16 * (1 << 4) * 7)];
   8091   {
   8092     {
   8093       uint const half_lane_idx = get_sub_group_local_id() ^ 8;
   8094       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   8095       ;
   8096       {
   8097         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   8098         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   8099       };
   8100       {
   8101         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   8102         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   8103       };
   8104       {
   8105         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   8106         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   8107       };
   8108       {
   8109         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   8110         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   8111       };
   8112       {
   8113         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   8114         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   8115       };
   8116       {
   8117         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   8118         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   8119       };
   8120       {
   8121         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   8122         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   8123       };
   8124       {
   8125         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   8126         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   8127       };
   8128     }
   8129     {
   8130       uint const half_lane_idx = get_sub_group_local_id() ^ 4;
   8131       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   8132       ;
   8133       {
   8134         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   8135         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   8136       };
   8137       {
   8138         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   8139         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   8140       };
   8141       {
   8142         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   8143         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   8144       };
   8145       {
   8146         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   8147         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   8148       };
   8149       {
   8150         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   8151         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   8152       };
   8153       {
   8154         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   8155         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   8156       };
   8157       {
   8158         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   8159         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   8160       };
   8161       {
   8162         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   8163         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   8164       };
   8165     }
   8166     {
   8167       uint const half_lane_idx = get_sub_group_local_id() ^ 2;
   8168       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   8169       ;
   8170       {
   8171         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   8172         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   8173       };
   8174       {
   8175         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   8176         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   8177       };
   8178       {
   8179         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   8180         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   8181       };
   8182       {
   8183         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   8184         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   8185       };
   8186       {
   8187         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   8188         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   8189       };
   8190       {
   8191         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   8192         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   8193       };
   8194       {
   8195         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   8196         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   8197       };
   8198       {
   8199         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   8200         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   8201       };
   8202     }
   8203     {
   8204       uint const half_lane_idx = get_sub_group_local_id() ^ 1;
   8205       int const t_lt = get_sub_group_local_id() < half_lane_idx;
   8206       ;
   8207       {
   8208         uint const ta = intel_sub_group_shuffle(r1, half_lane_idx);
   8209         r1 = ((r1 <= ta) ^ t_lt) ? ta : r1;
   8210       };
   8211       {
   8212         uint const ta = intel_sub_group_shuffle(r2, half_lane_idx);
   8213         r2 = ((r2 <= ta) ^ t_lt) ? ta : r2;
   8214       };
   8215       {
   8216         uint const ta = intel_sub_group_shuffle(r3, half_lane_idx);
   8217         r3 = ((r3 <= ta) ^ t_lt) ? ta : r3;
   8218       };
   8219       {
   8220         uint const ta = intel_sub_group_shuffle(r4, half_lane_idx);
   8221         r4 = ((r4 <= ta) ^ t_lt) ? ta : r4;
   8222       };
   8223       {
   8224         uint const ta = intel_sub_group_shuffle(r5, half_lane_idx);
   8225         r5 = ((r5 <= ta) ^ t_lt) ? ta : r5;
   8226       };
   8227       {
   8228         uint const ta = intel_sub_group_shuffle(r6, half_lane_idx);
   8229         r6 = ((r6 <= ta) ^ t_lt) ? ta : r6;
   8230       };
   8231       {
   8232         uint const ta = intel_sub_group_shuffle(r7, half_lane_idx);
   8233         r7 = ((r7 <= ta) ^ t_lt) ? ta : r7;
   8234       };
   8235       {
   8236         uint const ta = intel_sub_group_shuffle(r8, half_lane_idx);
   8237         r8 = ((r8 <= ta) ^ t_lt) ? ta : r8;
   8238       };
   8239     }
   8240     {
   8241       uint const t = min(r1, r5);
   8242       r5 = max(r1, r5);
   8243       r1 = t;
   8244     };
   8245     {
   8246       uint const t = min(r3, r7);
   8247       r7 = max(r3, r7);
   8248       r3 = t;
   8249     };
   8250     {
   8251       uint const t = min(r1, r3);
   8252       r3 = max(r1, r3);
   8253       r1 = t;
   8254     };
   8255     {
   8256       uint const t = min(r5, r7);
   8257       r7 = max(r5, r7);
   8258       r5 = t;
   8259     };
   8260     {
   8261       uint const t = min(r2, r6);
   8262       r6 = max(r2, r6);
   8263       r2 = t;
   8264     };
   8265     {
   8266       uint const t = min(r4, r8);
   8267       r8 = max(r4, r8);
   8268       r4 = t;
   8269     };
   8270     {
   8271       uint const t = min(r2, r4);
   8272       r4 = max(r2, r4);
   8273       r2 = t;
   8274     };
   8275     {
   8276       uint const t = min(r6, r8);
   8277       r8 = max(r6, r8);
   8278       r6 = t;
   8279     };
   8280     {
   8281       uint const t = min(r1, r2);
   8282       r2 = max(r1, r2);
   8283       r1 = t;
   8284     };
   8285     {
   8286       uint const t = min(r3, r4);
   8287       r4 = max(r3, r4);
   8288       r3 = t;
   8289     };
   8290     {
   8291       uint const t = min(r5, r6);
   8292       r6 = max(r5, r6);
   8293       r5 = t;
   8294     };
   8295     {
   8296       uint const t = min(r7, r8);
   8297       r8 = max(r7, r8);
   8298       r7 = t;
   8299     };
   8300   }
   8301   vout[gmem_idx + (1 << 4) * 0] = r1;
   8302   vout[gmem_idx + (1 << 4) * 1] = r2;
   8303   vout[gmem_idx + (1 << 4) * 2] = r3;
   8304   vout[gmem_idx + (1 << 4) * 3] = r4;
   8305   vout[gmem_idx + (1 << 4) * 4] = r5;
   8306   vout[gmem_idx + (1 << 4) * 5] = r6;
   8307   vout[gmem_idx + (1 << 4) * 6] = r7;
   8308   vout[gmem_idx + (1 << 4) * 7] = r8;
   8309 }
   8310 
   8311 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void
   8312 hs_kernel_fm_0_0(__global uint* const restrict vout)
   8313 {
   8314   uint const span_idx = get_global_id(1);
   8315   uint const span_stride = get_global_size(0);
   8316   uint const span_size = span_stride * 8 * 2;
   8317   uint const span_base = span_idx * span_size;
   8318   uint const span_off = get_global_id(0);
   8319   uint const span_l = span_base + span_off;
   8320   uint const span_r = span_base + span_stride * (8 + 1) - span_off - 1;
   8321   uint r1 = vout[span_l + span_stride * 0];
   8322   uint r2 = vout[span_l + span_stride * 1];
   8323   uint r3 = vout[span_l + span_stride * 2];
   8324   uint r4 = vout[span_l + span_stride * 3];
   8325   uint r5 = vout[span_l + span_stride * 4];
   8326   uint r6 = vout[span_l + span_stride * 5];
   8327   uint r7 = vout[span_l + span_stride * 6];
   8328   uint r8 = vout[span_l + span_stride * 7];
   8329   uint r9 = vout[span_r + span_stride * 0];
   8330   {
   8331     uint const t = min(r8, r9);
   8332     r9 = max(r8, r9);
   8333     r8 = t;
   8334   };
   8335   {
   8336     uint const t = min(r1, r5);
   8337     r5 = max(r1, r5);
   8338     r1 = t;
   8339   };
   8340   {
   8341     uint const t = min(r3, r7);
   8342     r7 = max(r3, r7);
   8343     r3 = t;
   8344   };
   8345   {
   8346     uint const t = min(r1, r3);
   8347     r3 = max(r1, r3);
   8348     r1 = t;
   8349   };
   8350   {
   8351     uint const t = min(r5, r7);
   8352     r7 = max(r5, r7);
   8353     r5 = t;
   8354   };
   8355   {
   8356     uint const t = min(r2, r6);
   8357     r6 = max(r2, r6);
   8358     r2 = t;
   8359   };
   8360   {
   8361     uint const t = min(r4, r8);
   8362     r8 = max(r4, r8);
   8363     r4 = t;
   8364   };
   8365   {
   8366     uint const t = min(r2, r4);
   8367     r4 = max(r2, r4);
   8368     r2 = t;
   8369   };
   8370   {
   8371     uint const t = min(r6, r8);
   8372     r8 = max(r6, r8);
   8373     r6 = t;
   8374   };
   8375   {
   8376     uint const t = min(r1, r2);
   8377     r2 = max(r1, r2);
   8378     r1 = t;
   8379   };
   8380   {
   8381     uint const t = min(r3, r4);
   8382     r4 = max(r3, r4);
   8383     r3 = t;
   8384   };
   8385   {
   8386     uint const t = min(r5, r6);
   8387     r6 = max(r5, r6);
   8388     r5 = t;
   8389   };
   8390   {
   8391     uint const t = min(r7, r8);
   8392     r8 = max(r7, r8);
   8393     r7 = t;
   8394   };
   8395   vout[span_l + span_stride * 0] = r1;
   8396   vout[span_l + span_stride * 1] = r2;
   8397   vout[span_l + span_stride * 2] = r3;
   8398   vout[span_l + span_stride * 3] = r4;
   8399   vout[span_l + span_stride * 4] = r5;
   8400   vout[span_l + span_stride * 5] = r6;
   8401   vout[span_l + span_stride * 6] = r7;
   8402   vout[span_l + span_stride * 7] = r8;
   8403   vout[span_r + span_stride * 0] = r9;
   8404 }
   8405 
   8406 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void
   8407 hs_kernel_fm_0_1(__global uint* const restrict vout)
   8408 {
   8409   uint const span_idx = get_global_id(1);
   8410   uint const span_stride = get_global_size(0);
   8411   uint const span_size = span_stride * 8 * 2;
   8412   uint const span_base = span_idx * span_size;
   8413   uint const span_off = get_global_id(0);
   8414   uint const span_l = span_base + span_off;
   8415   uint const span_r = span_base + span_stride * (8 + 1) - span_off - 1;
   8416   uint r1 = vout[span_l + span_stride * 0];
   8417   uint r2 = vout[span_l + span_stride * 1];
   8418   uint r3 = vout[span_l + span_stride * 2];
   8419   uint r4 = vout[span_l + span_stride * 3];
   8420   uint r5 = vout[span_l + span_stride * 4];
   8421   uint r6 = vout[span_l + span_stride * 5];
   8422   uint r7 = vout[span_l + span_stride * 6];
   8423   uint r8 = vout[span_l + span_stride * 7];
   8424   uint r9 = vout[span_r + span_stride * 0];
   8425   uint r10 = vout[span_r + span_stride * 1];
   8426   {
   8427     uint const t = min(r8, r9);
   8428     r9 = max(r8, r9);
   8429     r8 = t;
   8430   };
   8431   {
   8432     uint const t = min(r7, r10);
   8433     r10 = max(r7, r10);
   8434     r7 = t;
   8435   };
   8436   {
   8437     uint const t = min(r1, r5);
   8438     r5 = max(r1, r5);
   8439     r1 = t;
   8440   };
   8441   {
   8442     uint const t = min(r3, r7);
   8443     r7 = max(r3, r7);
   8444     r3 = t;
   8445   };
   8446   {
   8447     uint const t = min(r1, r3);
   8448     r3 = max(r1, r3);
   8449     r1 = t;
   8450   };
   8451   {
   8452     uint const t = min(r5, r7);
   8453     r7 = max(r5, r7);
   8454     r5 = t;
   8455   };
   8456   {
   8457     uint const t = min(r2, r6);
   8458     r6 = max(r2, r6);
   8459     r2 = t;
   8460   };
   8461   {
   8462     uint const t = min(r4, r8);
   8463     r8 = max(r4, r8);
   8464     r4 = t;
   8465   };
   8466   {
   8467     uint const t = min(r2, r4);
   8468     r4 = max(r2, r4);
   8469     r2 = t;
   8470   };
   8471   {
   8472     uint const t = min(r6, r8);
   8473     r8 = max(r6, r8);
   8474     r6 = t;
   8475   };
   8476   {
   8477     uint const t = min(r1, r2);
   8478     r2 = max(r1, r2);
   8479     r1 = t;
   8480   };
   8481   {
   8482     uint const t = min(r3, r4);
   8483     r4 = max(r3, r4);
   8484     r3 = t;
   8485   };
   8486   {
   8487     uint const t = min(r5, r6);
   8488     r6 = max(r5, r6);
   8489     r5 = t;
   8490   };
   8491   {
   8492     uint const t = min(r7, r8);
   8493     r8 = max(r7, r8);
   8494     r7 = t;
   8495   };
   8496   {
   8497     uint const t = min(r9, r10);
   8498     r10 = max(r9, r10);
   8499     r9 = t;
   8500   };
   8501   vout[span_l + span_stride * 0] = r1;
   8502   vout[span_l + span_stride * 1] = r2;
   8503   vout[span_l + span_stride * 2] = r3;
   8504   vout[span_l + span_stride * 3] = r4;
   8505   vout[span_l + span_stride * 4] = r5;
   8506   vout[span_l + span_stride * 5] = r6;
   8507   vout[span_l + span_stride * 6] = r7;
   8508   vout[span_l + span_stride * 7] = r8;
   8509   vout[span_r + span_stride * 0] = r9;
   8510   vout[span_r + span_stride * 1] = r10;
   8511 }
   8512 
   8513 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void
   8514 hs_kernel_fm_0_2(__global uint* const restrict vout)
   8515 {
   8516   uint const span_idx = get_global_id(1);
   8517   uint const span_stride = get_global_size(0);
   8518   uint const span_size = span_stride * 8 * 2;
   8519   uint const span_base = span_idx * span_size;
   8520   uint const span_off = get_global_id(0);
   8521   uint const span_l = span_base + span_off;
   8522   uint const span_r = span_base + span_stride * (8 + 1) - span_off - 1;
   8523   uint r1 = vout[span_l + span_stride * 0];
   8524   uint r2 = vout[span_l + span_stride * 1];
   8525   uint r3 = vout[span_l + span_stride * 2];
   8526   uint r4 = vout[span_l + span_stride * 3];
   8527   uint r5 = vout[span_l + span_stride * 4];
   8528   uint r6 = vout[span_l + span_stride * 5];
   8529   uint r7 = vout[span_l + span_stride * 6];
   8530   uint r8 = vout[span_l + span_stride * 7];
   8531   uint r9 = vout[span_r + span_stride * 0];
   8532   uint r10 = vout[span_r + span_stride * 1];
   8533   uint r11 = vout[span_r + span_stride * 2];
   8534   uint r12 = vout[span_r + span_stride * 3];
   8535   {
   8536     uint const t = min(r8, r9);
   8537     r9 = max(r8, r9);
   8538     r8 = t;
   8539   };
   8540   {
   8541     uint const t = min(r7, r10);
   8542     r10 = max(r7, r10);
   8543     r7 = t;
   8544   };
   8545   {
   8546     uint const t = min(r6, r11);
   8547     r11 = max(r6, r11);
   8548     r6 = t;
   8549   };
   8550   {
   8551     uint const t = min(r5, r12);
   8552     r12 = max(r5, r12);
   8553     r5 = t;
   8554   };
   8555   {
   8556     uint const t = min(r1, r5);
   8557     r5 = max(r1, r5);
   8558     r1 = t;
   8559   };
   8560   {
   8561     uint const t = min(r3, r7);
   8562     r7 = max(r3, r7);
   8563     r3 = t;
   8564   };
   8565   {
   8566     uint const t = min(r1, r3);
   8567     r3 = max(r1, r3);
   8568     r1 = t;
   8569   };
   8570   {
   8571     uint const t = min(r5, r7);
   8572     r7 = max(r5, r7);
   8573     r5 = t;
   8574   };
   8575   {
   8576     uint const t = min(r2, r6);
   8577     r6 = max(r2, r6);
   8578     r2 = t;
   8579   };
   8580   {
   8581     uint const t = min(r4, r8);
   8582     r8 = max(r4, r8);
   8583     r4 = t;
   8584   };
   8585   {
   8586     uint const t = min(r2, r4);
   8587     r4 = max(r2, r4);
   8588     r2 = t;
   8589   };
   8590   {
   8591     uint const t = min(r6, r8);
   8592     r8 = max(r6, r8);
   8593     r6 = t;
   8594   };
   8595   {
   8596     uint const t = min(r1, r2);
   8597     r2 = max(r1, r2);
   8598     r1 = t;
   8599   };
   8600   {
   8601     uint const t = min(r3, r4);
   8602     r4 = max(r3, r4);
   8603     r3 = t;
   8604   };
   8605   {
   8606     uint const t = min(r5, r6);
   8607     r6 = max(r5, r6);
   8608     r5 = t;
   8609   };
   8610   {
   8611     uint const t = min(r7, r8);
   8612     r8 = max(r7, r8);
   8613     r7 = t;
   8614   };
   8615   {
   8616     uint const t = min(r9, r11);
   8617     r11 = max(r9, r11);
   8618     r9 = t;
   8619   };
   8620   {
   8621     uint const t = min(r10, r12);
   8622     r12 = max(r10, r12);
   8623     r10 = t;
   8624   };
   8625   {
   8626     uint const t = min(r9, r10);
   8627     r10 = max(r9, r10);
   8628     r9 = t;
   8629   };
   8630   {
   8631     uint const t = min(r11, r12);
   8632     r12 = max(r11, r12);
   8633     r11 = t;
   8634   };
   8635   vout[span_l + span_stride * 0] = r1;
   8636   vout[span_l + span_stride * 1] = r2;
   8637   vout[span_l + span_stride * 2] = r3;
   8638   vout[span_l + span_stride * 3] = r4;
   8639   vout[span_l + span_stride * 4] = r5;
   8640   vout[span_l + span_stride * 5] = r6;
   8641   vout[span_l + span_stride * 6] = r7;
   8642   vout[span_l + span_stride * 7] = r8;
   8643   vout[span_r + span_stride * 0] = r9;
   8644   vout[span_r + span_stride * 1] = r10;
   8645   vout[span_r + span_stride * 2] = r11;
   8646   vout[span_r + span_stride * 3] = r12;
   8647 }
   8648 
   8649 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void
   8650 hs_kernel_fm_0_3(__global uint* const restrict vout)
   8651 {
   8652   uint const span_idx = get_global_id(1);
   8653   uint const span_stride = get_global_size(0);
   8654   uint const span_size = span_stride * 8 * 2;
   8655   uint const span_base = span_idx * span_size;
   8656   uint const span_off = get_global_id(0);
   8657   uint const span_l = span_base + span_off;
   8658   uint const span_r = span_base + span_stride * (8 + 1) - span_off - 1;
   8659   uint r1 = vout[span_l + span_stride * 0];
   8660   uint r2 = vout[span_l + span_stride * 1];
   8661   uint r3 = vout[span_l + span_stride * 2];
   8662   uint r4 = vout[span_l + span_stride * 3];
   8663   uint r5 = vout[span_l + span_stride * 4];
   8664   uint r6 = vout[span_l + span_stride * 5];
   8665   uint r7 = vout[span_l + span_stride * 6];
   8666   uint r8 = vout[span_l + span_stride * 7];
   8667   uint r9 = vout[span_r + span_stride * 0];
   8668   uint r10 = vout[span_r + span_stride * 1];
   8669   uint r11 = vout[span_r + span_stride * 2];
   8670   uint r12 = vout[span_r + span_stride * 3];
   8671   uint r13 = vout[span_r + span_stride * 4];
   8672   uint r14 = vout[span_r + span_stride * 5];
   8673   uint r15 = vout[span_r + span_stride * 6];
   8674   uint r16 = vout[span_r + span_stride * 7];
   8675   {
   8676     uint const t = min(r8, r9);
   8677     r9 = max(r8, r9);
   8678     r8 = t;
   8679   };
   8680   {
   8681     uint const t = min(r7, r10);
   8682     r10 = max(r7, r10);
   8683     r7 = t;
   8684   };
   8685   {
   8686     uint const t = min(r6, r11);
   8687     r11 = max(r6, r11);
   8688     r6 = t;
   8689   };
   8690   {
   8691     uint const t = min(r5, r12);
   8692     r12 = max(r5, r12);
   8693     r5 = t;
   8694   };
   8695   {
   8696     uint const t = min(r4, r13);
   8697     r13 = max(r4, r13);
   8698     r4 = t;
   8699   };
   8700   {
   8701     uint const t = min(r3, r14);
   8702     r14 = max(r3, r14);
   8703     r3 = t;
   8704   };
   8705   {
   8706     uint const t = min(r2, r15);
   8707     r15 = max(r2, r15);
   8708     r2 = t;
   8709   };
   8710   {
   8711     uint const t = min(r1, r16);
   8712     r16 = max(r1, r16);
   8713     r1 = t;
   8714   };
   8715   {
   8716     uint const t = min(r1, r5);
   8717     r5 = max(r1, r5);
   8718     r1 = t;
   8719   };
   8720   {
   8721     uint const t = min(r3, r7);
   8722     r7 = max(r3, r7);
   8723     r3 = t;
   8724   };
   8725   {
   8726     uint const t = min(r1, r3);
   8727     r3 = max(r1, r3);
   8728     r1 = t;
   8729   };
   8730   {
   8731     uint const t = min(r5, r7);
   8732     r7 = max(r5, r7);
   8733     r5 = t;
   8734   };
   8735   {
   8736     uint const t = min(r2, r6);
   8737     r6 = max(r2, r6);
   8738     r2 = t;
   8739   };
   8740   {
   8741     uint const t = min(r4, r8);
   8742     r8 = max(r4, r8);
   8743     r4 = t;
   8744   };
   8745   {
   8746     uint const t = min(r2, r4);
   8747     r4 = max(r2, r4);
   8748     r2 = t;
   8749   };
   8750   {
   8751     uint const t = min(r6, r8);
   8752     r8 = max(r6, r8);
   8753     r6 = t;
   8754   };
   8755   {
   8756     uint const t = min(r1, r2);
   8757     r2 = max(r1, r2);
   8758     r1 = t;
   8759   };
   8760   {
   8761     uint const t = min(r3, r4);
   8762     r4 = max(r3, r4);
   8763     r3 = t;
   8764   };
   8765   {
   8766     uint const t = min(r5, r6);
   8767     r6 = max(r5, r6);
   8768     r5 = t;
   8769   };
   8770   {
   8771     uint const t = min(r7, r8);
   8772     r8 = max(r7, r8);
   8773     r7 = t;
   8774   };
   8775   {
   8776     uint const t = min(r9, r13);
   8777     r13 = max(r9, r13);
   8778     r9 = t;
   8779   };
   8780   {
   8781     uint const t = min(r11, r15);
   8782     r15 = max(r11, r15);
   8783     r11 = t;
   8784   };
   8785   {
   8786     uint const t = min(r9, r11);
   8787     r11 = max(r9, r11);
   8788     r9 = t;
   8789   };
   8790   {
   8791     uint const t = min(r13, r15);
   8792     r15 = max(r13, r15);
   8793     r13 = t;
   8794   };
   8795   {
   8796     uint const t = min(r10, r14);
   8797     r14 = max(r10, r14);
   8798     r10 = t;
   8799   };
   8800   {
   8801     uint const t = min(r12, r16);
   8802     r16 = max(r12, r16);
   8803     r12 = t;
   8804   };
   8805   {
   8806     uint const t = min(r10, r12);
   8807     r12 = max(r10, r12);
   8808     r10 = t;
   8809   };
   8810   {
   8811     uint const t = min(r14, r16);
   8812     r16 = max(r14, r16);
   8813     r14 = t;
   8814   };
   8815   {
   8816     uint const t = min(r9, r10);
   8817     r10 = max(r9, r10);
   8818     r9 = t;
   8819   };
   8820   {
   8821     uint const t = min(r11, r12);
   8822     r12 = max(r11, r12);
   8823     r11 = t;
   8824   };
   8825   {
   8826     uint const t = min(r13, r14);
   8827     r14 = max(r13, r14);
   8828     r13 = t;
   8829   };
   8830   {
   8831     uint const t = min(r15, r16);
   8832     r16 = max(r15, r16);
   8833     r15 = t;
   8834   };
   8835   vout[span_l + span_stride * 0] = r1;
   8836   vout[span_l + span_stride * 1] = r2;
   8837   vout[span_l + span_stride * 2] = r3;
   8838   vout[span_l + span_stride * 3] = r4;
   8839   vout[span_l + span_stride * 4] = r5;
   8840   vout[span_l + span_stride * 5] = r6;
   8841   vout[span_l + span_stride * 6] = r7;
   8842   vout[span_l + span_stride * 7] = r8;
   8843   vout[span_r + span_stride * 0] = r9;
   8844   vout[span_r + span_stride * 1] = r10;
   8845   vout[span_r + span_stride * 2] = r11;
   8846   vout[span_r + span_stride * 3] = r12;
   8847   vout[span_r + span_stride * 4] = r13;
   8848   vout[span_r + span_stride * 5] = r14;
   8849   vout[span_r + span_stride * 6] = r15;
   8850   vout[span_r + span_stride * 7] = r16;
   8851 }
   8852 
   8853 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void
   8854 hs_kernel_hm_0(__global uint* const restrict vout)
   8855 {
   8856   uint const span_idx = get_global_id(1);
   8857   uint const span_stride = get_global_size(0);
   8858   uint const span_size = span_stride * 8 * 2;
   8859   uint const span_base = span_idx * span_size;
   8860   uint const span_off = get_global_id(0);
   8861   uint const span_l = span_base + span_off;
   8862   uint r1 = vout[span_l + span_stride * 0];
   8863   uint r2 = vout[span_l + span_stride * 1];
   8864   uint r3 = vout[span_l + span_stride * 2];
   8865   uint r4 = vout[span_l + span_stride * 3];
   8866   uint r5 = vout[span_l + span_stride * 4];
   8867   uint r6 = vout[span_l + span_stride * 5];
   8868   uint r7 = vout[span_l + span_stride * 6];
   8869   uint r8 = vout[span_l + span_stride * 7];
   8870   uint r9 = vout[span_l + span_stride * 8];
   8871   uint r10 = vout[span_l + span_stride * 9];
   8872   uint r11 = vout[span_l + span_stride * 10];
   8873   uint r12 = vout[span_l + span_stride * 11];
   8874   uint r13 = vout[span_l + span_stride * 12];
   8875   uint r14 = vout[span_l + span_stride * 13];
   8876   uint r15 = vout[span_l + span_stride * 14];
   8877   uint r16 = vout[span_l + span_stride * 15];
   8878   {
   8879     uint const t = min(r1, r9);
   8880     r9 = max(r1, r9);
   8881     r1 = t;
   8882   };
   8883   {
   8884     uint const t = min(r5, r13);
   8885     r13 = max(r5, r13);
   8886     r5 = t;
   8887   };
   8888   {
   8889     uint const t = min(r1, r5);
   8890     r5 = max(r1, r5);
   8891     r1 = t;
   8892   };
   8893   {
   8894     uint const t = min(r9, r13);
   8895     r13 = max(r9, r13);
   8896     r9 = t;
   8897   };
   8898   {
   8899     uint const t = min(r3, r11);
   8900     r11 = max(r3, r11);
   8901     r3 = t;
   8902   };
   8903   {
   8904     uint const t = min(r7, r15);
   8905     r15 = max(r7, r15);
   8906     r7 = t;
   8907   };
   8908   {
   8909     uint const t = min(r3, r7);
   8910     r7 = max(r3, r7);
   8911     r3 = t;
   8912   };
   8913   {
   8914     uint const t = min(r11, r15);
   8915     r15 = max(r11, r15);
   8916     r11 = t;
   8917   };
   8918   {
   8919     uint const t = min(r1, r3);
   8920     r3 = max(r1, r3);
   8921     r1 = t;
   8922   };
   8923   {
   8924     uint const t = min(r5, r7);
   8925     r7 = max(r5, r7);
   8926     r5 = t;
   8927   };
   8928   {
   8929     uint const t = min(r9, r11);
   8930     r11 = max(r9, r11);
   8931     r9 = t;
   8932   };
   8933   {
   8934     uint const t = min(r13, r15);
   8935     r15 = max(r13, r15);
   8936     r13 = t;
   8937   };
   8938   {
   8939     uint const t = min(r2, r10);
   8940     r10 = max(r2, r10);
   8941     r2 = t;
   8942   };
   8943   {
   8944     uint const t = min(r6, r14);
   8945     r14 = max(r6, r14);
   8946     r6 = t;
   8947   };
   8948   {
   8949     uint const t = min(r2, r6);
   8950     r6 = max(r2, r6);
   8951     r2 = t;
   8952   };
   8953   {
   8954     uint const t = min(r10, r14);
   8955     r14 = max(r10, r14);
   8956     r10 = t;
   8957   };
   8958   {
   8959     uint const t = min(r4, r12);
   8960     r12 = max(r4, r12);
   8961     r4 = t;
   8962   };
   8963   {
   8964     uint const t = min(r8, r16);
   8965     r16 = max(r8, r16);
   8966     r8 = t;
   8967   };
   8968   {
   8969     uint const t = min(r4, r8);
   8970     r8 = max(r4, r8);
   8971     r4 = t;
   8972   };
   8973   {
   8974     uint const t = min(r12, r16);
   8975     r16 = max(r12, r16);
   8976     r12 = t;
   8977   };
   8978   {
   8979     uint const t = min(r2, r4);
   8980     r4 = max(r2, r4);
   8981     r2 = t;
   8982   };
   8983   {
   8984     uint const t = min(r6, r8);
   8985     r8 = max(r6, r8);
   8986     r6 = t;
   8987   };
   8988   {
   8989     uint const t = min(r10, r12);
   8990     r12 = max(r10, r12);
   8991     r10 = t;
   8992   };
   8993   {
   8994     uint const t = min(r14, r16);
   8995     r16 = max(r14, r16);
   8996     r14 = t;
   8997   };
   8998   {
   8999     uint const t = min(r1, r2);
   9000     r2 = max(r1, r2);
   9001     r1 = t;
   9002   };
   9003   {
   9004     uint const t = min(r3, r4);
   9005     r4 = max(r3, r4);
   9006     r3 = t;
   9007   };
   9008   {
   9009     uint const t = min(r5, r6);
   9010     r6 = max(r5, r6);
   9011     r5 = t;
   9012   };
   9013   {
   9014     uint const t = min(r7, r8);
   9015     r8 = max(r7, r8);
   9016     r7 = t;
   9017   };
   9018   {
   9019     uint const t = min(r9, r10);
   9020     r10 = max(r9, r10);
   9021     r9 = t;
   9022   };
   9023   {
   9024     uint const t = min(r11, r12);
   9025     r12 = max(r11, r12);
   9026     r11 = t;
   9027   };
   9028   {
   9029     uint const t = min(r13, r14);
   9030     r14 = max(r13, r14);
   9031     r13 = t;
   9032   };
   9033   {
   9034     uint const t = min(r15, r16);
   9035     r16 = max(r15, r16);
   9036     r15 = t;
   9037   };
   9038   vout[span_l + span_stride * 0] = r1;
   9039   vout[span_l + span_stride * 1] = r2;
   9040   vout[span_l + span_stride * 2] = r3;
   9041   vout[span_l + span_stride * 3] = r4;
   9042   vout[span_l + span_stride * 4] = r5;
   9043   vout[span_l + span_stride * 5] = r6;
   9044   vout[span_l + span_stride * 6] = r7;
   9045   vout[span_l + span_stride * 7] = r8;
   9046   vout[span_l + span_stride * 8] = r9;
   9047   vout[span_l + span_stride * 9] = r10;
   9048   vout[span_l + span_stride * 10] = r11;
   9049   vout[span_l + span_stride * 11] = r12;
   9050   vout[span_l + span_stride * 12] = r13;
   9051   vout[span_l + span_stride * 13] = r14;
   9052   vout[span_l + span_stride * 14] = r15;
   9053   vout[span_l + span_stride * 15] = r16;
   9054 }
   9055 
   9056 __kernel __attribute__((intel_reqd_sub_group_size((1 << 4)))) void
   9057 hs_kernel_transpose(__global uint* const restrict vout)
   9058 {
   9059   uint const gmem_idx = (get_global_id(0) & ~((1 << 4) - 1)) * 8 +
   9060                         (get_local_id(0) & ((1 << 4) - 1));
   9061   uint r1 = vout[gmem_idx + (1 << 4) * 0];
   9062   uint r2 = vout[gmem_idx + (1 << 4) * 1];
   9063   uint r3 = vout[gmem_idx + (1 << 4) * 2];
   9064   uint r4 = vout[gmem_idx + (1 << 4) * 3];
   9065   uint r5 = vout[gmem_idx + (1 << 4) * 4];
   9066   uint r6 = vout[gmem_idx + (1 << 4) * 5];
   9067   uint r7 = vout[gmem_idx + (1 << 4) * 6];
   9068   uint r8 = vout[gmem_idx + (1 << 4) * 7];
   9069   bool const is_lo_1 = (get_sub_group_local_id() & (1 << (1 - 1))) == 0;
   9070   bool const is_lo_2 = (get_sub_group_local_id() & (1 << (2 - 1))) == 0;
   9071   bool const is_lo_3 = (get_sub_group_local_id() & (1 << (3 - 1))) == 0;
   9072   bool const is_lo_4 = (get_sub_group_local_id() & (1 << (4 - 1))) == 0;
   9073   uint const s2_1 =
   9074     intel_sub_group_shuffle_xor(is_lo_1 ? r2 : r1, 1 << (1 - 1));
   9075   uint const s2 = is_lo_1 ? s2_1 : r2;
   9076   uint const s1 = is_lo_1 ? r1 : s2_1;
   9077   uint const s4_3 =
   9078     intel_sub_group_shuffle_xor(is_lo_1 ? r4 : r3, 1 << (1 - 1));
   9079   uint const s4 = is_lo_1 ? s4_3 : r4;
   9080   uint const s3 = is_lo_1 ? r3 : s4_3;
   9081   uint const s6_5 =
   9082     intel_sub_group_shuffle_xor(is_lo_1 ? r6 : r5, 1 << (1 - 1));
   9083   uint const s6 = is_lo_1 ? s6_5 : r6;
   9084   uint const s5 = is_lo_1 ? r5 : s6_5;
   9085   uint const s8_7 =
   9086     intel_sub_group_shuffle_xor(is_lo_1 ? r8 : r7, 1 << (1 - 1));
   9087   uint const s8 = is_lo_1 ? s8_7 : r8;
   9088   uint const s7 = is_lo_1 ? r7 : s8_7;
   9089   uint const t3_1 =
   9090     intel_sub_group_shuffle_xor(is_lo_2 ? s3 : s1, 1 << (2 - 1));
   9091   uint const t3 = is_lo_2 ? t3_1 : s3;
   9092   uint const t1 = is_lo_2 ? s1 : t3_1;
   9093   uint const t4_2 =
   9094     intel_sub_group_shuffle_xor(is_lo_2 ? s4 : s2, 1 << (2 - 1));
   9095   uint const t4 = is_lo_2 ? t4_2 : s4;
   9096   uint const t2 = is_lo_2 ? s2 : t4_2;
   9097   uint const t7_5 =
   9098     intel_sub_group_shuffle_xor(is_lo_2 ? s7 : s5, 1 << (2 - 1));
   9099   uint const t7 = is_lo_2 ? t7_5 : s7;
   9100   uint const t5 = is_lo_2 ? s5 : t7_5;
   9101   uint const t8_6 =
   9102     intel_sub_group_shuffle_xor(is_lo_2 ? s8 : s6, 1 << (2 - 1));
   9103   uint const t8 = is_lo_2 ? t8_6 : s8;
   9104   uint const t6 = is_lo_2 ? s6 : t8_6;
   9105   uint const u5_1 =
   9106     intel_sub_group_shuffle_xor(is_lo_3 ? t5 : t1, 1 << (3 - 1));
   9107   uint const u5 = is_lo_3 ? u5_1 : t5;
   9108   uint const u1 = is_lo_3 ? t1 : u5_1;
   9109   uint const u6_2 =
   9110     intel_sub_group_shuffle_xor(is_lo_3 ? t6 : t2, 1 << (3 - 1));
   9111   uint const u6 = is_lo_3 ? u6_2 : t6;
   9112   uint const u2 = is_lo_3 ? t2 : u6_2;
   9113   uint const u7_3 =
   9114     intel_sub_group_shuffle_xor(is_lo_3 ? t7 : t3, 1 << (3 - 1));
   9115   uint const u7 = is_lo_3 ? u7_3 : t7;
   9116   uint const u3 = is_lo_3 ? t3 : u7_3;
   9117   uint const u8_4 =
   9118     intel_sub_group_shuffle_xor(is_lo_3 ? t8 : t4, 1 << (3 - 1));
   9119   uint const u8 = is_lo_3 ? u8_4 : t8;
   9120   uint const u4 = is_lo_3 ? t4 : u8_4;
   9121   uint const v2_1 =
   9122     intel_sub_group_shuffle_xor(is_lo_4 ? u2 : u1, 1 << (4 - 1));
   9123   uint const v2 = is_lo_4 ? v2_1 : u2;
   9124   uint const v1 = is_lo_4 ? u1 : v2_1;
   9125   uint const v4_3 =
   9126     intel_sub_group_shuffle_xor(is_lo_4 ? u4 : u3, 1 << (4 - 1));
   9127   uint const v4 = is_lo_4 ? v4_3 : u4;
   9128   uint const v3 = is_lo_4 ? u3 : v4_3;
   9129   uint const v6_5 =
   9130     intel_sub_group_shuffle_xor(is_lo_4 ? u6 : u5, 1 << (4 - 1));
   9131   uint const v6 = is_lo_4 ? v6_5 : u6;
   9132   uint const v5 = is_lo_4 ? u5 : v6_5;
   9133   uint const v8_7 =
   9134     intel_sub_group_shuffle_xor(is_lo_4 ? u8 : u7, 1 << (4 - 1));
   9135   uint const v8 = is_lo_4 ? v8_7 : u8;
   9136   uint const v7 = is_lo_4 ? u7 : v8_7;
   9137   vout[gmem_idx + ((1 - 1) << 4)] = v1;
   9138   vout[gmem_idx + ((5 - 1) << 4)] = v2;
   9139   vout[gmem_idx + ((2 - 1) << 4)] = v3;
   9140   vout[gmem_idx + ((6 - 1) << 4)] = v4;
   9141   vout[gmem_idx + ((3 - 1) << 4)] = v5;
   9142   vout[gmem_idx + ((7 - 1) << 4)] = v6;
   9143   vout[gmem_idx + ((4 - 1) << 4)] = v7;
   9144   vout[gmem_idx + ((8 - 1) << 4)] = v8;
   9145 }
   9146