Home | History | Annotate | Download | only in aarch64
      1 // Copyright 2016, VIXL authors
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are met:
      6 //
      7 //   * Redistributions of source code must retain the above copyright notice,
      8 //     this list of conditions and the following disclaimer.
      9 //   * Redistributions in binary form must reproduce the above copyright notice,
     10 //     this list of conditions and the following disclaimer in the documentation
     11 //     and/or other materials provided with the distribution.
     12 //   * Neither the name of ARM Limited nor the names of its contributors may be
     13 //     used to endorse or promote products derived from this software without
     14 //     specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
     17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
     20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
     23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 
     27 #include <cfloat>
     28 #include <cmath>
     29 #include <cstdio>
     30 #include <cstdlib>
     31 #include <cstring>
     32 
     33 #include "test-runner.h"
     34 #include "test-utils-aarch64.h"
     35 
     36 #include "aarch64/cpu-aarch64.h"
     37 #include "aarch64/debugger-aarch64.h"
     38 #include "aarch64/disasm-aarch64.h"
     39 #include "aarch64/macro-assembler-aarch64.h"
     40 #include "aarch64/simulator-aarch64.h"
     41 
     42 namespace vixl {
     43 namespace aarch64 {
     44 // Trace tests can only work with the simulator.
     45 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     46 
     47 #define __ masm->
     48 #define TEST(name) TEST_(TRACE_##name)
     49 
     50 static void GenerateTestSequenceBase(MacroAssembler* masm) {
     51   ExactAssemblyScope guard(masm,
     52                            masm->GetBuffer()->GetRemainingBytes(),
     53                            ExactAssemblyScope::kMaximumSize);
     54 
     55   __ adc(w3, w4, w5);
     56   __ adc(x6, x7, x8);
     57   __ adcs(w9, w10, w11);
     58   __ adcs(x12, x13, x14);
     59   __ add(w15, w16, w17);
     60   __ add(x18, x19, x20);
     61   __ adds(w21, w22, w23);
     62   __ adds(x24, x25, x26);
     63   __ and_(w27, w28, w29);
     64   __ and_(x2, x3, x4);
     65   __ ands(w5, w6, w7);
     66   __ ands(x8, x9, x10);
     67   __ asr(w11, w12, 0);
     68   __ asr(x13, x14, 1);
     69   __ asrv(w15, w16, w17);
     70   __ asrv(x18, x19, x20);
     71   __ bfm(w21, w22, 5, 6);
     72   __ bfm(x23, x24, 7, 8);
     73   __ bic(w25, w26, w27);
     74   __ bic(x28, x29, x2);
     75   __ bics(w3, w4, w5);
     76   __ bics(x6, x7, x8);
     77   __ ccmn(w9, w10, NoFlag, al);
     78   __ ccmn(w9, w10, NoFlag, eq);
     79   __ ccmn(w9, w10, NoFlag, ne);
     80   __ ccmn(x11, x12, CFlag, al);
     81   __ ccmn(x11, x12, CFlag, cc);
     82   __ ccmn(x11, x12, CFlag, cs);
     83   __ ccmp(w13, w14, VFlag, al);
     84   __ ccmp(w13, w14, VFlag, hi);
     85   __ ccmp(w13, w14, VFlag, ls);
     86   __ ccmp(x15, x16, CVFlag, al);
     87   __ ccmp(x15, x16, CVFlag, eq);
     88   __ ccmp(x15, x16, CVFlag, ne);
     89   __ cinc(w17, w18, cc);
     90   __ cinc(w17, w18, cs);
     91   __ cinc(x19, x20, hi);
     92   __ cinc(x19, x20, ls);
     93   __ cinv(w21, w22, eq);
     94   __ cinv(w21, w22, ne);
     95   __ cinv(x23, x24, cc);
     96   __ cinv(x23, x24, cs);
     97   __ clrex();
     98   __ cls(w25, w26);
     99   __ cls(x27, x28);
    100   __ clz(w29, w2);
    101   __ clz(x3, x4);
    102   __ cmn(w5, w6);
    103   __ cmn(x7, x8);
    104   __ cmp(w9, w10);
    105   __ cmp(x11, x12);
    106   __ cneg(w13, w14, hi);
    107   __ cneg(w13, w14, ls);
    108   __ cneg(x15, x16, eq);
    109   __ cneg(x15, x16, ne);
    110   __ crc32b(w17, w18, w19);
    111   __ crc32cb(w20, w21, w22);
    112   __ crc32ch(w23, w24, w25);
    113   __ crc32cw(w26, w27, w28);
    114   __ crc32h(w4, w5, w6);
    115   __ crc32w(w7, w8, w9);
    116   __ csel(w13, w14, w15, cc);
    117   __ csel(w13, w14, w15, cs);
    118   __ csel(x16, x17, x18, hi);
    119   __ csel(x16, x17, x18, ls);
    120   __ cset(w19, eq);
    121   __ cset(w19, ne);
    122   __ cset(x20, cc);
    123   __ cset(x20, cs);
    124   __ csetm(w21, hi);
    125   __ csetm(w21, ls);
    126   __ csetm(x22, eq);
    127   __ csetm(x22, ne);
    128   __ csinc(w23, w24, w25, cc);
    129   __ csinc(w23, w24, w25, cs);
    130   __ csinc(x26, x27, x28, hi);
    131   __ csinc(x26, x27, x28, ls);
    132   __ csinv(w29, w2, w3, eq);
    133   __ csinv(w29, w2, w3, ne);
    134   __ csinv(x4, x5, x6, cc);
    135   __ csinv(x4, x5, x6, cs);
    136   __ csneg(w7, w8, w9, hi);
    137   __ csneg(w7, w8, w9, ls);
    138   __ csneg(x10, x11, x12, eq);
    139   __ csneg(x10, x11, x12, ne);
    140   __ dc(CVAC, x0);
    141   __ dmb(InnerShareable, BarrierAll);
    142   __ dsb(InnerShareable, BarrierAll);
    143   __ eon(w13, w14, w15);
    144   __ eon(x16, x17, x18);
    145   __ eor(w19, w20, w21);
    146   __ eor(x22, x23, x24);
    147   __ extr(w25, w26, w27, 9);
    148   __ extr(x28, x29, x2, 10);
    149   __ hint(NOP);
    150   __ ic(IVAU, x0);
    151   __ isb();
    152   __ ldar(w3, MemOperand(x0));
    153   __ ldar(x4, MemOperand(x0));
    154   __ ldarb(w5, MemOperand(x0));
    155   __ ldarb(x6, MemOperand(x0));
    156   __ ldarh(w7, MemOperand(x0));
    157   __ ldarh(x8, MemOperand(x0));
    158   __ ldaxp(w9, w10, MemOperand(x0));
    159   __ ldaxp(x11, x12, MemOperand(x0));
    160   __ ldaxr(w13, MemOperand(x0));
    161   __ ldaxr(x14, MemOperand(x0));
    162   __ ldaxrb(w15, MemOperand(x0));
    163   __ ldaxrb(x16, MemOperand(x0));
    164   __ ldaxrh(w17, MemOperand(x0));
    165   __ ldaxrh(x18, MemOperand(x0));
    166   __ ldnp(w19, w20, MemOperand(x0));
    167   __ ldnp(x21, x22, MemOperand(x0));
    168   __ ldp(w23, w24, MemOperand(x0));
    169   __ ldp(w23, w24, MemOperand(x1, 8, PostIndex));
    170   __ ldp(w23, w24, MemOperand(x1, 8, PreIndex));
    171   __ ldp(x25, x26, MemOperand(x0));
    172   __ ldp(x25, x26, MemOperand(x1, 16, PostIndex));
    173   __ ldp(x25, x26, MemOperand(x1, 16, PreIndex));
    174   __ ldpsw(x27, x28, MemOperand(x0));
    175   __ ldpsw(x27, x28, MemOperand(x1, 8, PostIndex));
    176   __ ldpsw(x27, x28, MemOperand(x1, 8, PreIndex));
    177   __ ldr(w29, MemOperand(x0));
    178   __ ldr(w29, MemOperand(x1, 4, PostIndex));
    179   __ ldr(w29, MemOperand(x1, 4, PreIndex));
    180   __ ldr(x2, MemOperand(x0));
    181   __ ldr(x2, MemOperand(x1, 8, PostIndex));
    182   __ ldr(x2, MemOperand(x1, 8, PreIndex));
    183   __ ldrb(w3, MemOperand(x0));
    184   __ ldrb(w3, MemOperand(x1, 1, PostIndex));
    185   __ ldrb(w3, MemOperand(x1, 1, PreIndex));
    186   __ ldrb(x4, MemOperand(x0));
    187   __ ldrb(x4, MemOperand(x1, 1, PostIndex));
    188   __ ldrb(x4, MemOperand(x1, 1, PreIndex));
    189   __ ldrh(w5, MemOperand(x0));
    190   __ ldrh(w5, MemOperand(x1, 2, PostIndex));
    191   __ ldrh(w5, MemOperand(x1, 2, PreIndex));
    192   __ ldrh(x6, MemOperand(x0));
    193   __ ldrh(x6, MemOperand(x1, 2, PostIndex));
    194   __ ldrh(x6, MemOperand(x1, 2, PreIndex));
    195   __ ldrsb(w7, MemOperand(x0));
    196   __ ldrsb(w7, MemOperand(x1, 1, PostIndex));
    197   __ ldrsb(w7, MemOperand(x1, 1, PreIndex));
    198   __ ldrsb(x8, MemOperand(x0));
    199   __ ldrsb(x8, MemOperand(x1, 1, PostIndex));
    200   __ ldrsb(x8, MemOperand(x1, 1, PreIndex));
    201   __ ldrsh(w9, MemOperand(x0));
    202   __ ldrsh(w9, MemOperand(x1, 2, PostIndex));
    203   __ ldrsh(w9, MemOperand(x1, 2, PreIndex));
    204   __ ldrsh(x10, MemOperand(x0));
    205   __ ldrsh(x10, MemOperand(x1, 2, PostIndex));
    206   __ ldrsh(x10, MemOperand(x1, 2, PreIndex));
    207   __ ldrsw(x11, MemOperand(x0));
    208   __ ldrsw(x11, MemOperand(x1, 4, PostIndex));
    209   __ ldrsw(x11, MemOperand(x1, 4, PreIndex));
    210   __ ldur(w12, MemOperand(x0, 7));
    211   __ ldur(x13, MemOperand(x0, 15));
    212   __ ldurb(w14, MemOperand(x0, 1));
    213   __ ldurb(x15, MemOperand(x0, 1));
    214   __ ldurh(w16, MemOperand(x0, 3));
    215   __ ldurh(x17, MemOperand(x0, 3));
    216   __ ldursb(w18, MemOperand(x0, 1));
    217   __ ldursb(x19, MemOperand(x0, 1));
    218   __ ldursh(w20, MemOperand(x0, 3));
    219   __ ldursh(x21, MemOperand(x0, 3));
    220   __ ldursw(x22, MemOperand(x0, 7));
    221   __ ldxp(w23, w24, MemOperand(x0));
    222   __ ldxp(x25, x26, MemOperand(x0));
    223   __ ldxr(w27, MemOperand(x0));
    224   __ ldxr(x28, MemOperand(x0));
    225   __ ldxrb(w29, MemOperand(x0));
    226   __ ldxrb(x2, MemOperand(x0));
    227   __ ldxrh(w3, MemOperand(x0));
    228   __ ldxrh(x4, MemOperand(x0));
    229   __ lsl(w5, w6, 2);
    230   __ lsl(x7, x8, 3);
    231   __ lslv(w9, w10, w11);
    232   __ lslv(x12, x13, x14);
    233   __ lsr(w15, w16, 4);
    234   __ lsr(x17, x18, 5);
    235   __ lsrv(w19, w20, w21);
    236   __ lsrv(x22, x23, x24);
    237   __ madd(w25, w26, w27, w28);
    238   __ madd(x29, x2, x3, x4);
    239   __ mneg(w5, w6, w7);
    240   __ mneg(x8, x9, x10);
    241   __ mov(w11, w12);
    242   __ mov(x13, x14);
    243   __ movk(w15, 130);
    244   __ movk(x16, 131);
    245   __ movn(w17, 132);
    246   __ movn(x18, 133);
    247   __ movz(w19, 134);
    248   __ movz(x20, 135);
    249   __ msub(w22, w23, w24, w25);
    250   __ msub(x26, x27, x28, x29);
    251   __ mul(w2, w3, w4);
    252   __ mul(x5, x6, x7);
    253   __ mvn(w8, w9);
    254   __ mvn(x10, x11);
    255   __ neg(w12, w13);
    256   __ neg(x14, x15);
    257   __ negs(w16, w17);
    258   __ negs(x18, x19);
    259   __ ngc(w20, w21);
    260   __ ngc(x22, x23);
    261   __ ngcs(w24, w25);
    262   __ ngcs(x26, x27);
    263   __ nop();
    264   __ orn(w28, w29, w2);
    265   __ orn(x3, x4, x5);
    266   __ orr(w6, w7, w8);
    267   __ orr(x9, x10, x11);
    268   __ prfm(PLDL1KEEP, MemOperand(x0, 4));
    269   __ prfum(PLDL1KEEP, MemOperand(x0, 1));
    270   __ rbit(w12, w13);
    271   __ rbit(x14, x15);
    272   __ rev(w16, w17);
    273   __ rev(x18, x19);
    274   __ rev16(w20, w21);
    275   __ rev16(x22, x23);
    276   __ rev32(x24, x25);
    277   __ rorv(w26, w27, w28);
    278   __ rorv(x29, x2, x3);
    279   __ sbc(w4, w5, w6);
    280   __ sbc(x7, x8, x9);
    281   __ sbcs(w10, w11, w12);
    282   __ sbcs(x13, x14, x15);
    283   __ sbfiz(w16, w17, 2, 3);
    284   __ sbfiz(x18, x19, 4, 5);
    285   __ sbfx(w22, w23, 6, 7);
    286   __ sbfx(x24, x25, 8, 9);
    287   __ sdiv(w26, w27, w28);
    288   __ sdiv(x29, x2, x3);
    289   __ smulh(x12, x13, x14);
    290   __ stlr(w18, MemOperand(x0));
    291   __ stlr(x19, MemOperand(x0));
    292   __ stlrb(w20, MemOperand(x0));
    293   __ stlrb(x21, MemOperand(x0));
    294   __ stlrh(w22, MemOperand(x0));
    295   __ stlrh(x23, MemOperand(x0));
    296   __ stlxp(w24, w25, w26, MemOperand(x0));
    297   __ stlxp(x27, x28, x29, MemOperand(x0));
    298   __ stlxr(w2, w3, MemOperand(x0));
    299   __ stlxr(x4, x5, MemOperand(x0));
    300   __ stlxrb(w6, w7, MemOperand(x0));
    301   __ stlxrb(x8, x9, MemOperand(x0));
    302   __ stlxrh(w10, w11, MemOperand(x0));
    303   __ stlxrh(x12, x13, MemOperand(x0));
    304   __ stnp(w14, w15, MemOperand(x0));
    305   __ stnp(x16, x17, MemOperand(x0));
    306   __ stp(w18, w19, MemOperand(x0));
    307   __ stp(w18, w19, MemOperand(x1, 8, PostIndex));
    308   __ stp(w18, w19, MemOperand(x1, 8, PreIndex));
    309   __ stp(x20, x21, MemOperand(x0));
    310   __ stp(x20, x21, MemOperand(x1, 16, PostIndex));
    311   __ stp(x20, x21, MemOperand(x1, 16, PreIndex));
    312   __ str(w22, MemOperand(x0));
    313   __ str(w22, MemOperand(x1, 4, PostIndex));
    314   __ str(w22, MemOperand(x1, 4, PreIndex));
    315   __ str(x23, MemOperand(x0));
    316   __ str(x23, MemOperand(x1, 8, PostIndex));
    317   __ str(x23, MemOperand(x1, 8, PreIndex));
    318   __ strb(w24, MemOperand(x0));
    319   __ strb(w24, MemOperand(x1, 1, PostIndex));
    320   __ strb(w24, MemOperand(x1, 1, PreIndex));
    321   __ strb(x25, MemOperand(x0));
    322   __ strb(x25, MemOperand(x1, 1, PostIndex));
    323   __ strb(x25, MemOperand(x1, 1, PreIndex));
    324   __ strh(w26, MemOperand(x0));
    325   __ strh(w26, MemOperand(x1, 2, PostIndex));
    326   __ strh(w26, MemOperand(x1, 2, PreIndex));
    327   __ strh(x27, MemOperand(x0));
    328   __ strh(x27, MemOperand(x1, 2, PostIndex));
    329   __ strh(x27, MemOperand(x1, 2, PreIndex));
    330   __ stur(w28, MemOperand(x0, 7));
    331   __ stur(x29, MemOperand(x0, 15));
    332   __ sturb(w2, MemOperand(x0, 1));
    333   __ sturb(x3, MemOperand(x0, 1));
    334   __ sturh(w4, MemOperand(x0, 3));
    335   __ sturh(x5, MemOperand(x0, 3));
    336   __ stxp(w6, w7, w8, MemOperand(x0));
    337   __ stxp(x9, x10, x11, MemOperand(x0));
    338   __ stxr(w12, w13, MemOperand(x0));
    339   __ stxr(x14, x15, MemOperand(x0));
    340   __ stxrb(w16, w17, MemOperand(x0));
    341   __ stxrb(x18, x19, MemOperand(x0));
    342   __ stxrh(w20, w21, MemOperand(x0));
    343   __ stxrh(x22, x23, MemOperand(x0));
    344   __ sub(w24, w25, w26);
    345   __ sub(x27, x28, x29);
    346   __ subs(w2, w3, w4);
    347   __ subs(x5, x6, x7);
    348   __ sxtb(w8, w9);
    349   __ sxtb(x10, x11);
    350   __ sxth(w12, w13);
    351   __ sxth(x14, x15);
    352   __ sxtw(w16, w17);
    353   __ sxtw(x18, x19);
    354   __ tst(w20, w21);
    355   __ tst(x22, x23);
    356   __ ubfiz(w24, w25, 10, 11);
    357   __ ubfiz(x26, x27, 12, 13);
    358   __ ubfm(w28, w29, 14, 15);
    359   __ ubfm(x2, x3, 1, 2);
    360   __ ubfx(w4, w5, 3, 4);
    361   __ ubfx(x6, x7, 5, 6);
    362   __ udiv(w8, w9, w10);
    363   __ udiv(x11, x12, x13);
    364   __ umulh(x22, x23, x24);
    365   __ uxtb(w28, w29);
    366   __ uxtb(x2, x3);
    367   __ uxth(w4, w5);
    368   __ uxth(x6, x7);
    369   __ uxtw(w8, w9);
    370   __ uxtw(x10, x11);
    371 
    372   // Branch tests.
    373   {
    374     Label end;
    375     // Branch to the next instruction.
    376     __ b(&end);
    377     __ bind(&end);
    378   }
    379   {
    380     Label loop, end;
    381     __ subs(x3, x3, x3);
    382     __ bind(&loop);
    383     // Not-taken branch (the first time).
    384     // Taken branch (the second time).
    385     __ b(&end, ne);
    386     __ cmp(x3, 1);
    387     // Backwards branch.
    388     __ b(&loop);
    389     __ bind(&end);
    390   }
    391 }
    392 
    393 
    394 static void GenerateTestSequenceFP(MacroAssembler* masm) {
    395   ExactAssemblyScope guard(masm,
    396                            masm->GetBuffer()->GetRemainingBytes(),
    397                            ExactAssemblyScope::kMaximumSize);
    398 
    399   // Scalar floating point instructions.
    400   __ fabd(d13, d2, d19);
    401   __ fabd(s8, s10, s30);
    402   __ fabs(d1, d1);
    403   __ fabs(s25, s7);
    404   __ facge(d1, d23, d16);
    405   __ facge(s4, s17, s1);
    406   __ facgt(d2, d21, d24);
    407   __ facgt(s12, s26, s12);
    408   __ fadd(d13, d11, d22);
    409   __ fadd(s27, s19, s8);
    410   __ fccmp(d6, d10, NoFlag, hs);
    411   __ fccmp(s29, s20, NZVFlag, ne);
    412   __ fccmpe(d10, d2, NZCFlag, al);
    413   __ fccmpe(s3, s3, NZVFlag, pl);
    414   __ fcmeq(d19, d8, d10);
    415   __ fcmeq(d0, d18, 0.0);
    416   __ fcmeq(s1, s4, s30);
    417   __ fcmeq(s22, s29, 0.0);
    418   __ fcmge(d27, d18, d1);
    419   __ fcmge(d31, d28, 0.0);
    420   __ fcmge(s31, s19, s9);
    421   __ fcmge(s1, s25, 0.0);
    422   __ fcmgt(d18, d1, d15);
    423   __ fcmgt(d3, d31, 0.0);
    424   __ fcmgt(s11, s25, s2);
    425   __ fcmgt(s17, s16, 0.0);
    426   __ fcmle(d24, d17, 0.0);
    427   __ fcmle(s11, s8, 0.0);
    428   __ fcmlt(d5, d31, 0.0);
    429   __ fcmlt(s18, s23, 0.0);
    430   __ fcmp(d10, d24);
    431   __ fcmp(d13, 0.0);
    432   __ fcmp(s18, s6);
    433   __ fcmp(s16, 0.0);
    434   __ fcmpe(d9, d17);
    435   __ fcmpe(d29, 0.0);
    436   __ fcmpe(s16, s17);
    437   __ fcmpe(s22, 0.0);
    438   __ fcsel(d10, d14, d19, gt);
    439   __ fcsel(s22, s18, s2, ge);
    440   __ fcvt(d4, h24);
    441   __ fcvt(d11, s2);
    442   __ fcvt(h8, d9);
    443   __ fcvt(h12, s1);
    444   __ fcvt(s12, d31);
    445   __ fcvt(s27, h25);
    446   __ fcvtas(d28, d16);
    447   __ fcvtas(s3, s5);
    448   __ fcvtas(w18, d31);
    449   __ fcvtas(w29, s24);
    450   __ fcvtas(x9, d1);
    451   __ fcvtas(x30, s2);
    452   __ fcvtau(d14, d0);
    453   __ fcvtau(s31, s14);
    454   __ fcvtau(w16, d2);
    455   __ fcvtau(w18, s0);
    456   __ fcvtau(x26, d7);
    457   __ fcvtau(x25, s19);
    458   __ fcvtms(d30, d25);
    459   __ fcvtms(s12, s15);
    460   __ fcvtms(w9, d7);
    461   __ fcvtms(w19, s6);
    462   __ fcvtms(x6, d6);
    463   __ fcvtms(x22, s7);
    464   __ fcvtmu(d27, d0);
    465   __ fcvtmu(s8, s22);
    466   __ fcvtmu(w29, d19);
    467   __ fcvtmu(w26, s0);
    468   __ fcvtmu(x13, d5);
    469   __ fcvtmu(x5, s18);
    470   __ fcvtns(d30, d15);
    471   __ fcvtns(s10, s11);
    472   __ fcvtns(w21, d15);
    473   __ fcvtns(w18, s10);
    474   __ fcvtns(x8, d17);
    475   __ fcvtns(x17, s12);
    476   __ fcvtnu(d0, d21);
    477   __ fcvtnu(s6, s25);
    478   __ fcvtnu(w29, d11);
    479   __ fcvtnu(w25, s31);
    480   __ fcvtnu(x30, d11);
    481   __ fcvtnu(x27, s18);
    482   __ fcvtps(d11, d22);
    483   __ fcvtps(s29, s20);
    484   __ fcvtps(w15, d25);
    485   __ fcvtps(w16, s7);
    486   __ fcvtps(x13, d20);
    487   __ fcvtps(x3, s23);
    488   __ fcvtpu(d24, d1);
    489   __ fcvtpu(s14, s24);
    490   __ fcvtpu(w26, d29);
    491   __ fcvtpu(wzr, s26);
    492   __ fcvtpu(x27, d6);
    493   __ fcvtpu(x29, s14);
    494   __ fcvtxn(s12, d12);
    495   __ fcvtzs(d15, d0);
    496   __ fcvtzs(d13, d4, 42);
    497   __ fcvtzs(s8, s11);
    498   __ fcvtzs(s31, s6, 25);
    499   __ fcvtzs(w6, d9);
    500   __ fcvtzs(w25, d10, 20);
    501   __ fcvtzs(w9, s1);
    502   __ fcvtzs(w17, s29, 30);
    503   __ fcvtzs(x19, d2);
    504   __ fcvtzs(x22, d14, 1);
    505   __ fcvtzs(x14, s20);
    506   __ fcvtzs(x3, s30, 33);
    507   __ fcvtzu(d28, d15);
    508   __ fcvtzu(d0, d4, 3);
    509   __ fcvtzu(s2, s5);
    510   __ fcvtzu(s4, s0, 30);
    511   __ fcvtzu(w11, d4);
    512   __ fcvtzu(w7, d24, 32);
    513   __ fcvtzu(w18, s24);
    514   __ fcvtzu(w14, s27, 4);
    515   __ fcvtzu(x22, d11);
    516   __ fcvtzu(x8, d27, 52);
    517   __ fcvtzu(x7, s20);
    518   __ fcvtzu(x22, s7, 44);
    519   __ fdiv(d6, d14, d15);
    520   __ fdiv(s26, s5, s25);
    521   __ fmadd(d18, d26, d12, d30);
    522   __ fmadd(s13, s9, s28, s4);
    523   __ fmax(d12, d5, d5);
    524   __ fmax(s12, s28, s6);
    525   __ fmaxnm(d28, d4, d2);
    526   __ fmaxnm(s6, s10, s8);
    527   __ fmin(d20, d20, d18);
    528   __ fmin(s7, s13, s16);
    529   __ fminnm(d19, d14, d30);
    530   __ fminnm(s0, s1, s1);
    531   __ fmov(d13, d6);
    532   __ fmov(d2, x17);
    533   __ fmov(d8, -2.5000);
    534   __ fmov(s5, s3);
    535   __ fmov(s25, w20);
    536   __ fmov(s21, 2.8750f);
    537   __ fmov(w18, s24);
    538   __ fmov(x18, d2);
    539   __ fmsub(d20, d30, d3, d19);
    540   __ fmsub(s5, s19, s4, s12);
    541   __ fmul(d30, d27, d23);
    542   __ fmul(s25, s17, s15);
    543   __ fmulx(d4, d17, d1);
    544   __ fmulx(s14, s25, s4);
    545   __ fneg(d15, d0);
    546   __ fneg(s14, s15);
    547   __ fnmadd(d0, d16, d22, d31);
    548   __ fnmadd(s0, s18, s26, s18);
    549   __ fnmsub(d19, d12, d15, d21);
    550   __ fnmsub(s29, s0, s11, s26);
    551   __ fnmul(d31, d19, d1);
    552   __ fnmul(s18, s3, s17);
    553   __ frecpe(d7, d21);
    554   __ frecpe(s29, s17);
    555   __ frecps(d11, d26, d17);
    556   __ frecps(s18, s27, s1);
    557   __ frecpx(d15, d18);
    558   __ frecpx(s5, s10);
    559   __ frinta(d16, d30);
    560   __ frinta(s1, s22);
    561   __ frinti(d19, d29);
    562   __ frinti(s14, s21);
    563   __ frintm(d20, d30);
    564   __ frintm(s1, s16);
    565   __ frintn(d30, d1);
    566   __ frintn(s24, s10);
    567   __ frintp(d4, d20);
    568   __ frintp(s13, s3);
    569   __ frintx(d13, d20);
    570   __ frintx(s17, s7);
    571   __ frintz(d0, d8);
    572   __ frintz(s15, s29);
    573   __ frsqrte(d21, d10);
    574   __ frsqrte(s17, s25);
    575   __ frsqrts(d4, d29, d17);
    576   __ frsqrts(s14, s3, s24);
    577   __ fsqrt(d14, d17);
    578   __ fsqrt(s4, s14);
    579   __ fsub(d13, d19, d7);
    580   __ fsub(s3, s21, s27);
    581   __ scvtf(d31, d16);
    582   __ scvtf(d26, d31, 24);
    583   __ scvtf(d6, w16);
    584   __ scvtf(d5, w20, 6);
    585   __ scvtf(d16, x8);
    586   __ scvtf(d15, x8, 10);
    587   __ scvtf(s7, s4);
    588   __ scvtf(s8, s15, 14);
    589   __ scvtf(s29, w10);
    590   __ scvtf(s15, w21, 11);
    591   __ scvtf(s27, x26);
    592   __ scvtf(s26, x12, 38);
    593   __ ucvtf(d0, d9);
    594   __ ucvtf(d5, d22, 47);
    595   __ ucvtf(d30, w27);
    596   __ ucvtf(d3, w19, 1);
    597   __ ucvtf(d28, x21);
    598   __ ucvtf(d27, x30, 35);
    599   __ ucvtf(s11, s5);
    600   __ ucvtf(s0, s23, 14);
    601   __ ucvtf(s20, w19);
    602   __ ucvtf(s21, w22, 18);
    603   __ ucvtf(s6, x13);
    604   __ ucvtf(s7, x2, 21);
    605 }
    606 
    607 
    608 static void GenerateTestSequenceNEON(MacroAssembler* masm) {
    609   ExactAssemblyScope guard(masm,
    610                            masm->GetBuffer()->GetRemainingBytes(),
    611                            ExactAssemblyScope::kMaximumSize);
    612 
    613   // NEON integer instructions.
    614   __ abs(d19, d0);
    615   __ abs(v16.V16B(), v11.V16B());
    616   __ abs(v0.V2D(), v31.V2D());
    617   __ abs(v27.V2S(), v25.V2S());
    618   __ abs(v21.V4H(), v27.V4H());
    619   __ abs(v16.V4S(), v1.V4S());
    620   __ abs(v31.V8B(), v5.V8B());
    621   __ abs(v29.V8H(), v13.V8H());
    622   __ add(d10, d5, d17);
    623   __ add(v31.V16B(), v15.V16B(), v23.V16B());
    624   __ add(v10.V2D(), v31.V2D(), v14.V2D());
    625   __ add(v15.V2S(), v14.V2S(), v19.V2S());
    626   __ add(v27.V4H(), v23.V4H(), v17.V4H());
    627   __ add(v25.V4S(), v28.V4S(), v29.V4S());
    628   __ add(v13.V8B(), v7.V8B(), v18.V8B());
    629   __ add(v4.V8H(), v2.V8H(), v1.V8H());
    630   __ addhn(v10.V2S(), v14.V2D(), v15.V2D());
    631   __ addhn(v10.V4H(), v30.V4S(), v26.V4S());
    632   __ addhn(v31.V8B(), v12.V8H(), v22.V8H());
    633   __ addhn2(v16.V16B(), v21.V8H(), v20.V8H());
    634   __ addhn2(v0.V4S(), v2.V2D(), v17.V2D());
    635   __ addhn2(v31.V8H(), v7.V4S(), v17.V4S());
    636   __ addp(d14, v19.V2D());
    637   __ addp(v3.V16B(), v8.V16B(), v28.V16B());
    638   __ addp(v8.V2D(), v5.V2D(), v17.V2D());
    639   __ addp(v22.V2S(), v30.V2S(), v26.V2S());
    640   __ addp(v29.V4H(), v24.V4H(), v14.V4H());
    641   __ addp(v30.V4S(), v26.V4S(), v24.V4S());
    642   __ addp(v12.V8B(), v26.V8B(), v7.V8B());
    643   __ addp(v17.V8H(), v8.V8H(), v12.V8H());
    644   __ addv(b27, v23.V16B());
    645   __ addv(b12, v20.V8B());
    646   __ addv(h27, v30.V4H());
    647   __ addv(h19, v14.V8H());
    648   __ addv(s14, v27.V4S());
    649   __ and_(v10.V16B(), v8.V16B(), v27.V16B());
    650   __ and_(v5.V8B(), v1.V8B(), v16.V8B());
    651   __ bic(v26.V16B(), v3.V16B(), v24.V16B());
    652   __ bic(v7.V2S(), 0xe4, 16);
    653   __ bic(v28.V4H(), 0x23, 8);
    654   __ bic(v29.V4S(), 0xac);
    655   __ bic(v12.V8B(), v31.V8B(), v21.V8B());
    656   __ bic(v18.V8H(), 0x98);
    657   __ bif(v12.V16B(), v26.V16B(), v8.V16B());
    658   __ bif(v2.V8B(), v23.V8B(), v27.V8B());
    659   __ bit(v8.V16B(), v3.V16B(), v13.V16B());
    660   __ bit(v5.V8B(), v5.V8B(), v23.V8B());
    661   __ bsl(v9.V16B(), v31.V16B(), v23.V16B());
    662   __ bsl(v14.V8B(), v7.V8B(), v3.V8B());
    663   __ cls(v29.V16B(), v5.V16B());
    664   __ cls(v21.V2S(), v0.V2S());
    665   __ cls(v1.V4H(), v12.V4H());
    666   __ cls(v27.V4S(), v10.V4S());
    667   __ cls(v19.V8B(), v4.V8B());
    668   __ cls(v15.V8H(), v14.V8H());
    669   __ clz(v1.V16B(), v4.V16B());
    670   __ clz(v27.V2S(), v17.V2S());
    671   __ clz(v9.V4H(), v9.V4H());
    672   __ clz(v31.V4S(), v15.V4S());
    673   __ clz(v14.V8B(), v19.V8B());
    674   __ clz(v6.V8H(), v11.V8H());
    675   __ cmeq(d18, d5, d29);
    676   __ cmeq(d14, d31, 0);
    677   __ cmeq(v19.V16B(), v3.V16B(), v22.V16B());
    678   __ cmeq(v15.V16B(), v9.V16B(), 0);
    679   __ cmeq(v12.V2D(), v16.V2D(), v10.V2D());
    680   __ cmeq(v8.V2D(), v22.V2D(), 0);
    681   __ cmeq(v2.V2S(), v3.V2S(), v9.V2S());
    682   __ cmeq(v16.V2S(), v25.V2S(), 0);
    683   __ cmeq(v6.V4H(), v23.V4H(), v20.V4H());
    684   __ cmeq(v16.V4H(), v13.V4H(), 0);
    685   __ cmeq(v21.V4S(), v17.V4S(), v2.V4S());
    686   __ cmeq(v6.V4S(), v25.V4S(), 0);
    687   __ cmeq(v16.V8B(), v13.V8B(), v2.V8B());
    688   __ cmeq(v21.V8B(), v16.V8B(), 0);
    689   __ cmeq(v20.V8H(), v7.V8H(), v25.V8H());
    690   __ cmeq(v26.V8H(), v8.V8H(), 0);
    691   __ cmge(d16, d13, d31);
    692   __ cmge(d25, d24, 0);
    693   __ cmge(v17.V16B(), v19.V16B(), v17.V16B());
    694   __ cmge(v22.V16B(), v30.V16B(), 0);
    695   __ cmge(v28.V2D(), v20.V2D(), v26.V2D());
    696   __ cmge(v6.V2D(), v23.V2D(), 0);
    697   __ cmge(v25.V2S(), v22.V2S(), v3.V2S());
    698   __ cmge(v21.V2S(), v11.V2S(), 0);
    699   __ cmge(v16.V4H(), v3.V4H(), v12.V4H());
    700   __ cmge(v23.V4H(), v9.V4H(), 0);
    701   __ cmge(v7.V4S(), v2.V4S(), v11.V4S());
    702   __ cmge(v0.V4S(), v22.V4S(), 0);
    703   __ cmge(v10.V8B(), v30.V8B(), v9.V8B());
    704   __ cmge(v21.V8B(), v8.V8B(), 0);
    705   __ cmge(v2.V8H(), v7.V8H(), v26.V8H());
    706   __ cmge(v19.V8H(), v10.V8H(), 0);
    707   __ cmgt(d6, d13, d1);
    708   __ cmgt(d30, d24, 0);
    709   __ cmgt(v20.V16B(), v25.V16B(), v27.V16B());
    710   __ cmgt(v0.V16B(), v25.V16B(), 0);
    711   __ cmgt(v22.V2D(), v25.V2D(), v1.V2D());
    712   __ cmgt(v16.V2D(), v16.V2D(), 0);
    713   __ cmgt(v5.V2S(), v9.V2S(), v15.V2S());
    714   __ cmgt(v12.V2S(), v18.V2S(), 0);
    715   __ cmgt(v28.V4H(), v18.V4H(), v11.V4H());
    716   __ cmgt(v22.V4H(), v3.V4H(), 0);
    717   __ cmgt(v5.V4S(), v11.V4S(), v27.V4S());
    718   __ cmgt(v13.V4S(), v20.V4S(), 0);
    719   __ cmgt(v27.V8B(), v31.V8B(), v7.V8B());
    720   __ cmgt(v5.V8B(), v0.V8B(), 0);
    721   __ cmgt(v22.V8H(), v28.V8H(), v13.V8H());
    722   __ cmgt(v6.V8H(), v2.V8H(), 0);
    723   __ cmhi(d21, d8, d22);
    724   __ cmhi(v18.V16B(), v19.V16B(), v19.V16B());
    725   __ cmhi(v7.V2D(), v0.V2D(), v21.V2D());
    726   __ cmhi(v15.V2S(), v19.V2S(), v0.V2S());
    727   __ cmhi(v31.V4H(), v7.V4H(), v12.V4H());
    728   __ cmhi(v9.V4S(), v16.V4S(), v22.V4S());
    729   __ cmhi(v7.V8B(), v24.V8B(), v28.V8B());
    730   __ cmhi(v11.V8H(), v10.V8H(), v25.V8H());
    731   __ cmhs(d1, d12, d17);
    732   __ cmhs(v21.V16B(), v25.V16B(), v30.V16B());
    733   __ cmhs(v8.V2D(), v2.V2D(), v26.V2D());
    734   __ cmhs(v1.V2S(), v22.V2S(), v29.V2S());
    735   __ cmhs(v26.V4H(), v30.V4H(), v30.V4H());
    736   __ cmhs(v19.V4S(), v20.V4S(), v16.V4S());
    737   __ cmhs(v1.V8B(), v3.V8B(), v26.V8B());
    738   __ cmhs(v20.V8H(), v28.V8H(), v8.V8H());
    739   __ cmle(d30, d24, 0);
    740   __ cmle(v0.V16B(), v3.V16B(), 0);
    741   __ cmle(v2.V2D(), v30.V2D(), 0);
    742   __ cmle(v7.V2S(), v10.V2S(), 0);
    743   __ cmle(v9.V4H(), v31.V4H(), 0);
    744   __ cmle(v9.V4S(), v18.V4S(), 0);
    745   __ cmle(v21.V8B(), v31.V8B(), 0);
    746   __ cmle(v29.V8H(), v21.V8H(), 0);
    747   __ cmlt(d25, d23, 0);
    748   __ cmlt(v7.V16B(), v21.V16B(), 0);
    749   __ cmlt(v7.V2D(), v30.V2D(), 0);
    750   __ cmlt(v25.V2S(), v28.V2S(), 0);
    751   __ cmlt(v0.V4H(), v11.V4H(), 0);
    752   __ cmlt(v24.V4S(), v5.V4S(), 0);
    753   __ cmlt(v26.V8B(), v11.V8B(), 0);
    754   __ cmlt(v1.V8H(), v21.V8H(), 0);
    755   __ cmtst(d28, d23, d30);
    756   __ cmtst(v26.V16B(), v6.V16B(), v31.V16B());
    757   __ cmtst(v1.V2D(), v21.V2D(), v4.V2D());
    758   __ cmtst(v27.V2S(), v26.V2S(), v20.V2S());
    759   __ cmtst(v26.V4H(), v0.V4H(), v18.V4H());
    760   __ cmtst(v25.V4S(), v16.V4S(), v4.V4S());
    761   __ cmtst(v11.V8B(), v10.V8B(), v9.V8B());
    762   __ cmtst(v0.V8H(), v2.V8H(), v1.V8H());
    763   __ cnt(v25.V16B(), v15.V16B());
    764   __ cnt(v28.V8B(), v6.V8B());
    765   __ dup(v6.V16B(), v7.B(), 7);
    766   __ dup(v9.V16B(), w20);
    767   __ dup(v12.V2D(), v13.D(), 1);
    768   __ dup(v9.V2D(), xzr);
    769   __ dup(v4.V2S(), v26.S(), 2);
    770   __ dup(v3.V2S(), w12);
    771   __ dup(v22.V4H(), v5.H(), 7);
    772   __ dup(v16.V4H(), w25);
    773   __ dup(v20.V4S(), v10.S(), 2);
    774   __ dup(v10.V4S(), w7);
    775   __ dup(v30.V8B(), v30.B(), 2);
    776   __ dup(v31.V8B(), w15);
    777   __ dup(v28.V8H(), v17.H(), 4);
    778   __ dup(v2.V8H(), w3);
    779   __ eor(v29.V16B(), v25.V16B(), v3.V16B());
    780   __ eor(v3.V8B(), v16.V8B(), v28.V8B());
    781   __ ext(v1.V16B(), v26.V16B(), v6.V16B(), 1);
    782   __ ext(v2.V8B(), v30.V8B(), v1.V8B(), 1);
    783   __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x0));
    784   __ ld1(v23.V16B(),
    785          v24.V16B(),
    786          v25.V16B(),
    787          v26.V16B(),
    788          MemOperand(x1, x2, PostIndex));
    789   __ ld1(v5.V16B(),
    790          v6.V16B(),
    791          v7.V16B(),
    792          v8.V16B(),
    793          MemOperand(x1, 64, PostIndex));
    794   __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), MemOperand(x0));
    795   __ ld1(v13.V16B(), v14.V16B(), v15.V16B(), MemOperand(x1, x2, PostIndex));
    796   __ ld1(v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x1, 48, PostIndex));
    797   __ ld1(v17.V16B(), v18.V16B(), MemOperand(x0));
    798   __ ld1(v20.V16B(), v21.V16B(), MemOperand(x1, x2, PostIndex));
    799   __ ld1(v28.V16B(), v29.V16B(), MemOperand(x1, 32, PostIndex));
    800   __ ld1(v29.V16B(), MemOperand(x0));
    801   __ ld1(v21.V16B(), MemOperand(x1, x2, PostIndex));
    802   __ ld1(v4.V16B(), MemOperand(x1, 16, PostIndex));
    803   __ ld1(v4.V1D(), v5.V1D(), v6.V1D(), v7.V1D(), MemOperand(x0));
    804   __ ld1(v17.V1D(),
    805          v18.V1D(),
    806          v19.V1D(),
    807          v20.V1D(),
    808          MemOperand(x1, x2, PostIndex));
    809   __ ld1(v28.V1D(),
    810          v29.V1D(),
    811          v30.V1D(),
    812          v31.V1D(),
    813          MemOperand(x1, 32, PostIndex));
    814   __ ld1(v20.V1D(), v21.V1D(), v22.V1D(), MemOperand(x0));
    815   __ ld1(v19.V1D(), v20.V1D(), v21.V1D(), MemOperand(x1, x2, PostIndex));
    816   __ ld1(v12.V1D(), v13.V1D(), v14.V1D(), MemOperand(x1, 24, PostIndex));
    817   __ ld1(v29.V1D(), v30.V1D(), MemOperand(x0));
    818   __ ld1(v31.V1D(), v0.V1D(), MemOperand(x1, x2, PostIndex));
    819   __ ld1(v3.V1D(), v4.V1D(), MemOperand(x1, 16, PostIndex));
    820   __ ld1(v28.V1D(), MemOperand(x0));
    821   __ ld1(v11.V1D(), MemOperand(x1, x2, PostIndex));
    822   __ ld1(v29.V1D(), MemOperand(x1, 8, PostIndex));
    823   __ ld1(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), MemOperand(x0));
    824   __ ld1(v8.V2D(),
    825          v9.V2D(),
    826          v10.V2D(),
    827          v11.V2D(),
    828          MemOperand(x1, x2, PostIndex));
    829   __ ld1(v14.V2D(),
    830          v15.V2D(),
    831          v16.V2D(),
    832          v17.V2D(),
    833          MemOperand(x1, 64, PostIndex));
    834   __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), MemOperand(x0));
    835   __ ld1(v5.V2D(), v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
    836   __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), MemOperand(x1, 48, PostIndex));
    837   __ ld1(v18.V2D(), v19.V2D(), MemOperand(x0));
    838   __ ld1(v21.V2D(), v22.V2D(), MemOperand(x1, x2, PostIndex));
    839   __ ld1(v17.V2D(), v18.V2D(), MemOperand(x1, 32, PostIndex));
    840   __ ld1(v5.V2D(), MemOperand(x0));
    841   __ ld1(v6.V2D(), MemOperand(x1, x2, PostIndex));
    842   __ ld1(v15.V2D(), MemOperand(x1, 16, PostIndex));
    843   __ ld1(v30.V2S(), v31.V2S(), v0.V2S(), v1.V2S(), MemOperand(x0));
    844   __ ld1(v24.V2S(),
    845          v25.V2S(),
    846          v26.V2S(),
    847          v27.V2S(),
    848          MemOperand(x1, x2, PostIndex));
    849   __ ld1(v27.V2S(),
    850          v28.V2S(),
    851          v29.V2S(),
    852          v30.V2S(),
    853          MemOperand(x1, 32, PostIndex));
    854   __ ld1(v11.V2S(), v12.V2S(), v13.V2S(), MemOperand(x0));
    855   __ ld1(v8.V2S(), v9.V2S(), v10.V2S(), MemOperand(x1, x2, PostIndex));
    856   __ ld1(v31.V2S(), v0.V2S(), v1.V2S(), MemOperand(x1, 24, PostIndex));
    857   __ ld1(v0.V2S(), v1.V2S(), MemOperand(x0));
    858   __ ld1(v13.V2S(), v14.V2S(), MemOperand(x1, x2, PostIndex));
    859   __ ld1(v3.V2S(), v4.V2S(), MemOperand(x1, 16, PostIndex));
    860   __ ld1(v26.V2S(), MemOperand(x0));
    861   __ ld1(v0.V2S(), MemOperand(x1, x2, PostIndex));
    862   __ ld1(v11.V2S(), MemOperand(x1, 8, PostIndex));
    863   __ ld1(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), MemOperand(x0));
    864   __ ld1(v24.V4H(),
    865          v25.V4H(),
    866          v26.V4H(),
    867          v27.V4H(),
    868          MemOperand(x1, x2, PostIndex));
    869   __ ld1(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), MemOperand(x1, 32, PostIndex));
    870   __ ld1(v30.V4H(), v31.V4H(), v0.V4H(), MemOperand(x0));
    871   __ ld1(v25.V4H(), v26.V4H(), v27.V4H(), MemOperand(x1, x2, PostIndex));
    872   __ ld1(v3.V4H(), v4.V4H(), v5.V4H(), MemOperand(x1, 24, PostIndex));
    873   __ ld1(v3.V4H(), v4.V4H(), MemOperand(x0));
    874   __ ld1(v3.V4H(), v4.V4H(), MemOperand(x1, x2, PostIndex));
    875   __ ld1(v23.V4H(), v24.V4H(), MemOperand(x1, 16, PostIndex));
    876   __ ld1(v26.V4H(), MemOperand(x0));
    877   __ ld1(v1.V4H(), MemOperand(x1, x2, PostIndex));
    878   __ ld1(v14.V4H(), MemOperand(x1, 8, PostIndex));
    879   __ ld1(v26.V4S(), v27.V4S(), v28.V4S(), v29.V4S(), MemOperand(x0));
    880   __ ld1(v28.V4S(),
    881          v29.V4S(),
    882          v30.V4S(),
    883          v31.V4S(),
    884          MemOperand(x1, x2, PostIndex));
    885   __ ld1(v4.V4S(), v5.V4S(), v6.V4S(), v7.V4S(), MemOperand(x1, 64, PostIndex));
    886   __ ld1(v2.V4S(), v3.V4S(), v4.V4S(), MemOperand(x0));
    887   __ ld1(v22.V4S(), v23.V4S(), v24.V4S(), MemOperand(x1, x2, PostIndex));
    888   __ ld1(v15.V4S(), v16.V4S(), v17.V4S(), MemOperand(x1, 48, PostIndex));
    889   __ ld1(v20.V4S(), v21.V4S(), MemOperand(x0));
    890   __ ld1(v30.V4S(), v31.V4S(), MemOperand(x1, x2, PostIndex));
    891   __ ld1(v11.V4S(), v12.V4S(), MemOperand(x1, 32, PostIndex));
    892   __ ld1(v15.V4S(), MemOperand(x0));
    893   __ ld1(v12.V4S(), MemOperand(x1, x2, PostIndex));
    894   __ ld1(v0.V4S(), MemOperand(x1, 16, PostIndex));
    895   __ ld1(v17.V8B(), v18.V8B(), v19.V8B(), v20.V8B(), MemOperand(x0));
    896   __ ld1(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), MemOperand(x1, x2, PostIndex));
    897   __ ld1(v9.V8B(),
    898          v10.V8B(),
    899          v11.V8B(),
    900          v12.V8B(),
    901          MemOperand(x1, 32, PostIndex));
    902   __ ld1(v4.V8B(), v5.V8B(), v6.V8B(), MemOperand(x0));
    903   __ ld1(v2.V8B(), v3.V8B(), v4.V8B(), MemOperand(x1, x2, PostIndex));
    904   __ ld1(v12.V8B(), v13.V8B(), v14.V8B(), MemOperand(x1, 24, PostIndex));
    905   __ ld1(v10.V8B(), v11.V8B(), MemOperand(x0));
    906   __ ld1(v11.V8B(), v12.V8B(), MemOperand(x1, x2, PostIndex));
    907   __ ld1(v27.V8B(), v28.V8B(), MemOperand(x1, 16, PostIndex));
    908   __ ld1(v31.V8B(), MemOperand(x0));
    909   __ ld1(v10.V8B(), MemOperand(x1, x2, PostIndex));
    910   __ ld1(v28.V8B(), MemOperand(x1, 8, PostIndex));
    911   __ ld1(v5.V8H(), v6.V8H(), v7.V8H(), v8.V8H(), MemOperand(x0));
    912   __ ld1(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
    913   __ ld1(v10.V8H(),
    914          v11.V8H(),
    915          v12.V8H(),
    916          v13.V8H(),
    917          MemOperand(x1, 64, PostIndex));
    918   __ ld1(v26.V8H(), v27.V8H(), v28.V8H(), MemOperand(x0));
    919   __ ld1(v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
    920   __ ld1(v17.V8H(), v18.V8H(), v19.V8H(), MemOperand(x1, 48, PostIndex));
    921   __ ld1(v4.V8H(), v5.V8H(), MemOperand(x0));
    922   __ ld1(v21.V8H(), v22.V8H(), MemOperand(x1, x2, PostIndex));
    923   __ ld1(v4.V8H(), v5.V8H(), MemOperand(x1, 32, PostIndex));
    924   __ ld1(v9.V8H(), MemOperand(x0));
    925   __ ld1(v27.V8H(), MemOperand(x1, x2, PostIndex));
    926   __ ld1(v26.V8H(), MemOperand(x1, 16, PostIndex));
    927   __ ld1(v19.B(), 1, MemOperand(x0));
    928   __ ld1(v12.B(), 3, MemOperand(x1, x2, PostIndex));
    929   __ ld1(v27.B(), 12, MemOperand(x1, 1, PostIndex));
    930   __ ld1(v10.D(), 1, MemOperand(x0));
    931   __ ld1(v26.D(), 1, MemOperand(x1, x2, PostIndex));
    932   __ ld1(v7.D(), 1, MemOperand(x1, 8, PostIndex));
    933   __ ld1(v19.H(), 5, MemOperand(x0));
    934   __ ld1(v10.H(), 1, MemOperand(x1, x2, PostIndex));
    935   __ ld1(v5.H(), 4, MemOperand(x1, 2, PostIndex));
    936   __ ld1(v21.S(), 2, MemOperand(x0));
    937   __ ld1(v13.S(), 2, MemOperand(x1, x2, PostIndex));
    938   __ ld1(v1.S(), 2, MemOperand(x1, 4, PostIndex));
    939   __ ld1r(v2.V16B(), MemOperand(x0));
    940   __ ld1r(v2.V16B(), MemOperand(x1, x2, PostIndex));
    941   __ ld1r(v22.V16B(), MemOperand(x1, 1, PostIndex));
    942   __ ld1r(v25.V1D(), MemOperand(x0));
    943   __ ld1r(v9.V1D(), MemOperand(x1, x2, PostIndex));
    944   __ ld1r(v23.V1D(), MemOperand(x1, 8, PostIndex));
    945   __ ld1r(v19.V2D(), MemOperand(x0));
    946   __ ld1r(v21.V2D(), MemOperand(x1, x2, PostIndex));
    947   __ ld1r(v30.V2D(), MemOperand(x1, 8, PostIndex));
    948   __ ld1r(v24.V2S(), MemOperand(x0));
    949   __ ld1r(v26.V2S(), MemOperand(x1, x2, PostIndex));
    950   __ ld1r(v28.V2S(), MemOperand(x1, 4, PostIndex));
    951   __ ld1r(v19.V4H(), MemOperand(x0));
    952   __ ld1r(v1.V4H(), MemOperand(x1, x2, PostIndex));
    953   __ ld1r(v21.V4H(), MemOperand(x1, 2, PostIndex));
    954   __ ld1r(v15.V4S(), MemOperand(x0));
    955   __ ld1r(v21.V4S(), MemOperand(x1, x2, PostIndex));
    956   __ ld1r(v23.V4S(), MemOperand(x1, 4, PostIndex));
    957   __ ld1r(v26.V8B(), MemOperand(x0));
    958   __ ld1r(v14.V8B(), MemOperand(x1, x2, PostIndex));
    959   __ ld1r(v19.V8B(), MemOperand(x1, 1, PostIndex));
    960   __ ld1r(v13.V8H(), MemOperand(x0));
    961   __ ld1r(v30.V8H(), MemOperand(x1, x2, PostIndex));
    962   __ ld1r(v27.V8H(), MemOperand(x1, 2, PostIndex));
    963   __ ld2(v21.V16B(), v22.V16B(), MemOperand(x0));
    964   __ ld2(v21.V16B(), v22.V16B(), MemOperand(x1, x2, PostIndex));
    965   __ ld2(v12.V16B(), v13.V16B(), MemOperand(x1, 32, PostIndex));
    966   __ ld2(v14.V2D(), v15.V2D(), MemOperand(x0));
    967   __ ld2(v0.V2D(), v1.V2D(), MemOperand(x1, x2, PostIndex));
    968   __ ld2(v12.V2D(), v13.V2D(), MemOperand(x1, 32, PostIndex));
    969   __ ld2(v27.V2S(), v28.V2S(), MemOperand(x0));
    970   __ ld2(v2.V2S(), v3.V2S(), MemOperand(x1, x2, PostIndex));
    971   __ ld2(v12.V2S(), v13.V2S(), MemOperand(x1, 16, PostIndex));
    972   __ ld2(v9.V4H(), v10.V4H(), MemOperand(x0));
    973   __ ld2(v23.V4H(), v24.V4H(), MemOperand(x1, x2, PostIndex));
    974   __ ld2(v1.V4H(), v2.V4H(), MemOperand(x1, 16, PostIndex));
    975   __ ld2(v20.V4S(), v21.V4S(), MemOperand(x0));
    976   __ ld2(v10.V4S(), v11.V4S(), MemOperand(x1, x2, PostIndex));
    977   __ ld2(v24.V4S(), v25.V4S(), MemOperand(x1, 32, PostIndex));
    978   __ ld2(v17.V8B(), v18.V8B(), MemOperand(x0));
    979   __ ld2(v13.V8B(), v14.V8B(), MemOperand(x1, x2, PostIndex));
    980   __ ld2(v7.V8B(), v8.V8B(), MemOperand(x1, 16, PostIndex));
    981   __ ld2(v30.V8H(), v31.V8H(), MemOperand(x0));
    982   __ ld2(v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
    983   __ ld2(v13.V8H(), v14.V8H(), MemOperand(x1, 32, PostIndex));
    984   __ ld2(v5.B(), v6.B(), 12, MemOperand(x0));
    985   __ ld2(v16.B(), v17.B(), 7, MemOperand(x1, x2, PostIndex));
    986   __ ld2(v29.B(), v30.B(), 2, MemOperand(x1, 2, PostIndex));
    987   __ ld2(v11.D(), v12.D(), 1, MemOperand(x0));
    988   __ ld2(v26.D(), v27.D(), 0, MemOperand(x1, x2, PostIndex));
    989   __ ld2(v25.D(), v26.D(), 0, MemOperand(x1, 16, PostIndex));
    990   __ ld2(v18.H(), v19.H(), 7, MemOperand(x0));
    991   __ ld2(v17.H(), v18.H(), 5, MemOperand(x1, x2, PostIndex));
    992   __ ld2(v30.H(), v31.H(), 2, MemOperand(x1, 4, PostIndex));
    993   __ ld2(v29.S(), v30.S(), 3, MemOperand(x0));
    994   __ ld2(v28.S(), v29.S(), 0, MemOperand(x1, x2, PostIndex));
    995   __ ld2(v6.S(), v7.S(), 1, MemOperand(x1, 8, PostIndex));
    996   __ ld2r(v26.V16B(), v27.V16B(), MemOperand(x0));
    997   __ ld2r(v21.V16B(), v22.V16B(), MemOperand(x1, x2, PostIndex));
    998   __ ld2r(v5.V16B(), v6.V16B(), MemOperand(x1, 2, PostIndex));
    999   __ ld2r(v26.V1D(), v27.V1D(), MemOperand(x0));
   1000   __ ld2r(v14.V1D(), v15.V1D(), MemOperand(x1, x2, PostIndex));
   1001   __ ld2r(v23.V1D(), v24.V1D(), MemOperand(x1, 16, PostIndex));
   1002   __ ld2r(v11.V2D(), v12.V2D(), MemOperand(x0));
   1003   __ ld2r(v29.V2D(), v30.V2D(), MemOperand(x1, x2, PostIndex));
   1004   __ ld2r(v15.V2D(), v16.V2D(), MemOperand(x1, 16, PostIndex));
   1005   __ ld2r(v26.V2S(), v27.V2S(), MemOperand(x0));
   1006   __ ld2r(v22.V2S(), v23.V2S(), MemOperand(x1, x2, PostIndex));
   1007   __ ld2r(v2.V2S(), v3.V2S(), MemOperand(x1, 8, PostIndex));
   1008   __ ld2r(v2.V4H(), v3.V4H(), MemOperand(x0));
   1009   __ ld2r(v9.V4H(), v10.V4H(), MemOperand(x1, x2, PostIndex));
   1010   __ ld2r(v6.V4H(), v7.V4H(), MemOperand(x1, 4, PostIndex));
   1011   __ ld2r(v7.V4S(), v8.V4S(), MemOperand(x0));
   1012   __ ld2r(v19.V4S(), v20.V4S(), MemOperand(x1, x2, PostIndex));
   1013   __ ld2r(v21.V4S(), v22.V4S(), MemOperand(x1, 8, PostIndex));
   1014   __ ld2r(v26.V8B(), v27.V8B(), MemOperand(x0));
   1015   __ ld2r(v20.V8B(), v21.V8B(), MemOperand(x1, x2, PostIndex));
   1016   __ ld2r(v11.V8B(), v12.V8B(), MemOperand(x1, 2, PostIndex));
   1017   __ ld2r(v12.V8H(), v13.V8H(), MemOperand(x0));
   1018   __ ld2r(v6.V8H(), v7.V8H(), MemOperand(x1, x2, PostIndex));
   1019   __ ld2r(v25.V8H(), v26.V8H(), MemOperand(x1, 4, PostIndex));
   1020   __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), MemOperand(x0));
   1021   __ ld3(v28.V16B(), v29.V16B(), v30.V16B(), MemOperand(x1, x2, PostIndex));
   1022   __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), MemOperand(x1, 48, PostIndex));
   1023   __ ld3(v21.V2D(), v22.V2D(), v23.V2D(), MemOperand(x0));
   1024   __ ld3(v18.V2D(), v19.V2D(), v20.V2D(), MemOperand(x1, x2, PostIndex));
   1025   __ ld3(v27.V2D(), v28.V2D(), v29.V2D(), MemOperand(x1, 48, PostIndex));
   1026   __ ld3(v7.V2S(), v8.V2S(), v9.V2S(), MemOperand(x0));
   1027   __ ld3(v20.V2S(), v21.V2S(), v22.V2S(), MemOperand(x1, x2, PostIndex));
   1028   __ ld3(v26.V2S(), v27.V2S(), v28.V2S(), MemOperand(x1, 24, PostIndex));
   1029   __ ld3(v27.V4H(), v28.V4H(), v29.V4H(), MemOperand(x0));
   1030   __ ld3(v28.V4H(), v29.V4H(), v30.V4H(), MemOperand(x1, x2, PostIndex));
   1031   __ ld3(v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x1, 24, PostIndex));
   1032   __ ld3(v2.V4S(), v3.V4S(), v4.V4S(), MemOperand(x0));
   1033   __ ld3(v24.V4S(), v25.V4S(), v26.V4S(), MemOperand(x1, x2, PostIndex));
   1034   __ ld3(v11.V4S(), v12.V4S(), v13.V4S(), MemOperand(x1, 48, PostIndex));
   1035   __ ld3(v29.V8B(), v30.V8B(), v31.V8B(), MemOperand(x0));
   1036   __ ld3(v1.V8B(), v2.V8B(), v3.V8B(), MemOperand(x1, x2, PostIndex));
   1037   __ ld3(v12.V8B(), v13.V8B(), v14.V8B(), MemOperand(x1, 24, PostIndex));
   1038   __ ld3(v22.V8H(), v23.V8H(), v24.V8H(), MemOperand(x0));
   1039   __ ld3(v13.V8H(), v14.V8H(), v15.V8H(), MemOperand(x1, x2, PostIndex));
   1040   __ ld3(v28.V8H(), v29.V8H(), v30.V8H(), MemOperand(x1, 48, PostIndex));
   1041   __ ld3(v21.B(), v22.B(), v23.B(), 11, MemOperand(x0));
   1042   __ ld3(v5.B(), v6.B(), v7.B(), 9, MemOperand(x1, x2, PostIndex));
   1043   __ ld3(v23.B(), v24.B(), v25.B(), 0, MemOperand(x1, 3, PostIndex));
   1044   __ ld3(v16.D(), v17.D(), v18.D(), 0, MemOperand(x0));
   1045   __ ld3(v30.D(), v31.D(), v0.D(), 0, MemOperand(x1, x2, PostIndex));
   1046   __ ld3(v28.D(), v29.D(), v30.D(), 1, MemOperand(x1, 24, PostIndex));
   1047   __ ld3(v13.H(), v14.H(), v15.H(), 2, MemOperand(x0));
   1048   __ ld3(v22.H(), v23.H(), v24.H(), 7, MemOperand(x1, x2, PostIndex));
   1049   __ ld3(v14.H(), v15.H(), v16.H(), 3, MemOperand(x1, 6, PostIndex));
   1050   __ ld3(v22.S(), v23.S(), v24.S(), 3, MemOperand(x0));
   1051   __ ld3(v30.S(), v31.S(), v0.S(), 2, MemOperand(x1, x2, PostIndex));
   1052   __ ld3(v12.S(), v13.S(), v14.S(), 1, MemOperand(x1, 12, PostIndex));
   1053   __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x0));
   1054   __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x1, x2, PostIndex));
   1055   __ ld3r(v3.V16B(), v4.V16B(), v5.V16B(), MemOperand(x1, 3, PostIndex));
   1056   __ ld3r(v4.V1D(), v5.V1D(), v6.V1D(), MemOperand(x0));
   1057   __ ld3r(v7.V1D(), v8.V1D(), v9.V1D(), MemOperand(x1, x2, PostIndex));
   1058   __ ld3r(v17.V1D(), v18.V1D(), v19.V1D(), MemOperand(x1, 24, PostIndex));
   1059   __ ld3r(v16.V2D(), v17.V2D(), v18.V2D(), MemOperand(x0));
   1060   __ ld3r(v20.V2D(), v21.V2D(), v22.V2D(), MemOperand(x1, x2, PostIndex));
   1061   __ ld3r(v14.V2D(), v15.V2D(), v16.V2D(), MemOperand(x1, 24, PostIndex));
   1062   __ ld3r(v10.V2S(), v11.V2S(), v12.V2S(), MemOperand(x0));
   1063   __ ld3r(v0.V2S(), v1.V2S(), v2.V2S(), MemOperand(x1, x2, PostIndex));
   1064   __ ld3r(v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x1, 12, PostIndex));
   1065   __ ld3r(v22.V4H(), v23.V4H(), v24.V4H(), MemOperand(x0));
   1066   __ ld3r(v6.V4H(), v7.V4H(), v8.V4H(), MemOperand(x1, x2, PostIndex));
   1067   __ ld3r(v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x1, 6, PostIndex));
   1068   __ ld3r(v26.V4S(), v27.V4S(), v28.V4S(), MemOperand(x0));
   1069   __ ld3r(v0.V4S(), v1.V4S(), v2.V4S(), MemOperand(x1, x2, PostIndex));
   1070   __ ld3r(v30.V4S(), v31.V4S(), v0.V4S(), MemOperand(x1, 12, PostIndex));
   1071   __ ld3r(v2.V8B(), v3.V8B(), v4.V8B(), MemOperand(x0));
   1072   __ ld3r(v10.V8B(), v11.V8B(), v12.V8B(), MemOperand(x1, x2, PostIndex));
   1073   __ ld3r(v28.V8B(), v29.V8B(), v30.V8B(), MemOperand(x1, 3, PostIndex));
   1074   __ ld3r(v6.V8H(), v7.V8H(), v8.V8H(), MemOperand(x0));
   1075   __ ld3r(v29.V8H(), v30.V8H(), v31.V8H(), MemOperand(x1, x2, PostIndex));
   1076   __ ld3r(v7.V8H(), v8.V8H(), v9.V8H(), MemOperand(x1, 6, PostIndex));
   1077   __ ld4(v3.V16B(), v4.V16B(), v5.V16B(), v6.V16B(), MemOperand(x0));
   1078   __ ld4(v2.V16B(),
   1079          v3.V16B(),
   1080          v4.V16B(),
   1081          v5.V16B(),
   1082          MemOperand(x1, x2, PostIndex));
   1083   __ ld4(v5.V16B(),
   1084          v6.V16B(),
   1085          v7.V16B(),
   1086          v8.V16B(),
   1087          MemOperand(x1, 64, PostIndex));
   1088   __ ld4(v18.V2D(), v19.V2D(), v20.V2D(), v21.V2D(), MemOperand(x0));
   1089   __ ld4(v4.V2D(), v5.V2D(), v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
   1090   __ ld4(v29.V2D(),
   1091          v30.V2D(),
   1092          v31.V2D(),
   1093          v0.V2D(),
   1094          MemOperand(x1, 64, PostIndex));
   1095   __ ld4(v27.V2S(), v28.V2S(), v29.V2S(), v30.V2S(), MemOperand(x0));
   1096   __ ld4(v24.V2S(),
   1097          v25.V2S(),
   1098          v26.V2S(),
   1099          v27.V2S(),
   1100          MemOperand(x1, x2, PostIndex));
   1101   __ ld4(v4.V2S(), v5.V2S(), v6.V2S(), v7.V2S(), MemOperand(x1, 32, PostIndex));
   1102   __ ld4(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), MemOperand(x0));
   1103   __ ld4(v23.V4H(),
   1104          v24.V4H(),
   1105          v25.V4H(),
   1106          v26.V4H(),
   1107          MemOperand(x1, x2, PostIndex));
   1108   __ ld4(v2.V4H(), v3.V4H(), v4.V4H(), v5.V4H(), MemOperand(x1, 32, PostIndex));
   1109   __ ld4(v7.V4S(), v8.V4S(), v9.V4S(), v10.V4S(), MemOperand(x0));
   1110   __ ld4(v28.V4S(),
   1111          v29.V4S(),
   1112          v30.V4S(),
   1113          v31.V4S(),
   1114          MemOperand(x1, x2, PostIndex));
   1115   __ ld4(v29.V4S(),
   1116          v30.V4S(),
   1117          v31.V4S(),
   1118          v0.V4S(),
   1119          MemOperand(x1, 64, PostIndex));
   1120   __ ld4(v15.V8B(), v16.V8B(), v17.V8B(), v18.V8B(), MemOperand(x0));
   1121   __ ld4(v27.V8B(),
   1122          v28.V8B(),
   1123          v29.V8B(),
   1124          v30.V8B(),
   1125          MemOperand(x1, x2, PostIndex));
   1126   __ ld4(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), MemOperand(x1, 32, PostIndex));
   1127   __ ld4(v25.V8H(), v26.V8H(), v27.V8H(), v28.V8H(), MemOperand(x0));
   1128   __ ld4(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
   1129   __ ld4(v20.V8H(),
   1130          v21.V8H(),
   1131          v22.V8H(),
   1132          v23.V8H(),
   1133          MemOperand(x1, 64, PostIndex));
   1134   __ ld4(v20.B(), v21.B(), v22.B(), v23.B(), 3, MemOperand(x0));
   1135   __ ld4(v12.B(), v13.B(), v14.B(), v15.B(), 3, MemOperand(x1, x2, PostIndex));
   1136   __ ld4(v27.B(), v28.B(), v29.B(), v30.B(), 6, MemOperand(x1, 4, PostIndex));
   1137   __ ld4(v28.D(), v29.D(), v30.D(), v31.D(), 1, MemOperand(x0));
   1138   __ ld4(v15.D(), v16.D(), v17.D(), v18.D(), 1, MemOperand(x1, x2, PostIndex));
   1139   __ ld4(v16.D(), v17.D(), v18.D(), v19.D(), 1, MemOperand(x1, 32, PostIndex));
   1140   __ ld4(v2.H(), v3.H(), v4.H(), v5.H(), 6, MemOperand(x0));
   1141   __ ld4(v5.H(), v6.H(), v7.H(), v8.H(), 3, MemOperand(x1, x2, PostIndex));
   1142   __ ld4(v7.H(), v8.H(), v9.H(), v10.H(), 6, MemOperand(x1, 8, PostIndex));
   1143   __ ld4(v6.S(), v7.S(), v8.S(), v9.S(), 1, MemOperand(x0));
   1144   __ ld4(v25.S(), v26.S(), v27.S(), v28.S(), 2, MemOperand(x1, x2, PostIndex));
   1145   __ ld4(v8.S(), v9.S(), v10.S(), v11.S(), 3, MemOperand(x1, 16, PostIndex));
   1146   __ ld4r(v14.V16B(), v15.V16B(), v16.V16B(), v17.V16B(), MemOperand(x0));
   1147   __ ld4r(v13.V16B(),
   1148           v14.V16B(),
   1149           v15.V16B(),
   1150           v16.V16B(),
   1151           MemOperand(x1, x2, PostIndex));
   1152   __ ld4r(v9.V16B(),
   1153           v10.V16B(),
   1154           v11.V16B(),
   1155           v12.V16B(),
   1156           MemOperand(x1, 4, PostIndex));
   1157   __ ld4r(v8.V1D(), v9.V1D(), v10.V1D(), v11.V1D(), MemOperand(x0));
   1158   __ ld4r(v4.V1D(),
   1159           v5.V1D(),
   1160           v6.V1D(),
   1161           v7.V1D(),
   1162           MemOperand(x1, x2, PostIndex));
   1163   __ ld4r(v26.V1D(),
   1164           v27.V1D(),
   1165           v28.V1D(),
   1166           v29.V1D(),
   1167           MemOperand(x1, 32, PostIndex));
   1168   __ ld4r(v19.V2D(), v20.V2D(), v21.V2D(), v22.V2D(), MemOperand(x0));
   1169   __ ld4r(v28.V2D(),
   1170           v29.V2D(),
   1171           v30.V2D(),
   1172           v31.V2D(),
   1173           MemOperand(x1, x2, PostIndex));
   1174   __ ld4r(v15.V2D(),
   1175           v16.V2D(),
   1176           v17.V2D(),
   1177           v18.V2D(),
   1178           MemOperand(x1, 32, PostIndex));
   1179   __ ld4r(v31.V2S(), v0.V2S(), v1.V2S(), v2.V2S(), MemOperand(x0));
   1180   __ ld4r(v28.V2S(),
   1181           v29.V2S(),
   1182           v30.V2S(),
   1183           v31.V2S(),
   1184           MemOperand(x1, x2, PostIndex));
   1185   __ ld4r(v11.V2S(),
   1186           v12.V2S(),
   1187           v13.V2S(),
   1188           v14.V2S(),
   1189           MemOperand(x1, 16, PostIndex));
   1190   __ ld4r(v19.V4H(), v20.V4H(), v21.V4H(), v22.V4H(), MemOperand(x0));
   1191   __ ld4r(v22.V4H(),
   1192           v23.V4H(),
   1193           v24.V4H(),
   1194           v25.V4H(),
   1195           MemOperand(x1, x2, PostIndex));
   1196   __ ld4r(v20.V4H(),
   1197           v21.V4H(),
   1198           v22.V4H(),
   1199           v23.V4H(),
   1200           MemOperand(x1, 8, PostIndex));
   1201   __ ld4r(v16.V4S(), v17.V4S(), v18.V4S(), v19.V4S(), MemOperand(x0));
   1202   __ ld4r(v25.V4S(),
   1203           v26.V4S(),
   1204           v27.V4S(),
   1205           v28.V4S(),
   1206           MemOperand(x1, x2, PostIndex));
   1207   __ ld4r(v23.V4S(),
   1208           v24.V4S(),
   1209           v25.V4S(),
   1210           v26.V4S(),
   1211           MemOperand(x1, 16, PostIndex));
   1212   __ ld4r(v22.V8B(), v23.V8B(), v24.V8B(), v25.V8B(), MemOperand(x0));
   1213   __ ld4r(v27.V8B(),
   1214           v28.V8B(),
   1215           v29.V8B(),
   1216           v30.V8B(),
   1217           MemOperand(x1, x2, PostIndex));
   1218   __ ld4r(v29.V8B(),
   1219           v30.V8B(),
   1220           v31.V8B(),
   1221           v0.V8B(),
   1222           MemOperand(x1, 4, PostIndex));
   1223   __ ld4r(v28.V8H(), v29.V8H(), v30.V8H(), v31.V8H(), MemOperand(x0));
   1224   __ ld4r(v25.V8H(),
   1225           v26.V8H(),
   1226           v27.V8H(),
   1227           v28.V8H(),
   1228           MemOperand(x1, x2, PostIndex));
   1229   __ ld4r(v22.V8H(),
   1230           v23.V8H(),
   1231           v24.V8H(),
   1232           v25.V8H(),
   1233           MemOperand(x1, 8, PostIndex));
   1234   __ mla(v29.V16B(), v7.V16B(), v26.V16B());
   1235   __ mla(v6.V2S(), v4.V2S(), v14.V2S());
   1236   __ mla(v9.V2S(), v11.V2S(), v0.S(), 2);
   1237   __ mla(v5.V4H(), v17.V4H(), v25.V4H());
   1238   __ mla(v24.V4H(), v7.V4H(), v11.H(), 3);
   1239   __ mla(v12.V4S(), v3.V4S(), v4.V4S());
   1240   __ mla(v10.V4S(), v7.V4S(), v7.S(), 3);
   1241   __ mla(v3.V8B(), v16.V8B(), v9.V8B());
   1242   __ mla(v19.V8H(), v22.V8H(), v18.V8H());
   1243   __ mla(v6.V8H(), v2.V8H(), v0.H(), 0);
   1244   __ mls(v23.V16B(), v10.V16B(), v11.V16B());
   1245   __ mls(v14.V2S(), v31.V2S(), v22.V2S());
   1246   __ mls(v28.V2S(), v13.V2S(), v1.S(), 3);
   1247   __ mls(v2.V4H(), v19.V4H(), v13.V4H());
   1248   __ mls(v18.V4H(), v15.V4H(), v12.H(), 6);
   1249   __ mls(v6.V4S(), v11.V4S(), v16.V4S());
   1250   __ mls(v23.V4S(), v16.V4S(), v10.S(), 2);
   1251   __ mls(v26.V8B(), v13.V8B(), v23.V8B());
   1252   __ mls(v10.V8H(), v10.V8H(), v12.V8H());
   1253   __ mls(v14.V8H(), v0.V8H(), v14.H(), 7);
   1254   __ mov(b22, v1.B(), 3);
   1255   __ mov(d7, v13.D(), 1);
   1256   __ mov(h26, v21.H(), 2);
   1257   __ mov(s26, v19.S(), 0);
   1258   __ mov(v26.V16B(), v11.V16B());
   1259   __ mov(v20.V8B(), v0.V8B());
   1260   __ mov(v19.B(), 13, v6.B(), 4);
   1261   __ mov(v4.B(), 13, w19);
   1262   __ mov(v11.D(), 1, v8.D(), 0);
   1263   __ mov(v3.D(), 0, x30);
   1264   __ mov(v29.H(), 4, v11.H(), 7);
   1265   __ mov(v2.H(), 6, w6);
   1266   __ mov(v22.S(), 0, v5.S(), 2);
   1267   __ mov(v24.S(), 3, w8);
   1268   __ mov(w18, v1.S(), 3);
   1269   __ mov(x28, v21.D(), 0);
   1270   __ movi(d24, 0xffff0000ffffff);
   1271   __ movi(v29.V16B(), 0x80);
   1272   __ movi(v12.V2D(), 0xffff00ff00ffff00);
   1273   __ movi(v12.V2S(), 0xec, LSL, 24);
   1274   __ movi(v10.V2S(), 0x4c, MSL, 16);
   1275   __ movi(v26.V4H(), 0xc0, LSL);
   1276   __ movi(v24.V4S(), 0x98, LSL, 16);
   1277   __ movi(v1.V4S(), 0xde, MSL, 16);
   1278   __ movi(v21.V8B(), 0x4d);
   1279   __ movi(v29.V8H(), 0x69, LSL);
   1280   __ mul(v1.V16B(), v15.V16B(), v17.V16B());
   1281   __ mul(v21.V2S(), v19.V2S(), v29.V2S());
   1282   __ mul(v19.V2S(), v5.V2S(), v3.S(), 0);
   1283   __ mul(v29.V4H(), v11.V4H(), v2.V4H());
   1284   __ mul(v2.V4H(), v7.V4H(), v0.H(), 0);
   1285   __ mul(v25.V4S(), v26.V4S(), v16.V4S());
   1286   __ mul(v26.V4S(), v6.V4S(), v15.S(), 2);
   1287   __ mul(v11.V8B(), v15.V8B(), v31.V8B());
   1288   __ mul(v20.V8H(), v31.V8H(), v15.V8H());
   1289   __ mul(v29.V8H(), v5.V8H(), v9.H(), 4);
   1290   __ mvn(v13.V16B(), v21.V16B());
   1291   __ mvn(v28.V8B(), v19.V8B());
   1292   __ mvni(v25.V2S(), 0xb8, LSL, 8);
   1293   __ mvni(v17.V2S(), 0x6c, MSL, 16);
   1294   __ mvni(v29.V4H(), 0x48, LSL);
   1295   __ mvni(v20.V4S(), 0x7a, LSL, 16);
   1296   __ mvni(v0.V4S(), 0x1e, MSL, 8);
   1297   __ mvni(v31.V8H(), 0x3e, LSL);
   1298   __ neg(d25, d11);
   1299   __ neg(v4.V16B(), v9.V16B());
   1300   __ neg(v11.V2D(), v25.V2D());
   1301   __ neg(v7.V2S(), v18.V2S());
   1302   __ neg(v7.V4H(), v15.V4H());
   1303   __ neg(v17.V4S(), v18.V4S());
   1304   __ neg(v20.V8B(), v17.V8B());
   1305   __ neg(v0.V8H(), v11.V8H());
   1306   __ orn(v13.V16B(), v11.V16B(), v31.V16B());
   1307   __ orn(v22.V8B(), v16.V8B(), v22.V8B());
   1308   __ orr(v17.V16B(), v17.V16B(), v23.V16B());
   1309   __ orr(v8.V2S(), 0xe3);
   1310   __ orr(v11.V4H(), 0x97, 8);
   1311   __ orr(v7.V4S(), 0xab);
   1312   __ orr(v8.V8B(), v4.V8B(), v3.V8B());
   1313   __ orr(v31.V8H(), 0xb0, 8);
   1314   __ pmul(v11.V16B(), v18.V16B(), v23.V16B());
   1315   __ pmul(v8.V8B(), v24.V8B(), v5.V8B());
   1316   __ pmull(v24.V8H(), v18.V8B(), v22.V8B());
   1317   __ pmull2(v13.V8H(), v3.V16B(), v21.V16B());
   1318   __ raddhn(v22.V2S(), v10.V2D(), v21.V2D());
   1319   __ raddhn(v5.V4H(), v13.V4S(), v13.V4S());
   1320   __ raddhn(v10.V8B(), v17.V8H(), v26.V8H());
   1321   __ raddhn2(v9.V16B(), v29.V8H(), v13.V8H());
   1322   __ raddhn2(v27.V4S(), v23.V2D(), v26.V2D());
   1323   __ raddhn2(v0.V8H(), v29.V4S(), v7.V4S());
   1324   __ rbit(v22.V16B(), v15.V16B());
   1325   __ rbit(v30.V8B(), v3.V8B());
   1326   __ rev16(v31.V16B(), v27.V16B());
   1327   __ rev16(v12.V8B(), v26.V8B());
   1328   __ rev32(v5.V16B(), v4.V16B());
   1329   __ rev32(v16.V4H(), v26.V4H());
   1330   __ rev32(v20.V8B(), v3.V8B());
   1331   __ rev32(v20.V8H(), v28.V8H());
   1332   __ rev64(v9.V16B(), v19.V16B());
   1333   __ rev64(v5.V2S(), v16.V2S());
   1334   __ rev64(v7.V4H(), v31.V4H());
   1335   __ rev64(v15.V4S(), v26.V4S());
   1336   __ rev64(v25.V8B(), v9.V8B());
   1337   __ rev64(v11.V8H(), v5.V8H());
   1338   __ rshrn(v18.V2S(), v13.V2D(), 1);
   1339   __ rshrn(v25.V4H(), v30.V4S(), 2);
   1340   __ rshrn(v13.V8B(), v9.V8H(), 8);
   1341   __ rshrn2(v3.V16B(), v6.V8H(), 8);
   1342   __ rshrn2(v0.V4S(), v29.V2D(), 25);
   1343   __ rshrn2(v27.V8H(), v26.V4S(), 15);
   1344   __ rsubhn(v15.V2S(), v25.V2D(), v4.V2D());
   1345   __ rsubhn(v23.V4H(), v9.V4S(), v3.V4S());
   1346   __ rsubhn(v6.V8B(), v30.V8H(), v24.V8H());
   1347   __ rsubhn2(v4.V16B(), v24.V8H(), v20.V8H());
   1348   __ rsubhn2(v1.V4S(), v23.V2D(), v22.V2D());
   1349   __ rsubhn2(v19.V8H(), v2.V4S(), v20.V4S());
   1350   __ saba(v28.V16B(), v9.V16B(), v25.V16B());
   1351   __ saba(v9.V2S(), v28.V2S(), v20.V2S());
   1352   __ saba(v17.V4H(), v22.V4H(), v22.V4H());
   1353   __ saba(v29.V4S(), v5.V4S(), v27.V4S());
   1354   __ saba(v20.V8B(), v21.V8B(), v18.V8B());
   1355   __ saba(v27.V8H(), v17.V8H(), v30.V8H());
   1356   __ sabal(v20.V2D(), v13.V2S(), v7.V2S());
   1357   __ sabal(v4.V4S(), v12.V4H(), v4.V4H());
   1358   __ sabal(v23.V8H(), v24.V8B(), v20.V8B());
   1359   __ sabal2(v26.V2D(), v21.V4S(), v18.V4S());
   1360   __ sabal2(v27.V4S(), v28.V8H(), v8.V8H());
   1361   __ sabal2(v12.V8H(), v16.V16B(), v21.V16B());
   1362   __ sabd(v0.V16B(), v15.V16B(), v13.V16B());
   1363   __ sabd(v15.V2S(), v7.V2S(), v30.V2S());
   1364   __ sabd(v17.V4H(), v17.V4H(), v12.V4H());
   1365   __ sabd(v7.V4S(), v4.V4S(), v22.V4S());
   1366   __ sabd(v23.V8B(), v3.V8B(), v26.V8B());
   1367   __ sabd(v20.V8H(), v28.V8H(), v5.V8H());
   1368   __ sabdl(v27.V2D(), v22.V2S(), v20.V2S());
   1369   __ sabdl(v31.V4S(), v20.V4H(), v23.V4H());
   1370   __ sabdl(v0.V8H(), v20.V8B(), v27.V8B());
   1371   __ sabdl2(v31.V2D(), v11.V4S(), v3.V4S());
   1372   __ sabdl2(v26.V4S(), v11.V8H(), v27.V8H());
   1373   __ sabdl2(v6.V8H(), v8.V16B(), v18.V16B());
   1374   __ sadalp(v8.V1D(), v26.V2S());
   1375   __ sadalp(v12.V2D(), v26.V4S());
   1376   __ sadalp(v12.V2S(), v26.V4H());
   1377   __ sadalp(v4.V4H(), v1.V8B());
   1378   __ sadalp(v15.V4S(), v17.V8H());
   1379   __ sadalp(v21.V8H(), v25.V16B());
   1380   __ saddl(v5.V2D(), v10.V2S(), v14.V2S());
   1381   __ saddl(v18.V4S(), v3.V4H(), v15.V4H());
   1382   __ saddl(v15.V8H(), v2.V8B(), v23.V8B());
   1383   __ saddl2(v16.V2D(), v16.V4S(), v27.V4S());
   1384   __ saddl2(v6.V4S(), v24.V8H(), v0.V8H());
   1385   __ saddl2(v7.V8H(), v20.V16B(), v28.V16B());
   1386   __ saddlp(v10.V1D(), v25.V2S());
   1387   __ saddlp(v15.V2D(), v16.V4S());
   1388   __ saddlp(v18.V2S(), v10.V4H());
   1389   __ saddlp(v29.V4H(), v26.V8B());
   1390   __ saddlp(v10.V4S(), v1.V8H());
   1391   __ saddlp(v0.V8H(), v21.V16B());
   1392   __ saddlv(d12, v7.V4S());
   1393   __ saddlv(h14, v28.V16B());
   1394   __ saddlv(h30, v30.V8B());
   1395   __ saddlv(s27, v3.V4H());
   1396   __ saddlv(s16, v16.V8H());
   1397   __ saddw(v24.V2D(), v11.V2D(), v18.V2S());
   1398   __ saddw(v13.V4S(), v12.V4S(), v6.V4H());
   1399   __ saddw(v19.V8H(), v19.V8H(), v7.V8B());
   1400   __ saddw2(v27.V2D(), v9.V2D(), v26.V4S());
   1401   __ saddw2(v19.V4S(), v23.V4S(), v21.V8H());
   1402   __ saddw2(v15.V8H(), v25.V8H(), v30.V16B());
   1403   __ shadd(v7.V16B(), v4.V16B(), v9.V16B());
   1404   __ shadd(v29.V2S(), v25.V2S(), v24.V2S());
   1405   __ shadd(v31.V4H(), v10.V4H(), v13.V4H());
   1406   __ shadd(v21.V4S(), v16.V4S(), v8.V4S());
   1407   __ shadd(v14.V8B(), v29.V8B(), v22.V8B());
   1408   __ shadd(v19.V8H(), v24.V8H(), v20.V8H());
   1409   __ shl(d22, d25, 23);
   1410   __ shl(v5.V16B(), v17.V16B(), 7);
   1411   __ shl(v2.V2D(), v4.V2D(), 21);
   1412   __ shl(v4.V2S(), v3.V2S(), 26);
   1413   __ shl(v3.V4H(), v28.V4H(), 8);
   1414   __ shl(v4.V4S(), v31.V4S(), 24);
   1415   __ shl(v18.V8B(), v16.V8B(), 2);
   1416   __ shl(v0.V8H(), v11.V8H(), 3);
   1417   __ shll(v5.V2D(), v24.V2S(), 32);
   1418   __ shll(v26.V4S(), v20.V4H(), 16);
   1419   __ shll(v5.V8H(), v9.V8B(), 8);
   1420   __ shll2(v21.V2D(), v28.V4S(), 32);
   1421   __ shll2(v22.V4S(), v1.V8H(), 16);
   1422   __ shll2(v30.V8H(), v25.V16B(), 8);
   1423   __ shrn(v5.V2S(), v1.V2D(), 28);
   1424   __ shrn(v29.V4H(), v18.V4S(), 7);
   1425   __ shrn(v17.V8B(), v29.V8H(), 2);
   1426   __ shrn2(v5.V16B(), v30.V8H(), 3);
   1427   __ shrn2(v24.V4S(), v1.V2D(), 1);
   1428   __ shrn2(v5.V8H(), v14.V4S(), 16);
   1429   __ shsub(v30.V16B(), v22.V16B(), v23.V16B());
   1430   __ shsub(v22.V2S(), v27.V2S(), v25.V2S());
   1431   __ shsub(v13.V4H(), v22.V4H(), v1.V4H());
   1432   __ shsub(v10.V4S(), v8.V4S(), v23.V4S());
   1433   __ shsub(v6.V8B(), v9.V8B(), v31.V8B());
   1434   __ shsub(v8.V8H(), v31.V8H(), v8.V8H());
   1435   __ sli(d19, d29, 20);
   1436   __ sli(v9.V16B(), v24.V16B(), 0);
   1437   __ sli(v22.V2D(), v9.V2D(), 10);
   1438   __ sli(v11.V2S(), v27.V2S(), 20);
   1439   __ sli(v16.V4H(), v15.V4H(), 5);
   1440   __ sli(v8.V4S(), v8.V4S(), 25);
   1441   __ sli(v10.V8B(), v30.V8B(), 0);
   1442   __ sli(v7.V8H(), v28.V8H(), 6);
   1443   __ smax(v18.V16B(), v8.V16B(), v1.V16B());
   1444   __ smax(v30.V2S(), v5.V2S(), v1.V2S());
   1445   __ smax(v17.V4H(), v25.V4H(), v19.V4H());
   1446   __ smax(v1.V4S(), v24.V4S(), v31.V4S());
   1447   __ smax(v17.V8B(), v24.V8B(), v24.V8B());
   1448   __ smax(v11.V8H(), v26.V8H(), v10.V8H());
   1449   __ smaxp(v12.V16B(), v14.V16B(), v7.V16B());
   1450   __ smaxp(v31.V2S(), v24.V2S(), v6.V2S());
   1451   __ smaxp(v10.V4H(), v29.V4H(), v10.V4H());
   1452   __ smaxp(v18.V4S(), v11.V4S(), v7.V4S());
   1453   __ smaxp(v21.V8B(), v0.V8B(), v18.V8B());
   1454   __ smaxp(v26.V8H(), v8.V8H(), v15.V8H());
   1455   __ smaxv(b4, v5.V16B());
   1456   __ smaxv(b23, v0.V8B());
   1457   __ smaxv(h6, v0.V4H());
   1458   __ smaxv(h24, v8.V8H());
   1459   __ smaxv(s3, v16.V4S());
   1460   __ smin(v24.V16B(), v8.V16B(), v18.V16B());
   1461   __ smin(v29.V2S(), v8.V2S(), v23.V2S());
   1462   __ smin(v6.V4H(), v11.V4H(), v21.V4H());
   1463   __ smin(v24.V4S(), v23.V4S(), v15.V4S());
   1464   __ smin(v8.V8B(), v16.V8B(), v4.V8B());
   1465   __ smin(v12.V8H(), v1.V8H(), v10.V8H());
   1466   __ sminp(v13.V16B(), v18.V16B(), v28.V16B());
   1467   __ sminp(v22.V2S(), v28.V2S(), v16.V2S());
   1468   __ sminp(v15.V4H(), v12.V4H(), v5.V4H());
   1469   __ sminp(v15.V4S(), v17.V4S(), v8.V4S());
   1470   __ sminp(v21.V8B(), v2.V8B(), v6.V8B());
   1471   __ sminp(v21.V8H(), v12.V8H(), v6.V8H());
   1472   __ sminv(b8, v6.V16B());
   1473   __ sminv(b6, v18.V8B());
   1474   __ sminv(h20, v1.V4H());
   1475   __ sminv(h7, v17.V8H());
   1476   __ sminv(s21, v4.V4S());
   1477   __ smlal(v24.V2D(), v14.V2S(), v21.V2S());
   1478   __ smlal(v31.V2D(), v3.V2S(), v14.S(), 2);
   1479   __ smlal(v7.V4S(), v20.V4H(), v21.V4H());
   1480   __ smlal(v19.V4S(), v16.V4H(), v9.H(), 3);
   1481   __ smlal(v29.V8H(), v14.V8B(), v1.V8B());
   1482   __ smlal2(v30.V2D(), v26.V4S(), v16.V4S());
   1483   __ smlal2(v31.V2D(), v30.V4S(), v1.S(), 0);
   1484   __ smlal2(v17.V4S(), v6.V8H(), v3.V8H());
   1485   __ smlal2(v11.V4S(), v31.V8H(), v5.H(), 7);
   1486   __ smlal2(v30.V8H(), v16.V16B(), v29.V16B());
   1487   __ smlsl(v1.V2D(), v20.V2S(), v17.V2S());
   1488   __ smlsl(v29.V2D(), v12.V2S(), v5.S(), 3);
   1489   __ smlsl(v0.V4S(), v26.V4H(), v1.V4H());
   1490   __ smlsl(v3.V4S(), v5.V4H(), v6.H(), 5);
   1491   __ smlsl(v4.V8H(), v0.V8B(), v26.V8B());
   1492   __ smlsl2(v14.V2D(), v14.V4S(), v5.V4S());
   1493   __ smlsl2(v15.V2D(), v5.V4S(), v0.S(), 1);
   1494   __ smlsl2(v29.V4S(), v17.V8H(), v31.V8H());
   1495   __ smlsl2(v6.V4S(), v15.V8H(), v9.H(), 6);
   1496   __ smlsl2(v30.V8H(), v15.V16B(), v15.V16B());
   1497   __ smov(w21, v6.B(), 3);
   1498   __ smov(w13, v26.H(), 7);
   1499   __ smov(x24, v16.B(), 7);
   1500   __ smov(x7, v4.H(), 3);
   1501   __ smov(x29, v7.S(), 1);
   1502   __ smull(v4.V2D(), v29.V2S(), v17.V2S());
   1503   __ smull(v30.V2D(), v21.V2S(), v6.S(), 2);
   1504   __ smull(v23.V4S(), v5.V4H(), v23.V4H());
   1505   __ smull(v8.V4S(), v9.V4H(), v2.H(), 1);
   1506   __ smull(v31.V8H(), v17.V8B(), v1.V8B());
   1507   __ smull2(v3.V2D(), v3.V4S(), v23.V4S());
   1508   __ smull2(v15.V2D(), v29.V4S(), v6.S(), 1);
   1509   __ smull2(v19.V4S(), v20.V8H(), v30.V8H());
   1510   __ smull2(v6.V4S(), v10.V8H(), v7.H(), 4);
   1511   __ smull2(v25.V8H(), v8.V16B(), v27.V16B());
   1512   __ sqabs(b3, b15);
   1513   __ sqabs(d14, d9);
   1514   __ sqabs(h31, h28);
   1515   __ sqabs(s8, s0);
   1516   __ sqabs(v14.V16B(), v7.V16B());
   1517   __ sqabs(v23.V2D(), v19.V2D());
   1518   __ sqabs(v10.V2S(), v24.V2S());
   1519   __ sqabs(v31.V4H(), v19.V4H());
   1520   __ sqabs(v23.V4S(), v0.V4S());
   1521   __ sqabs(v29.V8B(), v23.V8B());
   1522   __ sqabs(v17.V8H(), v21.V8H());
   1523   __ sqadd(b9, b23, b13);
   1524   __ sqadd(d2, d25, d26);
   1525   __ sqadd(h7, h29, h25);
   1526   __ sqadd(s11, s7, s24);
   1527   __ sqadd(v20.V16B(), v16.V16B(), v29.V16B());
   1528   __ sqadd(v23.V2D(), v30.V2D(), v28.V2D());
   1529   __ sqadd(v8.V2S(), v19.V2S(), v2.V2S());
   1530   __ sqadd(v20.V4H(), v12.V4H(), v31.V4H());
   1531   __ sqadd(v14.V4S(), v15.V4S(), v17.V4S());
   1532   __ sqadd(v2.V8B(), v29.V8B(), v13.V8B());
   1533   __ sqadd(v7.V8H(), v19.V8H(), v14.V8H());
   1534   __ sqdmlal(d15, s5, s30);
   1535   __ sqdmlal(d24, s10, v2.S(), 3);
   1536   __ sqdmlal(s9, h19, h8);
   1537   __ sqdmlal(s14, h1, v12.H(), 3);
   1538   __ sqdmlal(v30.V2D(), v5.V2S(), v31.V2S());
   1539   __ sqdmlal(v25.V2D(), v14.V2S(), v10.S(), 1);
   1540   __ sqdmlal(v19.V4S(), v17.V4H(), v16.V4H());
   1541   __ sqdmlal(v8.V4S(), v5.V4H(), v8.H(), 1);
   1542   __ sqdmlal2(v1.V2D(), v23.V4S(), v3.V4S());
   1543   __ sqdmlal2(v19.V2D(), v0.V4S(), v9.S(), 0);
   1544   __ sqdmlal2(v26.V4S(), v22.V8H(), v11.V8H());
   1545   __ sqdmlal2(v6.V4S(), v28.V8H(), v13.H(), 4);
   1546   __ sqdmlsl(d10, s29, s20);
   1547   __ sqdmlsl(d10, s9, v10.S(), 1);
   1548   __ sqdmlsl(s30, h9, h24);
   1549   __ sqdmlsl(s13, h24, v6.H(), 1);
   1550   __ sqdmlsl(v27.V2D(), v10.V2S(), v20.V2S());
   1551   __ sqdmlsl(v23.V2D(), v23.V2S(), v3.S(), 3);
   1552   __ sqdmlsl(v7.V4S(), v17.V4H(), v29.V4H());
   1553   __ sqdmlsl(v22.V4S(), v21.V4H(), v3.H(), 4);
   1554   __ sqdmlsl2(v12.V2D(), v7.V4S(), v22.V4S());
   1555   __ sqdmlsl2(v20.V2D(), v25.V4S(), v8.S(), 0);
   1556   __ sqdmlsl2(v25.V4S(), v26.V8H(), v18.V8H());
   1557   __ sqdmlsl2(v25.V4S(), v19.V8H(), v5.H(), 0);
   1558   __ sqdmulh(h17, h27, h12);
   1559   __ sqdmulh(h16, h5, v11.H(), 0);
   1560   __ sqdmulh(s1, s19, s16);
   1561   __ sqdmulh(s1, s16, v2.S(), 0);
   1562   __ sqdmulh(v28.V2S(), v1.V2S(), v8.V2S());
   1563   __ sqdmulh(v28.V2S(), v8.V2S(), v3.S(), 0);
   1564   __ sqdmulh(v11.V4H(), v25.V4H(), v5.V4H());
   1565   __ sqdmulh(v30.V4H(), v14.V4H(), v8.H(), 5);
   1566   __ sqdmulh(v25.V4S(), v21.V4S(), v13.V4S());
   1567   __ sqdmulh(v23.V4S(), v2.V4S(), v10.S(), 3);
   1568   __ sqdmulh(v26.V8H(), v5.V8H(), v23.V8H());
   1569   __ sqdmulh(v4.V8H(), v22.V8H(), v4.H(), 3);
   1570   __ sqdmull(d25, s2, s26);
   1571   __ sqdmull(d30, s14, v5.S(), 1);
   1572   __ sqdmull(s29, h18, h11);
   1573   __ sqdmull(s11, h13, v7.H(), 6);
   1574   __ sqdmull(v23.V2D(), v9.V2S(), v8.V2S());
   1575   __ sqdmull(v18.V2D(), v29.V2S(), v4.S(), 1);
   1576   __ sqdmull(v17.V4S(), v24.V4H(), v7.V4H());
   1577   __ sqdmull(v8.V4S(), v15.V4H(), v5.H(), 1);
   1578   __ sqdmull2(v28.V2D(), v14.V4S(), v2.V4S());
   1579   __ sqdmull2(v1.V2D(), v24.V4S(), v13.S(), 2);
   1580   __ sqdmull2(v11.V4S(), v17.V8H(), v31.V8H());
   1581   __ sqdmull2(v1.V4S(), v20.V8H(), v11.H(), 3);
   1582   __ sqneg(b2, b0);
   1583   __ sqneg(d24, d2);
   1584   __ sqneg(h29, h3);
   1585   __ sqneg(s4, s9);
   1586   __ sqneg(v14.V16B(), v29.V16B());
   1587   __ sqneg(v30.V2D(), v12.V2D());
   1588   __ sqneg(v28.V2S(), v26.V2S());
   1589   __ sqneg(v4.V4H(), v4.V4H());
   1590   __ sqneg(v9.V4S(), v8.V4S());
   1591   __ sqneg(v20.V8B(), v20.V8B());
   1592   __ sqneg(v27.V8H(), v10.V8H());
   1593   __ sqrdmulh(h7, h24, h0);
   1594   __ sqrdmulh(h14, h3, v4.H(), 6);
   1595   __ sqrdmulh(s27, s19, s24);
   1596   __ sqrdmulh(s31, s21, v4.S(), 0);
   1597   __ sqrdmulh(v18.V2S(), v25.V2S(), v1.V2S());
   1598   __ sqrdmulh(v22.V2S(), v5.V2S(), v13.S(), 0);
   1599   __ sqrdmulh(v22.V4H(), v24.V4H(), v9.V4H());
   1600   __ sqrdmulh(v13.V4H(), v2.V4H(), v12.H(), 6);
   1601   __ sqrdmulh(v9.V4S(), v27.V4S(), v2.V4S());
   1602   __ sqrdmulh(v3.V4S(), v23.V4S(), v7.S(), 1);
   1603   __ sqrdmulh(v2.V8H(), v0.V8H(), v7.V8H());
   1604   __ sqrdmulh(v16.V8H(), v9.V8H(), v8.H(), 2);
   1605   __ sqrshl(b8, b21, b13);
   1606   __ sqrshl(d29, d7, d20);
   1607   __ sqrshl(h28, h14, h10);
   1608   __ sqrshl(s26, s18, s2);
   1609   __ sqrshl(v18.V16B(), v31.V16B(), v26.V16B());
   1610   __ sqrshl(v28.V2D(), v4.V2D(), v0.V2D());
   1611   __ sqrshl(v3.V2S(), v6.V2S(), v0.V2S());
   1612   __ sqrshl(v1.V4H(), v18.V4H(), v22.V4H());
   1613   __ sqrshl(v16.V4S(), v25.V4S(), v7.V4S());
   1614   __ sqrshl(v0.V8B(), v21.V8B(), v5.V8B());
   1615   __ sqrshl(v30.V8H(), v19.V8H(), v8.V8H());
   1616   __ sqrshrn(b6, h21, 4);
   1617   __ sqrshrn(h14, s17, 11);
   1618   __ sqrshrn(s25, d27, 10);
   1619   __ sqrshrn(v6.V2S(), v13.V2D(), 18);
   1620   __ sqrshrn(v5.V4H(), v9.V4S(), 15);
   1621   __ sqrshrn(v19.V8B(), v12.V8H(), 1);
   1622   __ sqrshrn2(v19.V16B(), v21.V8H(), 7);
   1623   __ sqrshrn2(v29.V4S(), v24.V2D(), 13);
   1624   __ sqrshrn2(v12.V8H(), v2.V4S(), 10);
   1625   __ sqrshrun(b16, h9, 5);
   1626   __ sqrshrun(h3, s24, 15);
   1627   __ sqrshrun(s16, d18, 8);
   1628   __ sqrshrun(v28.V2S(), v23.V2D(), 8);
   1629   __ sqrshrun(v31.V4H(), v25.V4S(), 10);
   1630   __ sqrshrun(v19.V8B(), v23.V8H(), 2);
   1631   __ sqrshrun2(v24.V16B(), v0.V8H(), 8);
   1632   __ sqrshrun2(v22.V4S(), v1.V2D(), 23);
   1633   __ sqrshrun2(v28.V8H(), v21.V4S(), 13);
   1634   __ sqshl(b6, b21, b8);
   1635   __ sqshl(b11, b26, 2);
   1636   __ sqshl(d29, d0, d4);
   1637   __ sqshl(d21, d7, 35);
   1638   __ sqshl(h20, h25, h17);
   1639   __ sqshl(h20, h0, 8);
   1640   __ sqshl(s29, s13, s4);
   1641   __ sqshl(s10, s11, 20);
   1642   __ sqshl(v8.V16B(), v18.V16B(), v28.V16B());
   1643   __ sqshl(v29.V16B(), v29.V16B(), 2);
   1644   __ sqshl(v8.V2D(), v31.V2D(), v16.V2D());
   1645   __ sqshl(v7.V2D(), v14.V2D(), 37);
   1646   __ sqshl(v0.V2S(), v26.V2S(), v7.V2S());
   1647   __ sqshl(v5.V2S(), v11.V2S(), 19);
   1648   __ sqshl(v11.V4H(), v30.V4H(), v0.V4H());
   1649   __ sqshl(v1.V4H(), v18.V4H(), 7);
   1650   __ sqshl(v22.V4S(), v3.V4S(), v30.V4S());
   1651   __ sqshl(v16.V4S(), v15.V4S(), 28);
   1652   __ sqshl(v6.V8B(), v28.V8B(), v25.V8B());
   1653   __ sqshl(v0.V8B(), v15.V8B(), 0);
   1654   __ sqshl(v6.V8H(), v16.V8H(), v30.V8H());
   1655   __ sqshl(v3.V8H(), v20.V8H(), 14);
   1656   __ sqshlu(b13, b14, 6);
   1657   __ sqshlu(d0, d16, 44);
   1658   __ sqshlu(h5, h29, 15);
   1659   __ sqshlu(s29, s8, 13);
   1660   __ sqshlu(v27.V16B(), v20.V16B(), 2);
   1661   __ sqshlu(v24.V2D(), v12.V2D(), 11);
   1662   __ sqshlu(v12.V2S(), v19.V2S(), 22);
   1663   __ sqshlu(v8.V4H(), v12.V4H(), 11);
   1664   __ sqshlu(v18.V4S(), v3.V4S(), 8);
   1665   __ sqshlu(v3.V8B(), v10.V8B(), 1);
   1666   __ sqshlu(v30.V8H(), v24.V8H(), 4);
   1667   __ sqshrn(b1, h28, 1);
   1668   __ sqshrn(h31, s7, 10);
   1669   __ sqshrn(s4, d10, 24);
   1670   __ sqshrn(v10.V2S(), v1.V2D(), 29);
   1671   __ sqshrn(v3.V4H(), v13.V4S(), 14);
   1672   __ sqshrn(v27.V8B(), v6.V8H(), 7);
   1673   __ sqshrn2(v14.V16B(), v23.V8H(), 1);
   1674   __ sqshrn2(v25.V4S(), v22.V2D(), 27);
   1675   __ sqshrn2(v31.V8H(), v12.V4S(), 10);
   1676   __ sqshrun(b9, h0, 1);
   1677   __ sqshrun(h11, s6, 7);
   1678   __ sqshrun(s13, d12, 13);
   1679   __ sqshrun(v10.V2S(), v30.V2D(), 1);
   1680   __ sqshrun(v31.V4H(), v3.V4S(), 11);
   1681   __ sqshrun(v28.V8B(), v30.V8H(), 8);
   1682   __ sqshrun2(v16.V16B(), v27.V8H(), 3);
   1683   __ sqshrun2(v27.V4S(), v14.V2D(), 18);
   1684   __ sqshrun2(v23.V8H(), v14.V4S(), 1);
   1685   __ sqsub(b19, b29, b11);
   1686   __ sqsub(d21, d31, d6);
   1687   __ sqsub(h18, h10, h19);
   1688   __ sqsub(s6, s5, s0);
   1689   __ sqsub(v21.V16B(), v22.V16B(), v0.V16B());
   1690   __ sqsub(v22.V2D(), v10.V2D(), v17.V2D());
   1691   __ sqsub(v8.V2S(), v21.V2S(), v2.V2S());
   1692   __ sqsub(v18.V4H(), v25.V4H(), v27.V4H());
   1693   __ sqsub(v13.V4S(), v3.V4S(), v6.V4S());
   1694   __ sqsub(v28.V8B(), v29.V8B(), v16.V8B());
   1695   __ sqsub(v17.V8H(), v6.V8H(), v10.V8H());
   1696   __ sqxtn(b27, h26);
   1697   __ sqxtn(h17, s11);
   1698   __ sqxtn(s22, d31);
   1699   __ sqxtn(v26.V2S(), v5.V2D());
   1700   __ sqxtn(v13.V4H(), v7.V4S());
   1701   __ sqxtn(v19.V8B(), v19.V8H());
   1702   __ sqxtn2(v19.V16B(), v3.V8H());
   1703   __ sqxtn2(v23.V4S(), v1.V2D());
   1704   __ sqxtn2(v13.V8H(), v3.V4S());
   1705   __ sqxtun(b26, h9);
   1706   __ sqxtun(h19, s12);
   1707   __ sqxtun(s3, d6);
   1708   __ sqxtun(v29.V2S(), v26.V2D());
   1709   __ sqxtun(v26.V4H(), v10.V4S());
   1710   __ sqxtun(v7.V8B(), v29.V8H());
   1711   __ sqxtun2(v21.V16B(), v14.V8H());
   1712   __ sqxtun2(v24.V4S(), v15.V2D());
   1713   __ sqxtun2(v30.V8H(), v1.V4S());
   1714   __ srhadd(v21.V16B(), v17.V16B(), v15.V16B());
   1715   __ srhadd(v28.V2S(), v21.V2S(), v29.V2S());
   1716   __ srhadd(v9.V4H(), v1.V4H(), v30.V4H());
   1717   __ srhadd(v24.V4S(), v0.V4S(), v2.V4S());
   1718   __ srhadd(v6.V8B(), v17.V8B(), v15.V8B());
   1719   __ srhadd(v5.V8H(), v7.V8H(), v21.V8H());
   1720   __ sri(d14, d14, 49);
   1721   __ sri(v23.V16B(), v8.V16B(), 4);
   1722   __ sri(v20.V2D(), v13.V2D(), 20);
   1723   __ sri(v16.V2S(), v2.V2S(), 24);
   1724   __ sri(v5.V4H(), v23.V4H(), 11);
   1725   __ sri(v27.V4S(), v15.V4S(), 23);
   1726   __ sri(v19.V8B(), v29.V8B(), 4);
   1727   __ sri(v7.V8H(), v29.V8H(), 3);
   1728   __ srshl(d2, d9, d26);
   1729   __ srshl(v29.V16B(), v17.V16B(), v11.V16B());
   1730   __ srshl(v8.V2D(), v15.V2D(), v4.V2D());
   1731   __ srshl(v25.V2S(), v17.V2S(), v8.V2S());
   1732   __ srshl(v19.V4H(), v7.V4H(), v7.V4H());
   1733   __ srshl(v13.V4S(), v2.V4S(), v17.V4S());
   1734   __ srshl(v22.V8B(), v6.V8B(), v21.V8B());
   1735   __ srshl(v10.V8H(), v17.V8H(), v4.V8H());
   1736   __ srshr(d21, d18, 45);
   1737   __ srshr(v3.V16B(), v11.V16B(), 7);
   1738   __ srshr(v21.V2D(), v26.V2D(), 53);
   1739   __ srshr(v11.V2S(), v5.V2S(), 28);
   1740   __ srshr(v7.V4H(), v18.V4H(), 12);
   1741   __ srshr(v7.V4S(), v3.V4S(), 30);
   1742   __ srshr(v14.V8B(), v2.V8B(), 6);
   1743   __ srshr(v21.V8H(), v20.V8H(), 3);
   1744   __ srsra(d21, d30, 63);
   1745   __ srsra(v27.V16B(), v30.V16B(), 6);
   1746   __ srsra(v20.V2D(), v12.V2D(), 27);
   1747   __ srsra(v0.V2S(), v17.V2S(), 5);
   1748   __ srsra(v14.V4H(), v16.V4H(), 15);
   1749   __ srsra(v18.V4S(), v3.V4S(), 20);
   1750   __ srsra(v21.V8B(), v1.V8B(), 1);
   1751   __ srsra(v31.V8H(), v25.V8H(), 2);
   1752   __ sshl(d1, d13, d9);
   1753   __ sshl(v17.V16B(), v31.V16B(), v15.V16B());
   1754   __ sshl(v13.V2D(), v16.V2D(), v0.V2D());
   1755   __ sshl(v0.V2S(), v7.V2S(), v22.V2S());
   1756   __ sshl(v23.V4H(), v19.V4H(), v4.V4H());
   1757   __ sshl(v5.V4S(), v5.V4S(), v11.V4S());
   1758   __ sshl(v23.V8B(), v27.V8B(), v7.V8B());
   1759   __ sshl(v29.V8H(), v10.V8H(), v5.V8H());
   1760   __ sshll(v0.V2D(), v2.V2S(), 23);
   1761   __ sshll(v11.V4S(), v8.V4H(), 8);
   1762   __ sshll(v4.V8H(), v29.V8B(), 1);
   1763   __ sshll2(v10.V2D(), v4.V4S(), 14);
   1764   __ sshll2(v26.V4S(), v31.V8H(), 6);
   1765   __ sshll2(v3.V8H(), v26.V16B(), 4);
   1766   __ sshr(d19, d21, 20);
   1767   __ sshr(v15.V16B(), v23.V16B(), 5);
   1768   __ sshr(v17.V2D(), v14.V2D(), 38);
   1769   __ sshr(v3.V2S(), v29.V2S(), 23);
   1770   __ sshr(v23.V4H(), v27.V4H(), 4);
   1771   __ sshr(v28.V4S(), v3.V4S(), 4);
   1772   __ sshr(v14.V8B(), v2.V8B(), 6);
   1773   __ sshr(v3.V8H(), v8.V8H(), 6);
   1774   __ ssra(d12, d28, 44);
   1775   __ ssra(v29.V16B(), v31.V16B(), 4);
   1776   __ ssra(v3.V2D(), v0.V2D(), 24);
   1777   __ ssra(v14.V2S(), v28.V2S(), 6);
   1778   __ ssra(v18.V4H(), v8.V4H(), 7);
   1779   __ ssra(v31.V4S(), v14.V4S(), 24);
   1780   __ ssra(v28.V8B(), v26.V8B(), 5);
   1781   __ ssra(v9.V8H(), v9.V8H(), 14);
   1782   __ ssubl(v13.V2D(), v14.V2S(), v3.V2S());
   1783   __ ssubl(v5.V4S(), v16.V4H(), v8.V4H());
   1784   __ ssubl(v0.V8H(), v28.V8B(), v6.V8B());
   1785   __ ssubl2(v5.V2D(), v13.V4S(), v25.V4S());
   1786   __ ssubl2(v3.V4S(), v15.V8H(), v17.V8H());
   1787   __ ssubl2(v15.V8H(), v15.V16B(), v14.V16B());
   1788   __ ssubw(v25.V2D(), v23.V2D(), v26.V2S());
   1789   __ ssubw(v21.V4S(), v18.V4S(), v24.V4H());
   1790   __ ssubw(v30.V8H(), v22.V8H(), v3.V8B());
   1791   __ ssubw2(v16.V2D(), v24.V2D(), v28.V4S());
   1792   __ ssubw2(v31.V4S(), v11.V4S(), v15.V8H());
   1793   __ ssubw2(v4.V8H(), v8.V8H(), v16.V16B());
   1794   __ st1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x0));
   1795   __ st1(v10.V16B(),
   1796          v11.V16B(),
   1797          v12.V16B(),
   1798          v13.V16B(),
   1799          MemOperand(x1, x2, PostIndex));
   1800   __ st1(v27.V16B(),
   1801          v28.V16B(),
   1802          v29.V16B(),
   1803          v30.V16B(),
   1804          MemOperand(x1, 64, PostIndex));
   1805   __ st1(v16.V16B(), v17.V16B(), v18.V16B(), MemOperand(x0));
   1806   __ st1(v21.V16B(), v22.V16B(), v23.V16B(), MemOperand(x1, x2, PostIndex));
   1807   __ st1(v9.V16B(), v10.V16B(), v11.V16B(), MemOperand(x1, 48, PostIndex));
   1808   __ st1(v7.V16B(), v8.V16B(), MemOperand(x0));
   1809   __ st1(v26.V16B(), v27.V16B(), MemOperand(x1, x2, PostIndex));
   1810   __ st1(v22.V16B(), v23.V16B(), MemOperand(x1, 32, PostIndex));
   1811   __ st1(v23.V16B(), MemOperand(x0));
   1812   __ st1(v28.V16B(), MemOperand(x1, x2, PostIndex));
   1813   __ st1(v2.V16B(), MemOperand(x1, 16, PostIndex));
   1814   __ st1(v29.V1D(), v30.V1D(), v31.V1D(), v0.V1D(), MemOperand(x0));
   1815   __ st1(v12.V1D(),
   1816          v13.V1D(),
   1817          v14.V1D(),
   1818          v15.V1D(),
   1819          MemOperand(x1, x2, PostIndex));
   1820   __ st1(v30.V1D(),
   1821          v31.V1D(),
   1822          v0.V1D(),
   1823          v1.V1D(),
   1824          MemOperand(x1, 32, PostIndex));
   1825   __ st1(v16.V1D(), v17.V1D(), v18.V1D(), MemOperand(x0));
   1826   __ st1(v3.V1D(), v4.V1D(), v5.V1D(), MemOperand(x1, x2, PostIndex));
   1827   __ st1(v14.V1D(), v15.V1D(), v16.V1D(), MemOperand(x1, 24, PostIndex));
   1828   __ st1(v18.V1D(), v19.V1D(), MemOperand(x0));
   1829   __ st1(v5.V1D(), v6.V1D(), MemOperand(x1, x2, PostIndex));
   1830   __ st1(v2.V1D(), v3.V1D(), MemOperand(x1, 16, PostIndex));
   1831   __ st1(v4.V1D(), MemOperand(x0));
   1832   __ st1(v27.V1D(), MemOperand(x1, x2, PostIndex));
   1833   __ st1(v23.V1D(), MemOperand(x1, 8, PostIndex));
   1834   __ st1(v2.V2D(), v3.V2D(), v4.V2D(), v5.V2D(), MemOperand(x0));
   1835   __ st1(v22.V2D(),
   1836          v23.V2D(),
   1837          v24.V2D(),
   1838          v25.V2D(),
   1839          MemOperand(x1, x2, PostIndex));
   1840   __ st1(v28.V2D(),
   1841          v29.V2D(),
   1842          v30.V2D(),
   1843          v31.V2D(),
   1844          MemOperand(x1, 64, PostIndex));
   1845   __ st1(v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
   1846   __ st1(v16.V2D(), v17.V2D(), v18.V2D(), MemOperand(x1, x2, PostIndex));
   1847   __ st1(v22.V2D(), v23.V2D(), v24.V2D(), MemOperand(x1, 48, PostIndex));
   1848   __ st1(v21.V2D(), v22.V2D(), MemOperand(x0));
   1849   __ st1(v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
   1850   __ st1(v27.V2D(), v28.V2D(), MemOperand(x1, 32, PostIndex));
   1851   __ st1(v21.V2D(), MemOperand(x0));
   1852   __ st1(v29.V2D(), MemOperand(x1, x2, PostIndex));
   1853   __ st1(v20.V2D(), MemOperand(x1, 16, PostIndex));
   1854   __ st1(v22.V2S(), v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x0));
   1855   __ st1(v8.V2S(),
   1856          v9.V2S(),
   1857          v10.V2S(),
   1858          v11.V2S(),
   1859          MemOperand(x1, x2, PostIndex));
   1860   __ st1(v15.V2S(),
   1861          v16.V2S(),
   1862          v17.V2S(),
   1863          v18.V2S(),
   1864          MemOperand(x1, 32, PostIndex));
   1865   __ st1(v2.V2S(), v3.V2S(), v4.V2S(), MemOperand(x0));
   1866   __ st1(v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x1, x2, PostIndex));
   1867   __ st1(v7.V2S(), v8.V2S(), v9.V2S(), MemOperand(x1, 24, PostIndex));
   1868   __ st1(v28.V2S(), v29.V2S(), MemOperand(x0));
   1869   __ st1(v29.V2S(), v30.V2S(), MemOperand(x1, x2, PostIndex));
   1870   __ st1(v23.V2S(), v24.V2S(), MemOperand(x1, 16, PostIndex));
   1871   __ st1(v6.V2S(), MemOperand(x0));
   1872   __ st1(v11.V2S(), MemOperand(x1, x2, PostIndex));
   1873   __ st1(v17.V2S(), MemOperand(x1, 8, PostIndex));
   1874   __ st1(v6.V4H(), v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x0));
   1875   __ st1(v9.V4H(),
   1876          v10.V4H(),
   1877          v11.V4H(),
   1878          v12.V4H(),
   1879          MemOperand(x1, x2, PostIndex));
   1880   __ st1(v25.V4H(),
   1881          v26.V4H(),
   1882          v27.V4H(),
   1883          v28.V4H(),
   1884          MemOperand(x1, 32, PostIndex));
   1885   __ st1(v11.V4H(), v12.V4H(), v13.V4H(), MemOperand(x0));
   1886   __ st1(v10.V4H(), v11.V4H(), v12.V4H(), MemOperand(x1, x2, PostIndex));
   1887   __ st1(v12.V4H(), v13.V4H(), v14.V4H(), MemOperand(x1, 24, PostIndex));
   1888   __ st1(v13.V4H(), v14.V4H(), MemOperand(x0));
   1889   __ st1(v15.V4H(), v16.V4H(), MemOperand(x1, x2, PostIndex));
   1890   __ st1(v21.V4H(), v22.V4H(), MemOperand(x1, 16, PostIndex));
   1891   __ st1(v16.V4H(), MemOperand(x0));
   1892   __ st1(v8.V4H(), MemOperand(x1, x2, PostIndex));
   1893   __ st1(v30.V4H(), MemOperand(x1, 8, PostIndex));
   1894   __ st1(v3.V4S(), v4.V4S(), v5.V4S(), v6.V4S(), MemOperand(x0));
   1895   __ st1(v25.V4S(),
   1896          v26.V4S(),
   1897          v27.V4S(),
   1898          v28.V4S(),
   1899          MemOperand(x1, x2, PostIndex));
   1900   __ st1(v5.V4S(), v6.V4S(), v7.V4S(), v8.V4S(), MemOperand(x1, 64, PostIndex));
   1901   __ st1(v31.V4S(), v0.V4S(), v1.V4S(), MemOperand(x0));
   1902   __ st1(v30.V4S(), v31.V4S(), v0.V4S(), MemOperand(x1, x2, PostIndex));
   1903   __ st1(v6.V4S(), v7.V4S(), v8.V4S(), MemOperand(x1, 48, PostIndex));
   1904   __ st1(v17.V4S(), v18.V4S(), MemOperand(x0));
   1905   __ st1(v31.V4S(), v0.V4S(), MemOperand(x1, x2, PostIndex));
   1906   __ st1(v1.V4S(), v2.V4S(), MemOperand(x1, 32, PostIndex));
   1907   __ st1(v26.V4S(), MemOperand(x0));
   1908   __ st1(v15.V4S(), MemOperand(x1, x2, PostIndex));
   1909   __ st1(v13.V4S(), MemOperand(x1, 16, PostIndex));
   1910   __ st1(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
   1911   __ st1(v10.V8B(),
   1912          v11.V8B(),
   1913          v12.V8B(),
   1914          v13.V8B(),
   1915          MemOperand(x1, x2, PostIndex));
   1916   __ st1(v15.V8B(),
   1917          v16.V8B(),
   1918          v17.V8B(),
   1919          v18.V8B(),
   1920          MemOperand(x1, 32, PostIndex));
   1921   __ st1(v19.V8B(), v20.V8B(), v21.V8B(), MemOperand(x0));
   1922   __ st1(v31.V8B(), v0.V8B(), v1.V8B(), MemOperand(x1, x2, PostIndex));
   1923   __ st1(v9.V8B(), v10.V8B(), v11.V8B(), MemOperand(x1, 24, PostIndex));
   1924   __ st1(v12.V8B(), v13.V8B(), MemOperand(x0));
   1925   __ st1(v2.V8B(), v3.V8B(), MemOperand(x1, x2, PostIndex));
   1926   __ st1(v0.V8B(), v1.V8B(), MemOperand(x1, 16, PostIndex));
   1927   __ st1(v16.V8B(), MemOperand(x0));
   1928   __ st1(v25.V8B(), MemOperand(x1, x2, PostIndex));
   1929   __ st1(v31.V8B(), MemOperand(x1, 8, PostIndex));
   1930   __ st1(v4.V8H(), v5.V8H(), v6.V8H(), v7.V8H(), MemOperand(x0));
   1931   __ st1(v3.V8H(), v4.V8H(), v5.V8H(), v6.V8H(), MemOperand(x1, x2, PostIndex));
   1932   __ st1(v26.V8H(),
   1933          v27.V8H(),
   1934          v28.V8H(),
   1935          v29.V8H(),
   1936          MemOperand(x1, 64, PostIndex));
   1937   __ st1(v10.V8H(), v11.V8H(), v12.V8H(), MemOperand(x0));
   1938   __ st1(v21.V8H(), v22.V8H(), v23.V8H(), MemOperand(x1, x2, PostIndex));
   1939   __ st1(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, 48, PostIndex));
   1940   __ st1(v26.V8H(), v27.V8H(), MemOperand(x0));
   1941   __ st1(v24.V8H(), v25.V8H(), MemOperand(x1, x2, PostIndex));
   1942   __ st1(v17.V8H(), v18.V8H(), MemOperand(x1, 32, PostIndex));
   1943   __ st1(v29.V8H(), MemOperand(x0));
   1944   __ st1(v19.V8H(), MemOperand(x1, x2, PostIndex));
   1945   __ st1(v23.V8H(), MemOperand(x1, 16, PostIndex));
   1946   __ st1(v19.B(), 15, MemOperand(x0));
   1947   __ st1(v25.B(), 9, MemOperand(x1, x2, PostIndex));
   1948   __ st1(v4.B(), 8, MemOperand(x1, 1, PostIndex));
   1949   __ st1(v13.D(), 0, MemOperand(x0));
   1950   __ st1(v30.D(), 0, MemOperand(x1, x2, PostIndex));
   1951   __ st1(v3.D(), 0, MemOperand(x1, 8, PostIndex));
   1952   __ st1(v22.H(), 0, MemOperand(x0));
   1953   __ st1(v31.H(), 7, MemOperand(x1, x2, PostIndex));
   1954   __ st1(v23.H(), 3, MemOperand(x1, 2, PostIndex));
   1955   __ st1(v0.S(), 0, MemOperand(x0));
   1956   __ st1(v11.S(), 3, MemOperand(x1, x2, PostIndex));
   1957   __ st1(v24.S(), 3, MemOperand(x1, 4, PostIndex));
   1958   __ st2(v7.V16B(), v8.V16B(), MemOperand(x0));
   1959   __ st2(v5.V16B(), v6.V16B(), MemOperand(x1, x2, PostIndex));
   1960   __ st2(v18.V16B(), v19.V16B(), MemOperand(x1, 32, PostIndex));
   1961   __ st2(v14.V2D(), v15.V2D(), MemOperand(x0));
   1962   __ st2(v7.V2D(), v8.V2D(), MemOperand(x1, x2, PostIndex));
   1963   __ st2(v24.V2D(), v25.V2D(), MemOperand(x1, 32, PostIndex));
   1964   __ st2(v22.V2S(), v23.V2S(), MemOperand(x0));
   1965   __ st2(v4.V2S(), v5.V2S(), MemOperand(x1, x2, PostIndex));
   1966   __ st2(v2.V2S(), v3.V2S(), MemOperand(x1, 16, PostIndex));
   1967   __ st2(v23.V4H(), v24.V4H(), MemOperand(x0));
   1968   __ st2(v8.V4H(), v9.V4H(), MemOperand(x1, x2, PostIndex));
   1969   __ st2(v7.V4H(), v8.V4H(), MemOperand(x1, 16, PostIndex));
   1970   __ st2(v17.V4S(), v18.V4S(), MemOperand(x0));
   1971   __ st2(v6.V4S(), v7.V4S(), MemOperand(x1, x2, PostIndex));
   1972   __ st2(v26.V4S(), v27.V4S(), MemOperand(x1, 32, PostIndex));
   1973   __ st2(v31.V8B(), v0.V8B(), MemOperand(x0));
   1974   __ st2(v0.V8B(), v1.V8B(), MemOperand(x1, x2, PostIndex));
   1975   __ st2(v21.V8B(), v22.V8B(), MemOperand(x1, 16, PostIndex));
   1976   __ st2(v7.V8H(), v8.V8H(), MemOperand(x0));
   1977   __ st2(v22.V8H(), v23.V8H(), MemOperand(x1, x2, PostIndex));
   1978   __ st2(v4.V8H(), v5.V8H(), MemOperand(x1, 32, PostIndex));
   1979   __ st2(v8.B(), v9.B(), 15, MemOperand(x0));
   1980   __ st2(v8.B(), v9.B(), 15, MemOperand(x1, x2, PostIndex));
   1981   __ st2(v7.B(), v8.B(), 4, MemOperand(x1, 2, PostIndex));
   1982   __ st2(v25.D(), v26.D(), 0, MemOperand(x0));
   1983   __ st2(v17.D(), v18.D(), 1, MemOperand(x1, x2, PostIndex));
   1984   __ st2(v3.D(), v4.D(), 1, MemOperand(x1, 16, PostIndex));
   1985   __ st2(v4.H(), v5.H(), 3, MemOperand(x0));
   1986   __ st2(v0.H(), v1.H(), 5, MemOperand(x1, x2, PostIndex));
   1987   __ st2(v22.H(), v23.H(), 2, MemOperand(x1, 4, PostIndex));
   1988   __ st2(v14.S(), v15.S(), 3, MemOperand(x0));
   1989   __ st2(v23.S(), v24.S(), 3, MemOperand(x1, x2, PostIndex));
   1990   __ st2(v0.S(), v1.S(), 2, MemOperand(x1, 8, PostIndex));
   1991   __ st3(v26.V16B(), v27.V16B(), v28.V16B(), MemOperand(x0));
   1992   __ st3(v21.V16B(), v22.V16B(), v23.V16B(), MemOperand(x1, x2, PostIndex));
   1993   __ st3(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x1, 48, PostIndex));
   1994   __ st3(v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
   1995   __ st3(v23.V2D(), v24.V2D(), v25.V2D(), MemOperand(x1, x2, PostIndex));
   1996   __ st3(v10.V2D(), v11.V2D(), v12.V2D(), MemOperand(x1, 48, PostIndex));
   1997   __ st3(v9.V2S(), v10.V2S(), v11.V2S(), MemOperand(x0));
   1998   __ st3(v13.V2S(), v14.V2S(), v15.V2S(), MemOperand(x1, x2, PostIndex));
   1999   __ st3(v22.V2S(), v23.V2S(), v24.V2S(), MemOperand(x1, 24, PostIndex));
   2000   __ st3(v31.V4H(), v0.V4H(), v1.V4H(), MemOperand(x0));
   2001   __ st3(v8.V4H(), v9.V4H(), v10.V4H(), MemOperand(x1, x2, PostIndex));
   2002   __ st3(v19.V4H(), v20.V4H(), v21.V4H(), MemOperand(x1, 24, PostIndex));
   2003   __ st3(v18.V4S(), v19.V4S(), v20.V4S(), MemOperand(x0));
   2004   __ st3(v25.V4S(), v26.V4S(), v27.V4S(), MemOperand(x1, x2, PostIndex));
   2005   __ st3(v16.V4S(), v17.V4S(), v18.V4S(), MemOperand(x1, 48, PostIndex));
   2006   __ st3(v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
   2007   __ st3(v29.V8B(), v30.V8B(), v31.V8B(), MemOperand(x1, x2, PostIndex));
   2008   __ st3(v30.V8B(), v31.V8B(), v0.V8B(), MemOperand(x1, 24, PostIndex));
   2009   __ st3(v8.V8H(), v9.V8H(), v10.V8H(), MemOperand(x0));
   2010   __ st3(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, x2, PostIndex));
   2011   __ st3(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, 48, PostIndex));
   2012   __ st3(v31.B(), v0.B(), v1.B(), 10, MemOperand(x0));
   2013   __ st3(v4.B(), v5.B(), v6.B(), 5, MemOperand(x1, x2, PostIndex));
   2014   __ st3(v5.B(), v6.B(), v7.B(), 1, MemOperand(x1, 3, PostIndex));
   2015   __ st3(v5.D(), v6.D(), v7.D(), 0, MemOperand(x0));
   2016   __ st3(v6.D(), v7.D(), v8.D(), 0, MemOperand(x1, x2, PostIndex));
   2017   __ st3(v0.D(), v1.D(), v2.D(), 0, MemOperand(x1, 24, PostIndex));
   2018   __ st3(v31.H(), v0.H(), v1.H(), 2, MemOperand(x0));
   2019   __ st3(v14.H(), v15.H(), v16.H(), 5, MemOperand(x1, x2, PostIndex));
   2020   __ st3(v21.H(), v22.H(), v23.H(), 6, MemOperand(x1, 6, PostIndex));
   2021   __ st3(v21.S(), v22.S(), v23.S(), 0, MemOperand(x0));
   2022   __ st3(v11.S(), v12.S(), v13.S(), 1, MemOperand(x1, x2, PostIndex));
   2023   __ st3(v15.S(), v16.S(), v17.S(), 0, MemOperand(x1, 12, PostIndex));
   2024   __ st4(v22.V16B(), v23.V16B(), v24.V16B(), v25.V16B(), MemOperand(x0));
   2025   __ st4(v24.V16B(),
   2026          v25.V16B(),
   2027          v26.V16B(),
   2028          v27.V16B(),
   2029          MemOperand(x1, x2, PostIndex));
   2030   __ st4(v15.V16B(),
   2031          v16.V16B(),
   2032          v17.V16B(),
   2033          v18.V16B(),
   2034          MemOperand(x1, 64, PostIndex));
   2035   __ st4(v16.V2D(), v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
   2036   __ st4(v17.V2D(),
   2037          v18.V2D(),
   2038          v19.V2D(),
   2039          v20.V2D(),
   2040          MemOperand(x1, x2, PostIndex));
   2041   __ st4(v9.V2D(),
   2042          v10.V2D(),
   2043          v11.V2D(),
   2044          v12.V2D(),
   2045          MemOperand(x1, 64, PostIndex));
   2046   __ st4(v23.V2S(), v24.V2S(), v25.V2S(), v26.V2S(), MemOperand(x0));
   2047   __ st4(v15.V2S(),
   2048          v16.V2S(),
   2049          v17.V2S(),
   2050          v18.V2S(),
   2051          MemOperand(x1, x2, PostIndex));
   2052   __ st4(v24.V2S(),
   2053          v25.V2S(),
   2054          v26.V2S(),
   2055          v27.V2S(),
   2056          MemOperand(x1, 32, PostIndex));
   2057   __ st4(v14.V4H(), v15.V4H(), v16.V4H(), v17.V4H(), MemOperand(x0));
   2058   __ st4(v18.V4H(),
   2059          v19.V4H(),
   2060          v20.V4H(),
   2061          v21.V4H(),
   2062          MemOperand(x1, x2, PostIndex));
   2063   __ st4(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), MemOperand(x1, 32, PostIndex));
   2064   __ st4(v13.V4S(), v14.V4S(), v15.V4S(), v16.V4S(), MemOperand(x0));
   2065   __ st4(v6.V4S(), v7.V4S(), v8.V4S(), v9.V4S(), MemOperand(x1, x2, PostIndex));
   2066   __ st4(v15.V4S(),
   2067          v16.V4S(),
   2068          v17.V4S(),
   2069          v18.V4S(),
   2070          MemOperand(x1, 64, PostIndex));
   2071   __ st4(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
   2072   __ st4(v25.V8B(),
   2073          v26.V8B(),
   2074          v27.V8B(),
   2075          v28.V8B(),
   2076          MemOperand(x1, x2, PostIndex));
   2077   __ st4(v19.V8B(),
   2078          v20.V8B(),
   2079          v21.V8B(),
   2080          v22.V8B(),
   2081          MemOperand(x1, 32, PostIndex));
   2082   __ st4(v19.V8H(), v20.V8H(), v21.V8H(), v22.V8H(), MemOperand(x0));
   2083   __ st4(v15.V8H(),
   2084          v16.V8H(),
   2085          v17.V8H(),
   2086          v18.V8H(),
   2087          MemOperand(x1, x2, PostIndex));
   2088   __ st4(v31.V8H(),
   2089          v0.V8H(),
   2090          v1.V8H(),
   2091          v2.V8H(),
   2092          MemOperand(x1, 64, PostIndex));
   2093   __ st4(v0.B(), v1.B(), v2.B(), v3.B(), 13, MemOperand(x0));
   2094   __ st4(v4.B(), v5.B(), v6.B(), v7.B(), 10, MemOperand(x1, x2, PostIndex));
   2095   __ st4(v9.B(), v10.B(), v11.B(), v12.B(), 9, MemOperand(x1, 4, PostIndex));
   2096   __ st4(v2.D(), v3.D(), v4.D(), v5.D(), 1, MemOperand(x0));
   2097   __ st4(v7.D(), v8.D(), v9.D(), v10.D(), 0, MemOperand(x1, x2, PostIndex));
   2098   __ st4(v31.D(), v0.D(), v1.D(), v2.D(), 1, MemOperand(x1, 32, PostIndex));
   2099   __ st4(v2.H(), v3.H(), v4.H(), v5.H(), 1, MemOperand(x0));
   2100   __ st4(v27.H(), v28.H(), v29.H(), v30.H(), 3, MemOperand(x1, x2, PostIndex));
   2101   __ st4(v24.H(), v25.H(), v26.H(), v27.H(), 4, MemOperand(x1, 8, PostIndex));
   2102   __ st4(v18.S(), v19.S(), v20.S(), v21.S(), 2, MemOperand(x0));
   2103   __ st4(v6.S(), v7.S(), v8.S(), v9.S(), 2, MemOperand(x1, x2, PostIndex));
   2104   __ st4(v25.S(), v26.S(), v27.S(), v28.S(), 1, MemOperand(x1, 16, PostIndex));
   2105   __ sub(d12, d17, d2);
   2106   __ sub(v20.V16B(), v24.V16B(), v8.V16B());
   2107   __ sub(v8.V2D(), v29.V2D(), v5.V2D());
   2108   __ sub(v2.V2S(), v28.V2S(), v24.V2S());
   2109   __ sub(v24.V4H(), v10.V4H(), v4.V4H());
   2110   __ sub(v28.V4S(), v4.V4S(), v17.V4S());
   2111   __ sub(v16.V8B(), v27.V8B(), v2.V8B());
   2112   __ sub(v20.V8H(), v10.V8H(), v13.V8H());
   2113   __ subhn(v5.V2S(), v14.V2D(), v13.V2D());
   2114   __ subhn(v10.V4H(), v5.V4S(), v8.V4S());
   2115   __ subhn(v6.V8B(), v10.V8H(), v22.V8H());
   2116   __ subhn2(v11.V16B(), v6.V8H(), v9.V8H());
   2117   __ subhn2(v25.V4S(), v18.V2D(), v24.V2D());
   2118   __ subhn2(v20.V8H(), v21.V4S(), v1.V4S());
   2119   __ suqadd(b25, b11);
   2120   __ suqadd(d13, d1);
   2121   __ suqadd(h0, h9);
   2122   __ suqadd(s22, s8);
   2123   __ suqadd(v24.V16B(), v27.V16B());
   2124   __ suqadd(v26.V2D(), v14.V2D());
   2125   __ suqadd(v7.V2S(), v10.V2S());
   2126   __ suqadd(v25.V4H(), v12.V4H());
   2127   __ suqadd(v4.V4S(), v3.V4S());
   2128   __ suqadd(v14.V8B(), v18.V8B());
   2129   __ suqadd(v31.V8H(), v8.V8H());
   2130   __ sxtl(v16.V2D(), v20.V2S());
   2131   __ sxtl(v27.V4S(), v28.V4H());
   2132   __ sxtl(v0.V8H(), v22.V8B());
   2133   __ sxtl2(v6.V2D(), v7.V4S());
   2134   __ sxtl2(v9.V4S(), v27.V8H());
   2135   __ sxtl2(v16.V8H(), v16.V16B());
   2136   __ tbl(v25.V16B(),
   2137          v17.V16B(),
   2138          v18.V16B(),
   2139          v19.V16B(),
   2140          v20.V16B(),
   2141          v22.V16B());
   2142   __ tbl(v28.V16B(), v13.V16B(), v14.V16B(), v15.V16B(), v4.V16B());
   2143   __ tbl(v3.V16B(), v0.V16B(), v1.V16B(), v2.V16B());
   2144   __ tbl(v20.V16B(), v15.V16B(), v4.V16B());
   2145   __ tbl(v7.V8B(), v23.V16B(), v24.V16B(), v25.V16B(), v26.V16B(), v20.V8B());
   2146   __ tbl(v8.V8B(), v1.V16B(), v2.V16B(), v3.V16B(), v31.V8B());
   2147   __ tbl(v8.V8B(), v25.V16B(), v26.V16B(), v16.V8B());
   2148   __ tbl(v11.V8B(), v19.V16B(), v30.V8B());
   2149   __ tbx(v25.V16B(), v25.V16B(), v26.V16B(), v27.V16B(), v28.V16B(), v5.V16B());
   2150   __ tbx(v21.V16B(), v29.V16B(), v30.V16B(), v31.V16B(), v24.V16B());
   2151   __ tbx(v6.V16B(), v16.V16B(), v17.V16B(), v1.V16B());
   2152   __ tbx(v13.V16B(), v3.V16B(), v20.V16B());
   2153   __ tbx(v24.V8B(), v29.V16B(), v30.V16B(), v31.V16B(), v0.V16B(), v9.V8B());
   2154   __ tbx(v17.V8B(), v9.V16B(), v10.V16B(), v11.V16B(), v26.V8B());
   2155   __ tbx(v5.V8B(), v3.V16B(), v4.V16B(), v21.V8B());
   2156   __ tbx(v16.V8B(), v11.V16B(), v29.V8B());
   2157   __ trn1(v19.V16B(), v24.V16B(), v12.V16B());
   2158   __ trn1(v2.V2D(), v7.V2D(), v10.V2D());
   2159   __ trn1(v22.V2S(), v0.V2S(), v21.V2S());
   2160   __ trn1(v12.V4H(), v15.V4H(), v20.V4H());
   2161   __ trn1(v30.V4S(), v17.V4S(), v9.V4S());
   2162   __ trn1(v12.V8B(), v19.V8B(), v29.V8B());
   2163   __ trn1(v23.V8H(), v8.V8H(), v9.V8H());
   2164   __ trn2(v28.V16B(), v30.V16B(), v25.V16B());
   2165   __ trn2(v7.V2D(), v27.V2D(), v7.V2D());
   2166   __ trn2(v30.V2S(), v16.V2S(), v19.V2S());
   2167   __ trn2(v24.V4H(), v6.V4H(), v25.V4H());
   2168   __ trn2(v2.V4S(), v19.V4S(), v11.V4S());
   2169   __ trn2(v25.V8B(), v27.V8B(), v18.V8B());
   2170   __ trn2(v12.V8H(), v4.V8H(), v15.V8H());
   2171   __ uaba(v31.V16B(), v12.V16B(), v28.V16B());
   2172   __ uaba(v18.V2S(), v5.V2S(), v14.V2S());
   2173   __ uaba(v9.V4H(), v20.V4H(), v21.V4H());
   2174   __ uaba(v6.V4S(), v20.V4S(), v2.V4S());
   2175   __ uaba(v16.V8B(), v12.V8B(), v5.V8B());
   2176   __ uaba(v15.V8H(), v26.V8H(), v30.V8H());
   2177   __ uabal(v10.V2D(), v18.V2S(), v15.V2S());
   2178   __ uabal(v30.V4S(), v19.V4H(), v7.V4H());
   2179   __ uabal(v4.V8H(), v27.V8B(), v0.V8B());
   2180   __ uabal2(v19.V2D(), v12.V4S(), v2.V4S());
   2181   __ uabal2(v26.V4S(), v5.V8H(), v12.V8H());
   2182   __ uabal2(v19.V8H(), v20.V16B(), v28.V16B());
   2183   __ uabd(v18.V16B(), v4.V16B(), v21.V16B());
   2184   __ uabd(v30.V2S(), v21.V2S(), v16.V2S());
   2185   __ uabd(v8.V4H(), v28.V4H(), v25.V4H());
   2186   __ uabd(v28.V4S(), v12.V4S(), v21.V4S());
   2187   __ uabd(v19.V8B(), v16.V8B(), v28.V8B());
   2188   __ uabd(v9.V8H(), v12.V8H(), v29.V8H());
   2189   __ uabdl(v26.V2D(), v0.V2S(), v8.V2S());
   2190   __ uabdl(v29.V4S(), v31.V4H(), v25.V4H());
   2191   __ uabdl(v27.V8H(), v29.V8B(), v14.V8B());
   2192   __ uabdl2(v20.V2D(), v20.V4S(), v8.V4S());
   2193   __ uabdl2(v22.V4S(), v15.V8H(), v18.V8H());
   2194   __ uabdl2(v9.V8H(), v18.V16B(), v23.V16B());
   2195   __ uadalp(v9.V1D(), v15.V2S());
   2196   __ uadalp(v14.V2D(), v12.V4S());
   2197   __ uadalp(v28.V2S(), v12.V4H());
   2198   __ uadalp(v0.V4H(), v17.V8B());
   2199   __ uadalp(v1.V4S(), v29.V8H());
   2200   __ uadalp(v15.V8H(), v22.V16B());
   2201   __ uaddl(v1.V2D(), v20.V2S(), v27.V2S());
   2202   __ uaddl(v31.V4S(), v25.V4H(), v5.V4H());
   2203   __ uaddl(v12.V8H(), v3.V8B(), v3.V8B());
   2204   __ uaddl2(v5.V2D(), v23.V4S(), v6.V4S());
   2205   __ uaddl2(v1.V4S(), v5.V8H(), v25.V8H());
   2206   __ uaddl2(v22.V8H(), v30.V16B(), v28.V16B());
   2207   __ uaddlp(v7.V1D(), v9.V2S());
   2208   __ uaddlp(v26.V2D(), v4.V4S());
   2209   __ uaddlp(v28.V2S(), v1.V4H());
   2210   __ uaddlp(v20.V4H(), v31.V8B());
   2211   __ uaddlp(v16.V4S(), v17.V8H());
   2212   __ uaddlp(v6.V8H(), v2.V16B());
   2213   __ uaddlv(d28, v22.V4S());
   2214   __ uaddlv(h0, v19.V16B());
   2215   __ uaddlv(h30, v30.V8B());
   2216   __ uaddlv(s24, v18.V4H());
   2217   __ uaddlv(s10, v0.V8H());
   2218   __ uaddw(v9.V2D(), v17.V2D(), v14.V2S());
   2219   __ uaddw(v9.V4S(), v25.V4S(), v3.V4H());
   2220   __ uaddw(v18.V8H(), v1.V8H(), v0.V8B());
   2221   __ uaddw2(v18.V2D(), v5.V2D(), v6.V4S());
   2222   __ uaddw2(v17.V4S(), v15.V4S(), v11.V8H());
   2223   __ uaddw2(v29.V8H(), v11.V8H(), v7.V16B());
   2224   __ uhadd(v13.V16B(), v9.V16B(), v3.V16B());
   2225   __ uhadd(v17.V2S(), v25.V2S(), v24.V2S());
   2226   __ uhadd(v25.V4H(), v23.V4H(), v13.V4H());
   2227   __ uhadd(v0.V4S(), v20.V4S(), v16.V4S());
   2228   __ uhadd(v5.V8B(), v5.V8B(), v25.V8B());
   2229   __ uhadd(v3.V8H(), v29.V8H(), v18.V8H());
   2230   __ uhsub(v1.V16B(), v22.V16B(), v13.V16B());
   2231   __ uhsub(v14.V2S(), v30.V2S(), v30.V2S());
   2232   __ uhsub(v29.V4H(), v14.V4H(), v17.V4H());
   2233   __ uhsub(v26.V4S(), v5.V4S(), v18.V4S());
   2234   __ uhsub(v3.V8B(), v7.V8B(), v12.V8B());
   2235   __ uhsub(v25.V8H(), v21.V8H(), v5.V8H());
   2236   __ umax(v28.V16B(), v12.V16B(), v6.V16B());
   2237   __ umax(v20.V2S(), v19.V2S(), v26.V2S());
   2238   __ umax(v0.V4H(), v31.V4H(), v18.V4H());
   2239   __ umax(v6.V4S(), v21.V4S(), v28.V4S());
   2240   __ umax(v0.V8B(), v2.V8B(), v20.V8B());
   2241   __ umax(v4.V8H(), v11.V8H(), v22.V8H());
   2242   __ umaxp(v1.V16B(), v6.V16B(), v29.V16B());
   2243   __ umaxp(v19.V2S(), v17.V2S(), v27.V2S());
   2244   __ umaxp(v21.V4H(), v16.V4H(), v7.V4H());
   2245   __ umaxp(v9.V4S(), v20.V4S(), v29.V4S());
   2246   __ umaxp(v13.V8B(), v1.V8B(), v16.V8B());
   2247   __ umaxp(v19.V8H(), v23.V8H(), v26.V8H());
   2248   __ umaxv(b17, v30.V16B());
   2249   __ umaxv(b23, v12.V8B());
   2250   __ umaxv(h31, v15.V4H());
   2251   __ umaxv(h15, v25.V8H());
   2252   __ umaxv(s18, v21.V4S());
   2253   __ umin(v22.V16B(), v0.V16B(), v18.V16B());
   2254   __ umin(v1.V2S(), v21.V2S(), v16.V2S());
   2255   __ umin(v17.V4H(), v4.V4H(), v25.V4H());
   2256   __ umin(v24.V4S(), v26.V4S(), v13.V4S());
   2257   __ umin(v20.V8B(), v1.V8B(), v5.V8B());
   2258   __ umin(v26.V8H(), v25.V8H(), v23.V8H());
   2259   __ uminp(v5.V16B(), v1.V16B(), v23.V16B());
   2260   __ uminp(v7.V2S(), v26.V2S(), v30.V2S());
   2261   __ uminp(v9.V4H(), v5.V4H(), v25.V4H());
   2262   __ uminp(v23.V4S(), v10.V4S(), v1.V4S());
   2263   __ uminp(v4.V8B(), v29.V8B(), v14.V8B());
   2264   __ uminp(v21.V8H(), v0.V8H(), v14.V8H());
   2265   __ uminv(b0, v17.V16B());
   2266   __ uminv(b0, v31.V8B());
   2267   __ uminv(h24, v0.V4H());
   2268   __ uminv(h29, v14.V8H());
   2269   __ uminv(s30, v3.V4S());
   2270   __ umlal(v11.V2D(), v11.V2S(), v24.V2S());
   2271   __ umlal(v30.V2D(), v16.V2S(), v11.S(), 3);
   2272   __ umlal(v0.V4S(), v9.V4H(), v26.V4H());
   2273   __ umlal(v20.V4S(), v24.V4H(), v12.H(), 4);
   2274   __ umlal(v16.V8H(), v21.V8B(), v6.V8B());
   2275   __ umlal2(v17.V2D(), v19.V4S(), v23.V4S());
   2276   __ umlal2(v5.V2D(), v30.V4S(), v8.S(), 0);
   2277   __ umlal2(v16.V4S(), v8.V8H(), v15.V8H());
   2278   __ umlal2(v15.V4S(), v26.V8H(), v1.H(), 5);
   2279   __ umlal2(v30.V8H(), v1.V16B(), v17.V16B());
   2280   __ umlsl(v18.V2D(), v19.V2S(), v28.V2S());
   2281   __ umlsl(v7.V2D(), v7.V2S(), v8.S(), 0);
   2282   __ umlsl(v24.V4S(), v8.V4H(), v4.V4H());
   2283   __ umlsl(v18.V4S(), v22.V4H(), v12.H(), 4);
   2284   __ umlsl(v28.V8H(), v14.V8B(), v20.V8B());
   2285   __ umlsl2(v11.V2D(), v0.V4S(), v9.V4S());
   2286   __ umlsl2(v26.V2D(), v16.V4S(), v9.S(), 2);
   2287   __ umlsl2(v3.V4S(), v11.V8H(), v9.V8H());
   2288   __ umlsl2(v10.V4S(), v25.V8H(), v9.H(), 4);
   2289   __ umlsl2(v24.V8H(), v16.V16B(), v28.V16B());
   2290   __ umov(x30, v25.D(), 1);
   2291   __ umull(v12.V2D(), v10.V2S(), v29.V2S());
   2292   __ umull(v22.V2D(), v30.V2S(), v5.S(), 3);
   2293   __ umull(v7.V4S(), v0.V4H(), v25.V4H());
   2294   __ umull(v11.V4S(), v13.V4H(), v3.H(), 2);
   2295   __ umull(v25.V8H(), v16.V8B(), v10.V8B());
   2296   __ umull2(v17.V2D(), v3.V4S(), v26.V4S());
   2297   __ umull2(v26.V2D(), v11.V4S(), v2.S(), 3);
   2298   __ umull2(v12.V4S(), v17.V8H(), v23.V8H());
   2299   __ umull2(v4.V4S(), v31.V8H(), v1.H(), 2);
   2300   __ umull2(v5.V8H(), v12.V16B(), v17.V16B());
   2301   __ uqadd(b30, b4, b28);
   2302   __ uqadd(d27, d20, d16);
   2303   __ uqadd(h7, h14, h28);
   2304   __ uqadd(s28, s17, s4);
   2305   __ uqadd(v19.V16B(), v22.V16B(), v21.V16B());
   2306   __ uqadd(v16.V2D(), v4.V2D(), v11.V2D());
   2307   __ uqadd(v20.V2S(), v14.V2S(), v4.V2S());
   2308   __ uqadd(v5.V4H(), v0.V4H(), v16.V4H());
   2309   __ uqadd(v21.V4S(), v31.V4S(), v9.V4S());
   2310   __ uqadd(v23.V8B(), v24.V8B(), v3.V8B());
   2311   __ uqadd(v17.V8H(), v27.V8H(), v11.V8H());
   2312   __ uqrshl(b10, b22, b10);
   2313   __ uqrshl(d29, d5, d11);
   2314   __ uqrshl(h27, h24, h30);
   2315   __ uqrshl(s10, s13, s8);
   2316   __ uqrshl(v9.V16B(), v18.V16B(), v14.V16B());
   2317   __ uqrshl(v24.V2D(), v15.V2D(), v17.V2D());
   2318   __ uqrshl(v4.V2S(), v14.V2S(), v27.V2S());
   2319   __ uqrshl(v15.V4H(), v5.V4H(), v8.V4H());
   2320   __ uqrshl(v21.V4S(), v29.V4S(), v0.V4S());
   2321   __ uqrshl(v16.V8B(), v24.V8B(), v9.V8B());
   2322   __ uqrshl(v2.V8H(), v0.V8H(), v15.V8H());
   2323   __ uqrshrn(b11, h26, 4);
   2324   __ uqrshrn(h7, s30, 5);
   2325   __ uqrshrn(s10, d8, 21);
   2326   __ uqrshrn(v15.V2S(), v6.V2D(), 11);
   2327   __ uqrshrn(v5.V4H(), v26.V4S(), 12);
   2328   __ uqrshrn(v28.V8B(), v25.V8H(), 5);
   2329   __ uqrshrn2(v25.V16B(), v30.V8H(), 2);
   2330   __ uqrshrn2(v21.V4S(), v14.V2D(), 32);
   2331   __ uqrshrn2(v13.V8H(), v7.V4S(), 2);
   2332   __ uqshl(b13, b0, b23);
   2333   __ uqshl(b9, b17, 4);
   2334   __ uqshl(d23, d6, d4);
   2335   __ uqshl(d8, d11, 44);
   2336   __ uqshl(h19, h13, h15);
   2337   __ uqshl(h25, h26, 6);
   2338   __ uqshl(s4, s24, s10);
   2339   __ uqshl(s19, s14, 1);
   2340   __ uqshl(v14.V16B(), v30.V16B(), v25.V16B());
   2341   __ uqshl(v6.V16B(), v10.V16B(), 5);
   2342   __ uqshl(v18.V2D(), v8.V2D(), v7.V2D());
   2343   __ uqshl(v25.V2D(), v14.V2D(), 18);
   2344   __ uqshl(v25.V2S(), v16.V2S(), v23.V2S());
   2345   __ uqshl(v13.V2S(), v15.V2S(), 31);
   2346   __ uqshl(v28.V4H(), v24.V4H(), v15.V4H());
   2347   __ uqshl(v4.V4H(), v17.V4H(), 1);
   2348   __ uqshl(v9.V4S(), v31.V4S(), v23.V4S());
   2349   __ uqshl(v18.V4S(), v28.V4S(), 31);
   2350   __ uqshl(v31.V8B(), v21.V8B(), v15.V8B());
   2351   __ uqshl(v6.V8B(), v21.V8B(), 1);
   2352   __ uqshl(v28.V8H(), v2.V8H(), v17.V8H());
   2353   __ uqshl(v24.V8H(), v8.V8H(), 14);
   2354   __ uqshrn(b21, h27, 7);
   2355   __ uqshrn(h28, s26, 11);
   2356   __ uqshrn(s13, d31, 17);
   2357   __ uqshrn(v21.V2S(), v16.V2D(), 8);
   2358   __ uqshrn(v24.V4H(), v24.V4S(), 2);
   2359   __ uqshrn(v5.V8B(), v1.V8H(), 8);
   2360   __ uqshrn2(v16.V16B(), v29.V8H(), 6);
   2361   __ uqshrn2(v2.V4S(), v6.V2D(), 1);
   2362   __ uqshrn2(v16.V8H(), v10.V4S(), 14);
   2363   __ uqsub(b28, b20, b26);
   2364   __ uqsub(d0, d7, d10);
   2365   __ uqsub(h26, h24, h7);
   2366   __ uqsub(s23, s23, s16);
   2367   __ uqsub(v14.V16B(), v16.V16B(), v24.V16B());
   2368   __ uqsub(v11.V2D(), v17.V2D(), v6.V2D());
   2369   __ uqsub(v10.V2S(), v10.V2S(), v8.V2S());
   2370   __ uqsub(v9.V4H(), v15.V4H(), v12.V4H());
   2371   __ uqsub(v23.V4S(), v18.V4S(), v7.V4S());
   2372   __ uqsub(v9.V8B(), v19.V8B(), v17.V8B());
   2373   __ uqsub(v20.V8H(), v2.V8H(), v6.V8H());
   2374   __ uqxtn(b29, h19);
   2375   __ uqxtn(h0, s13);
   2376   __ uqxtn(s26, d22);
   2377   __ uqxtn(v5.V2S(), v31.V2D());
   2378   __ uqxtn(v30.V4H(), v19.V4S());
   2379   __ uqxtn(v15.V8B(), v2.V8H());
   2380   __ uqxtn2(v29.V16B(), v3.V8H());
   2381   __ uqxtn2(v13.V4S(), v17.V2D());
   2382   __ uqxtn2(v28.V8H(), v11.V4S());
   2383   __ urecpe(v23.V2S(), v15.V2S());
   2384   __ urecpe(v27.V4S(), v7.V4S());
   2385   __ urhadd(v2.V16B(), v15.V16B(), v27.V16B());
   2386   __ urhadd(v15.V2S(), v1.V2S(), v18.V2S());
   2387   __ urhadd(v17.V4H(), v4.V4H(), v26.V4H());
   2388   __ urhadd(v2.V4S(), v27.V4S(), v14.V4S());
   2389   __ urhadd(v5.V8B(), v17.V8B(), v14.V8B());
   2390   __ urhadd(v30.V8H(), v2.V8H(), v25.V8H());
   2391   __ urshl(d4, d28, d30);
   2392   __ urshl(v13.V16B(), v31.V16B(), v19.V16B());
   2393   __ urshl(v14.V2D(), v23.V2D(), v21.V2D());
   2394   __ urshl(v10.V2S(), v7.V2S(), v8.V2S());
   2395   __ urshl(v15.V4H(), v21.V4H(), v28.V4H());
   2396   __ urshl(v30.V4S(), v8.V4S(), v23.V4S());
   2397   __ urshl(v31.V8B(), v20.V8B(), v5.V8B());
   2398   __ urshl(v30.V8H(), v27.V8H(), v30.V8H());
   2399   __ urshr(d4, d13, 49);
   2400   __ urshr(v2.V16B(), v20.V16B(), 1);
   2401   __ urshr(v13.V2D(), v11.V2D(), 51);
   2402   __ urshr(v21.V2S(), v31.V2S(), 10);
   2403   __ urshr(v21.V4H(), v17.V4H(), 11);
   2404   __ urshr(v4.V4S(), v22.V4S(), 1);
   2405   __ urshr(v0.V8B(), v1.V8B(), 7);
   2406   __ urshr(v13.V8H(), v20.V8H(), 1);
   2407   __ ursqrte(v20.V2S(), v16.V2S());
   2408   __ ursqrte(v28.V4S(), v8.V4S());
   2409   __ ursra(d27, d16, 45);
   2410   __ ursra(v18.V16B(), v17.V16B(), 3);
   2411   __ ursra(v26.V2D(), v28.V2D(), 58);
   2412   __ ursra(v8.V2S(), v22.V2S(), 31);
   2413   __ ursra(v31.V4H(), v4.V4H(), 7);
   2414   __ ursra(v31.V4S(), v15.V4S(), 2);
   2415   __ ursra(v3.V8B(), v1.V8B(), 5);
   2416   __ ursra(v18.V8H(), v14.V8H(), 13);
   2417   __ ushl(d31, d0, d16);
   2418   __ ushl(v0.V16B(), v6.V16B(), v2.V16B());
   2419   __ ushl(v18.V2D(), v1.V2D(), v18.V2D());
   2420   __ ushl(v27.V2S(), v7.V2S(), v29.V2S());
   2421   __ ushl(v14.V4H(), v14.V4H(), v13.V4H());
   2422   __ ushl(v22.V4S(), v4.V4S(), v9.V4S());
   2423   __ ushl(v23.V8B(), v22.V8B(), v27.V8B());
   2424   __ ushl(v21.V8H(), v25.V8H(), v8.V8H());
   2425   __ ushll(v11.V2D(), v0.V2S(), 21);
   2426   __ ushll(v2.V4S(), v17.V4H(), 8);
   2427   __ ushll(v11.V8H(), v14.V8B(), 1);
   2428   __ ushll2(v8.V2D(), v29.V4S(), 7);
   2429   __ ushll2(v29.V4S(), v9.V8H(), 2);
   2430   __ ushll2(v5.V8H(), v24.V16B(), 6);
   2431   __ ushr(d28, d27, 53);
   2432   __ ushr(v1.V16B(), v9.V16B(), 7);
   2433   __ ushr(v2.V2D(), v24.V2D(), 43);
   2434   __ ushr(v30.V2S(), v25.V2S(), 11);
   2435   __ ushr(v10.V4H(), v26.V4H(), 12);
   2436   __ ushr(v4.V4S(), v5.V4S(), 30);
   2437   __ ushr(v30.V8B(), v2.V8B(), 1);
   2438   __ ushr(v6.V8H(), v12.V8H(), 2);
   2439   __ usqadd(b19, b5);
   2440   __ usqadd(d9, d2);
   2441   __ usqadd(h2, h16);
   2442   __ usqadd(s16, s3);
   2443   __ usqadd(v31.V16B(), v29.V16B());
   2444   __ usqadd(v8.V2D(), v10.V2D());
   2445   __ usqadd(v18.V2S(), v9.V2S());
   2446   __ usqadd(v24.V4H(), v14.V4H());
   2447   __ usqadd(v10.V4S(), v30.V4S());
   2448   __ usqadd(v16.V8B(), v20.V8B());
   2449   __ usqadd(v12.V8H(), v16.V8H());
   2450   __ usra(d28, d27, 37);
   2451   __ usra(v5.V16B(), v22.V16B(), 5);
   2452   __ usra(v2.V2D(), v19.V2D(), 33);
   2453   __ usra(v0.V2S(), v0.V2S(), 21);
   2454   __ usra(v7.V4H(), v6.V4H(), 12);
   2455   __ usra(v4.V4S(), v17.V4S(), 9);
   2456   __ usra(v9.V8B(), v12.V8B(), 7);
   2457   __ usra(v3.V8H(), v27.V8H(), 14);
   2458   __ usubl(v29.V2D(), v12.V2S(), v30.V2S());
   2459   __ usubl(v29.V4S(), v28.V4H(), v6.V4H());
   2460   __ usubl(v12.V8H(), v4.V8B(), v14.V8B());
   2461   __ usubl2(v1.V2D(), v24.V4S(), v17.V4S());
   2462   __ usubl2(v4.V4S(), v1.V8H(), v3.V8H());
   2463   __ usubl2(v23.V8H(), v4.V16B(), v7.V16B());
   2464   __ usubw(v9.V2D(), v20.V2D(), v30.V2S());
   2465   __ usubw(v20.V4S(), v16.V4S(), v23.V4H());
   2466   __ usubw(v25.V8H(), v8.V8H(), v29.V8B());
   2467   __ usubw2(v18.V2D(), v29.V2D(), v6.V4S());
   2468   __ usubw2(v6.V4S(), v6.V4S(), v20.V8H());
   2469   __ usubw2(v18.V8H(), v4.V8H(), v16.V16B());
   2470   __ uxtl(v27.V2D(), v21.V2S());
   2471   __ uxtl(v0.V4S(), v31.V4H());
   2472   __ uxtl(v27.V8H(), v10.V8B());
   2473   __ uxtl2(v6.V2D(), v16.V4S());
   2474   __ uxtl2(v22.V4S(), v20.V8H());
   2475   __ uxtl2(v20.V8H(), v21.V16B());
   2476   __ uzp1(v30.V16B(), v9.V16B(), v17.V16B());
   2477   __ uzp1(v7.V2D(), v26.V2D(), v28.V2D());
   2478   __ uzp1(v26.V2S(), v16.V2S(), v22.V2S());
   2479   __ uzp1(v14.V4H(), v19.V4H(), v6.V4H());
   2480   __ uzp1(v17.V4S(), v23.V4S(), v30.V4S());
   2481   __ uzp1(v28.V8B(), v27.V8B(), v13.V8B());
   2482   __ uzp1(v17.V8H(), v1.V8H(), v12.V8H());
   2483   __ uzp2(v8.V16B(), v18.V16B(), v26.V16B());
   2484   __ uzp2(v21.V2D(), v22.V2D(), v24.V2D());
   2485   __ uzp2(v20.V2S(), v21.V2S(), v2.V2S());
   2486   __ uzp2(v16.V4H(), v31.V4H(), v6.V4H());
   2487   __ uzp2(v25.V4S(), v11.V4S(), v8.V4S());
   2488   __ uzp2(v31.V8B(), v31.V8B(), v13.V8B());
   2489   __ uzp2(v8.V8H(), v17.V8H(), v1.V8H());
   2490   __ xtn(v17.V2S(), v26.V2D());
   2491   __ xtn(v3.V4H(), v0.V4S());
   2492   __ xtn(v18.V8B(), v8.V8H());
   2493   __ xtn2(v0.V16B(), v0.V8H());
   2494   __ xtn2(v15.V4S(), v4.V2D());
   2495   __ xtn2(v31.V8H(), v18.V4S());
   2496   __ zip1(v22.V16B(), v9.V16B(), v6.V16B());
   2497   __ zip1(v23.V2D(), v11.V2D(), v2.V2D());
   2498   __ zip1(v26.V2S(), v16.V2S(), v9.V2S());
   2499   __ zip1(v1.V4H(), v9.V4H(), v7.V4H());
   2500   __ zip1(v0.V4S(), v30.V4S(), v20.V4S());
   2501   __ zip1(v30.V8B(), v17.V8B(), v15.V8B());
   2502   __ zip1(v17.V8H(), v8.V8H(), v2.V8H());
   2503   __ zip2(v23.V16B(), v10.V16B(), v11.V16B());
   2504   __ zip2(v30.V2D(), v6.V2D(), v14.V2D());
   2505   __ zip2(v9.V2S(), v10.V2S(), v21.V2S());
   2506   __ zip2(v8.V4H(), v24.V4H(), v29.V4H());
   2507   __ zip2(v0.V4S(), v21.V4S(), v23.V4S());
   2508   __ zip2(v25.V8B(), v23.V8B(), v30.V8B());
   2509   __ zip2(v7.V8H(), v10.V8H(), v30.V8H());
   2510 }  // NOLINT(readability/fn_size)
   2511 
   2512 
   2513 static void GenerateTestSequenceNEONFP(MacroAssembler* masm) {
   2514   ExactAssemblyScope guard(masm,
   2515                            masm->GetBuffer()->GetRemainingBytes(),
   2516                            ExactAssemblyScope::kMaximumSize);
   2517 
   2518   // NEON floating point instructions.
   2519   __ fabd(v3.V2D(), v25.V2D(), v8.V2D());
   2520   __ fabd(v14.V2S(), v27.V2S(), v11.V2S());
   2521   __ fabd(v9.V4S(), v22.V4S(), v18.V4S());
   2522   __ fabs(v1.V2D(), v29.V2D());
   2523   __ fabs(v6.V2S(), v21.V2S());
   2524   __ fabs(v12.V4S(), v25.V4S());
   2525   __ facge(v18.V2D(), v5.V2D(), v0.V2D());
   2526   __ facge(v15.V2S(), v11.V2S(), v6.V2S());
   2527   __ facge(v30.V4S(), v10.V4S(), v25.V4S());
   2528   __ facgt(v28.V2D(), v16.V2D(), v31.V2D());
   2529   __ facgt(v15.V2S(), v1.V2S(), v4.V2S());
   2530   __ facgt(v22.V4S(), v3.V4S(), v10.V4S());
   2531   __ fadd(v7.V2D(), v10.V2D(), v24.V2D());
   2532   __ fadd(v10.V2S(), v23.V2S(), v7.V2S());
   2533   __ fadd(v16.V4S(), v22.V4S(), v11.V4S());
   2534   __ faddp(d27, v28.V2D());
   2535   __ faddp(s20, v23.V2S());
   2536   __ faddp(v21.V2D(), v4.V2D(), v11.V2D());
   2537   __ faddp(v31.V2S(), v26.V2S(), v1.V2S());
   2538   __ faddp(v13.V4S(), v27.V4S(), v28.V4S());
   2539   __ fcmeq(v17.V2D(), v13.V2D(), v20.V2D());
   2540   __ fcmeq(v24.V2D(), v16.V2D(), 0.0);
   2541   __ fcmeq(v26.V2S(), v17.V2S(), v10.V2S());
   2542   __ fcmeq(v24.V2S(), v4.V2S(), 0.0);
   2543   __ fcmeq(v8.V4S(), v4.V4S(), v14.V4S());
   2544   __ fcmeq(v26.V4S(), v25.V4S(), 0.0);
   2545   __ fcmge(v27.V2D(), v0.V2D(), v0.V2D());
   2546   __ fcmge(v22.V2D(), v30.V2D(), 0.0);
   2547   __ fcmge(v7.V2S(), v21.V2S(), v25.V2S());
   2548   __ fcmge(v15.V2S(), v15.V2S(), 0.0);
   2549   __ fcmge(v29.V4S(), v4.V4S(), v27.V4S());
   2550   __ fcmge(v22.V4S(), v21.V4S(), 0.0);
   2551   __ fcmgt(v1.V2D(), v26.V2D(), v15.V2D());
   2552   __ fcmgt(v15.V2D(), v23.V2D(), 0.0);
   2553   __ fcmgt(v21.V2S(), v16.V2S(), v6.V2S());
   2554   __ fcmgt(v1.V2S(), v13.V2S(), 0.0);
   2555   __ fcmgt(v14.V4S(), v0.V4S(), v25.V4S());
   2556   __ fcmgt(v13.V4S(), v8.V4S(), 0.0);
   2557   __ fcmle(v4.V2D(), v6.V2D(), 0.0);
   2558   __ fcmle(v24.V2S(), v31.V2S(), 0.0);
   2559   __ fcmle(v8.V4S(), v23.V4S(), 0.0);
   2560   __ fcmlt(v7.V2D(), v3.V2D(), 0.0);
   2561   __ fcmlt(v15.V2S(), v21.V2S(), 0.0);
   2562   __ fcmlt(v1.V4S(), v2.V4S(), 0.0);
   2563   __ fcvtas(v6.V2D(), v8.V2D());
   2564   __ fcvtas(v1.V2S(), v9.V2S());
   2565   __ fcvtas(v8.V4S(), v19.V4S());
   2566   __ fcvtau(v5.V2D(), v31.V2D());
   2567   __ fcvtau(v28.V2S(), v29.V2S());
   2568   __ fcvtau(v11.V4S(), v26.V4S());
   2569   __ fcvtl(v8.V2D(), v25.V2S());
   2570   __ fcvtl(v27.V4S(), v14.V4H());
   2571   __ fcvtl2(v1.V2D(), v6.V4S());
   2572   __ fcvtl2(v24.V4S(), v9.V8H());
   2573   __ fcvtms(v9.V2D(), v24.V2D());
   2574   __ fcvtms(v7.V2S(), v11.V2S());
   2575   __ fcvtms(v23.V4S(), v21.V4S());
   2576   __ fcvtmu(v13.V2D(), v1.V2D());
   2577   __ fcvtmu(v26.V2S(), v12.V2S());
   2578   __ fcvtmu(v21.V4S(), v21.V4S());
   2579   __ fcvtn(v11.V2S(), v1.V2D());
   2580   __ fcvtn(v8.V4H(), v2.V4S());
   2581   __ fcvtn2(v24.V4S(), v29.V2D());
   2582   __ fcvtn2(v4.V8H(), v10.V4S());
   2583   __ fcvtns(v25.V2D(), v10.V2D());
   2584   __ fcvtns(v4.V2S(), v8.V2S());
   2585   __ fcvtns(v29.V4S(), v27.V4S());
   2586   __ fcvtnu(v18.V2D(), v27.V2D());
   2587   __ fcvtnu(v11.V2S(), v14.V2S());
   2588   __ fcvtnu(v27.V4S(), v21.V4S());
   2589   __ fcvtps(v23.V2D(), v5.V2D());
   2590   __ fcvtps(v24.V2S(), v15.V2S());
   2591   __ fcvtps(v5.V4S(), v19.V4S());
   2592   __ fcvtpu(v3.V2D(), v21.V2D());
   2593   __ fcvtpu(v3.V2S(), v21.V2S());
   2594   __ fcvtpu(v0.V4S(), v7.V4S());
   2595   __ fcvtxn(v29.V2S(), v11.V2D());
   2596   __ fcvtxn2(v31.V4S(), v25.V2D());
   2597   __ fcvtzs(v19.V2D(), v17.V2D());
   2598   __ fcvtzs(v12.V2D(), v24.V2D(), 64);
   2599   __ fcvtzs(v9.V2S(), v2.V2S());
   2600   __ fcvtzs(v5.V2S(), v20.V2S(), 29);
   2601   __ fcvtzs(v21.V4S(), v25.V4S());
   2602   __ fcvtzs(v26.V4S(), v1.V4S(), 6);
   2603   __ fcvtzu(v13.V2D(), v25.V2D());
   2604   __ fcvtzu(v28.V2D(), v13.V2D(), 32);
   2605   __ fcvtzu(v26.V2S(), v6.V2S());
   2606   __ fcvtzu(v9.V2S(), v10.V2S(), 15);
   2607   __ fcvtzu(v30.V4S(), v6.V4S());
   2608   __ fcvtzu(v19.V4S(), v22.V4S(), 18);
   2609   __ fdiv(v15.V2D(), v8.V2D(), v15.V2D());
   2610   __ fdiv(v12.V2S(), v9.V2S(), v26.V2S());
   2611   __ fdiv(v19.V4S(), v22.V4S(), v19.V4S());
   2612   __ fmax(v19.V2D(), v7.V2D(), v8.V2D());
   2613   __ fmax(v25.V2S(), v12.V2S(), v29.V2S());
   2614   __ fmax(v6.V4S(), v15.V4S(), v5.V4S());
   2615   __ fmaxnm(v16.V2D(), v8.V2D(), v20.V2D());
   2616   __ fmaxnm(v15.V2S(), v26.V2S(), v25.V2S());
   2617   __ fmaxnm(v23.V4S(), v14.V4S(), v16.V4S());
   2618   __ fmaxnmp(d6, v19.V2D());
   2619   __ fmaxnmp(s27, v26.V2S());
   2620   __ fmaxnmp(v8.V2D(), v12.V2D(), v23.V2D());
   2621   __ fmaxnmp(v13.V2S(), v25.V2S(), v22.V2S());
   2622   __ fmaxnmp(v15.V4S(), v11.V4S(), v17.V4S());
   2623   __ fmaxnmv(s27, v19.V4S());
   2624   __ fmaxp(d20, v14.V2D());
   2625   __ fmaxp(s18, v2.V2S());
   2626   __ fmaxp(v9.V2D(), v23.V2D(), v31.V2D());
   2627   __ fmaxp(v7.V2S(), v22.V2S(), v31.V2S());
   2628   __ fmaxp(v18.V4S(), v7.V4S(), v29.V4S());
   2629   __ fmaxv(s31, v29.V4S());
   2630   __ fmin(v2.V2D(), v5.V2D(), v2.V2D());
   2631   __ fmin(v31.V2S(), v17.V2S(), v10.V2S());
   2632   __ fmin(v10.V4S(), v4.V4S(), v16.V4S());
   2633   __ fminnm(v21.V2D(), v6.V2D(), v5.V2D());
   2634   __ fminnm(v22.V2S(), v18.V2S(), v14.V2S());
   2635   __ fminnm(v25.V4S(), v31.V4S(), v3.V4S());
   2636   __ fminnmp(d9, v1.V2D());
   2637   __ fminnmp(s21, v20.V2S());
   2638   __ fminnmp(v16.V2D(), v21.V2D(), v19.V2D());
   2639   __ fminnmp(v16.V2S(), v31.V2S(), v25.V2S());
   2640   __ fminnmp(v26.V4S(), v16.V4S(), v15.V4S());
   2641   __ fminnmv(s3, v4.V4S());
   2642   __ fminp(d24, v26.V2D());
   2643   __ fminp(s7, v17.V2S());
   2644   __ fminp(v23.V2D(), v19.V2D(), v3.V2D());
   2645   __ fminp(v29.V2S(), v21.V2S(), v9.V2S());
   2646   __ fminp(v0.V4S(), v24.V4S(), v21.V4S());
   2647   __ fminv(s25, v8.V4S());
   2648   __ fmla(d23, d0, v9.D(), 1);
   2649   __ fmla(s23, s15, v7.S(), 0);
   2650   __ fmla(v17.V2D(), v11.V2D(), v6.V2D());
   2651   __ fmla(v30.V2D(), v30.V2D(), v11.D(), 0);
   2652   __ fmla(v19.V2S(), v12.V2S(), v6.V2S());
   2653   __ fmla(v24.V2S(), v17.V2S(), v9.S(), 0);
   2654   __ fmla(v16.V4S(), v11.V4S(), v11.V4S());
   2655   __ fmla(v27.V4S(), v23.V4S(), v9.S(), 2);
   2656   __ fmls(d27, d30, v6.D(), 0);
   2657   __ fmls(s21, s16, v2.S(), 0);
   2658   __ fmls(v5.V2D(), v19.V2D(), v21.V2D());
   2659   __ fmls(v18.V2D(), v30.V2D(), v12.D(), 0);
   2660   __ fmls(v5.V2S(), v16.V2S(), v7.V2S());
   2661   __ fmls(v3.V2S(), v18.V2S(), v11.S(), 1);
   2662   __ fmls(v27.V4S(), v5.V4S(), v30.V4S());
   2663   __ fmls(v26.V4S(), v20.V4S(), v4.S(), 3);
   2664   __ fmov(v14.V2D(), -0.34375);
   2665   __ fmov(v26.V2S(), 0.90625f);
   2666   __ fmov(v31.V4S(), -5.0000f);
   2667   __ fmov(v28.D(), 1, x25);
   2668   __ fmov(x18, v2.D(), 1);
   2669   __ fmul(d12, d4, v1.D(), 1);
   2670   __ fmul(s30, s1, v15.S(), 3);
   2671   __ fmul(v25.V2D(), v0.V2D(), v21.V2D());
   2672   __ fmul(v10.V2D(), v24.V2D(), v10.D(), 1);
   2673   __ fmul(v7.V2S(), v24.V2S(), v16.V2S());
   2674   __ fmul(v1.V2S(), v16.V2S(), v4.S(), 2);
   2675   __ fmul(v5.V4S(), v28.V4S(), v25.V4S());
   2676   __ fmul(v11.V4S(), v3.V4S(), v8.S(), 0);
   2677   __ fmulx(d28, d9, v3.D(), 1);
   2678   __ fmulx(s25, s21, v15.S(), 1);
   2679   __ fmulx(v31.V2D(), v28.V2D(), v8.V2D());
   2680   __ fmulx(v3.V2D(), v21.V2D(), v6.D(), 0);
   2681   __ fmulx(v9.V2S(), v1.V2S(), v0.V2S());
   2682   __ fmulx(v16.V2S(), v27.V2S(), v6.S(), 0);
   2683   __ fmulx(v2.V4S(), v4.V4S(), v5.V4S());
   2684   __ fmulx(v18.V4S(), v7.V4S(), v4.S(), 0);
   2685   __ fneg(v1.V2D(), v25.V2D());
   2686   __ fneg(v14.V2S(), v31.V2S());
   2687   __ fneg(v5.V4S(), v4.V4S());
   2688   __ frecpe(v18.V2D(), v12.V2D());
   2689   __ frecpe(v10.V2S(), v22.V2S());
   2690   __ frecpe(v5.V4S(), v6.V4S());
   2691   __ frecps(v22.V2D(), v7.V2D(), v26.V2D());
   2692   __ frecps(v31.V2S(), v27.V2S(), v2.V2S());
   2693   __ frecps(v18.V4S(), v6.V4S(), v27.V4S());
   2694   __ frinta(v26.V2D(), v13.V2D());
   2695   __ frinta(v15.V2S(), v26.V2S());
   2696   __ frinta(v13.V4S(), v16.V4S());
   2697   __ frinti(v9.V2D(), v12.V2D());
   2698   __ frinti(v5.V2S(), v19.V2S());
   2699   __ frinti(v15.V4S(), v11.V4S());
   2700   __ frintm(v17.V2D(), v29.V2D());
   2701   __ frintm(v30.V2S(), v11.V2S());
   2702   __ frintm(v1.V4S(), v20.V4S());
   2703   __ frintn(v24.V2D(), v6.V2D());
   2704   __ frintn(v12.V2S(), v17.V2S());
   2705   __ frintn(v29.V4S(), v11.V4S());
   2706   __ frintp(v10.V2D(), v7.V2D());
   2707   __ frintp(v12.V2S(), v18.V2S());
   2708   __ frintp(v26.V4S(), v31.V4S());
   2709   __ frintx(v24.V2D(), v13.V2D());
   2710   __ frintx(v7.V2S(), v9.V2S());
   2711   __ frintx(v18.V4S(), v21.V4S());
   2712   __ frintz(v19.V2D(), v25.V2D());
   2713   __ frintz(v15.V2S(), v8.V2S());
   2714   __ frintz(v20.V4S(), v3.V4S());
   2715   __ frsqrte(v23.V2D(), v5.V2D());
   2716   __ frsqrte(v9.V2S(), v7.V2S());
   2717   __ frsqrte(v3.V4S(), v9.V4S());
   2718   __ frsqrts(v25.V2D(), v28.V2D(), v15.V2D());
   2719   __ frsqrts(v9.V2S(), v26.V2S(), v10.V2S());
   2720   __ frsqrts(v5.V4S(), v1.V4S(), v10.V4S());
   2721   __ fsqrt(v6.V2D(), v18.V2D());
   2722   __ fsqrt(v6.V2S(), v18.V2S());
   2723   __ fsqrt(v0.V4S(), v31.V4S());
   2724   __ fsub(v31.V2D(), v30.V2D(), v31.V2D());
   2725   __ fsub(v11.V2S(), v8.V2S(), v6.V2S());
   2726   __ fsub(v16.V4S(), v0.V4S(), v31.V4S());
   2727   __ scvtf(v25.V2D(), v31.V2D());
   2728   __ scvtf(v10.V2D(), v13.V2D(), 45);
   2729   __ scvtf(v10.V2S(), v15.V2S());
   2730   __ scvtf(v18.V2S(), v4.V2S(), 27);
   2731   __ scvtf(v17.V4S(), v5.V4S());
   2732   __ scvtf(v11.V4S(), v25.V4S(), 24);
   2733   __ ucvtf(v9.V2D(), v3.V2D());
   2734   __ ucvtf(v26.V2D(), v30.V2D(), 46);
   2735   __ ucvtf(v11.V2S(), v4.V2S());
   2736   __ ucvtf(v29.V2S(), v3.V2S(), 25);
   2737   __ ucvtf(v22.V4S(), v23.V4S());
   2738   __ ucvtf(v18.V4S(), v9.V4S(), 25);
   2739 }
   2740 
   2741 
   2742 static void MaskAddresses(const char* trace) {
   2743 // Hexadecimal expressions of the form `\xab` do not work out-of-the box with
   2744 // BSD `sed`. So we use ANSI-C quoting to have the regular expressions below
   2745 // work both on Linux and BSD (and macOS).
   2746 #ifdef __APPLE__
   2747 #define MAYBE_ANSI_C_QUOTE "$"
   2748 #define HEX(val) "\\x" #val
   2749 #define ESCAPE(c) "\\\\" #c
   2750   const char* sed_options = "-i \"\" -E";
   2751 #else
   2752 #define MAYBE_ANSI_C_QUOTE
   2753 #define HEX(val) "\\x" #val
   2754 #define ESCAPE(c) "\\" #c
   2755   const char* sed_options = "--in-place --regexp-extended";
   2756 #endif
   2757 #define COLOUR "(" HEX(1b) ESCAPE([) "[01];([0-9][0-9])?m)?"
   2758   struct {
   2759     const char* search;
   2760     const char* replace;
   2761   } patterns[] =
   2762       {// Mask registers that hold addresses that change from run to run.
   2763        {"((x0|x1|x2|sp): " COLOUR "0x)[0-9a-f]{16}",
   2764         ESCAPE(1) "~~~~~~~~~~~~~~~~"},
   2765        // Mask accessed memory addresses.
   2766        {"((<-|->) " COLOUR "0x)[0-9a-f]{16}", ESCAPE(1) "~~~~~~~~~~~~~~~~"},
   2767        // Mask instruction addresses.
   2768        {"^0x[0-9a-f]{16}", "0x~~~~~~~~~~~~~~~~"},
   2769        // Mask branch targets.
   2770        {"(Branch" COLOUR " to 0x)[0-9a-f]{16}", ESCAPE(1) "~~~~~~~~~~~~~~~~"},
   2771        {"addr 0x[0-9a-f]+", "addr 0x~~~~~~~~~~~~~~~~"}};
   2772   const size_t patterns_length = sizeof(patterns) / sizeof(patterns[0]);
   2773   // Rewrite `trace`, masking addresses and other values that legitimately vary
   2774   // from run to run.
   2775   char command[1024];
   2776   for (size_t i = 0; i < patterns_length; i++) {
   2777     size_t length = snprintf(command,
   2778                              sizeof(command),
   2779                              "sed %s " MAYBE_ANSI_C_QUOTE "'s/%s/%s/' '%s'",
   2780                              sed_options,
   2781                              patterns[i].search,
   2782                              patterns[i].replace,
   2783                              trace);
   2784     VIXL_CHECK(length < sizeof(command));
   2785     VIXL_CHECK(system(command) == 0);
   2786   }
   2787 }
   2788 
   2789 
   2790 static void TraceTestHelper(bool coloured_trace,
   2791                             TraceParameters trace_parameters,
   2792                             const char* ref_file) {
   2793   MacroAssembler masm(12 * KBytes);
   2794 
   2795   char trace_stream_filename[] = "/tmp/vixl-test-trace-XXXXXX";
   2796   FILE* trace_stream = fdopen(mkstemp(trace_stream_filename), "w");
   2797 
   2798   Decoder decoder;
   2799   Simulator simulator(&decoder, trace_stream);
   2800   simulator.SetColouredTrace(coloured_trace);
   2801   simulator.SetTraceParameters(trace_parameters);
   2802   simulator.SilenceExclusiveAccessWarning();
   2803 
   2804   // Set up a scratch buffer so we can test loads and stores.
   2805   const int kScratchSize = 64 * KBytes;
   2806   const int kScratchGuardSize = 128;
   2807   char scratch_buffer[kScratchSize + kScratchGuardSize];
   2808   for (size_t i = 0; i < (sizeof(scratch_buffer) / sizeof(scratch_buffer[0]));
   2809        i++) {
   2810     scratch_buffer[i] = i & 0xff;
   2811   }
   2812   // Used for offset addressing.
   2813   simulator.WriteRegister(0, scratch_buffer);
   2814   // Used for pre-/post-index addressing.
   2815   simulator.WriteRegister(1, scratch_buffer);
   2816 
   2817   const int kPostIndexRegisterStep = 13;  // Arbitrary interesting value.
   2818   // Used for post-index offsets.
   2819   simulator.WriteRegister(2, kPostIndexRegisterStep);
   2820 
   2821   // Initialize the other registers with unique values.
   2822   uint64_t initial_base_u64 = 0x0100001000100101;
   2823   for (unsigned i = 3; i < kNumberOfRegisters; i++) {
   2824     if (i == kLinkRegCode) continue;
   2825     if (i == kZeroRegCode) continue;
   2826     // NoRegLog suppresses the log now, but the registers will still be logged
   2827     // before the first instruction is executed since they have been written but
   2828     // not printed.
   2829     simulator.WriteRegister(i, initial_base_u64 * i, Simulator::NoRegLog);
   2830   }
   2831   float initial_base_f32 = 1.2345f;
   2832   double initial_base_f64 = 1.3456f;
   2833   for (unsigned i = 0; i < kNumberOfVRegisters; i++) {
   2834     // Try to initialise V registers with reasonable FP values.
   2835     uint64_t low = (DoubleToRawbits(initial_base_f64 * i) & ~kSRegMask) |
   2836                    FloatToRawbits(initial_base_f32 * i);
   2837     uint64_t high = low ^ 0x0005555500555555;
   2838     LogicVRegister reg(simulator.ReadVRegister(i));
   2839     reg.SetUint(kFormat2D, 0, low);
   2840     reg.SetUint(kFormat2D, 1, high);
   2841   }
   2842 
   2843   GenerateTestSequenceBase(&masm);
   2844   GenerateTestSequenceFP(&masm);
   2845   GenerateTestSequenceNEON(&masm);
   2846   GenerateTestSequenceNEONFP(&masm);
   2847   masm.Ret();
   2848   masm.FinalizeCode();
   2849 
   2850   simulator.RunFrom(masm.GetBuffer()->GetStartAddress<Instruction*>());
   2851 
   2852   fclose(trace_stream);
   2853   MaskAddresses(trace_stream_filename);
   2854 
   2855   bool trace_matched_reference;
   2856   if (Test::generate_test_trace()) {
   2857     // Copy trace_stream to stdout.
   2858     trace_stream = fopen(trace_stream_filename, "r");
   2859     VIXL_ASSERT(trace_stream != NULL);
   2860     fseek(trace_stream, 0, SEEK_SET);
   2861     int c;
   2862     while (1) {
   2863       c = getc(trace_stream);
   2864       if (c == EOF) break;
   2865       putc(c, stdout);
   2866     }
   2867     fclose(trace_stream);
   2868     trace_matched_reference = true;
   2869   } else {
   2870     // Check trace_stream against ref_file.
   2871     char command[1024];
   2872     size_t length = snprintf(command,
   2873                              sizeof(command),
   2874                              "diff -u %s %s",
   2875                              ref_file,
   2876                              trace_stream_filename);
   2877     VIXL_CHECK(length < sizeof(command));
   2878     trace_matched_reference = (system(command) == 0);
   2879   }
   2880 
   2881   uint64_t offset_base = simulator.ReadRegister<uint64_t>(0);
   2882   uint64_t index_base = simulator.ReadRegister<uint64_t>(1);
   2883 
   2884   // Clean up before checking the result; VIXL_CHECK aborts.
   2885   remove(trace_stream_filename);
   2886 
   2887   VIXL_CHECK(trace_matched_reference);
   2888   VIXL_CHECK(index_base >= offset_base);
   2889   VIXL_CHECK((index_base - offset_base) <= kScratchSize);
   2890 }
   2891 
   2892 
   2893 #define REF(name) "test/test-trace-reference/" name
   2894 
   2895 // Test individual options.
   2896 TEST(disasm) { TraceTestHelper(false, LOG_DISASM, REF("log-disasm")); }
   2897 TEST(regs) { TraceTestHelper(false, LOG_REGS, REF("log-regs")); }
   2898 TEST(vregs) { TraceTestHelper(false, LOG_VREGS, REF("log-vregs")); }
   2899 TEST(sysregs) { TraceTestHelper(false, LOG_SYSREGS, REF("log-sysregs")); }
   2900 TEST(write) { TraceTestHelper(false, LOG_WRITE, REF("log-write")); }
   2901 TEST(branch) { TraceTestHelper(false, LOG_WRITE, REF("log-branch")); }
   2902 
   2903 // Test standard combinations.
   2904 TEST(none) { TraceTestHelper(false, LOG_NONE, REF("log-none")); }
   2905 TEST(state) { TraceTestHelper(false, LOG_STATE, REF("log-state")); }
   2906 TEST(all) { TraceTestHelper(false, LOG_ALL, REF("log-all")); }
   2907 
   2908 
   2909 // Test individual options (with colour).
   2910 TEST(disasm_colour) {
   2911   TraceTestHelper(true, LOG_DISASM, REF("log-disasm-colour"));
   2912 }
   2913 TEST(regs_colour) { TraceTestHelper(true, LOG_REGS, REF("log-regs-colour")); }
   2914 TEST(vregs_colour) {
   2915   TraceTestHelper(true, LOG_VREGS, REF("log-vregs-colour"));
   2916 }
   2917 TEST(sysregs_colour) {
   2918   TraceTestHelper(true, LOG_SYSREGS, REF("log-sysregs-colour"));
   2919 }
   2920 TEST(write_colour) {
   2921   TraceTestHelper(true, LOG_WRITE, REF("log-write-colour"));
   2922 }
   2923 TEST(branch_colour) {
   2924   TraceTestHelper(true, LOG_WRITE, REF("log-branch-colour"));
   2925 }
   2926 
   2927 // Test standard combinations (with colour).
   2928 TEST(none_colour) { TraceTestHelper(true, LOG_NONE, REF("log-none-colour")); }
   2929 TEST(state_colour) {
   2930   TraceTestHelper(true, LOG_STATE, REF("log-state-colour"));
   2931 }
   2932 TEST(all_colour) { TraceTestHelper(true, LOG_ALL, REF("log-all-colour")); }
   2933 
   2934 
   2935 #endif  // VIXL_INCLUDE_SIMULATOR_AARCH64
   2936 }  // namespace aarch64
   2937 }  // namespace vixl
   2938