Home | History | Annotate | Download | only in priv
      1 /* -*- mode: C; c-basic-offset: 3; -*- */
      2 
      3 /*--------------------------------------------------------------------*/
      4 /*--- begin                                     guest_arm64_toIR.c ---*/
      5 /*--------------------------------------------------------------------*/
      6 
      7 /*
      8    This file is part of Valgrind, a dynamic binary instrumentation
      9    framework.
     10 
     11    Copyright (C) 2013-2015 OpenWorks
     12       info (at) open-works.net
     13 
     14    This program is free software; you can redistribute it and/or
     15    modify it under the terms of the GNU General Public License as
     16    published by the Free Software Foundation; either version 2 of the
     17    License, or (at your option) any later version.
     18 
     19    This program is distributed in the hope that it will be useful, but
     20    WITHOUT ANY WARRANTY; without even the implied warranty of
     21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     22    General Public License for more details.
     23 
     24    You should have received a copy of the GNU General Public License
     25    along with this program; if not, write to the Free Software
     26    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     27    02110-1301, USA.
     28 
     29    The GNU General Public License is contained in the file COPYING.
     30 */
     31 
     32 /* KNOWN LIMITATIONS 2014-Nov-16
     33 
     34    * Correctness: FMAXNM, FMINNM are implemented the same as FMAX/FMIN.
     35 
     36      Also FP comparison "unordered" .. is implemented as normal FP
     37      comparison.
     38 
     39      Both should be fixed.  They behave incorrectly in the presence of
     40      NaNs.
     41 
     42      FMULX is treated the same as FMUL.  That's also not correct.
     43 
     44    * Floating multiply-add (etc) insns.  Are split into a multiply and
     45      an add, and so suffer double rounding and hence sometimes the
     46      least significant mantissa bit is incorrect.  Fix: use the IR
     47      multiply-add IROps instead.
     48 
     49    * FRINTA, FRINTN are kludged .. they just round to nearest.  No special
     50      handling for the "ties" case.  FRINTX might be dubious too.
     51 
     52    * Ditto FCVTXN.  No idea what "round to odd" means.  This implementation
     53      just rounds to nearest.
     54 */
     55 
     56 /* "Special" instructions.
     57 
     58    This instruction decoder can decode four special instructions
     59    which mean nothing natively (are no-ops as far as regs/mem are
     60    concerned) but have meaning for supporting Valgrind.  A special
     61    instruction is flagged by a 16-byte preamble:
     62 
     63       93CC0D8C 93CC358C 93CCCD8C 93CCF58C
     64       (ror x12, x12, #3;   ror x12, x12, #13
     65        ror x12, x12, #51;  ror x12, x12, #61)
     66 
     67    Following that, one of the following 3 are allowed
     68    (standard interpretation in parentheses):
     69 
     70       AA0A014A (orr x10,x10,x10)   X3 = client_request ( X4 )
     71       AA0B016B (orr x11,x11,x11)   X3 = guest_NRADDR
     72       AA0C018C (orr x12,x12,x12)   branch-and-link-to-noredir X8
     73       AA090129 (orr x9,x9,x9)      IR injection
     74 
     75    Any other bytes following the 16-byte preamble are illegal and
     76    constitute a failure in instruction decoding.  This all assumes
     77    that the preamble will never occur except in specific code
     78    fragments designed for Valgrind to catch.
     79 */
     80 
     81 /* Translates ARM64 code to IR. */
     82 
     83 #include "libvex_basictypes.h"
     84 #include "libvex_ir.h"
     85 #include "libvex.h"
     86 #include "libvex_guest_arm64.h"
     87 
     88 #include "main_util.h"
     89 #include "main_globals.h"
     90 #include "guest_generic_bb_to_IR.h"
     91 #include "guest_arm64_defs.h"
     92 
     93 
     94 /*------------------------------------------------------------*/
     95 /*--- Globals                                              ---*/
     96 /*------------------------------------------------------------*/
     97 
     98 /* These are set at the start of the translation of a instruction, so
     99    that we don't have to pass them around endlessly.  CONST means does
    100    not change during translation of the instruction.
    101 */
    102 
    103 /* CONST: what is the host's endianness?  We need to know this in
    104    order to do sub-register accesses to the SIMD/FP registers
    105    correctly. */
    106 static VexEndness host_endness;
    107 
    108 /* CONST: The guest address for the instruction currently being
    109    translated.  */
    110 static Addr64 guest_PC_curr_instr;
    111 
    112 /* MOD: The IRSB* into which we're generating code. */
    113 static IRSB* irsb;
    114 
    115 
    116 /*------------------------------------------------------------*/
    117 /*--- Debugging output                                     ---*/
    118 /*------------------------------------------------------------*/
    119 
    120 #define DIP(format, args...)           \
    121    if (vex_traceflags & VEX_TRACE_FE)  \
    122       vex_printf(format, ## args)
    123 
    124 #define DIS(buf, format, args...)      \
    125    if (vex_traceflags & VEX_TRACE_FE)  \
    126       vex_sprintf(buf, format, ## args)
    127 
    128 
    129 /*------------------------------------------------------------*/
    130 /*--- Helper bits and pieces for deconstructing the        ---*/
    131 /*--- arm insn stream.                                     ---*/
    132 /*------------------------------------------------------------*/
    133 
    134 /* Do a little-endian load of a 32-bit word, regardless of the
    135    endianness of the underlying host. */
    136 static inline UInt getUIntLittleEndianly ( const UChar* p )
    137 {
    138    UInt w = 0;
    139    w = (w << 8) | p[3];
    140    w = (w << 8) | p[2];
    141    w = (w << 8) | p[1];
    142    w = (w << 8) | p[0];
    143    return w;
    144 }
    145 
    146 /* Sign extend a N-bit value up to 64 bits, by copying
    147    bit N-1 into all higher positions. */
    148 static ULong sx_to_64 ( ULong x, UInt n )
    149 {
    150    vassert(n > 1 && n < 64);
    151    Long r = (Long)x;
    152    r = (r << (64-n)) >> (64-n);
    153    return (ULong)r;
    154 }
    155 
    156 //ZZ /* Do a little-endian load of a 16-bit word, regardless of the
    157 //ZZ    endianness of the underlying host. */
    158 //ZZ static inline UShort getUShortLittleEndianly ( UChar* p )
    159 //ZZ {
    160 //ZZ    UShort w = 0;
    161 //ZZ    w = (w << 8) | p[1];
    162 //ZZ    w = (w << 8) | p[0];
    163 //ZZ    return w;
    164 //ZZ }
    165 //ZZ
    166 //ZZ static UInt ROR32 ( UInt x, UInt sh ) {
    167 //ZZ    vassert(sh >= 0 && sh < 32);
    168 //ZZ    if (sh == 0)
    169 //ZZ       return x;
    170 //ZZ    else
    171 //ZZ       return (x << (32-sh)) | (x >> sh);
    172 //ZZ }
    173 //ZZ
    174 //ZZ static Int popcount32 ( UInt x )
    175 //ZZ {
    176 //ZZ    Int res = 0, i;
    177 //ZZ    for (i = 0; i < 32; i++) {
    178 //ZZ       res += (x & 1);
    179 //ZZ       x >>= 1;
    180 //ZZ    }
    181 //ZZ    return res;
    182 //ZZ }
    183 //ZZ
    184 //ZZ static UInt setbit32 ( UInt x, Int ix, UInt b )
    185 //ZZ {
    186 //ZZ    UInt mask = 1 << ix;
    187 //ZZ    x &= ~mask;
    188 //ZZ    x |= ((b << ix) & mask);
    189 //ZZ    return x;
    190 //ZZ }
    191 
    192 #define BITS2(_b1,_b0)  \
    193    (((_b1) << 1) | (_b0))
    194 
    195 #define BITS3(_b2,_b1,_b0)  \
    196   (((_b2) << 2) | ((_b1) << 1) | (_b0))
    197 
    198 #define BITS4(_b3,_b2,_b1,_b0)  \
    199    (((_b3) << 3) | ((_b2) << 2) | ((_b1) << 1) | (_b0))
    200 
    201 #define BITS8(_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
    202    ((BITS4((_b7),(_b6),(_b5),(_b4)) << 4)  \
    203     | BITS4((_b3),(_b2),(_b1),(_b0)))
    204 
    205 #define BITS5(_b4,_b3,_b2,_b1,_b0)  \
    206    (BITS8(0,0,0,(_b4),(_b3),(_b2),(_b1),(_b0)))
    207 #define BITS6(_b5,_b4,_b3,_b2,_b1,_b0)  \
    208    (BITS8(0,0,(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
    209 #define BITS7(_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
    210    (BITS8(0,(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
    211 
    212 #define BITS9(_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
    213    (((_b8) << 8)  \
    214     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
    215 
    216 #define BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
    217    (((_b9) << 9) | ((_b8) << 8)  \
    218     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
    219 
    220 #define BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
    221    (((_b10) << 10)  \
    222     | BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
    223 
    224 #define BITS12(_b11, _b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0) \
    225    (((_b11) << 11)  \
    226     | BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
    227 
    228 #define X00 BITS2(0,0)
    229 #define X01 BITS2(0,1)
    230 #define X10 BITS2(1,0)
    231 #define X11 BITS2(1,1)
    232 
    233 // produces _uint[_bMax:_bMin]
    234 #define SLICE_UInt(_uint,_bMax,_bMin)  \
    235    (( ((UInt)(_uint)) >> (_bMin))  \
    236     & (UInt)((1ULL << ((_bMax) - (_bMin) + 1)) - 1ULL))
    237 
    238 
    239 /*------------------------------------------------------------*/
    240 /*--- Helper bits and pieces for creating IR fragments.    ---*/
    241 /*------------------------------------------------------------*/
    242 
    243 static IRExpr* mkV128 ( UShort w )
    244 {
    245    return IRExpr_Const(IRConst_V128(w));
    246 }
    247 
    248 static IRExpr* mkU64 ( ULong i )
    249 {
    250    return IRExpr_Const(IRConst_U64(i));
    251 }
    252 
    253 static IRExpr* mkU32 ( UInt i )
    254 {
    255    return IRExpr_Const(IRConst_U32(i));
    256 }
    257 
    258 static IRExpr* mkU16 ( UInt i )
    259 {
    260    vassert(i < 65536);
    261    return IRExpr_Const(IRConst_U16(i));
    262 }
    263 
    264 static IRExpr* mkU8 ( UInt i )
    265 {
    266    vassert(i < 256);
    267    return IRExpr_Const(IRConst_U8( (UChar)i ));
    268 }
    269 
    270 static IRExpr* mkexpr ( IRTemp tmp )
    271 {
    272    return IRExpr_RdTmp(tmp);
    273 }
    274 
    275 static IRExpr* unop ( IROp op, IRExpr* a )
    276 {
    277    return IRExpr_Unop(op, a);
    278 }
    279 
    280 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    281 {
    282    return IRExpr_Binop(op, a1, a2);
    283 }
    284 
    285 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    286 {
    287    return IRExpr_Triop(op, a1, a2, a3);
    288 }
    289 
    290 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    291 {
    292    return IRExpr_Load(Iend_LE, ty, addr);
    293 }
    294 
    295 /* Add a statement to the list held by "irbb". */
    296 static void stmt ( IRStmt* st )
    297 {
    298    addStmtToIRSB( irsb, st );
    299 }
    300 
    301 static void assign ( IRTemp dst, IRExpr* e )
    302 {
    303    stmt( IRStmt_WrTmp(dst, e) );
    304 }
    305 
    306 static void storeLE ( IRExpr* addr, IRExpr* data )
    307 {
    308    stmt( IRStmt_Store(Iend_LE, addr, data) );
    309 }
    310 
    311 //ZZ static void storeGuardedLE ( IRExpr* addr, IRExpr* data, IRTemp guardT )
    312 //ZZ {
    313 //ZZ    if (guardT == IRTemp_INVALID) {
    314 //ZZ       /* unconditional */
    315 //ZZ       storeLE(addr, data);
    316 //ZZ    } else {
    317 //ZZ       stmt( IRStmt_StoreG(Iend_LE, addr, data,
    318 //ZZ                           binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
    319 //ZZ    }
    320 //ZZ }
    321 //ZZ
    322 //ZZ static void loadGuardedLE ( IRTemp dst, IRLoadGOp cvt,
    323 //ZZ                             IRExpr* addr, IRExpr* alt,
    324 //ZZ                             IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
    325 //ZZ {
    326 //ZZ    if (guardT == IRTemp_INVALID) {
    327 //ZZ       /* unconditional */
    328 //ZZ       IRExpr* loaded = NULL;
    329 //ZZ       switch (cvt) {
    330 //ZZ          case ILGop_Ident32:
    331 //ZZ             loaded = loadLE(Ity_I32, addr); break;
    332 //ZZ          case ILGop_8Uto32:
    333 //ZZ             loaded = unop(Iop_8Uto32, loadLE(Ity_I8, addr)); break;
    334 //ZZ          case ILGop_8Sto32:
    335 //ZZ             loaded = unop(Iop_8Sto32, loadLE(Ity_I8, addr)); break;
    336 //ZZ          case ILGop_16Uto32:
    337 //ZZ             loaded = unop(Iop_16Uto32, loadLE(Ity_I16, addr)); break;
    338 //ZZ          case ILGop_16Sto32:
    339 //ZZ             loaded = unop(Iop_16Sto32, loadLE(Ity_I16, addr)); break;
    340 //ZZ          default:
    341 //ZZ             vassert(0);
    342 //ZZ       }
    343 //ZZ       vassert(loaded != NULL);
    344 //ZZ       assign(dst, loaded);
    345 //ZZ    } else {
    346 //ZZ       /* Generate a guarded load into 'dst', but apply 'cvt' to the
    347 //ZZ          loaded data before putting the data in 'dst'.  If the load
    348 //ZZ          does not take place, 'alt' is placed directly in 'dst'. */
    349 //ZZ       stmt( IRStmt_LoadG(Iend_LE, cvt, dst, addr, alt,
    350 //ZZ                          binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
    351 //ZZ    }
    352 //ZZ }
    353 
    354 /* Generate a new temporary of the given type. */
    355 static IRTemp newTemp ( IRType ty )
    356 {
    357    vassert(isPlausibleIRType(ty));
    358    return newIRTemp( irsb->tyenv, ty );
    359 }
    360 
    361 /* This is used in many places, so the brevity is an advantage. */
    362 static IRTemp newTempV128(void)
    363 {
    364    return newTemp(Ity_V128);
    365 }
    366 
    367 /* Initialise V128 temporaries en masse. */
    368 static
    369 void newTempsV128_2(IRTemp* t1, IRTemp* t2)
    370 {
    371    vassert(t1 && *t1 == IRTemp_INVALID);
    372    vassert(t2 && *t2 == IRTemp_INVALID);
    373    *t1 = newTempV128();
    374    *t2 = newTempV128();
    375 }
    376 
    377 static
    378 void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3)
    379 {
    380    vassert(t1 && *t1 == IRTemp_INVALID);
    381    vassert(t2 && *t2 == IRTemp_INVALID);
    382    vassert(t3 && *t3 == IRTemp_INVALID);
    383    *t1 = newTempV128();
    384    *t2 = newTempV128();
    385    *t3 = newTempV128();
    386 }
    387 
    388 static
    389 void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
    390 {
    391    vassert(t1 && *t1 == IRTemp_INVALID);
    392    vassert(t2 && *t2 == IRTemp_INVALID);
    393    vassert(t3 && *t3 == IRTemp_INVALID);
    394    vassert(t4 && *t4 == IRTemp_INVALID);
    395    *t1 = newTempV128();
    396    *t2 = newTempV128();
    397    *t3 = newTempV128();
    398    *t4 = newTempV128();
    399 }
    400 
    401 static
    402 void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
    403                     IRTemp* t4, IRTemp* t5, IRTemp* t6, IRTemp* t7)
    404 {
    405    vassert(t1 && *t1 == IRTemp_INVALID);
    406    vassert(t2 && *t2 == IRTemp_INVALID);
    407    vassert(t3 && *t3 == IRTemp_INVALID);
    408    vassert(t4 && *t4 == IRTemp_INVALID);
    409    vassert(t5 && *t5 == IRTemp_INVALID);
    410    vassert(t6 && *t6 == IRTemp_INVALID);
    411    vassert(t7 && *t7 == IRTemp_INVALID);
    412    *t1 = newTempV128();
    413    *t2 = newTempV128();
    414    *t3 = newTempV128();
    415    *t4 = newTempV128();
    416    *t5 = newTempV128();
    417    *t6 = newTempV128();
    418    *t7 = newTempV128();
    419 }
    420 
    421 //ZZ /* Produces a value in 0 .. 3, which is encoded as per the type
    422 //ZZ    IRRoundingMode. */
    423 //ZZ static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
    424 //ZZ {
    425 //ZZ    return mkU32(Irrm_NEAREST);
    426 //ZZ }
    427 //ZZ
    428 //ZZ /* Generate an expression for SRC rotated right by ROT. */
    429 //ZZ static IRExpr* genROR32( IRTemp src, Int rot )
    430 //ZZ {
    431 //ZZ    vassert(rot >= 0 && rot < 32);
    432 //ZZ    if (rot == 0)
    433 //ZZ       return mkexpr(src);
    434 //ZZ    return
    435 //ZZ       binop(Iop_Or32,
    436 //ZZ             binop(Iop_Shl32, mkexpr(src), mkU8(32 - rot)),
    437 //ZZ             binop(Iop_Shr32, mkexpr(src), mkU8(rot)));
    438 //ZZ }
    439 //ZZ
    440 //ZZ static IRExpr* mkU128 ( ULong i )
    441 //ZZ {
    442 //ZZ    return binop(Iop_64HLtoV128, mkU64(i), mkU64(i));
    443 //ZZ }
    444 //ZZ
    445 //ZZ /* Generate a 4-aligned version of the given expression if
    446 //ZZ    the given condition is true.  Else return it unchanged. */
    447 //ZZ static IRExpr* align4if ( IRExpr* e, Bool b )
    448 //ZZ {
    449 //ZZ    if (b)
    450 //ZZ       return binop(Iop_And32, e, mkU32(~3));
    451 //ZZ    else
    452 //ZZ       return e;
    453 //ZZ }
    454 
    455 /* Other IR construction helpers. */
    456 static IROp mkAND ( IRType ty ) {
    457    switch (ty) {
    458       case Ity_I32: return Iop_And32;
    459       case Ity_I64: return Iop_And64;
    460       default: vpanic("mkAND");
    461    }
    462 }
    463 
    464 static IROp mkOR ( IRType ty ) {
    465    switch (ty) {
    466       case Ity_I32: return Iop_Or32;
    467       case Ity_I64: return Iop_Or64;
    468       default: vpanic("mkOR");
    469    }
    470 }
    471 
    472 static IROp mkXOR ( IRType ty ) {
    473    switch (ty) {
    474       case Ity_I32: return Iop_Xor32;
    475       case Ity_I64: return Iop_Xor64;
    476       default: vpanic("mkXOR");
    477    }
    478 }
    479 
    480 static IROp mkSHL ( IRType ty ) {
    481    switch (ty) {
    482       case Ity_I32: return Iop_Shl32;
    483       case Ity_I64: return Iop_Shl64;
    484       default: vpanic("mkSHL");
    485    }
    486 }
    487 
    488 static IROp mkSHR ( IRType ty ) {
    489    switch (ty) {
    490       case Ity_I32: return Iop_Shr32;
    491       case Ity_I64: return Iop_Shr64;
    492       default: vpanic("mkSHR");
    493    }
    494 }
    495 
    496 static IROp mkSAR ( IRType ty ) {
    497    switch (ty) {
    498       case Ity_I32: return Iop_Sar32;
    499       case Ity_I64: return Iop_Sar64;
    500       default: vpanic("mkSAR");
    501    }
    502 }
    503 
    504 static IROp mkNOT ( IRType ty ) {
    505    switch (ty) {
    506       case Ity_I32: return Iop_Not32;
    507       case Ity_I64: return Iop_Not64;
    508       default: vpanic("mkNOT");
    509    }
    510 }
    511 
    512 static IROp mkADD ( IRType ty ) {
    513    switch (ty) {
    514       case Ity_I32: return Iop_Add32;
    515       case Ity_I64: return Iop_Add64;
    516       default: vpanic("mkADD");
    517    }
    518 }
    519 
    520 static IROp mkSUB ( IRType ty ) {
    521    switch (ty) {
    522       case Ity_I32: return Iop_Sub32;
    523       case Ity_I64: return Iop_Sub64;
    524       default: vpanic("mkSUB");
    525    }
    526 }
    527 
    528 static IROp mkADDF ( IRType ty ) {
    529    switch (ty) {
    530       case Ity_F32: return Iop_AddF32;
    531       case Ity_F64: return Iop_AddF64;
    532       default: vpanic("mkADDF");
    533    }
    534 }
    535 
    536 static IROp mkSUBF ( IRType ty ) {
    537    switch (ty) {
    538       case Ity_F32: return Iop_SubF32;
    539       case Ity_F64: return Iop_SubF64;
    540       default: vpanic("mkSUBF");
    541    }
    542 }
    543 
    544 static IROp mkMULF ( IRType ty ) {
    545    switch (ty) {
    546       case Ity_F32: return Iop_MulF32;
    547       case Ity_F64: return Iop_MulF64;
    548       default: vpanic("mkMULF");
    549    }
    550 }
    551 
    552 static IROp mkDIVF ( IRType ty ) {
    553    switch (ty) {
    554       case Ity_F32: return Iop_DivF32;
    555       case Ity_F64: return Iop_DivF64;
    556       default: vpanic("mkMULF");
    557    }
    558 }
    559 
    560 static IROp mkNEGF ( IRType ty ) {
    561    switch (ty) {
    562       case Ity_F32: return Iop_NegF32;
    563       case Ity_F64: return Iop_NegF64;
    564       default: vpanic("mkNEGF");
    565    }
    566 }
    567 
    568 static IROp mkABSF ( IRType ty ) {
    569    switch (ty) {
    570       case Ity_F32: return Iop_AbsF32;
    571       case Ity_F64: return Iop_AbsF64;
    572       default: vpanic("mkNEGF");
    573    }
    574 }
    575 
    576 static IROp mkSQRTF ( IRType ty ) {
    577    switch (ty) {
    578       case Ity_F32: return Iop_SqrtF32;
    579       case Ity_F64: return Iop_SqrtF64;
    580       default: vpanic("mkNEGF");
    581    }
    582 }
    583 
    584 static IROp mkVecADD ( UInt size ) {
    585    const IROp ops[4]
    586       = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
    587    vassert(size < 4);
    588    return ops[size];
    589 }
    590 
    591 static IROp mkVecQADDU ( UInt size ) {
    592    const IROp ops[4]
    593       = { Iop_QAdd8Ux16, Iop_QAdd16Ux8, Iop_QAdd32Ux4, Iop_QAdd64Ux2 };
    594    vassert(size < 4);
    595    return ops[size];
    596 }
    597 
    598 static IROp mkVecQADDS ( UInt size ) {
    599    const IROp ops[4]
    600       = { Iop_QAdd8Sx16, Iop_QAdd16Sx8, Iop_QAdd32Sx4, Iop_QAdd64Sx2 };
    601    vassert(size < 4);
    602    return ops[size];
    603 }
    604 
    605 static IROp mkVecQADDEXTSUSATUU ( UInt size ) {
    606    const IROp ops[4]
    607       = { Iop_QAddExtSUsatUU8x16, Iop_QAddExtSUsatUU16x8,
    608           Iop_QAddExtSUsatUU32x4, Iop_QAddExtSUsatUU64x2 };
    609    vassert(size < 4);
    610    return ops[size];
    611 }
    612 
    613 static IROp mkVecQADDEXTUSSATSS ( UInt size ) {
    614    const IROp ops[4]
    615       = { Iop_QAddExtUSsatSS8x16, Iop_QAddExtUSsatSS16x8,
    616           Iop_QAddExtUSsatSS32x4, Iop_QAddExtUSsatSS64x2 };
    617    vassert(size < 4);
    618    return ops[size];
    619 }
    620 
    621 static IROp mkVecSUB ( UInt size ) {
    622    const IROp ops[4]
    623       = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
    624    vassert(size < 4);
    625    return ops[size];
    626 }
    627 
    628 static IROp mkVecQSUBU ( UInt size ) {
    629    const IROp ops[4]
    630       = { Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4, Iop_QSub64Ux2 };
    631    vassert(size < 4);
    632    return ops[size];
    633 }
    634 
    635 static IROp mkVecQSUBS ( UInt size ) {
    636    const IROp ops[4]
    637       = { Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, Iop_QSub64Sx2 };
    638    vassert(size < 4);
    639    return ops[size];
    640 }
    641 
    642 static IROp mkVecSARN ( UInt size ) {
    643    const IROp ops[4]
    644       = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
    645    vassert(size < 4);
    646    return ops[size];
    647 }
    648 
    649 static IROp mkVecSHRN ( UInt size ) {
    650    const IROp ops[4]
    651       = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
    652    vassert(size < 4);
    653    return ops[size];
    654 }
    655 
    656 static IROp mkVecSHLN ( UInt size ) {
    657    const IROp ops[4]
    658       = { Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2 };
    659    vassert(size < 4);
    660    return ops[size];
    661 }
    662 
    663 static IROp mkVecCATEVENLANES ( UInt size ) {
    664    const IROp ops[4]
    665       = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8,
    666           Iop_CatEvenLanes32x4, Iop_InterleaveLO64x2 };
    667    vassert(size < 4);
    668    return ops[size];
    669 }
    670 
    671 static IROp mkVecCATODDLANES ( UInt size ) {
    672    const IROp ops[4]
    673       = { Iop_CatOddLanes8x16, Iop_CatOddLanes16x8,
    674           Iop_CatOddLanes32x4, Iop_InterleaveHI64x2 };
    675    vassert(size < 4);
    676    return ops[size];
    677 }
    678 
    679 static IROp mkVecINTERLEAVELO ( UInt size ) {
    680    const IROp ops[4]
    681       = { Iop_InterleaveLO8x16, Iop_InterleaveLO16x8,
    682           Iop_InterleaveLO32x4, Iop_InterleaveLO64x2 };
    683    vassert(size < 4);
    684    return ops[size];
    685 }
    686 
    687 static IROp mkVecINTERLEAVEHI ( UInt size ) {
    688    const IROp ops[4]
    689       = { Iop_InterleaveHI8x16, Iop_InterleaveHI16x8,
    690           Iop_InterleaveHI32x4, Iop_InterleaveHI64x2 };
    691    vassert(size < 4);
    692    return ops[size];
    693 }
    694 
    695 static IROp mkVecMAXU ( UInt size ) {
    696    const IROp ops[4]
    697       = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
    698    vassert(size < 4);
    699    return ops[size];
    700 }
    701 
    702 static IROp mkVecMAXS ( UInt size ) {
    703    const IROp ops[4]
    704       = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2 };
    705    vassert(size < 4);
    706    return ops[size];
    707 }
    708 
    709 static IROp mkVecMINU ( UInt size ) {
    710    const IROp ops[4]
    711       = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4, Iop_Min64Ux2 };
    712    vassert(size < 4);
    713    return ops[size];
    714 }
    715 
    716 static IROp mkVecMINS ( UInt size ) {
    717    const IROp ops[4]
    718       = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4, Iop_Min64Sx2 };
    719    vassert(size < 4);
    720    return ops[size];
    721 }
    722 
    723 static IROp mkVecMUL ( UInt size ) {
    724    const IROp ops[4]
    725       = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4, Iop_INVALID };
    726    vassert(size < 3);
    727    return ops[size];
    728 }
    729 
    730 static IROp mkVecMULLU ( UInt sizeNarrow ) {
    731    const IROp ops[4]
    732       = { Iop_Mull8Ux8, Iop_Mull16Ux4, Iop_Mull32Ux2, Iop_INVALID };
    733    vassert(sizeNarrow < 3);
    734    return ops[sizeNarrow];
    735 }
    736 
    737 static IROp mkVecMULLS ( UInt sizeNarrow ) {
    738    const IROp ops[4]
    739       = { Iop_Mull8Sx8, Iop_Mull16Sx4, Iop_Mull32Sx2, Iop_INVALID };
    740    vassert(sizeNarrow < 3);
    741    return ops[sizeNarrow];
    742 }
    743 
    744 static IROp mkVecQDMULLS ( UInt sizeNarrow ) {
    745    const IROp ops[4]
    746       = { Iop_INVALID, Iop_QDMull16Sx4, Iop_QDMull32Sx2, Iop_INVALID };
    747    vassert(sizeNarrow < 3);
    748    return ops[sizeNarrow];
    749 }
    750 
    751 static IROp mkVecCMPEQ ( UInt size ) {
    752    const IROp ops[4]
    753       = { Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4, Iop_CmpEQ64x2 };
    754    vassert(size < 4);
    755    return ops[size];
    756 }
    757 
    758 static IROp mkVecCMPGTU ( UInt size ) {
    759    const IROp ops[4]
    760       = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, Iop_CmpGT64Ux2 };
    761    vassert(size < 4);
    762    return ops[size];
    763 }
    764 
    765 static IROp mkVecCMPGTS ( UInt size ) {
    766    const IROp ops[4]
    767       = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2 };
    768    vassert(size < 4);
    769    return ops[size];
    770 }
    771 
    772 static IROp mkVecABS ( UInt size ) {
    773    const IROp ops[4]
    774       = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 };
    775    vassert(size < 4);
    776    return ops[size];
    777 }
    778 
    779 static IROp mkVecZEROHIxxOFV128 ( UInt size ) {
    780    const IROp ops[4]
    781       = { Iop_ZeroHI120ofV128, Iop_ZeroHI112ofV128,
    782           Iop_ZeroHI96ofV128,  Iop_ZeroHI64ofV128 };
    783    vassert(size < 4);
    784    return ops[size];
    785 }
    786 
    787 static IRExpr* mkU ( IRType ty, ULong imm ) {
    788    switch (ty) {
    789       case Ity_I32: return mkU32((UInt)(imm & 0xFFFFFFFFULL));
    790       case Ity_I64: return mkU64(imm);
    791       default: vpanic("mkU");
    792    }
    793 }
    794 
    795 static IROp mkVecQDMULHIS ( UInt size ) {
    796    const IROp ops[4]
    797       = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
    798    vassert(size < 4);
    799    return ops[size];
    800 }
    801 
    802 static IROp mkVecQRDMULHIS ( UInt size ) {
    803    const IROp ops[4]
    804       = { Iop_INVALID, Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4, Iop_INVALID };
    805    vassert(size < 4);
    806    return ops[size];
    807 }
    808 
    809 static IROp mkVecQANDUQSH ( UInt size ) {
    810    const IROp ops[4]
    811       = { Iop_QandUQsh8x16, Iop_QandUQsh16x8,
    812           Iop_QandUQsh32x4, Iop_QandUQsh64x2 };
    813    vassert(size < 4);
    814    return ops[size];
    815 }
    816 
    817 static IROp mkVecQANDSQSH ( UInt size ) {
    818    const IROp ops[4]
    819       = { Iop_QandSQsh8x16, Iop_QandSQsh16x8,
    820           Iop_QandSQsh32x4, Iop_QandSQsh64x2 };
    821    vassert(size < 4);
    822    return ops[size];
    823 }
    824 
    825 static IROp mkVecQANDUQRSH ( UInt size ) {
    826    const IROp ops[4]
    827       = { Iop_QandUQRsh8x16, Iop_QandUQRsh16x8,
    828           Iop_QandUQRsh32x4, Iop_QandUQRsh64x2 };
    829    vassert(size < 4);
    830    return ops[size];
    831 }
    832 
    833 static IROp mkVecQANDSQRSH ( UInt size ) {
    834    const IROp ops[4]
    835       = { Iop_QandSQRsh8x16, Iop_QandSQRsh16x8,
    836           Iop_QandSQRsh32x4, Iop_QandSQRsh64x2 };
    837    vassert(size < 4);
    838    return ops[size];
    839 }
    840 
    841 static IROp mkVecSHU ( UInt size ) {
    842    const IROp ops[4]
    843       = { Iop_Sh8Ux16, Iop_Sh16Ux8, Iop_Sh32Ux4, Iop_Sh64Ux2 };
    844    vassert(size < 4);
    845    return ops[size];
    846 }
    847 
    848 static IROp mkVecSHS ( UInt size ) {
    849    const IROp ops[4]
    850       = { Iop_Sh8Sx16, Iop_Sh16Sx8, Iop_Sh32Sx4, Iop_Sh64Sx2 };
    851    vassert(size < 4);
    852    return ops[size];
    853 }
    854 
    855 static IROp mkVecRSHU ( UInt size ) {
    856    const IROp ops[4]
    857       = { Iop_Rsh8Ux16, Iop_Rsh16Ux8, Iop_Rsh32Ux4, Iop_Rsh64Ux2 };
    858    vassert(size < 4);
    859    return ops[size];
    860 }
    861 
    862 static IROp mkVecRSHS ( UInt size ) {
    863    const IROp ops[4]
    864       = { Iop_Rsh8Sx16, Iop_Rsh16Sx8, Iop_Rsh32Sx4, Iop_Rsh64Sx2 };
    865    vassert(size < 4);
    866    return ops[size];
    867 }
    868 
    869 static IROp mkVecNARROWUN ( UInt sizeNarrow ) {
    870    const IROp ops[4]
    871       = { Iop_NarrowUn16to8x8, Iop_NarrowUn32to16x4,
    872           Iop_NarrowUn64to32x2, Iop_INVALID };
    873    vassert(sizeNarrow < 4);
    874    return ops[sizeNarrow];
    875 }
    876 
    877 static IROp mkVecQNARROWUNSU ( UInt sizeNarrow ) {
    878    const IROp ops[4]
    879       = { Iop_QNarrowUn16Sto8Ux8,  Iop_QNarrowUn32Sto16Ux4,
    880           Iop_QNarrowUn64Sto32Ux2, Iop_INVALID };
    881    vassert(sizeNarrow < 4);
    882    return ops[sizeNarrow];
    883 }
    884 
    885 static IROp mkVecQNARROWUNSS ( UInt sizeNarrow ) {
    886    const IROp ops[4]
    887       = { Iop_QNarrowUn16Sto8Sx8,  Iop_QNarrowUn32Sto16Sx4,
    888           Iop_QNarrowUn64Sto32Sx2, Iop_INVALID };
    889    vassert(sizeNarrow < 4);
    890    return ops[sizeNarrow];
    891 }
    892 
    893 static IROp mkVecQNARROWUNUU ( UInt sizeNarrow ) {
    894    const IROp ops[4]
    895       = { Iop_QNarrowUn16Uto8Ux8,  Iop_QNarrowUn32Uto16Ux4,
    896           Iop_QNarrowUn64Uto32Ux2, Iop_INVALID };
    897    vassert(sizeNarrow < 4);
    898    return ops[sizeNarrow];
    899 }
    900 
    901 static IROp mkVecQANDqshrNNARROWUU ( UInt sizeNarrow ) {
    902    const IROp ops[4]
    903       = { Iop_QandQShrNnarrow16Uto8Ux8, Iop_QandQShrNnarrow32Uto16Ux4,
    904           Iop_QandQShrNnarrow64Uto32Ux2, Iop_INVALID };
    905    vassert(sizeNarrow < 4);
    906    return ops[sizeNarrow];
    907 }
    908 
    909 static IROp mkVecQANDqsarNNARROWSS ( UInt sizeNarrow ) {
    910    const IROp ops[4]
    911       = { Iop_QandQSarNnarrow16Sto8Sx8,  Iop_QandQSarNnarrow32Sto16Sx4,
    912           Iop_QandQSarNnarrow64Sto32Sx2, Iop_INVALID };
    913    vassert(sizeNarrow < 4);
    914    return ops[sizeNarrow];
    915 }
    916 
    917 static IROp mkVecQANDqsarNNARROWSU ( UInt sizeNarrow ) {
    918    const IROp ops[4]
    919       = { Iop_QandQSarNnarrow16Sto8Ux8,  Iop_QandQSarNnarrow32Sto16Ux4,
    920           Iop_QandQSarNnarrow64Sto32Ux2, Iop_INVALID };
    921    vassert(sizeNarrow < 4);
    922    return ops[sizeNarrow];
    923 }
    924 
    925 static IROp mkVecQANDqrshrNNARROWUU ( UInt sizeNarrow ) {
    926    const IROp ops[4]
    927       = { Iop_QandQRShrNnarrow16Uto8Ux8,  Iop_QandQRShrNnarrow32Uto16Ux4,
    928           Iop_QandQRShrNnarrow64Uto32Ux2, Iop_INVALID };
    929    vassert(sizeNarrow < 4);
    930    return ops[sizeNarrow];
    931 }
    932 
    933 static IROp mkVecQANDqrsarNNARROWSS ( UInt sizeNarrow ) {
    934    const IROp ops[4]
    935       = { Iop_QandQRSarNnarrow16Sto8Sx8,  Iop_QandQRSarNnarrow32Sto16Sx4,
    936           Iop_QandQRSarNnarrow64Sto32Sx2, Iop_INVALID };
    937    vassert(sizeNarrow < 4);
    938    return ops[sizeNarrow];
    939 }
    940 
    941 static IROp mkVecQANDqrsarNNARROWSU ( UInt sizeNarrow ) {
    942    const IROp ops[4]
    943       = { Iop_QandQRSarNnarrow16Sto8Ux8,  Iop_QandQRSarNnarrow32Sto16Ux4,
    944           Iop_QandQRSarNnarrow64Sto32Ux2, Iop_INVALID };
    945    vassert(sizeNarrow < 4);
    946    return ops[sizeNarrow];
    947 }
    948 
    949 static IROp mkVecQSHLNSATUU ( UInt size ) {
    950    const IROp ops[4]
    951       = { Iop_QShlNsatUU8x16, Iop_QShlNsatUU16x8,
    952           Iop_QShlNsatUU32x4, Iop_QShlNsatUU64x2 };
    953    vassert(size < 4);
    954    return ops[size];
    955 }
    956 
    957 static IROp mkVecQSHLNSATSS ( UInt size ) {
    958    const IROp ops[4]
    959       = { Iop_QShlNsatSS8x16, Iop_QShlNsatSS16x8,
    960           Iop_QShlNsatSS32x4, Iop_QShlNsatSS64x2 };
    961    vassert(size < 4);
    962    return ops[size];
    963 }
    964 
    965 static IROp mkVecQSHLNSATSU ( UInt size ) {
    966    const IROp ops[4]
    967       = { Iop_QShlNsatSU8x16, Iop_QShlNsatSU16x8,
    968           Iop_QShlNsatSU32x4, Iop_QShlNsatSU64x2 };
    969    vassert(size < 4);
    970    return ops[size];
    971 }
    972 
    973 static IROp mkVecADDF ( UInt size ) {
    974    const IROp ops[4]
    975       = { Iop_INVALID, Iop_INVALID, Iop_Add32Fx4, Iop_Add64Fx2 };
    976    vassert(size < 4);
    977    return ops[size];
    978 }
    979 
    980 static IROp mkVecMAXF ( UInt size ) {
    981    const IROp ops[4]
    982       = { Iop_INVALID, Iop_INVALID, Iop_Max32Fx4, Iop_Max64Fx2 };
    983    vassert(size < 4);
    984    return ops[size];
    985 }
    986 
    987 static IROp mkVecMINF ( UInt size ) {
    988    const IROp ops[4]
    989       = { Iop_INVALID, Iop_INVALID, Iop_Min32Fx4, Iop_Min64Fx2 };
    990    vassert(size < 4);
    991    return ops[size];
    992 }
    993 
    994 /* Generate IR to create 'arg rotated right by imm', for sane values
    995    of 'ty' and 'imm'. */
    996 static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm )
    997 {
    998    UInt w = 0;
    999    if (ty == Ity_I64) {
   1000       w = 64;
   1001    } else {
   1002       vassert(ty == Ity_I32);
   1003       w = 32;
   1004    }
   1005    vassert(w != 0);
   1006    vassert(imm < w);
   1007    if (imm == 0) {
   1008       return arg;
   1009    }
   1010    IRTemp res = newTemp(ty);
   1011    assign(res, binop(mkOR(ty),
   1012                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - imm)),
   1013                      binop(mkSHR(ty), mkexpr(arg), mkU8(imm)) ));
   1014    return res;
   1015 }
   1016 
   1017 /* Generate IR to set the returned temp to either all-zeroes or
   1018    all ones, as a copy of arg<imm>. */
   1019 static IRTemp mathREPLICATE ( IRType ty, IRTemp arg, UInt imm )
   1020 {
   1021    UInt w = 0;
   1022    if (ty == Ity_I64) {
   1023       w = 64;
   1024    } else {
   1025       vassert(ty == Ity_I32);
   1026       w = 32;
   1027    }
   1028    vassert(w != 0);
   1029    vassert(imm < w);
   1030    IRTemp res = newTemp(ty);
   1031    assign(res, binop(mkSAR(ty),
   1032                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - 1 - imm)),
   1033                      mkU8(w - 1)));
   1034    return res;
   1035 }
   1036 
   1037 /* U-widen 8/16/32/64 bit int expr to 64. */
   1038 static IRExpr* widenUto64 ( IRType srcTy, IRExpr* e )
   1039 {
   1040    switch (srcTy) {
   1041       case Ity_I64: return e;
   1042       case Ity_I32: return unop(Iop_32Uto64, e);
   1043       case Ity_I16: return unop(Iop_16Uto64, e);
   1044       case Ity_I8:  return unop(Iop_8Uto64, e);
   1045       default: vpanic("widenUto64(arm64)");
   1046    }
   1047 }
   1048 
   1049 /* Narrow 64 bit int expr to 8/16/32/64.  Clearly only some
   1050    of these combinations make sense. */
   1051 static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
   1052 {
   1053    switch (dstTy) {
   1054       case Ity_I64: return e;
   1055       case Ity_I32: return unop(Iop_64to32, e);
   1056       case Ity_I16: return unop(Iop_64to16, e);
   1057       case Ity_I8:  return unop(Iop_64to8, e);
   1058       default: vpanic("narrowFrom64(arm64)");
   1059    }
   1060 }
   1061 
   1062 
   1063 /*------------------------------------------------------------*/
   1064 /*--- Helpers for accessing guest registers.               ---*/
   1065 /*------------------------------------------------------------*/
   1066 
   1067 #define OFFB_X0       offsetof(VexGuestARM64State,guest_X0)
   1068 #define OFFB_X1       offsetof(VexGuestARM64State,guest_X1)
   1069 #define OFFB_X2       offsetof(VexGuestARM64State,guest_X2)
   1070 #define OFFB_X3       offsetof(VexGuestARM64State,guest_X3)
   1071 #define OFFB_X4       offsetof(VexGuestARM64State,guest_X4)
   1072 #define OFFB_X5       offsetof(VexGuestARM64State,guest_X5)
   1073 #define OFFB_X6       offsetof(VexGuestARM64State,guest_X6)
   1074 #define OFFB_X7       offsetof(VexGuestARM64State,guest_X7)
   1075 #define OFFB_X8       offsetof(VexGuestARM64State,guest_X8)
   1076 #define OFFB_X9       offsetof(VexGuestARM64State,guest_X9)
   1077 #define OFFB_X10      offsetof(VexGuestARM64State,guest_X10)
   1078 #define OFFB_X11      offsetof(VexGuestARM64State,guest_X11)
   1079 #define OFFB_X12      offsetof(VexGuestARM64State,guest_X12)
   1080 #define OFFB_X13      offsetof(VexGuestARM64State,guest_X13)
   1081 #define OFFB_X14      offsetof(VexGuestARM64State,guest_X14)
   1082 #define OFFB_X15      offsetof(VexGuestARM64State,guest_X15)
   1083 #define OFFB_X16      offsetof(VexGuestARM64State,guest_X16)
   1084 #define OFFB_X17      offsetof(VexGuestARM64State,guest_X17)
   1085 #define OFFB_X18      offsetof(VexGuestARM64State,guest_X18)
   1086 #define OFFB_X19      offsetof(VexGuestARM64State,guest_X19)
   1087 #define OFFB_X20      offsetof(VexGuestARM64State,guest_X20)
   1088 #define OFFB_X21      offsetof(VexGuestARM64State,guest_X21)
   1089 #define OFFB_X22      offsetof(VexGuestARM64State,guest_X22)
   1090 #define OFFB_X23      offsetof(VexGuestARM64State,guest_X23)
   1091 #define OFFB_X24      offsetof(VexGuestARM64State,guest_X24)
   1092 #define OFFB_X25      offsetof(VexGuestARM64State,guest_X25)
   1093 #define OFFB_X26      offsetof(VexGuestARM64State,guest_X26)
   1094 #define OFFB_X27      offsetof(VexGuestARM64State,guest_X27)
   1095 #define OFFB_X28      offsetof(VexGuestARM64State,guest_X28)
   1096 #define OFFB_X29      offsetof(VexGuestARM64State,guest_X29)
   1097 #define OFFB_X30      offsetof(VexGuestARM64State,guest_X30)
   1098 
   1099 #define OFFB_XSP      offsetof(VexGuestARM64State,guest_XSP)
   1100 #define OFFB_PC       offsetof(VexGuestARM64State,guest_PC)
   1101 
   1102 #define OFFB_CC_OP    offsetof(VexGuestARM64State,guest_CC_OP)
   1103 #define OFFB_CC_DEP1  offsetof(VexGuestARM64State,guest_CC_DEP1)
   1104 #define OFFB_CC_DEP2  offsetof(VexGuestARM64State,guest_CC_DEP2)
   1105 #define OFFB_CC_NDEP  offsetof(VexGuestARM64State,guest_CC_NDEP)
   1106 
   1107 #define OFFB_TPIDR_EL0 offsetof(VexGuestARM64State,guest_TPIDR_EL0)
   1108 #define OFFB_NRADDR   offsetof(VexGuestARM64State,guest_NRADDR)
   1109 
   1110 #define OFFB_Q0       offsetof(VexGuestARM64State,guest_Q0)
   1111 #define OFFB_Q1       offsetof(VexGuestARM64State,guest_Q1)
   1112 #define OFFB_Q2       offsetof(VexGuestARM64State,guest_Q2)
   1113 #define OFFB_Q3       offsetof(VexGuestARM64State,guest_Q3)
   1114 #define OFFB_Q4       offsetof(VexGuestARM64State,guest_Q4)
   1115 #define OFFB_Q5       offsetof(VexGuestARM64State,guest_Q5)
   1116 #define OFFB_Q6       offsetof(VexGuestARM64State,guest_Q6)
   1117 #define OFFB_Q7       offsetof(VexGuestARM64State,guest_Q7)
   1118 #define OFFB_Q8       offsetof(VexGuestARM64State,guest_Q8)
   1119 #define OFFB_Q9       offsetof(VexGuestARM64State,guest_Q9)
   1120 #define OFFB_Q10      offsetof(VexGuestARM64State,guest_Q10)
   1121 #define OFFB_Q11      offsetof(VexGuestARM64State,guest_Q11)
   1122 #define OFFB_Q12      offsetof(VexGuestARM64State,guest_Q12)
   1123 #define OFFB_Q13      offsetof(VexGuestARM64State,guest_Q13)
   1124 #define OFFB_Q14      offsetof(VexGuestARM64State,guest_Q14)
   1125 #define OFFB_Q15      offsetof(VexGuestARM64State,guest_Q15)
   1126 #define OFFB_Q16      offsetof(VexGuestARM64State,guest_Q16)
   1127 #define OFFB_Q17      offsetof(VexGuestARM64State,guest_Q17)
   1128 #define OFFB_Q18      offsetof(VexGuestARM64State,guest_Q18)
   1129 #define OFFB_Q19      offsetof(VexGuestARM64State,guest_Q19)
   1130 #define OFFB_Q20      offsetof(VexGuestARM64State,guest_Q20)
   1131 #define OFFB_Q21      offsetof(VexGuestARM64State,guest_Q21)
   1132 #define OFFB_Q22      offsetof(VexGuestARM64State,guest_Q22)
   1133 #define OFFB_Q23      offsetof(VexGuestARM64State,guest_Q23)
   1134 #define OFFB_Q24      offsetof(VexGuestARM64State,guest_Q24)
   1135 #define OFFB_Q25      offsetof(VexGuestARM64State,guest_Q25)
   1136 #define OFFB_Q26      offsetof(VexGuestARM64State,guest_Q26)
   1137 #define OFFB_Q27      offsetof(VexGuestARM64State,guest_Q27)
   1138 #define OFFB_Q28      offsetof(VexGuestARM64State,guest_Q28)
   1139 #define OFFB_Q29      offsetof(VexGuestARM64State,guest_Q29)
   1140 #define OFFB_Q30      offsetof(VexGuestARM64State,guest_Q30)
   1141 #define OFFB_Q31      offsetof(VexGuestARM64State,guest_Q31)
   1142 
   1143 #define OFFB_FPCR     offsetof(VexGuestARM64State,guest_FPCR)
   1144 #define OFFB_QCFLAG   offsetof(VexGuestARM64State,guest_QCFLAG)
   1145 
   1146 #define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
   1147 #define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
   1148 
   1149 
   1150 /* ---------------- Integer registers ---------------- */
   1151 
   1152 static Int offsetIReg64 ( UInt iregNo )
   1153 {
   1154    /* Do we care about endianness here?  We do if sub-parts of integer
   1155       registers are accessed. */
   1156    switch (iregNo) {
   1157       case 0:  return OFFB_X0;
   1158       case 1:  return OFFB_X1;
   1159       case 2:  return OFFB_X2;
   1160       case 3:  return OFFB_X3;
   1161       case 4:  return OFFB_X4;
   1162       case 5:  return OFFB_X5;
   1163       case 6:  return OFFB_X6;
   1164       case 7:  return OFFB_X7;
   1165       case 8:  return OFFB_X8;
   1166       case 9:  return OFFB_X9;
   1167       case 10: return OFFB_X10;
   1168       case 11: return OFFB_X11;
   1169       case 12: return OFFB_X12;
   1170       case 13: return OFFB_X13;
   1171       case 14: return OFFB_X14;
   1172       case 15: return OFFB_X15;
   1173       case 16: return OFFB_X16;
   1174       case 17: return OFFB_X17;
   1175       case 18: return OFFB_X18;
   1176       case 19: return OFFB_X19;
   1177       case 20: return OFFB_X20;
   1178       case 21: return OFFB_X21;
   1179       case 22: return OFFB_X22;
   1180       case 23: return OFFB_X23;
   1181       case 24: return OFFB_X24;
   1182       case 25: return OFFB_X25;
   1183       case 26: return OFFB_X26;
   1184       case 27: return OFFB_X27;
   1185       case 28: return OFFB_X28;
   1186       case 29: return OFFB_X29;
   1187       case 30: return OFFB_X30;
   1188       /* but not 31 */
   1189       default: vassert(0);
   1190    }
   1191 }
   1192 
   1193 static Int offsetIReg64orSP ( UInt iregNo )
   1194 {
   1195    return iregNo == 31  ? OFFB_XSP  : offsetIReg64(iregNo);
   1196 }
   1197 
   1198 static const HChar* nameIReg64orZR ( UInt iregNo )
   1199 {
   1200    vassert(iregNo < 32);
   1201    static const HChar* names[32]
   1202       = { "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
   1203           "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
   1204           "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
   1205           "x24", "x25", "x26", "x27", "x28", "x29", "x30", "xzr" };
   1206    return names[iregNo];
   1207 }
   1208 
   1209 static const HChar* nameIReg64orSP ( UInt iregNo )
   1210 {
   1211    if (iregNo == 31) {
   1212       return "sp";
   1213    }
   1214    vassert(iregNo < 31);
   1215    return nameIReg64orZR(iregNo);
   1216 }
   1217 
   1218 static IRExpr* getIReg64orSP ( UInt iregNo )
   1219 {
   1220    vassert(iregNo < 32);
   1221    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
   1222 }
   1223 
   1224 static IRExpr* getIReg64orZR ( UInt iregNo )
   1225 {
   1226    if (iregNo == 31) {
   1227       return mkU64(0);
   1228    }
   1229    vassert(iregNo < 31);
   1230    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
   1231 }
   1232 
   1233 static void putIReg64orSP ( UInt iregNo, IRExpr* e )
   1234 {
   1235    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   1236    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
   1237 }
   1238 
   1239 static void putIReg64orZR ( UInt iregNo, IRExpr* e )
   1240 {
   1241    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   1242    if (iregNo == 31) {
   1243       return;
   1244    }
   1245    vassert(iregNo < 31);
   1246    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
   1247 }
   1248 
   1249 static const HChar* nameIReg32orZR ( UInt iregNo )
   1250 {
   1251    vassert(iregNo < 32);
   1252    static const HChar* names[32]
   1253       = { "w0",  "w1",  "w2",  "w3",  "w4",  "w5",  "w6",  "w7",
   1254           "w8",  "w9",  "w10", "w11", "w12", "w13", "w14", "w15",
   1255           "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w23",
   1256           "w24", "w25", "w26", "w27", "w28", "w29", "w30", "wzr" };
   1257    return names[iregNo];
   1258 }
   1259 
   1260 static const HChar* nameIReg32orSP ( UInt iregNo )
   1261 {
   1262    if (iregNo == 31) {
   1263       return "wsp";
   1264    }
   1265    vassert(iregNo < 31);
   1266    return nameIReg32orZR(iregNo);
   1267 }
   1268 
   1269 static IRExpr* getIReg32orSP ( UInt iregNo )
   1270 {
   1271    vassert(iregNo < 32);
   1272    return unop(Iop_64to32,
   1273                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
   1274 }
   1275 
   1276 static IRExpr* getIReg32orZR ( UInt iregNo )
   1277 {
   1278    if (iregNo == 31) {
   1279       return mkU32(0);
   1280    }
   1281    vassert(iregNo < 31);
   1282    return unop(Iop_64to32,
   1283                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
   1284 }
   1285 
   1286 static void putIReg32orSP ( UInt iregNo, IRExpr* e )
   1287 {
   1288    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   1289    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
   1290 }
   1291 
   1292 static void putIReg32orZR ( UInt iregNo, IRExpr* e )
   1293 {
   1294    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   1295    if (iregNo == 31) {
   1296       return;
   1297    }
   1298    vassert(iregNo < 31);
   1299    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
   1300 }
   1301 
   1302 static const HChar* nameIRegOrSP ( Bool is64, UInt iregNo )
   1303 {
   1304    vassert(is64 == True || is64 == False);
   1305    return is64 ? nameIReg64orSP(iregNo) : nameIReg32orSP(iregNo);
   1306 }
   1307 
   1308 static const HChar* nameIRegOrZR ( Bool is64, UInt iregNo )
   1309 {
   1310    vassert(is64 == True || is64 == False);
   1311    return is64 ? nameIReg64orZR(iregNo) : nameIReg32orZR(iregNo);
   1312 }
   1313 
   1314 static IRExpr* getIRegOrZR ( Bool is64, UInt iregNo )
   1315 {
   1316    vassert(is64 == True || is64 == False);
   1317    return is64 ? getIReg64orZR(iregNo) : getIReg32orZR(iregNo);
   1318 }
   1319 
   1320 static void putIRegOrZR ( Bool is64, UInt iregNo, IRExpr* e )
   1321 {
   1322    vassert(is64 == True || is64 == False);
   1323    if (is64) putIReg64orZR(iregNo, e); else putIReg32orZR(iregNo, e);
   1324 }
   1325 
   1326 static void putPC ( IRExpr* e )
   1327 {
   1328    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   1329    stmt( IRStmt_Put(OFFB_PC, e) );
   1330 }
   1331 
   1332 
   1333 /* ---------------- Vector (Q) registers ---------------- */
   1334 
   1335 static Int offsetQReg128 ( UInt qregNo )
   1336 {
   1337    /* We don't care about endianness at this point.  It only becomes
   1338       relevant when dealing with sections of these registers.*/
   1339    switch (qregNo) {
   1340       case 0:  return OFFB_Q0;
   1341       case 1:  return OFFB_Q1;
   1342       case 2:  return OFFB_Q2;
   1343       case 3:  return OFFB_Q3;
   1344       case 4:  return OFFB_Q4;
   1345       case 5:  return OFFB_Q5;
   1346       case 6:  return OFFB_Q6;
   1347       case 7:  return OFFB_Q7;
   1348       case 8:  return OFFB_Q8;
   1349       case 9:  return OFFB_Q9;
   1350       case 10: return OFFB_Q10;
   1351       case 11: return OFFB_Q11;
   1352       case 12: return OFFB_Q12;
   1353       case 13: return OFFB_Q13;
   1354       case 14: return OFFB_Q14;
   1355       case 15: return OFFB_Q15;
   1356       case 16: return OFFB_Q16;
   1357       case 17: return OFFB_Q17;
   1358       case 18: return OFFB_Q18;
   1359       case 19: return OFFB_Q19;
   1360       case 20: return OFFB_Q20;
   1361       case 21: return OFFB_Q21;
   1362       case 22: return OFFB_Q22;
   1363       case 23: return OFFB_Q23;
   1364       case 24: return OFFB_Q24;
   1365       case 25: return OFFB_Q25;
   1366       case 26: return OFFB_Q26;
   1367       case 27: return OFFB_Q27;
   1368       case 28: return OFFB_Q28;
   1369       case 29: return OFFB_Q29;
   1370       case 30: return OFFB_Q30;
   1371       case 31: return OFFB_Q31;
   1372       default: vassert(0);
   1373    }
   1374 }
   1375 
   1376 /* Write to a complete Qreg. */
   1377 static void putQReg128 ( UInt qregNo, IRExpr* e )
   1378 {
   1379    vassert(qregNo < 32);
   1380    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_V128);
   1381    stmt( IRStmt_Put(offsetQReg128(qregNo), e) );
   1382 }
   1383 
   1384 /* Read a complete Qreg. */
   1385 static IRExpr* getQReg128 ( UInt qregNo )
   1386 {
   1387    vassert(qregNo < 32);
   1388    return IRExpr_Get(offsetQReg128(qregNo), Ity_V128);
   1389 }
   1390 
   1391 /* Produce the IR type for some sub-part of a vector.  For 32- and 64-
   1392    bit sub-parts we can choose either integer or float types, and
   1393    choose float on the basis that that is the common use case and so
   1394    will give least interference with Put-to-Get forwarding later
   1395    on. */
   1396 static IRType preferredVectorSubTypeFromSize ( UInt szB )
   1397 {
   1398    switch (szB) {
   1399       case 1:  return Ity_I8;
   1400       case 2:  return Ity_I16;
   1401       case 4:  return Ity_I32; //Ity_F32;
   1402       case 8:  return Ity_F64;
   1403       case 16: return Ity_V128;
   1404       default: vassert(0);
   1405    }
   1406 }
   1407 
   1408 /* Find the offset of the laneNo'th lane of type laneTy in the given
   1409    Qreg.  Since the host is little-endian, the least significant lane
   1410    has the lowest offset. */
   1411 static Int offsetQRegLane ( UInt qregNo, IRType laneTy, UInt laneNo )
   1412 {
   1413    vassert(host_endness == VexEndnessLE);
   1414    Int base = offsetQReg128(qregNo);
   1415    /* Since the host is little-endian, the least significant lane
   1416       will be at the lowest address. */
   1417    /* Restrict this to known types, so as to avoid silently accepting
   1418       stupid types. */
   1419    UInt laneSzB = 0;
   1420    switch (laneTy) {
   1421       case Ity_I8:                 laneSzB = 1;  break;
   1422       case Ity_F16: case Ity_I16:  laneSzB = 2;  break;
   1423       case Ity_F32: case Ity_I32:  laneSzB = 4;  break;
   1424       case Ity_F64: case Ity_I64:  laneSzB = 8;  break;
   1425       case Ity_V128:               laneSzB = 16; break;
   1426       default: break;
   1427    }
   1428    vassert(laneSzB > 0);
   1429    UInt minOff = laneNo * laneSzB;
   1430    UInt maxOff = minOff + laneSzB - 1;
   1431    vassert(maxOff < 16);
   1432    return base + minOff;
   1433 }
   1434 
   1435 /* Put to the least significant lane of a Qreg. */
   1436 static void putQRegLO ( UInt qregNo, IRExpr* e )
   1437 {
   1438    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
   1439    Int    off = offsetQRegLane(qregNo, ty, 0);
   1440    switch (ty) {
   1441       case Ity_I8:  case Ity_I16: case Ity_I32: case Ity_I64:
   1442       case Ity_F16: case Ity_F32: case Ity_F64: case Ity_V128:
   1443          break;
   1444       default:
   1445          vassert(0); // Other cases are probably invalid
   1446    }
   1447    stmt(IRStmt_Put(off, e));
   1448 }
   1449 
   1450 /* Get from the least significant lane of a Qreg. */
   1451 static IRExpr* getQRegLO ( UInt qregNo, IRType ty )
   1452 {
   1453    Int off = offsetQRegLane(qregNo, ty, 0);
   1454    switch (ty) {
   1455       case Ity_I8:
   1456       case Ity_F16: case Ity_I16:
   1457       case Ity_I32: case Ity_I64:
   1458       case Ity_F32: case Ity_F64: case Ity_V128:
   1459          break;
   1460       default:
   1461          vassert(0); // Other cases are ATC
   1462    }
   1463    return IRExpr_Get(off, ty);
   1464 }
   1465 
   1466 static const HChar* nameQRegLO ( UInt qregNo, IRType laneTy )
   1467 {
   1468    static const HChar* namesQ[32]
   1469       = { "q0",  "q1",  "q2",  "q3",  "q4",  "q5",  "q6",  "q7",
   1470           "q8",  "q9",  "q10", "q11", "q12", "q13", "q14", "q15",
   1471           "q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23",
   1472           "q24", "q25", "q26", "q27", "q28", "q29", "q30", "q31" };
   1473    static const HChar* namesD[32]
   1474       = { "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
   1475           "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
   1476           "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
   1477           "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" };
   1478    static const HChar* namesS[32]
   1479       = { "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
   1480           "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
   1481           "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
   1482           "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31" };
   1483    static const HChar* namesH[32]
   1484       = { "h0",  "h1",  "h2",  "h3",  "h4",  "h5",  "h6",  "h7",
   1485           "h8",  "h9",  "h10", "h11", "h12", "h13", "h14", "h15",
   1486           "h16", "h17", "h18", "h19", "h20", "h21", "h22", "h23",
   1487           "h24", "h25", "h26", "h27", "h28", "h29", "h30", "h31" };
   1488    static const HChar* namesB[32]
   1489       = { "b0",  "b1",  "b2",  "b3",  "b4",  "b5",  "b6",  "b7",
   1490           "b8",  "b9",  "b10", "b11", "b12", "b13", "b14", "b15",
   1491           "b16", "b17", "b18", "b19", "b20", "b21", "b22", "b23",
   1492           "b24", "b25", "b26", "b27", "b28", "b29", "b30", "b31" };
   1493    vassert(qregNo < 32);
   1494    switch (sizeofIRType(laneTy)) {
   1495       case 1:  return namesB[qregNo];
   1496       case 2:  return namesH[qregNo];
   1497       case 4:  return namesS[qregNo];
   1498       case 8:  return namesD[qregNo];
   1499       case 16: return namesQ[qregNo];
   1500       default: vassert(0);
   1501    }
   1502    /*NOTREACHED*/
   1503 }
   1504 
   1505 static const HChar* nameQReg128 ( UInt qregNo )
   1506 {
   1507    return nameQRegLO(qregNo, Ity_V128);
   1508 }
   1509 
   1510 /* Find the offset of the most significant half (8 bytes) of the given
   1511    Qreg.  This requires knowing the endianness of the host. */
   1512 static Int offsetQRegHI64 ( UInt qregNo )
   1513 {
   1514    return offsetQRegLane(qregNo, Ity_I64, 1);
   1515 }
   1516 
   1517 static IRExpr* getQRegHI64 ( UInt qregNo )
   1518 {
   1519    return IRExpr_Get(offsetQRegHI64(qregNo), Ity_I64);
   1520 }
   1521 
   1522 static void putQRegHI64 ( UInt qregNo, IRExpr* e )
   1523 {
   1524    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
   1525    Int    off = offsetQRegHI64(qregNo);
   1526    switch (ty) {
   1527       case Ity_I64: case Ity_F64:
   1528          break;
   1529       default:
   1530          vassert(0); // Other cases are plain wrong
   1531    }
   1532    stmt(IRStmt_Put(off, e));
   1533 }
   1534 
   1535 /* Put to a specified lane of a Qreg. */
   1536 static void putQRegLane ( UInt qregNo, UInt laneNo, IRExpr* e )
   1537 {
   1538    IRType laneTy  = typeOfIRExpr(irsb->tyenv, e);
   1539    Int    off     = offsetQRegLane(qregNo, laneTy, laneNo);
   1540    switch (laneTy) {
   1541       case Ity_F64: case Ity_I64:
   1542       case Ity_I32: case Ity_F32:
   1543       case Ity_I16: case Ity_F16:
   1544       case Ity_I8:
   1545          break;
   1546       default:
   1547          vassert(0); // Other cases are ATC
   1548    }
   1549    stmt(IRStmt_Put(off, e));
   1550 }
   1551 
   1552 /* Get from a specified lane of a Qreg. */
   1553 static IRExpr* getQRegLane ( UInt qregNo, UInt laneNo, IRType laneTy )
   1554 {
   1555    Int off = offsetQRegLane(qregNo, laneTy, laneNo);
   1556    switch (laneTy) {
   1557       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
   1558       case Ity_F64: case Ity_F32: case Ity_F16:
   1559          break;
   1560       default:
   1561          vassert(0); // Other cases are ATC
   1562    }
   1563    return IRExpr_Get(off, laneTy);
   1564 }
   1565 
   1566 
   1567 //ZZ /* ---------------- Misc registers ---------------- */
   1568 //ZZ
   1569 //ZZ static void putMiscReg32 ( UInt    gsoffset,
   1570 //ZZ                            IRExpr* e, /* :: Ity_I32 */
   1571 //ZZ                            IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
   1572 //ZZ {
   1573 //ZZ    switch (gsoffset) {
   1574 //ZZ       case OFFB_FPSCR:   break;
   1575 //ZZ       case OFFB_QFLAG32: break;
   1576 //ZZ       case OFFB_GEFLAG0: break;
   1577 //ZZ       case OFFB_GEFLAG1: break;
   1578 //ZZ       case OFFB_GEFLAG2: break;
   1579 //ZZ       case OFFB_GEFLAG3: break;
   1580 //ZZ       default: vassert(0); /* awaiting more cases */
   1581 //ZZ    }
   1582 //ZZ    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   1583 //ZZ
   1584 //ZZ    if (guardT == IRTemp_INVALID) {
   1585 //ZZ       /* unconditional write */
   1586 //ZZ       stmt(IRStmt_Put(gsoffset, e));
   1587 //ZZ    } else {
   1588 //ZZ       stmt(IRStmt_Put(
   1589 //ZZ          gsoffset,
   1590 //ZZ          IRExpr_ITE( binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0)),
   1591 //ZZ                      e, IRExpr_Get(gsoffset, Ity_I32) )
   1592 //ZZ       ));
   1593 //ZZ    }
   1594 //ZZ }
   1595 //ZZ
   1596 //ZZ static IRTemp get_ITSTATE ( void )
   1597 //ZZ {
   1598 //ZZ    ASSERT_IS_THUMB;
   1599 //ZZ    IRTemp t = newTemp(Ity_I32);
   1600 //ZZ    assign(t, IRExpr_Get( OFFB_ITSTATE, Ity_I32));
   1601 //ZZ    return t;
   1602 //ZZ }
   1603 //ZZ
   1604 //ZZ static void put_ITSTATE ( IRTemp t )
   1605 //ZZ {
   1606 //ZZ    ASSERT_IS_THUMB;
   1607 //ZZ    stmt( IRStmt_Put( OFFB_ITSTATE, mkexpr(t)) );
   1608 //ZZ }
   1609 //ZZ
   1610 //ZZ static IRTemp get_QFLAG32 ( void )
   1611 //ZZ {
   1612 //ZZ    IRTemp t = newTemp(Ity_I32);
   1613 //ZZ    assign(t, IRExpr_Get( OFFB_QFLAG32, Ity_I32));
   1614 //ZZ    return t;
   1615 //ZZ }
   1616 //ZZ
   1617 //ZZ static void put_QFLAG32 ( IRTemp t, IRTemp condT )
   1618 //ZZ {
   1619 //ZZ    putMiscReg32( OFFB_QFLAG32, mkexpr(t), condT );
   1620 //ZZ }
   1621 //ZZ
   1622 //ZZ /* Stickily set the 'Q' flag (APSR bit 27) of the APSR (Application Program
   1623 //ZZ    Status Register) to indicate that overflow or saturation occurred.
   1624 //ZZ    Nb: t must be zero to denote no saturation, and any nonzero
   1625 //ZZ    value to indicate saturation. */
   1626 //ZZ static void or_into_QFLAG32 ( IRExpr* e, IRTemp condT )
   1627 //ZZ {
   1628 //ZZ    IRTemp old = get_QFLAG32();
   1629 //ZZ    IRTemp nyu = newTemp(Ity_I32);
   1630 //ZZ    assign(nyu, binop(Iop_Or32, mkexpr(old), e) );
   1631 //ZZ    put_QFLAG32(nyu, condT);
   1632 //ZZ }
   1633 
   1634 
   1635 /* ---------------- FPCR stuff ---------------- */
   1636 
   1637 /* Generate IR to get hold of the rounding mode bits in FPCR, and
   1638    convert them to IR format.  Bind the final result to the
   1639    returned temp. */
   1640 static IRTemp /* :: Ity_I32 */ mk_get_IR_rounding_mode ( void )
   1641 {
   1642    /* The ARMvfp encoding for rounding mode bits is:
   1643          00  to nearest
   1644          01  to +infinity
   1645          10  to -infinity
   1646          11  to zero
   1647       We need to convert that to the IR encoding:
   1648          00  to nearest (the default)
   1649          10  to +infinity
   1650          01  to -infinity
   1651          11  to zero
   1652       Which can be done by swapping bits 0 and 1.
   1653       The rmode bits are at 23:22 in FPSCR.
   1654    */
   1655    IRTemp armEncd = newTemp(Ity_I32);
   1656    IRTemp swapped = newTemp(Ity_I32);
   1657    /* Fish FPCR[23:22] out, and slide to bottom.  Doesn't matter that
   1658       we don't zero out bits 24 and above, since the assignment to
   1659       'swapped' will mask them out anyway. */
   1660    assign(armEncd,
   1661           binop(Iop_Shr32, IRExpr_Get(OFFB_FPCR, Ity_I32), mkU8(22)));
   1662    /* Now swap them. */
   1663    assign(swapped,
   1664           binop(Iop_Or32,
   1665                 binop(Iop_And32,
   1666                       binop(Iop_Shl32, mkexpr(armEncd), mkU8(1)),
   1667                       mkU32(2)),
   1668                 binop(Iop_And32,
   1669                       binop(Iop_Shr32, mkexpr(armEncd), mkU8(1)),
   1670                       mkU32(1))
   1671          ));
   1672    return swapped;
   1673 }
   1674 
   1675 
   1676 /*------------------------------------------------------------*/
   1677 /*--- Helpers for flag handling and conditional insns      ---*/
   1678 /*------------------------------------------------------------*/
   1679 
   1680 static const HChar* nameARM64Condcode ( ARM64Condcode cond )
   1681 {
   1682    switch (cond) {
   1683       case ARM64CondEQ:  return "eq";
   1684       case ARM64CondNE:  return "ne";
   1685       case ARM64CondCS:  return "cs";  // or 'hs'
   1686       case ARM64CondCC:  return "cc";  // or 'lo'
   1687       case ARM64CondMI:  return "mi";
   1688       case ARM64CondPL:  return "pl";
   1689       case ARM64CondVS:  return "vs";
   1690       case ARM64CondVC:  return "vc";
   1691       case ARM64CondHI:  return "hi";
   1692       case ARM64CondLS:  return "ls";
   1693       case ARM64CondGE:  return "ge";
   1694       case ARM64CondLT:  return "lt";
   1695       case ARM64CondGT:  return "gt";
   1696       case ARM64CondLE:  return "le";
   1697       case ARM64CondAL:  return "al";
   1698       case ARM64CondNV:  return "nv";
   1699       default: vpanic("name_ARM64Condcode");
   1700    }
   1701 }
   1702 
   1703 /* and a handy shorthand for it */
   1704 static const HChar* nameCC ( ARM64Condcode cond ) {
   1705    return nameARM64Condcode(cond);
   1706 }
   1707 
   1708 
   1709 /* Build IR to calculate some particular condition from stored
   1710    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
   1711    Ity_I64, suitable for narrowing.  Although the return type is
   1712    Ity_I64, the returned value is either 0 or 1.  'cond' must be
   1713    :: Ity_I64 and must denote the condition to compute in
   1714    bits 7:4, and be zero everywhere else.
   1715 */
   1716 static IRExpr* mk_arm64g_calculate_condition_dyn ( IRExpr* cond )
   1717 {
   1718    vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I64);
   1719    /* And 'cond' had better produce a value in which only bits 7:4 are
   1720       nonzero.  However, obviously we can't assert for that. */
   1721 
   1722    /* So what we're constructing for the first argument is
   1723       "(cond << 4) | stored-operation".
   1724       However, as per comments above, 'cond' must be supplied
   1725       pre-shifted to this function.
   1726 
   1727       This pairing scheme requires that the ARM64_CC_OP_ values all fit
   1728       in 4 bits.  Hence we are passing a (COND, OP) pair in the lowest
   1729       8 bits of the first argument. */
   1730    IRExpr** args
   1731       = mkIRExprVec_4(
   1732            binop(Iop_Or64, IRExpr_Get(OFFB_CC_OP, Ity_I64), cond),
   1733            IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1734            IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1735            IRExpr_Get(OFFB_CC_NDEP, Ity_I64)
   1736         );
   1737    IRExpr* call
   1738       = mkIRExprCCall(
   1739            Ity_I64,
   1740            0/*regparm*/,
   1741            "arm64g_calculate_condition", &arm64g_calculate_condition,
   1742            args
   1743         );
   1744 
   1745    /* Exclude the requested condition, OP and NDEP from definedness
   1746       checking.  We're only interested in DEP1 and DEP2. */
   1747    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1748    return call;
   1749 }
   1750 
   1751 
   1752 /* Build IR to calculate some particular condition from stored
   1753    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
   1754    Ity_I64, suitable for narrowing.  Although the return type is
   1755    Ity_I64, the returned value is either 0 or 1.
   1756 */
   1757 static IRExpr* mk_arm64g_calculate_condition ( ARM64Condcode cond )
   1758 {
   1759   /* First arg is "(cond << 4) | condition".  This requires that the
   1760      ARM64_CC_OP_ values all fit in 4 bits.  Hence we are passing a
   1761      (COND, OP) pair in the lowest 8 bits of the first argument. */
   1762    vassert(cond >= 0 && cond <= 15);
   1763    return mk_arm64g_calculate_condition_dyn( mkU64(cond << 4) );
   1764 }
   1765 
   1766 
   1767 /* Build IR to calculate just the carry flag from stored
   1768    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1769    Ity_I64. */
   1770 static IRExpr* mk_arm64g_calculate_flag_c ( void )
   1771 {
   1772    IRExpr** args
   1773       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1774                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1775                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1776                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1777    IRExpr* call
   1778       = mkIRExprCCall(
   1779            Ity_I64,
   1780            0/*regparm*/,
   1781            "arm64g_calculate_flag_c", &arm64g_calculate_flag_c,
   1782            args
   1783         );
   1784    /* Exclude OP and NDEP from definedness checking.  We're only
   1785       interested in DEP1 and DEP2. */
   1786    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1787    return call;
   1788 }
   1789 
   1790 
   1791 //ZZ /* Build IR to calculate just the overflow flag from stored
   1792 //ZZ    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1793 //ZZ    Ity_I32. */
   1794 //ZZ static IRExpr* mk_armg_calculate_flag_v ( void )
   1795 //ZZ {
   1796 //ZZ    IRExpr** args
   1797 //ZZ       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
   1798 //ZZ                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
   1799 //ZZ                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
   1800 //ZZ                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
   1801 //ZZ    IRExpr* call
   1802 //ZZ       = mkIRExprCCall(
   1803 //ZZ            Ity_I32,
   1804 //ZZ            0/*regparm*/,
   1805 //ZZ            "armg_calculate_flag_v", &armg_calculate_flag_v,
   1806 //ZZ            args
   1807 //ZZ         );
   1808 //ZZ    /* Exclude OP and NDEP from definedness checking.  We're only
   1809 //ZZ       interested in DEP1 and DEP2. */
   1810 //ZZ    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1811 //ZZ    return call;
   1812 //ZZ }
   1813 
   1814 
   1815 /* Build IR to calculate N Z C V in bits 31:28 of the
   1816    returned word. */
   1817 static IRExpr* mk_arm64g_calculate_flags_nzcv ( void )
   1818 {
   1819    IRExpr** args
   1820       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1821                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1822                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1823                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1824    IRExpr* call
   1825       = mkIRExprCCall(
   1826            Ity_I64,
   1827            0/*regparm*/,
   1828            "arm64g_calculate_flags_nzcv", &arm64g_calculate_flags_nzcv,
   1829            args
   1830         );
   1831    /* Exclude OP and NDEP from definedness checking.  We're only
   1832       interested in DEP1 and DEP2. */
   1833    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1834    return call;
   1835 }
   1836 
   1837 
   1838 /* Build IR to set the flags thunk, in the most general case. */
   1839 static
   1840 void setFlags_D1_D2_ND ( UInt cc_op,
   1841                          IRTemp t_dep1, IRTemp t_dep2, IRTemp t_ndep )
   1842 {
   1843    vassert(typeOfIRTemp(irsb->tyenv, t_dep1 == Ity_I64));
   1844    vassert(typeOfIRTemp(irsb->tyenv, t_dep2 == Ity_I64));
   1845    vassert(typeOfIRTemp(irsb->tyenv, t_ndep == Ity_I64));
   1846    vassert(cc_op >= ARM64G_CC_OP_COPY && cc_op < ARM64G_CC_OP_NUMBER);
   1847    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(cc_op) ));
   1848    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t_dep1) ));
   1849    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(t_dep2) ));
   1850    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(t_ndep) ));
   1851 }
   1852 
   1853 /* Build IR to set the flags thunk after ADD or SUB. */
   1854 static
   1855 void setFlags_ADD_SUB ( Bool is64, Bool isSUB, IRTemp argL, IRTemp argR )
   1856 {
   1857    IRTemp argL64 = IRTemp_INVALID;
   1858    IRTemp argR64 = IRTemp_INVALID;
   1859    IRTemp z64    = newTemp(Ity_I64);
   1860    if (is64) {
   1861       argL64 = argL;
   1862       argR64 = argR;
   1863    } else {
   1864       argL64 = newTemp(Ity_I64);
   1865       argR64 = newTemp(Ity_I64);
   1866       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
   1867       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
   1868    }
   1869    assign(z64, mkU64(0));
   1870    UInt cc_op = ARM64G_CC_OP_NUMBER;
   1871    /**/ if ( isSUB &&  is64) { cc_op = ARM64G_CC_OP_SUB64; }
   1872    else if ( isSUB && !is64) { cc_op = ARM64G_CC_OP_SUB32; }
   1873    else if (!isSUB &&  is64) { cc_op = ARM64G_CC_OP_ADD64; }
   1874    else if (!isSUB && !is64) { cc_op = ARM64G_CC_OP_ADD32; }
   1875    else                      { vassert(0); }
   1876    setFlags_D1_D2_ND(cc_op, argL64, argR64, z64);
   1877 }
   1878 
   1879 /* Build IR to set the flags thunk after ADC or SBC. */
   1880 static
   1881 void setFlags_ADC_SBC ( Bool is64, Bool isSBC,
   1882                         IRTemp argL, IRTemp argR, IRTemp oldC )
   1883 {
   1884    IRTemp argL64 = IRTemp_INVALID;
   1885    IRTemp argR64 = IRTemp_INVALID;
   1886    IRTemp oldC64 = IRTemp_INVALID;
   1887    if (is64) {
   1888       argL64 = argL;
   1889       argR64 = argR;
   1890       oldC64 = oldC;
   1891    } else {
   1892       argL64 = newTemp(Ity_I64);
   1893       argR64 = newTemp(Ity_I64);
   1894       oldC64 = newTemp(Ity_I64);
   1895       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
   1896       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
   1897       assign(oldC64, unop(Iop_32Uto64, mkexpr(oldC)));
   1898    }
   1899    UInt cc_op = ARM64G_CC_OP_NUMBER;
   1900    /**/ if ( isSBC &&  is64) { cc_op = ARM64G_CC_OP_SBC64; }
   1901    else if ( isSBC && !is64) { cc_op = ARM64G_CC_OP_SBC32; }
   1902    else if (!isSBC &&  is64) { cc_op = ARM64G_CC_OP_ADC64; }
   1903    else if (!isSBC && !is64) { cc_op = ARM64G_CC_OP_ADC32; }
   1904    else                      { vassert(0); }
   1905    setFlags_D1_D2_ND(cc_op, argL64, argR64, oldC64);
   1906 }
   1907 
   1908 /* Build IR to set the flags thunk after ADD or SUB, if the given
   1909    condition evaluates to True at run time.  If not, the flags are set
   1910    to the specified NZCV value. */
   1911 static
   1912 void setFlags_ADD_SUB_conditionally (
   1913         Bool is64, Bool isSUB,
   1914         IRTemp cond, IRTemp argL, IRTemp argR, UInt nzcv
   1915      )
   1916 {
   1917    /* Generate IR as follows:
   1918         CC_OP   = ITE(cond, OP_{ADD,SUB}{32,64}, OP_COPY)
   1919         CC_DEP1 = ITE(cond, argL64, nzcv << 28)
   1920         CC_DEP2 = ITE(cond, argR64, 0)
   1921         CC_NDEP = 0
   1922    */
   1923 
   1924    IRTemp z64 = newTemp(Ity_I64);
   1925    assign(z64, mkU64(0));
   1926 
   1927    /* Establish the operation and operands for the True case. */
   1928    IRTemp t_dep1 = IRTemp_INVALID;
   1929    IRTemp t_dep2 = IRTemp_INVALID;
   1930    UInt   t_op   = ARM64G_CC_OP_NUMBER;
   1931    /**/ if ( isSUB &&  is64) { t_op = ARM64G_CC_OP_SUB64; }
   1932    else if ( isSUB && !is64) { t_op = ARM64G_CC_OP_SUB32; }
   1933    else if (!isSUB &&  is64) { t_op = ARM64G_CC_OP_ADD64; }
   1934    else if (!isSUB && !is64) { t_op = ARM64G_CC_OP_ADD32; }
   1935    else                      { vassert(0); }
   1936    /* */
   1937    if (is64) {
   1938       t_dep1 = argL;
   1939       t_dep2 = argR;
   1940    } else {
   1941       t_dep1 = newTemp(Ity_I64);
   1942       t_dep2 = newTemp(Ity_I64);
   1943       assign(t_dep1, unop(Iop_32Uto64, mkexpr(argL)));
   1944       assign(t_dep2, unop(Iop_32Uto64, mkexpr(argR)));
   1945    }
   1946 
   1947    /* Establish the operation and operands for the False case. */
   1948    IRTemp f_dep1 = newTemp(Ity_I64);
   1949    IRTemp f_dep2 = z64;
   1950    UInt   f_op   = ARM64G_CC_OP_COPY;
   1951    assign(f_dep1, mkU64(nzcv << 28));
   1952 
   1953    /* Final thunk values */
   1954    IRTemp dep1 = newTemp(Ity_I64);
   1955    IRTemp dep2 = newTemp(Ity_I64);
   1956    IRTemp op   = newTemp(Ity_I64);
   1957 
   1958    assign(op,   IRExpr_ITE(mkexpr(cond), mkU64(t_op), mkU64(f_op)));
   1959    assign(dep1, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep1), mkexpr(f_dep1)));
   1960    assign(dep2, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep2), mkexpr(f_dep2)));
   1961 
   1962    /* finally .. */
   1963    stmt( IRStmt_Put( OFFB_CC_OP,   mkexpr(op) ));
   1964    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(dep1) ));
   1965    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(dep2) ));
   1966    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(z64) ));
   1967 }
   1968 
   1969 /* Build IR to set the flags thunk after AND/OR/XOR or variants thereof. */
   1970 static
   1971 void setFlags_LOGIC ( Bool is64, IRTemp res )
   1972 {
   1973    IRTemp res64 = IRTemp_INVALID;
   1974    IRTemp z64   = newTemp(Ity_I64);
   1975    UInt   cc_op = ARM64G_CC_OP_NUMBER;
   1976    if (is64) {
   1977       res64 = res;
   1978       cc_op = ARM64G_CC_OP_LOGIC64;
   1979    } else {
   1980       res64 = newTemp(Ity_I64);
   1981       assign(res64, unop(Iop_32Uto64, mkexpr(res)));
   1982       cc_op = ARM64G_CC_OP_LOGIC32;
   1983    }
   1984    assign(z64, mkU64(0));
   1985    setFlags_D1_D2_ND(cc_op, res64, z64, z64);
   1986 }
   1987 
   1988 /* Build IR to set the flags thunk to a given NZCV value.  NZCV is
   1989    located in bits 31:28 of the supplied value. */
   1990 static
   1991 void setFlags_COPY ( IRTemp nzcv_28x0 )
   1992 {
   1993    IRTemp z64 = newTemp(Ity_I64);
   1994    assign(z64, mkU64(0));
   1995    setFlags_D1_D2_ND(ARM64G_CC_OP_COPY, nzcv_28x0, z64, z64);
   1996 }
   1997 
   1998 
   1999 //ZZ /* Minor variant of the above that sets NDEP to zero (if it
   2000 //ZZ    sets it at all) */
   2001 //ZZ static void setFlags_D1_D2 ( UInt cc_op, IRTemp t_dep1,
   2002 //ZZ                              IRTemp t_dep2,
   2003 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
   2004 //ZZ {
   2005 //ZZ    IRTemp z32 = newTemp(Ity_I32);
   2006 //ZZ    assign( z32, mkU32(0) );
   2007 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, t_dep2, z32, guardT );
   2008 //ZZ }
   2009 //ZZ
   2010 //ZZ
   2011 //ZZ /* Minor variant of the above that sets DEP2 to zero (if it
   2012 //ZZ    sets it at all) */
   2013 //ZZ static void setFlags_D1_ND ( UInt cc_op, IRTemp t_dep1,
   2014 //ZZ                              IRTemp t_ndep,
   2015 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
   2016 //ZZ {
   2017 //ZZ    IRTemp z32 = newTemp(Ity_I32);
   2018 //ZZ    assign( z32, mkU32(0) );
   2019 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, t_ndep, guardT );
   2020 //ZZ }
   2021 //ZZ
   2022 //ZZ
   2023 //ZZ /* Minor variant of the above that sets DEP2 and NDEP to zero (if it
   2024 //ZZ    sets them at all) */
   2025 //ZZ static void setFlags_D1 ( UInt cc_op, IRTemp t_dep1,
   2026 //ZZ                           IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
   2027 //ZZ {
   2028 //ZZ    IRTemp z32 = newTemp(Ity_I32);
   2029 //ZZ    assign( z32, mkU32(0) );
   2030 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, z32, guardT );
   2031 //ZZ }
   2032 
   2033 
   2034 /*------------------------------------------------------------*/
   2035 /*--- Misc math helpers                                    ---*/
   2036 /*------------------------------------------------------------*/
   2037 
   2038 /* Generate IR for ((x & mask) >>u sh) | ((x << sh) & mask) */
   2039 static IRTemp math_SWAPHELPER ( IRTemp x, ULong mask, Int sh )
   2040 {
   2041    IRTemp maskT = newTemp(Ity_I64);
   2042    IRTemp res   = newTemp(Ity_I64);
   2043    vassert(sh >= 1 && sh <= 63);
   2044    assign(maskT, mkU64(mask));
   2045    assign( res,
   2046            binop(Iop_Or64,
   2047                  binop(Iop_Shr64,
   2048                        binop(Iop_And64,mkexpr(x),mkexpr(maskT)),
   2049                        mkU8(sh)),
   2050                  binop(Iop_And64,
   2051                        binop(Iop_Shl64,mkexpr(x),mkU8(sh)),
   2052                        mkexpr(maskT))
   2053                  )
   2054            );
   2055    return res;
   2056 }
   2057 
   2058 /* Generates byte swaps within 32-bit lanes. */
   2059 static IRTemp math_UINTSWAP64 ( IRTemp src )
   2060 {
   2061    IRTemp res;
   2062    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
   2063    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
   2064    return res;
   2065 }
   2066 
   2067 /* Generates byte swaps within 16-bit lanes. */
   2068 static IRTemp math_USHORTSWAP64 ( IRTemp src )
   2069 {
   2070    IRTemp res;
   2071    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
   2072    return res;
   2073 }
   2074 
   2075 /* Generates a 64-bit byte swap. */
   2076 static IRTemp math_BYTESWAP64 ( IRTemp src )
   2077 {
   2078    IRTemp res;
   2079    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
   2080    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
   2081    res = math_SWAPHELPER(res, 0xFFFFFFFF00000000ULL, 32);
   2082    return res;
   2083 }
   2084 
   2085 /* Generates a 64-bit bit swap. */
   2086 static IRTemp math_BITSWAP64 ( IRTemp src )
   2087 {
   2088    IRTemp res;
   2089    res = math_SWAPHELPER(src, 0xAAAAAAAAAAAAAAAAULL, 1);
   2090    res = math_SWAPHELPER(res, 0xCCCCCCCCCCCCCCCCULL, 2);
   2091    res = math_SWAPHELPER(res, 0xF0F0F0F0F0F0F0F0ULL, 4);
   2092    return math_BYTESWAP64(res);
   2093 }
   2094 
   2095 /* Duplicates the bits at the bottom of the given word to fill the
   2096    whole word.  src :: Ity_I64 is assumed to have zeroes everywhere
   2097    except for the bottom bits. */
   2098 static IRTemp math_DUP_TO_64 ( IRTemp src, IRType srcTy )
   2099 {
   2100    if (srcTy == Ity_I8) {
   2101       IRTemp t16 = newTemp(Ity_I64);
   2102       assign(t16, binop(Iop_Or64, mkexpr(src),
   2103                                   binop(Iop_Shl64, mkexpr(src), mkU8(8))));
   2104       IRTemp t32 = newTemp(Ity_I64);
   2105       assign(t32, binop(Iop_Or64, mkexpr(t16),
   2106                                   binop(Iop_Shl64, mkexpr(t16), mkU8(16))));
   2107       IRTemp t64 = newTemp(Ity_I64);
   2108       assign(t64, binop(Iop_Or64, mkexpr(t32),
   2109                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
   2110       return t64;
   2111    }
   2112    if (srcTy == Ity_I16) {
   2113       IRTemp t32 = newTemp(Ity_I64);
   2114       assign(t32, binop(Iop_Or64, mkexpr(src),
   2115                                   binop(Iop_Shl64, mkexpr(src), mkU8(16))));
   2116       IRTemp t64 = newTemp(Ity_I64);
   2117       assign(t64, binop(Iop_Or64, mkexpr(t32),
   2118                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
   2119       return t64;
   2120    }
   2121    if (srcTy == Ity_I32) {
   2122       IRTemp t64 = newTemp(Ity_I64);
   2123       assign(t64, binop(Iop_Or64, mkexpr(src),
   2124                                   binop(Iop_Shl64, mkexpr(src), mkU8(32))));
   2125       return t64;
   2126    }
   2127    if (srcTy == Ity_I64) {
   2128       return src;
   2129    }
   2130    vassert(0);
   2131 }
   2132 
   2133 
   2134 /* Duplicates the src element exactly so as to fill a V128 value. */
   2135 static IRTemp math_DUP_TO_V128 ( IRTemp src, IRType srcTy )
   2136 {
   2137    IRTemp res = newTempV128();
   2138    if (srcTy == Ity_F64) {
   2139       IRTemp i64 = newTemp(Ity_I64);
   2140       assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(src)));
   2141       assign(res, binop(Iop_64HLtoV128, mkexpr(i64), mkexpr(i64)));
   2142       return res;
   2143    }
   2144    if (srcTy == Ity_F32) {
   2145       IRTemp i64a = newTemp(Ity_I64);
   2146       assign(i64a, unop(Iop_32Uto64, unop(Iop_ReinterpF32asI32, mkexpr(src))));
   2147       IRTemp i64b = newTemp(Ity_I64);
   2148       assign(i64b, binop(Iop_Or64, binop(Iop_Shl64, mkexpr(i64a), mkU8(32)),
   2149                                    mkexpr(i64a)));
   2150       assign(res, binop(Iop_64HLtoV128, mkexpr(i64b), mkexpr(i64b)));
   2151       return res;
   2152    }
   2153    if (srcTy == Ity_I64) {
   2154       assign(res, binop(Iop_64HLtoV128, mkexpr(src), mkexpr(src)));
   2155       return res;
   2156    }
   2157    if (srcTy == Ity_I32 || srcTy == Ity_I16 || srcTy == Ity_I8) {
   2158       IRTemp t1 = newTemp(Ity_I64);
   2159       assign(t1, widenUto64(srcTy, mkexpr(src)));
   2160       IRTemp t2 = math_DUP_TO_64(t1, srcTy);
   2161       assign(res, binop(Iop_64HLtoV128, mkexpr(t2), mkexpr(t2)));
   2162       return res;
   2163    }
   2164    vassert(0);
   2165 }
   2166 
   2167 
   2168 /* |fullWidth| is a full V128 width result.  Depending on bitQ,
   2169    zero out the upper half. */
   2170 static IRExpr* math_MAYBE_ZERO_HI64 ( UInt bitQ, IRTemp fullWidth )
   2171 {
   2172    if (bitQ == 1) return mkexpr(fullWidth);
   2173    if (bitQ == 0) return unop(Iop_ZeroHI64ofV128, mkexpr(fullWidth));
   2174    vassert(0);
   2175 }
   2176 
   2177 /* The same, but from an expression instead. */
   2178 static IRExpr* math_MAYBE_ZERO_HI64_fromE ( UInt bitQ, IRExpr* fullWidth )
   2179 {
   2180    IRTemp fullWidthT = newTempV128();
   2181    assign(fullWidthT, fullWidth);
   2182    return math_MAYBE_ZERO_HI64(bitQ, fullWidthT);
   2183 }
   2184 
   2185 
   2186 /*------------------------------------------------------------*/
   2187 /*--- FP comparison helpers                                ---*/
   2188 /*------------------------------------------------------------*/
   2189 
   2190 /* irRes :: Ity_I32 holds a floating point comparison result encoded
   2191    as an IRCmpF64Result.  Generate code to convert it to an
   2192    ARM64-encoded (N,Z,C,V) group in the lowest 4 bits of an I64 value.
   2193    Assign a new temp to hold that value, and return the temp. */
   2194 static
   2195 IRTemp mk_convert_IRCmpF64Result_to_NZCV ( IRTemp irRes32 )
   2196 {
   2197    IRTemp ix       = newTemp(Ity_I64);
   2198    IRTemp termL    = newTemp(Ity_I64);
   2199    IRTemp termR    = newTemp(Ity_I64);
   2200    IRTemp nzcv     = newTemp(Ity_I64);
   2201    IRTemp irRes    = newTemp(Ity_I64);
   2202 
   2203    /* This is where the fun starts.  We have to convert 'irRes' from
   2204       an IR-convention return result (IRCmpF64Result) to an
   2205       ARM-encoded (N,Z,C,V) group.  The final result is in the bottom
   2206       4 bits of 'nzcv'. */
   2207    /* Map compare result from IR to ARM(nzcv) */
   2208    /*
   2209       FP cmp result | IR   | ARM(nzcv)
   2210       --------------------------------
   2211       UN              0x45   0011
   2212       LT              0x01   1000
   2213       GT              0x00   0010
   2214       EQ              0x40   0110
   2215    */
   2216    /* Now since you're probably wondering WTF ..
   2217 
   2218       ix fishes the useful bits out of the IR value, bits 6 and 0, and
   2219       places them side by side, giving a number which is 0, 1, 2 or 3.
   2220 
   2221       termL is a sequence cooked up by GNU superopt.  It converts ix
   2222          into an almost correct value NZCV value (incredibly), except
   2223          for the case of UN, where it produces 0100 instead of the
   2224          required 0011.
   2225 
   2226       termR is therefore a correction term, also computed from ix.  It
   2227          is 1 in the UN case and 0 for LT, GT and UN.  Hence, to get
   2228          the final correct value, we subtract termR from termL.
   2229 
   2230       Don't take my word for it.  There's a test program at the bottom
   2231       of guest_arm_toIR.c, to try this out with.
   2232    */
   2233    assign(irRes, unop(Iop_32Uto64, mkexpr(irRes32)));
   2234 
   2235    assign(
   2236       ix,
   2237       binop(Iop_Or64,
   2238             binop(Iop_And64,
   2239                   binop(Iop_Shr64, mkexpr(irRes), mkU8(5)),
   2240                   mkU64(3)),
   2241             binop(Iop_And64, mkexpr(irRes), mkU64(1))));
   2242 
   2243    assign(
   2244       termL,
   2245       binop(Iop_Add64,
   2246             binop(Iop_Shr64,
   2247                   binop(Iop_Sub64,
   2248                         binop(Iop_Shl64,
   2249                               binop(Iop_Xor64, mkexpr(ix), mkU64(1)),
   2250                               mkU8(62)),
   2251                         mkU64(1)),
   2252                   mkU8(61)),
   2253             mkU64(1)));
   2254 
   2255    assign(
   2256       termR,
   2257       binop(Iop_And64,
   2258             binop(Iop_And64,
   2259                   mkexpr(ix),
   2260                   binop(Iop_Shr64, mkexpr(ix), mkU8(1))),
   2261             mkU64(1)));
   2262 
   2263    assign(nzcv, binop(Iop_Sub64, mkexpr(termL), mkexpr(termR)));
   2264    return nzcv;
   2265 }
   2266 
   2267 
   2268 /*------------------------------------------------------------*/
   2269 /*--- Data processing (immediate)                          ---*/
   2270 /*------------------------------------------------------------*/
   2271 
   2272 /* Helper functions for supporting "DecodeBitMasks" */
   2273 
   2274 static ULong dbm_ROR ( Int width, ULong x, Int rot )
   2275 {
   2276    vassert(width > 0 && width <= 64);
   2277    vassert(rot >= 0 && rot < width);
   2278    if (rot == 0) return x;
   2279    ULong res = x >> rot;
   2280    res |= (x << (width - rot));
   2281    if (width < 64)
   2282      res &= ((1ULL << width) - 1);
   2283    return res;
   2284 }
   2285 
   2286 static ULong dbm_RepTo64( Int esize, ULong x )
   2287 {
   2288    switch (esize) {
   2289       case 64:
   2290          return x;
   2291       case 32:
   2292          x &= 0xFFFFFFFF; x |= (x << 32);
   2293          return x;
   2294       case 16:
   2295          x &= 0xFFFF; x |= (x << 16); x |= (x << 32);
   2296          return x;
   2297       case 8:
   2298          x &= 0xFF; x |= (x << 8); x |= (x << 16); x |= (x << 32);
   2299          return x;
   2300       case 4:
   2301          x &= 0xF; x |= (x << 4); x |= (x << 8);
   2302          x |= (x << 16); x |= (x << 32);
   2303          return x;
   2304       case 2:
   2305          x &= 0x3; x |= (x << 2); x |= (x << 4); x |= (x << 8);
   2306          x |= (x << 16); x |= (x << 32);
   2307          return x;
   2308       default:
   2309          break;
   2310    }
   2311    vpanic("dbm_RepTo64");
   2312    /*NOTREACHED*/
   2313    return 0;
   2314 }
   2315 
   2316 static Int dbm_highestSetBit ( ULong x )
   2317 {
   2318    Int i;
   2319    for (i = 63; i >= 0; i--) {
   2320       if (x & (1ULL << i))
   2321          return i;
   2322    }
   2323    vassert(x == 0);
   2324    return -1;
   2325 }
   2326 
   2327 static
   2328 Bool dbm_DecodeBitMasks ( /*OUT*/ULong* wmask, /*OUT*/ULong* tmask,
   2329                           ULong immN, ULong imms, ULong immr, Bool immediate,
   2330                           UInt M /*32 or 64*/)
   2331 {
   2332    vassert(immN < (1ULL << 1));
   2333    vassert(imms < (1ULL << 6));
   2334    vassert(immr < (1ULL << 6));
   2335    vassert(immediate == False || immediate == True);
   2336    vassert(M == 32 || M == 64);
   2337 
   2338    Int len = dbm_highestSetBit( ((immN << 6) & 64) | ((~imms) & 63) );
   2339    if (len < 1) { /* printf("fail1\n"); */ return False; }
   2340    vassert(len <= 6);
   2341    vassert(M >= (1 << len));
   2342 
   2343    vassert(len >= 1 && len <= 6);
   2344    ULong levels = // (zeroes(6 - len) << (6-len)) | ones(len);
   2345                   (1 << len) - 1;
   2346    vassert(levels >= 1 && levels <= 63);
   2347 
   2348    if (immediate && ((imms & levels) == levels)) {
   2349       /* printf("fail2 imms %llu levels %llu len %d\n", imms, levels, len); */
   2350       return False;
   2351    }
   2352 
   2353    ULong S = imms & levels;
   2354    ULong R = immr & levels;
   2355    Int   diff = S - R;
   2356    diff &= 63;
   2357    Int esize = 1 << len;
   2358    vassert(2 <= esize && esize <= 64);
   2359 
   2360    /* Be careful of these (1ULL << (S+1)) - 1 expressions, and the
   2361       same below with d.  S can be 63 in which case we have an out of
   2362       range and hence undefined shift. */
   2363    vassert(S >= 0 && S <= 63);
   2364    vassert(esize >= (S+1));
   2365    ULong elem_s = // Zeroes(esize-(S+1)):Ones(S+1)
   2366                   //(1ULL << (S+1)) - 1;
   2367                   ((1ULL << S) - 1) + (1ULL << S);
   2368 
   2369    Int d = // diff<len-1:0>
   2370            diff & ((1 << len)-1);
   2371    vassert(esize >= (d+1));
   2372    vassert(d >= 0 && d <= 63);
   2373 
   2374    ULong elem_d = // Zeroes(esize-(d+1)):Ones(d+1)
   2375                   //(1ULL << (d+1)) - 1;
   2376                   ((1ULL << d) - 1) + (1ULL << d);
   2377 
   2378    if (esize != 64) vassert(elem_s < (1ULL << esize));
   2379    if (esize != 64) vassert(elem_d < (1ULL << esize));
   2380 
   2381    if (wmask) *wmask = dbm_RepTo64(esize, dbm_ROR(esize, elem_s, R));
   2382    if (tmask) *tmask = dbm_RepTo64(esize, elem_d);
   2383 
   2384    return True;
   2385 }
   2386 
   2387 
   2388 static
   2389 Bool dis_ARM64_data_processing_immediate(/*MB_OUT*/DisResult* dres,
   2390                                          UInt insn)
   2391 {
   2392 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   2393 
   2394    /* insn[28:23]
   2395       10000x PC-rel addressing
   2396       10001x Add/subtract (immediate)
   2397       100100 Logical (immediate)
   2398       100101 Move Wide (immediate)
   2399       100110 Bitfield
   2400       100111 Extract
   2401    */
   2402 
   2403    /* ------------------ ADD/SUB{,S} imm12 ------------------ */
   2404    if (INSN(28,24) == BITS5(1,0,0,0,1)) {
   2405       Bool is64   = INSN(31,31) == 1;
   2406       Bool isSub  = INSN(30,30) == 1;
   2407       Bool setCC  = INSN(29,29) == 1;
   2408       UInt sh     = INSN(23,22);
   2409       UInt uimm12 = INSN(21,10);
   2410       UInt nn     = INSN(9,5);
   2411       UInt dd     = INSN(4,0);
   2412       const HChar* nm = isSub ? "sub" : "add";
   2413       if (sh >= 2) {
   2414          /* Invalid; fall through */
   2415       } else {
   2416          vassert(sh <= 1);
   2417          uimm12 <<= (12 * sh);
   2418          if (is64) {
   2419             IRTemp argL  = newTemp(Ity_I64);
   2420             IRTemp argR  = newTemp(Ity_I64);
   2421             IRTemp res   = newTemp(Ity_I64);
   2422             assign(argL, getIReg64orSP(nn));
   2423             assign(argR, mkU64(uimm12));
   2424             assign(res,  binop(isSub ? Iop_Sub64 : Iop_Add64,
   2425                                mkexpr(argL), mkexpr(argR)));
   2426             if (setCC) {
   2427                putIReg64orZR(dd, mkexpr(res));
   2428                setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
   2429                DIP("%ss %s, %s, 0x%x\n",
   2430                    nm, nameIReg64orZR(dd), nameIReg64orSP(nn), uimm12);
   2431             } else {
   2432                putIReg64orSP(dd, mkexpr(res));
   2433                DIP("%s %s, %s, 0x%x\n",
   2434                    nm, nameIReg64orSP(dd), nameIReg64orSP(nn), uimm12);
   2435             }
   2436          } else {
   2437             IRTemp argL  = newTemp(Ity_I32);
   2438             IRTemp argR  = newTemp(Ity_I32);
   2439             IRTemp res   = newTemp(Ity_I32);
   2440             assign(argL, getIReg32orSP(nn));
   2441             assign(argR, mkU32(uimm12));
   2442             assign(res,  binop(isSub ? Iop_Sub32 : Iop_Add32,
   2443                                mkexpr(argL), mkexpr(argR)));
   2444             if (setCC) {
   2445                putIReg32orZR(dd, mkexpr(res));
   2446                setFlags_ADD_SUB(False/*!is64*/, isSub, argL, argR);
   2447                DIP("%ss %s, %s, 0x%x\n",
   2448                    nm, nameIReg32orZR(dd), nameIReg32orSP(nn), uimm12);
   2449             } else {
   2450                putIReg32orSP(dd, mkexpr(res));
   2451                DIP("%s %s, %s, 0x%x\n",
   2452                    nm, nameIReg32orSP(dd), nameIReg32orSP(nn), uimm12);
   2453             }
   2454          }
   2455          return True;
   2456       }
   2457    }
   2458 
   2459    /* -------------------- ADR/ADRP -------------------- */
   2460    if (INSN(28,24) == BITS5(1,0,0,0,0)) {
   2461       UInt  bP    = INSN(31,31);
   2462       UInt  immLo = INSN(30,29);
   2463       UInt  immHi = INSN(23,5);
   2464       UInt  rD    = INSN(4,0);
   2465       ULong uimm  = (immHi << 2) | immLo;
   2466       ULong simm  = sx_to_64(uimm, 21);
   2467       ULong val;
   2468       if (bP) {
   2469          val = (guest_PC_curr_instr & 0xFFFFFFFFFFFFF000ULL) + (simm << 12);
   2470       } else {
   2471          val = guest_PC_curr_instr + simm;
   2472       }
   2473       putIReg64orZR(rD, mkU64(val));
   2474       DIP("adr%s %s, 0x%llx\n", bP ? "p" : "", nameIReg64orZR(rD), val);
   2475       return True;
   2476    }
   2477 
   2478    /* -------------------- LOGIC(imm) -------------------- */
   2479    if (INSN(28,23) == BITS6(1,0,0,1,0,0)) {
   2480       /* 31 30 28     22 21   15   9  4
   2481          sf op 100100 N  immr imms Rn Rd
   2482            op=00: AND  Rd|SP, Rn, #imm
   2483            op=01: ORR  Rd|SP, Rn, #imm
   2484            op=10: EOR  Rd|SP, Rn, #imm
   2485            op=11: ANDS Rd|ZR, Rn, #imm
   2486       */
   2487       Bool  is64 = INSN(31,31) == 1;
   2488       UInt  op   = INSN(30,29);
   2489       UInt  N    = INSN(22,22);
   2490       UInt  immR = INSN(21,16);
   2491       UInt  immS = INSN(15,10);
   2492       UInt  nn   = INSN(9,5);
   2493       UInt  dd   = INSN(4,0);
   2494       ULong imm  = 0;
   2495       Bool  ok;
   2496       if (N == 1 && !is64)
   2497          goto after_logic_imm; /* not allowed; fall through */
   2498       ok = dbm_DecodeBitMasks(&imm, NULL,
   2499                               N, immS, immR, True, is64 ? 64 : 32);
   2500       if (!ok)
   2501          goto after_logic_imm;
   2502 
   2503       const HChar* names[4] = { "and", "orr", "eor", "ands" };
   2504       const IROp   ops64[4] = { Iop_And64, Iop_Or64, Iop_Xor64, Iop_And64 };
   2505       const IROp   ops32[4] = { Iop_And32, Iop_Or32, Iop_Xor32, Iop_And32 };
   2506 
   2507       vassert(op < 4);
   2508       if (is64) {
   2509          IRExpr* argL = getIReg64orZR(nn);
   2510          IRExpr* argR = mkU64(imm);
   2511          IRTemp  res  = newTemp(Ity_I64);
   2512          assign(res, binop(ops64[op], argL, argR));
   2513          if (op < 3) {
   2514             putIReg64orSP(dd, mkexpr(res));
   2515             DIP("%s %s, %s, 0x%llx\n", names[op],
   2516                 nameIReg64orSP(dd), nameIReg64orZR(nn), imm);
   2517          } else {
   2518             putIReg64orZR(dd, mkexpr(res));
   2519             setFlags_LOGIC(True/*is64*/, res);
   2520             DIP("%s %s, %s, 0x%llx\n", names[op],
   2521                 nameIReg64orZR(dd), nameIReg64orZR(nn), imm);
   2522          }
   2523       } else {
   2524          IRExpr* argL = getIReg32orZR(nn);
   2525          IRExpr* argR = mkU32((UInt)imm);
   2526          IRTemp  res  = newTemp(Ity_I32);
   2527          assign(res, binop(ops32[op], argL, argR));
   2528          if (op < 3) {
   2529             putIReg32orSP(dd, mkexpr(res));
   2530             DIP("%s %s, %s, 0x%x\n", names[op],
   2531                 nameIReg32orSP(dd), nameIReg32orZR(nn), (UInt)imm);
   2532          } else {
   2533             putIReg32orZR(dd, mkexpr(res));
   2534             setFlags_LOGIC(False/*!is64*/, res);
   2535             DIP("%s %s, %s, 0x%x\n", names[op],
   2536                 nameIReg32orZR(dd), nameIReg32orZR(nn), (UInt)imm);
   2537          }
   2538       }
   2539       return True;
   2540    }
   2541    after_logic_imm:
   2542 
   2543    /* -------------------- MOV{Z,N,K} -------------------- */
   2544    if (INSN(28,23) == BITS6(1,0,0,1,0,1)) {
   2545       /* 31 30 28      22 20    4
   2546          |  |  |       |  |     |
   2547          sf 10 100 101 hw imm16 Rd   MOV(Z) Rd, (imm16 << (16*hw))
   2548          sf 00 100 101 hw imm16 Rd   MOV(N) Rd, ~(imm16 << (16*hw))
   2549          sf 11 100 101 hw imm16 Rd   MOV(K) Rd, (imm16 << (16*hw))
   2550       */
   2551       Bool is64   = INSN(31,31) == 1;
   2552       UInt subopc = INSN(30,29);
   2553       UInt hw     = INSN(22,21);
   2554       UInt imm16  = INSN(20,5);
   2555       UInt dd     = INSN(4,0);
   2556       if (subopc == BITS2(0,1) || (!is64 && hw >= 2)) {
   2557          /* invalid; fall through */
   2558       } else {
   2559          ULong imm64 = ((ULong)imm16) << (16 * hw);
   2560          if (!is64)
   2561             vassert(imm64 < 0x100000000ULL);
   2562          switch (subopc) {
   2563             case BITS2(1,0): // MOVZ
   2564                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
   2565                DIP("movz %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
   2566                break;
   2567             case BITS2(0,0): // MOVN
   2568                imm64 = ~imm64;
   2569                if (!is64)
   2570                   imm64 &= 0xFFFFFFFFULL;
   2571                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
   2572                DIP("movn %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
   2573                break;
   2574             case BITS2(1,1): // MOVK
   2575                /* This is more complex.  We are inserting a slice into
   2576                   the destination register, so we need to have the old
   2577                   value of it. */
   2578                if (is64) {
   2579                   IRTemp old = newTemp(Ity_I64);
   2580                   assign(old, getIReg64orZR(dd));
   2581                   ULong mask = 0xFFFFULL << (16 * hw);
   2582                   IRExpr* res
   2583                      = binop(Iop_Or64,
   2584                              binop(Iop_And64, mkexpr(old), mkU64(~mask)),
   2585                              mkU64(imm64));
   2586                   putIReg64orZR(dd, res);
   2587                   DIP("movk %s, 0x%x, lsl %u\n",
   2588                       nameIReg64orZR(dd), imm16, 16*hw);
   2589                } else {
   2590                   IRTemp old = newTemp(Ity_I32);
   2591                   assign(old, getIReg32orZR(dd));
   2592                   vassert(hw <= 1);
   2593                   UInt mask = 0xFFFF << (16 * hw);
   2594                   IRExpr* res
   2595                      = binop(Iop_Or32,
   2596                              binop(Iop_And32, mkexpr(old), mkU32(~mask)),
   2597                              mkU32((UInt)imm64));
   2598                   putIReg32orZR(dd, res);
   2599                   DIP("movk %s, 0x%x, lsl %u\n",
   2600                       nameIReg32orZR(dd), imm16, 16*hw);
   2601                }
   2602                break;
   2603             default:
   2604                vassert(0);
   2605          }
   2606          return True;
   2607       }
   2608    }
   2609 
   2610    /* -------------------- {U,S,}BFM -------------------- */
   2611    /*    30 28     22 21   15   9  4
   2612 
   2613       sf 10 100110 N  immr imms nn dd
   2614          UBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
   2615          UBFM Xd, Xn, #immr, #imms   when sf=1, N=1
   2616 
   2617       sf 00 100110 N  immr imms nn dd
   2618          SBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
   2619          SBFM Xd, Xn, #immr, #imms   when sf=1, N=1
   2620 
   2621       sf 01 100110 N  immr imms nn dd
   2622          BFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
   2623          BFM Xd, Xn, #immr, #imms   when sf=1, N=1
   2624    */
   2625    if (INSN(28,23) == BITS6(1,0,0,1,1,0)) {
   2626       UInt sf     = INSN(31,31);
   2627       UInt opc    = INSN(30,29);
   2628       UInt N      = INSN(22,22);
   2629       UInt immR   = INSN(21,16);
   2630       UInt immS   = INSN(15,10);
   2631       UInt nn     = INSN(9,5);
   2632       UInt dd     = INSN(4,0);
   2633       Bool inZero = False;
   2634       Bool extend = False;
   2635       const HChar* nm = "???";
   2636       /* skip invalid combinations */
   2637       switch (opc) {
   2638          case BITS2(0,0):
   2639             inZero = True; extend = True; nm = "sbfm"; break;
   2640          case BITS2(0,1):
   2641             inZero = False; extend = False; nm = "bfm"; break;
   2642          case BITS2(1,0):
   2643             inZero = True; extend = False; nm = "ubfm"; break;
   2644          case BITS2(1,1):
   2645             goto after_bfm; /* invalid */
   2646          default:
   2647             vassert(0);
   2648       }
   2649       if (sf == 1 && N != 1) goto after_bfm;
   2650       if (sf == 0 && (N != 0 || ((immR >> 5) & 1) != 0
   2651                              || ((immS >> 5) & 1) != 0)) goto after_bfm;
   2652       ULong wmask = 0, tmask = 0;
   2653       Bool ok = dbm_DecodeBitMasks(&wmask, &tmask,
   2654                                    N, immS, immR, False, sf == 1 ? 64 : 32);
   2655       if (!ok) goto after_bfm; /* hmmm */
   2656 
   2657       Bool   is64 = sf == 1;
   2658       IRType ty   = is64 ? Ity_I64 : Ity_I32;
   2659 
   2660       IRTemp dst = newTemp(ty);
   2661       IRTemp src = newTemp(ty);
   2662       IRTemp bot = newTemp(ty);
   2663       IRTemp top = newTemp(ty);
   2664       IRTemp res = newTemp(ty);
   2665       assign(dst, inZero ? mkU(ty,0) : getIRegOrZR(is64, dd));
   2666       assign(src, getIRegOrZR(is64, nn));
   2667       /* perform bitfield move on low bits */
   2668       assign(bot, binop(mkOR(ty),
   2669                         binop(mkAND(ty), mkexpr(dst), mkU(ty, ~wmask)),
   2670                         binop(mkAND(ty), mkexpr(mathROR(ty, src, immR)),
   2671                                          mkU(ty, wmask))));
   2672       /* determine extension bits (sign, zero or dest register) */
   2673       assign(top, mkexpr(extend ? mathREPLICATE(ty, src, immS) : dst));
   2674       /* combine extension bits and result bits */
   2675       assign(res, binop(mkOR(ty),
   2676                         binop(mkAND(ty), mkexpr(top), mkU(ty, ~tmask)),
   2677                         binop(mkAND(ty), mkexpr(bot), mkU(ty, tmask))));
   2678       putIRegOrZR(is64, dd, mkexpr(res));
   2679       DIP("%s %s, %s, immR=%u, immS=%u\n",
   2680           nm, nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR, immS);
   2681       return True;
   2682    }
   2683    after_bfm:
   2684 
   2685    /* ---------------------- EXTR ---------------------- */
   2686    /*   30 28     22 20 15   9 4
   2687       1 00 100111 10 m  imm6 n d  EXTR Xd, Xn, Xm, #imm6
   2688       0 00 100111 00 m  imm6 n d  EXTR Wd, Wn, Wm, #imm6 when #imm6 < 32
   2689    */
   2690    if (INSN(30,23) == BITS8(0,0,1,0,0,1,1,1) && INSN(21,21) == 0) {
   2691       Bool is64  = INSN(31,31) == 1;
   2692       UInt mm    = INSN(20,16);
   2693       UInt imm6  = INSN(15,10);
   2694       UInt nn    = INSN(9,5);
   2695       UInt dd    = INSN(4,0);
   2696       Bool valid = True;
   2697       if (INSN(31,31) != INSN(22,22))
   2698         valid = False;
   2699       if (!is64 && imm6 >= 32)
   2700         valid = False;
   2701       if (!valid) goto after_extr;
   2702       IRType ty    = is64 ? Ity_I64 : Ity_I32;
   2703       IRTemp srcHi = newTemp(ty);
   2704       IRTemp srcLo = newTemp(ty);
   2705       IRTemp res   = newTemp(ty);
   2706       assign(srcHi, getIRegOrZR(is64, nn));
   2707       assign(srcLo, getIRegOrZR(is64, mm));
   2708       if (imm6 == 0) {
   2709         assign(res, mkexpr(srcLo));
   2710       } else {
   2711         UInt szBits = 8 * sizeofIRType(ty);
   2712         vassert(imm6 > 0 && imm6 < szBits);
   2713         assign(res, binop(mkOR(ty),
   2714                           binop(mkSHL(ty), mkexpr(srcHi), mkU8(szBits-imm6)),
   2715                           binop(mkSHR(ty), mkexpr(srcLo), mkU8(imm6))));
   2716       }
   2717       putIRegOrZR(is64, dd, mkexpr(res));
   2718       DIP("extr %s, %s, %s, #%u\n",
   2719           nameIRegOrZR(is64,dd),
   2720           nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm), imm6);
   2721       return True;
   2722    }
   2723   after_extr:
   2724 
   2725    vex_printf("ARM64 front end: data_processing_immediate\n");
   2726    return False;
   2727 #  undef INSN
   2728 }
   2729 
   2730 
   2731 /*------------------------------------------------------------*/
   2732 /*--- Data processing (register) instructions              ---*/
   2733 /*------------------------------------------------------------*/
   2734 
   2735 static const HChar* nameSH ( UInt sh ) {
   2736    switch (sh) {
   2737       case 0: return "lsl";
   2738       case 1: return "lsr";
   2739       case 2: return "asr";
   2740       case 3: return "ror";
   2741       default: vassert(0);
   2742    }
   2743 }
   2744 
   2745 /* Generate IR to get a register value, possibly shifted by an
   2746    immediate.  Returns either a 32- or 64-bit temporary holding the
   2747    result.  After the shift, the value can optionally be NOT-ed
   2748    too.
   2749 
   2750    sh_how coding: 00=SHL, 01=SHR, 10=SAR, 11=ROR.  sh_amt may only be
   2751    in the range 0 to (is64 ? 64 : 32)-1.  For some instructions, ROR
   2752    isn't allowed, but it's the job of the caller to check that.
   2753 */
   2754 static IRTemp getShiftedIRegOrZR ( Bool is64,
   2755                                    UInt sh_how, UInt sh_amt, UInt regNo,
   2756                                    Bool invert )
   2757 {
   2758    vassert(sh_how < 4);
   2759    vassert(sh_amt < (is64 ? 64 : 32));
   2760    IRType ty = is64 ? Ity_I64 : Ity_I32;
   2761    IRTemp t0 = newTemp(ty);
   2762    assign(t0, getIRegOrZR(is64, regNo));
   2763    IRTemp t1 = newTemp(ty);
   2764    switch (sh_how) {
   2765       case BITS2(0,0):
   2766          assign(t1, binop(mkSHL(ty), mkexpr(t0), mkU8(sh_amt)));
   2767          break;
   2768       case BITS2(0,1):
   2769          assign(t1, binop(mkSHR(ty), mkexpr(t0), mkU8(sh_amt)));
   2770          break;
   2771       case BITS2(1,0):
   2772          assign(t1, binop(mkSAR(ty), mkexpr(t0), mkU8(sh_amt)));
   2773          break;
   2774       case BITS2(1,1):
   2775          assign(t1, mkexpr(mathROR(ty, t0, sh_amt)));
   2776          break;
   2777       default:
   2778          vassert(0);
   2779    }
   2780    if (invert) {
   2781       IRTemp t2 = newTemp(ty);
   2782       assign(t2, unop(mkNOT(ty), mkexpr(t1)));
   2783       return t2;
   2784    } else {
   2785       return t1;
   2786    }
   2787 }
   2788 
   2789 
   2790 static
   2791 Bool dis_ARM64_data_processing_register(/*MB_OUT*/DisResult* dres,
   2792                                         UInt insn)
   2793 {
   2794 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   2795 
   2796    /* ------------------- ADD/SUB(reg) ------------------- */
   2797    /* x==0 => 32 bit op      x==1 => 64 bit op
   2798       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR(NOT ALLOWED)
   2799 
   2800       31 30 29 28    23 21 20 15   9  4
   2801       |  |  |  |     |  |  |  |    |  |
   2802       x  0  0  01011 sh 0  Rm imm6 Rn Rd   ADD  Rd,Rn, sh(Rm,imm6)
   2803       x  0  1  01011 sh 0  Rm imm6 Rn Rd   ADDS Rd,Rn, sh(Rm,imm6)
   2804       x  1  0  01011 sh 0  Rm imm6 Rn Rd   SUB  Rd,Rn, sh(Rm,imm6)
   2805       x  1  1  01011 sh 0  Rm imm6 Rn Rd   SUBS Rd,Rn, sh(Rm,imm6)
   2806    */
   2807    if (INSN(28,24) == BITS5(0,1,0,1,1) && INSN(21,21) == 0) {
   2808       UInt   bX    = INSN(31,31);
   2809       UInt   bOP   = INSN(30,30); /* 0: ADD, 1: SUB */
   2810       UInt   bS    = INSN(29, 29); /* set flags? */
   2811       UInt   sh    = INSN(23,22);
   2812       UInt   rM    = INSN(20,16);
   2813       UInt   imm6  = INSN(15,10);
   2814       UInt   rN    = INSN(9,5);
   2815       UInt   rD    = INSN(4,0);
   2816       Bool   isSUB = bOP == 1;
   2817       Bool   is64  = bX == 1;
   2818       IRType ty    = is64 ? Ity_I64 : Ity_I32;
   2819       if ((!is64 && imm6 > 31) || sh == BITS2(1,1)) {
   2820          /* invalid; fall through */
   2821       } else {
   2822          IRTemp argL = newTemp(ty);
   2823          assign(argL, getIRegOrZR(is64, rN));
   2824          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, False);
   2825          IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
   2826          IRTemp res  = newTemp(ty);
   2827          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
   2828          if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
   2829          if (bS) {
   2830             setFlags_ADD_SUB(is64, isSUB, argL, argR);
   2831          }
   2832          DIP("%s%s %s, %s, %s, %s #%u\n",
   2833              bOP ? "sub" : "add", bS ? "s" : "",
   2834              nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
   2835              nameIRegOrZR(is64, rM), nameSH(sh), imm6);
   2836          return True;
   2837       }
   2838    }
   2839 
   2840    /* ------------------- ADC/SBC(reg) ------------------- */
   2841    /* x==0 => 32 bit op      x==1 => 64 bit op
   2842 
   2843       31 30 29 28    23 21 20 15     9  4
   2844       |  |  |  |     |  |  |  |      |  |
   2845       x  0  0  11010 00 0  Rm 000000 Rn Rd   ADC  Rd,Rn,Rm
   2846       x  0  1  11010 00 0  Rm 000000 Rn Rd   ADCS Rd,Rn,Rm
   2847       x  1  0  11010 00 0  Rm 000000 Rn Rd   SBC  Rd,Rn,Rm
   2848       x  1  1  11010 00 0  Rm 000000 Rn Rd   SBCS Rd,Rn,Rm
   2849    */
   2850 
   2851    if (INSN(28,21) == BITS8(1,1,0,1,0,0,0,0) && INSN(15,10) == 0 ) {
   2852       UInt   bX    = INSN(31,31);
   2853       UInt   bOP   = INSN(30,30); /* 0: ADC, 1: SBC */
   2854       UInt   bS    = INSN(29,29); /* set flags */
   2855       UInt   rM    = INSN(20,16);
   2856       UInt   rN    = INSN(9,5);
   2857       UInt   rD    = INSN(4,0);
   2858 
   2859       Bool   isSUB = bOP == 1;
   2860       Bool   is64  = bX == 1;
   2861       IRType ty    = is64 ? Ity_I64 : Ity_I32;
   2862 
   2863       IRTemp oldC = newTemp(ty);
   2864       assign(oldC,
   2865              is64 ? mk_arm64g_calculate_flag_c()
   2866                   : unop(Iop_64to32, mk_arm64g_calculate_flag_c()) );
   2867 
   2868       IRTemp argL = newTemp(ty);
   2869       assign(argL, getIRegOrZR(is64, rN));
   2870       IRTemp argR = newTemp(ty);
   2871       assign(argR, getIRegOrZR(is64, rM));
   2872 
   2873       IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
   2874       IRTemp res  = newTemp(ty);
   2875       if (isSUB) {
   2876          IRExpr* one = is64 ? mkU64(1) : mkU32(1);
   2877          IROp xorOp = is64 ? Iop_Xor64 : Iop_Xor32;
   2878          assign(res,
   2879                 binop(op,
   2880                       binop(op, mkexpr(argL), mkexpr(argR)),
   2881                       binop(xorOp, mkexpr(oldC), one)));
   2882       } else {
   2883          assign(res,
   2884                 binop(op,
   2885                       binop(op, mkexpr(argL), mkexpr(argR)),
   2886                       mkexpr(oldC)));
   2887       }
   2888 
   2889       if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
   2890 
   2891       if (bS) {
   2892          setFlags_ADC_SBC(is64, isSUB, argL, argR, oldC);
   2893       }
   2894 
   2895       DIP("%s%s %s, %s, %s\n",
   2896           bOP ? "sbc" : "adc", bS ? "s" : "",
   2897           nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
   2898           nameIRegOrZR(is64, rM));
   2899       return True;
   2900    }
   2901 
   2902    /* -------------------- LOGIC(reg) -------------------- */
   2903    /* x==0 => 32 bit op      x==1 => 64 bit op
   2904       N==0 => inv? is no-op (no inversion)
   2905       N==1 => inv? is NOT
   2906       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR
   2907 
   2908       31 30 28    23 21 20 15   9  4
   2909       |  |  |     |  |  |  |    |  |
   2910       x  00 01010 sh N  Rm imm6 Rn Rd  AND  Rd,Rn, inv?(sh(Rm,imm6))
   2911       x  01 01010 sh N  Rm imm6 Rn Rd  ORR  Rd,Rn, inv?(sh(Rm,imm6))
   2912       x  10 01010 sh N  Rm imm6 Rn Rd  EOR  Rd,Rn, inv?(sh(Rm,imm6))
   2913       x  11 01010 sh N  Rm imm6 Rn Rd  ANDS Rd,Rn, inv?(sh(Rm,imm6))
   2914       With N=1, the names are: BIC ORN EON BICS
   2915    */
   2916    if (INSN(28,24) == BITS5(0,1,0,1,0)) {
   2917       UInt   bX   = INSN(31,31);
   2918       UInt   sh   = INSN(23,22);
   2919       UInt   bN   = INSN(21,21);
   2920       UInt   rM   = INSN(20,16);
   2921       UInt   imm6 = INSN(15,10);
   2922       UInt   rN   = INSN(9,5);
   2923       UInt   rD   = INSN(4,0);
   2924       Bool   is64 = bX == 1;
   2925       IRType ty   = is64 ? Ity_I64 : Ity_I32;
   2926       if (!is64 && imm6 > 31) {
   2927          /* invalid; fall though */
   2928       } else {
   2929          IRTemp argL = newTemp(ty);
   2930          assign(argL, getIRegOrZR(is64, rN));
   2931          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, bN == 1);
   2932          IROp   op   = Iop_INVALID;
   2933          switch (INSN(30,29)) {
   2934             case BITS2(0,0): case BITS2(1,1): op = mkAND(ty); break;
   2935             case BITS2(0,1):                  op = mkOR(ty);  break;
   2936             case BITS2(1,0):                  op = mkXOR(ty); break;
   2937             default: vassert(0);
   2938          }
   2939          IRTemp res = newTemp(ty);
   2940          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
   2941          if (INSN(30,29) == BITS2(1,1)) {
   2942             setFlags_LOGIC(is64, res);
   2943          }
   2944          putIRegOrZR(is64, rD, mkexpr(res));
   2945 
   2946          static const HChar* names_op[8]
   2947             = { "and", "orr", "eor", "ands", "bic", "orn", "eon", "bics" };
   2948          vassert(((bN << 2) | INSN(30,29)) < 8);
   2949          const HChar* nm_op = names_op[(bN << 2) | INSN(30,29)];
   2950          /* Special-case the printing of "MOV" */
   2951          if (rN == 31/*zr*/ && sh == 0/*LSL*/ && imm6 == 0 && bN == 0) {
   2952             DIP("mov %s, %s\n", nameIRegOrZR(is64, rD),
   2953                                 nameIRegOrZR(is64, rM));
   2954          } else {
   2955             DIP("%s %s, %s, %s, %s #%u\n", nm_op,
   2956                 nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
   2957                 nameIRegOrZR(is64, rM), nameSH(sh), imm6);
   2958          }
   2959          return True;
   2960       }
   2961    }
   2962 
   2963    /* -------------------- {U,S}MULH -------------------- */
   2964    /* 31       23 22 20 15     9   4
   2965       10011011 1  10 Rm 011111 Rn Rd   UMULH Xd,Xn,Xm
   2966       10011011 0  10 Rm 011111 Rn Rd   SMULH Xd,Xn,Xm
   2967    */
   2968    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1)
   2969        && INSN(22,21) == BITS2(1,0) && INSN(15,10) == BITS6(0,1,1,1,1,1)) {
   2970       Bool isU = INSN(23,23) == 1;
   2971       UInt mm  = INSN(20,16);
   2972       UInt nn  = INSN(9,5);
   2973       UInt dd  = INSN(4,0);
   2974       putIReg64orZR(dd, unop(Iop_128HIto64,
   2975                              binop(isU ? Iop_MullU64 : Iop_MullS64,
   2976                                    getIReg64orZR(nn), getIReg64orZR(mm))));
   2977       DIP("%cmulh %s, %s, %s\n",
   2978           isU ? 'u' : 's',
   2979           nameIReg64orZR(dd), nameIReg64orZR(nn), nameIReg64orZR(mm));
   2980       return True;
   2981    }
   2982 
   2983    /* -------------------- M{ADD,SUB} -------------------- */
   2984    /* 31 30           20 15 14 9 4
   2985       sf 00 11011 000 m  0  a  n r   MADD Rd,Rn,Rm,Ra  d = a+m*n
   2986       sf 00 11011 000 m  1  a  n r   MADD Rd,Rn,Rm,Ra  d = a-m*n
   2987    */
   2988    if (INSN(30,21) == BITS10(0,0,1,1,0,1,1,0,0,0)) {
   2989       Bool is64  = INSN(31,31) == 1;
   2990       UInt mm    = INSN(20,16);
   2991       Bool isAdd = INSN(15,15) == 0;
   2992       UInt aa    = INSN(14,10);
   2993       UInt nn    = INSN(9,5);
   2994       UInt dd    = INSN(4,0);
   2995       if (is64) {
   2996          putIReg64orZR(
   2997             dd,
   2998             binop(isAdd ? Iop_Add64 : Iop_Sub64,
   2999                   getIReg64orZR(aa),
   3000                   binop(Iop_Mul64, getIReg64orZR(mm), getIReg64orZR(nn))));
   3001       } else {
   3002          putIReg32orZR(
   3003             dd,
   3004             binop(isAdd ? Iop_Add32 : Iop_Sub32,
   3005                   getIReg32orZR(aa),
   3006                   binop(Iop_Mul32, getIReg32orZR(mm), getIReg32orZR(nn))));
   3007       }
   3008       DIP("%s %s, %s, %s, %s\n",
   3009           isAdd ? "madd" : "msub",
   3010           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
   3011           nameIRegOrZR(is64, mm), nameIRegOrZR(is64, aa));
   3012       return True;
   3013    }
   3014 
   3015    /* ---------------- CS{EL,INC,INV,NEG} ---------------- */
   3016    /* 31 30 28        20 15   11 9  4
   3017       sf 00 1101 0100 mm cond 00 nn dd   CSEL  Rd,Rn,Rm
   3018       sf 00 1101 0100 mm cond 01 nn dd   CSINC Rd,Rn,Rm
   3019       sf 10 1101 0100 mm cond 00 nn dd   CSINV Rd,Rn,Rm
   3020       sf 10 1101 0100 mm cond 01 nn dd   CSNEG Rd,Rn,Rm
   3021       In all cases, the operation is: Rd = if cond then Rn else OP(Rm)
   3022    */
   3023    if (INSN(29,21) == BITS9(0, 1,1,0,1, 0,1,0,0) && INSN(11,11) == 0) {
   3024       Bool    is64 = INSN(31,31) == 1;
   3025       UInt    b30  = INSN(30,30);
   3026       UInt    mm   = INSN(20,16);
   3027       UInt    cond = INSN(15,12);
   3028       UInt    b10  = INSN(10,10);
   3029       UInt    nn   = INSN(9,5);
   3030       UInt    dd   = INSN(4,0);
   3031       UInt    op   = (b30 << 1) | b10; /* 00=id 01=inc 10=inv 11=neg */
   3032       IRType  ty   = is64 ? Ity_I64 : Ity_I32;
   3033       IRExpr* argL = getIRegOrZR(is64, nn);
   3034       IRExpr* argR = getIRegOrZR(is64, mm);
   3035       switch (op) {
   3036          case BITS2(0,0):
   3037             break;
   3038          case BITS2(0,1):
   3039             argR = binop(mkADD(ty), argR, mkU(ty,1));
   3040             break;
   3041          case BITS2(1,0):
   3042             argR = unop(mkNOT(ty), argR);
   3043             break;
   3044          case BITS2(1,1):
   3045             argR = binop(mkSUB(ty), mkU(ty,0), argR);
   3046             break;
   3047          default:
   3048             vassert(0);
   3049       }
   3050       putIRegOrZR(
   3051          is64, dd,
   3052          IRExpr_ITE(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
   3053                     argL, argR)
   3054       );
   3055       const HChar* op_nm[4] = { "csel", "csinc", "csinv", "csneg" };
   3056       DIP("%s %s, %s, %s, %s\n", op_nm[op],
   3057           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
   3058           nameIRegOrZR(is64, mm), nameCC(cond));
   3059       return True;
   3060    }
   3061 
   3062    /* -------------- ADD/SUB(extended reg) -------------- */
   3063    /*     28         20 15  12   9 4
   3064       000 01011 00 1 m  opt imm3 n d   ADD  Wd|SP, Wn|SP, Wm ext&lsld
   3065       100 01011 00 1 m  opt imm3 n d   ADD  Xd|SP, Xn|SP, Rm ext&lsld
   3066 
   3067       001 01011 00 1 m  opt imm3 n d   ADDS Wd,    Wn|SP, Wm ext&lsld
   3068       101 01011 00 1 m  opt imm3 n d   ADDS Xd,    Xn|SP, Rm ext&lsld
   3069 
   3070       010 01011 00 1 m  opt imm3 n d   SUB  Wd|SP, Wn|SP, Wm ext&lsld
   3071       110 01011 00 1 m  opt imm3 n d   SUB  Xd|SP, Xn|SP, Rm ext&lsld
   3072 
   3073       011 01011 00 1 m  opt imm3 n d   SUBS Wd,    Wn|SP, Wm ext&lsld
   3074       111 01011 00 1 m  opt imm3 n d   SUBS Xd,    Xn|SP, Rm ext&lsld
   3075 
   3076       The 'm' operand is extended per opt, thusly:
   3077 
   3078         000   Xm & 0xFF           UXTB
   3079         001   Xm & 0xFFFF         UXTH
   3080         010   Xm & (2^32)-1       UXTW
   3081         011   Xm                  UXTX
   3082 
   3083         100   Xm sx from bit 7    SXTB
   3084         101   Xm sx from bit 15   SXTH
   3085         110   Xm sx from bit 31   SXTW
   3086         111   Xm                  SXTX
   3087 
   3088       In the 64 bit case (bit31 == 1), UXTX and SXTX are the identity
   3089       operation on Xm.  In the 32 bit case, UXTW, UXTX, SXTW and SXTX
   3090       are the identity operation on Wm.
   3091 
   3092       After extension, the value is shifted left by imm3 bits, which
   3093       may only be in the range 0 .. 4 inclusive.
   3094    */
   3095    if (INSN(28,21) == BITS8(0,1,0,1,1,0,0,1) && INSN(12,10) <= 4) {
   3096       Bool is64  = INSN(31,31) == 1;
   3097       Bool isSub = INSN(30,30) == 1;
   3098       Bool setCC = INSN(29,29) == 1;
   3099       UInt mm    = INSN(20,16);
   3100       UInt opt   = INSN(15,13);
   3101       UInt imm3  = INSN(12,10);
   3102       UInt nn    = INSN(9,5);
   3103       UInt dd    = INSN(4,0);
   3104       const HChar* nameExt[8] = { "uxtb", "uxth", "uxtw", "uxtx",
   3105                                   "sxtb", "sxth", "sxtw", "sxtx" };
   3106       /* Do almost the same thing in the 32- and 64-bit cases. */
   3107       IRTemp xN = newTemp(Ity_I64);
   3108       IRTemp xM = newTemp(Ity_I64);
   3109       assign(xN, getIReg64orSP(nn));
   3110       assign(xM, getIReg64orZR(mm));
   3111       IRExpr* xMw  = mkexpr(xM); /* "xM widened" */
   3112       Int     shSX = 0;
   3113       /* widen Xm .. */
   3114       switch (opt) {
   3115          case BITS3(0,0,0): // UXTB
   3116             xMw = binop(Iop_And64, xMw, mkU64(0xFF)); break;
   3117          case BITS3(0,0,1): // UXTH
   3118             xMw = binop(Iop_And64, xMw, mkU64(0xFFFF)); break;
   3119          case BITS3(0,1,0): // UXTW -- noop for the 32bit case
   3120             if (is64) {
   3121                xMw = unop(Iop_32Uto64, unop(Iop_64to32, xMw));
   3122             }
   3123             break;
   3124          case BITS3(0,1,1): // UXTX -- always a noop
   3125             break;
   3126          case BITS3(1,0,0): // SXTB
   3127             shSX = 56; goto sxTo64;
   3128          case BITS3(1,0,1): // SXTH
   3129             shSX = 48; goto sxTo64;
   3130          case BITS3(1,1,0): // SXTW -- noop for the 32bit case
   3131             if (is64) {
   3132                shSX = 32; goto sxTo64;
   3133             }
   3134             break;
   3135          case BITS3(1,1,1): // SXTX -- always a noop
   3136             break;
   3137          sxTo64:
   3138             vassert(shSX >= 32);
   3139             xMw = binop(Iop_Sar64, binop(Iop_Shl64, xMw, mkU8(shSX)),
   3140                         mkU8(shSX));
   3141             break;
   3142          default:
   3143             vassert(0);
   3144       }
   3145       /* and now shift */
   3146       IRTemp argL = xN;
   3147       IRTemp argR = newTemp(Ity_I64);
   3148       assign(argR, binop(Iop_Shl64, xMw, mkU8(imm3)));
   3149       IRTemp res = newTemp(Ity_I64);
   3150       assign(res, binop(isSub ? Iop_Sub64 : Iop_Add64,
   3151                         mkexpr(argL), mkexpr(argR)));
   3152       if (is64) {
   3153          if (setCC) {
   3154             putIReg64orZR(dd, mkexpr(res));
   3155             setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
   3156          } else {
   3157             putIReg64orSP(dd, mkexpr(res));
   3158          }
   3159       } else {
   3160          if (setCC) {
   3161             IRTemp argL32 = newTemp(Ity_I32);
   3162             IRTemp argR32 = newTemp(Ity_I32);
   3163             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(res)));
   3164             assign(argL32, unop(Iop_64to32, mkexpr(argL)));
   3165             assign(argR32, unop(Iop_64to32, mkexpr(argR)));
   3166             setFlags_ADD_SUB(False/*!is64*/, isSub, argL32, argR32);
   3167          } else {
   3168             putIReg32orSP(dd, unop(Iop_64to32, mkexpr(res)));
   3169          }
   3170       }
   3171       DIP("%s%s %s, %s, %s %s lsl %u\n",
   3172           isSub ? "sub" : "add", setCC ? "s" : "",
   3173           setCC ? nameIRegOrZR(is64, dd) : nameIRegOrSP(is64, dd),
   3174           nameIRegOrSP(is64, nn), nameIRegOrSP(is64, mm),
   3175           nameExt[opt], imm3);
   3176       return True;
   3177    }
   3178 
   3179    /* ---------------- CCMP/CCMN(imm) ---------------- */
   3180    /* Bizarrely, these appear in the "data processing register"
   3181       category, even though they are operations against an
   3182       immediate. */
   3183    /* 31   29        20   15   11 9    3
   3184       sf 1 111010010 imm5 cond 10 Rn 0 nzcv   CCMP Rn, #imm5, #nzcv, cond
   3185       sf 0 111010010 imm5 cond 10 Rn 0 nzcv   CCMN Rn, #imm5, #nzcv, cond
   3186 
   3187       Operation is:
   3188          (CCMP) flags = if cond then flags-after-sub(Rn,imm5) else nzcv
   3189          (CCMN) flags = if cond then flags-after-add(Rn,imm5) else nzcv
   3190    */
   3191    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
   3192        && INSN(11,10) == BITS2(1,0) && INSN(4,4) == 0) {
   3193       Bool is64  = INSN(31,31) == 1;
   3194       Bool isSUB = INSN(30,30) == 1;
   3195       UInt imm5  = INSN(20,16);
   3196       UInt cond  = INSN(15,12);
   3197       UInt nn    = INSN(9,5);
   3198       UInt nzcv  = INSN(3,0);
   3199 
   3200       IRTemp condT = newTemp(Ity_I1);
   3201       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
   3202 
   3203       IRType ty   = is64 ? Ity_I64 : Ity_I32;
   3204       IRTemp argL = newTemp(ty);
   3205       IRTemp argR = newTemp(ty);
   3206 
   3207       if (is64) {
   3208          assign(argL, getIReg64orZR(nn));
   3209          assign(argR, mkU64(imm5));
   3210       } else {
   3211          assign(argL, getIReg32orZR(nn));
   3212          assign(argR, mkU32(imm5));
   3213       }
   3214       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
   3215 
   3216       DIP("ccm%c %s, #%u, #%u, %s\n",
   3217           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
   3218           imm5, nzcv, nameCC(cond));
   3219       return True;
   3220    }
   3221 
   3222    /* ---------------- CCMP/CCMN(reg) ---------------- */
   3223    /* 31   29        20 15   11 9    3
   3224       sf 1 111010010 Rm cond 00 Rn 0 nzcv   CCMP Rn, Rm, #nzcv, cond
   3225       sf 0 111010010 Rm cond 00 Rn 0 nzcv   CCMN Rn, Rm, #nzcv, cond
   3226       Operation is:
   3227          (CCMP) flags = if cond then flags-after-sub(Rn,Rm) else nzcv
   3228          (CCMN) flags = if cond then flags-after-add(Rn,Rm) else nzcv
   3229    */
   3230    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
   3231        && INSN(11,10) == BITS2(0,0) && INSN(4,4) == 0) {
   3232       Bool is64  = INSN(31,31) == 1;
   3233       Bool isSUB = INSN(30,30) == 1;
   3234       UInt mm    = INSN(20,16);
   3235       UInt cond  = INSN(15,12);
   3236       UInt nn    = INSN(9,5);
   3237       UInt nzcv  = INSN(3,0);
   3238 
   3239       IRTemp condT = newTemp(Ity_I1);
   3240       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
   3241 
   3242       IRType ty   = is64 ? Ity_I64 : Ity_I32;
   3243       IRTemp argL = newTemp(ty);
   3244       IRTemp argR = newTemp(ty);
   3245 
   3246       if (is64) {
   3247          assign(argL, getIReg64orZR(nn));
   3248          assign(argR, getIReg64orZR(mm));
   3249       } else {
   3250          assign(argL, getIReg32orZR(nn));
   3251          assign(argR, getIReg32orZR(mm));
   3252       }
   3253       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
   3254 
   3255       DIP("ccm%c %s, %s, #%u, %s\n",
   3256           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
   3257           nameIRegOrZR(is64, mm), nzcv, nameCC(cond));
   3258       return True;
   3259    }
   3260 
   3261 
   3262    /* -------------- REV/REV16/REV32/RBIT -------------- */
   3263    /* 31 30 28       20    15   11 9 4
   3264 
   3265       1  10 11010110 00000 0000 11 n d    (1) REV   Xd, Xn
   3266       0  10 11010110 00000 0000 10 n d    (2) REV   Wd, Wn
   3267 
   3268       1  10 11010110 00000 0000 00 n d    (3) RBIT  Xd, Xn
   3269       0  10 11010110 00000 0000 00 n d    (4) RBIT  Wd, Wn
   3270 
   3271       1  10 11010110 00000 0000 01 n d    (5) REV16 Xd, Xn
   3272       0  10 11010110 00000 0000 01 n d    (6) REV16 Wd, Wn
   3273 
   3274       1  10 11010110 00000 0000 10 n d    (7) REV32 Xd, Xn
   3275    */
   3276    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
   3277        && INSN(20,12) == BITS9(0,0,0,0,0,0,0,0,0)) {
   3278       UInt b31 = INSN(31,31);
   3279       UInt opc = INSN(11,10);
   3280 
   3281       UInt ix = 0;
   3282       /**/ if (b31 == 1 && opc == BITS2(1,1)) ix = 1;
   3283       else if (b31 == 0 && opc == BITS2(1,0)) ix = 2;
   3284       else if (b31 == 1 && opc == BITS2(0,0)) ix = 3;
   3285       else if (b31 == 0 && opc == BITS2(0,0)) ix = 4;
   3286       else if (b31 == 1 && opc == BITS2(0,1)) ix = 5;
   3287       else if (b31 == 0 && opc == BITS2(0,1)) ix = 6;
   3288       else if (b31 == 1 && opc == BITS2(1,0)) ix = 7;
   3289       if (ix >= 1 && ix <= 7) {
   3290          Bool   is64  = ix == 1 || ix == 3 || ix == 5 || ix == 7;
   3291          UInt   nn    = INSN(9,5);
   3292          UInt   dd    = INSN(4,0);
   3293          IRTemp src   = newTemp(Ity_I64);
   3294          IRTemp dst   = IRTemp_INVALID;
   3295          IRTemp (*math)(IRTemp) = NULL;
   3296          switch (ix) {
   3297             case 1: case 2: math = math_BYTESWAP64;   break;
   3298             case 3: case 4: math = math_BITSWAP64;    break;
   3299             case 5: case 6: math = math_USHORTSWAP64; break;
   3300             case 7:         math = math_UINTSWAP64;   break;
   3301             default: vassert(0);
   3302          }
   3303          const HChar* names[7]
   3304            = { "rev", "rev", "rbit", "rbit", "rev16", "rev16", "rev32" };
   3305          const HChar* nm = names[ix-1];
   3306          vassert(math);
   3307          if (ix == 6) {
   3308             /* This has to be special cased, since the logic below doesn't
   3309                handle it correctly. */
   3310             assign(src, getIReg64orZR(nn));
   3311             dst = math(src);
   3312             putIReg64orZR(dd,
   3313                           unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(dst))));
   3314          } else if (is64) {
   3315             assign(src, getIReg64orZR(nn));
   3316             dst = math(src);
   3317             putIReg64orZR(dd, mkexpr(dst));
   3318          } else {
   3319             assign(src, binop(Iop_Shl64, getIReg64orZR(nn), mkU8(32)));
   3320             dst = math(src);
   3321             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
   3322          }
   3323          DIP("%s %s, %s\n", nm,
   3324              nameIRegOrZR(is64,dd), nameIRegOrZR(is64,nn));
   3325          return True;
   3326       }
   3327       /* else fall through */
   3328    }
   3329 
   3330    /* -------------------- CLZ/CLS -------------------- */
   3331    /*    30 28   24   20    15      9 4
   3332       sf 10 1101 0110 00000 00010 0 n d    CLZ Rd, Rn
   3333       sf 10 1101 0110 00000 00010 1 n d    CLS Rd, Rn
   3334    */
   3335    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
   3336        && INSN(20,11) == BITS10(0,0,0,0,0,0,0,0,1,0)) {
   3337       Bool   is64  = INSN(31,31) == 1;
   3338       Bool   isCLS = INSN(10,10) == 1;
   3339       UInt   nn    = INSN(9,5);
   3340       UInt   dd    = INSN(4,0);
   3341       IRTemp src   = newTemp(Ity_I64);
   3342       IRTemp srcZ  = newTemp(Ity_I64);
   3343       IRTemp dst   = newTemp(Ity_I64);
   3344       /* Get the argument, widened out to 64 bit */
   3345       if (is64) {
   3346          assign(src, getIReg64orZR(nn));
   3347       } else {
   3348          assign(src, binop(Iop_Shl64,
   3349                            unop(Iop_32Uto64, getIReg32orZR(nn)), mkU8(32)));
   3350       }
   3351       /* If this is CLS, mash the arg around accordingly */
   3352       if (isCLS) {
   3353          IRExpr* one = mkU8(1);
   3354          assign(srcZ,
   3355          binop(Iop_Xor64,
   3356                binop(Iop_Shl64, mkexpr(src), one),
   3357                binop(Iop_Shl64, binop(Iop_Shr64, mkexpr(src), one), one)));
   3358       } else {
   3359          assign(srcZ, mkexpr(src));
   3360       }
   3361       /* And compute CLZ. */
   3362       if (is64) {
   3363          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
   3364                                 mkU64(isCLS ? 63 : 64),
   3365                                 unop(Iop_Clz64, mkexpr(srcZ))));
   3366          putIReg64orZR(dd, mkexpr(dst));
   3367       } else {
   3368          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
   3369                                 mkU64(isCLS ? 31 : 32),
   3370                                 unop(Iop_Clz64, mkexpr(srcZ))));
   3371          putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
   3372       }
   3373       DIP("cl%c %s, %s\n", isCLS ? 's' : 'z',
   3374           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn));
   3375       return True;
   3376    }
   3377 
   3378    /* ------------------ LSLV/LSRV/ASRV/RORV ------------------ */
   3379    /*    30 28        20 15   11 9 4
   3380       sf 00 1101 0110 m  0010 00 n d   LSLV Rd,Rn,Rm
   3381       sf 00 1101 0110 m  0010 01 n d   LSRV Rd,Rn,Rm
   3382       sf 00 1101 0110 m  0010 10 n d   ASRV Rd,Rn,Rm
   3383       sf 00 1101 0110 m  0010 11 n d   RORV Rd,Rn,Rm
   3384    */
   3385    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
   3386        && INSN(15,12) == BITS4(0,0,1,0)) {
   3387       Bool   is64 = INSN(31,31) == 1;
   3388       UInt   mm   = INSN(20,16);
   3389       UInt   op   = INSN(11,10);
   3390       UInt   nn   = INSN(9,5);
   3391       UInt   dd   = INSN(4,0);
   3392       IRType ty   = is64 ? Ity_I64 : Ity_I32;
   3393       IRTemp srcL = newTemp(ty);
   3394       IRTemp srcR = newTemp(Ity_I64);
   3395       IRTemp res  = newTemp(ty);
   3396       IROp   iop  = Iop_INVALID;
   3397       assign(srcL, getIRegOrZR(is64, nn));
   3398       assign(srcR, binop(Iop_And64, getIReg64orZR(mm),
   3399                                     mkU64(is64 ? 63 : 31)));
   3400       if (op < 3) {
   3401          // LSLV, LSRV, ASRV
   3402          switch (op) {
   3403             case BITS2(0,0): iop = mkSHL(ty); break;
   3404             case BITS2(0,1): iop = mkSHR(ty); break;
   3405             case BITS2(1,0): iop = mkSAR(ty); break;
   3406             default: vassert(0);
   3407          }
   3408          assign(res, binop(iop, mkexpr(srcL),
   3409                                 unop(Iop_64to8, mkexpr(srcR))));
   3410       } else {
   3411          // RORV
   3412          IROp opSHL = mkSHL(ty);
   3413          IROp opSHR = mkSHR(ty);
   3414          IROp opOR  = mkOR(ty);
   3415          IRExpr* width = mkU64(is64 ? 64: 32);
   3416          assign(
   3417             res,
   3418             IRExpr_ITE(
   3419                binop(Iop_CmpEQ64, mkexpr(srcR), mkU64(0)),
   3420                mkexpr(srcL),
   3421                binop(opOR,
   3422                      binop(opSHL,
   3423                            mkexpr(srcL),
   3424                            unop(Iop_64to8, binop(Iop_Sub64, width,
   3425                                                             mkexpr(srcR)))),
   3426                      binop(opSHR,
   3427                            mkexpr(srcL), unop(Iop_64to8, mkexpr(srcR))))
   3428          ));
   3429       }
   3430       putIRegOrZR(is64, dd, mkexpr(res));
   3431       vassert(op < 4);
   3432       const HChar* names[4] = { "lslv", "lsrv", "asrv", "rorv" };
   3433       DIP("%s %s, %s, %s\n",
   3434           names[op], nameIRegOrZR(is64,dd),
   3435                      nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm));
   3436       return True;
   3437    }
   3438 
   3439    /* -------------------- SDIV/UDIV -------------------- */
   3440    /*    30 28        20 15    10 9 4
   3441       sf 00 1101 0110 m  00001  1 n d  SDIV Rd,Rn,Rm
   3442       sf 00 1101 0110 m  00001  0 n d  UDIV Rd,Rn,Rm
   3443    */
   3444    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
   3445        && INSN(15,11) == BITS5(0,0,0,0,1)) {
   3446       Bool is64 = INSN(31,31) == 1;
   3447       UInt mm   = INSN(20,16);
   3448       Bool isS  = INSN(10,10) == 1;
   3449       UInt nn   = INSN(9,5);
   3450       UInt dd   = INSN(4,0);
   3451       if (isS) {
   3452          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivS64 : Iop_DivS32,
   3453                                      getIRegOrZR(is64, nn),
   3454                                      getIRegOrZR(is64, mm)));
   3455       } else {
   3456          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivU64 : Iop_DivU32,
   3457                                      getIRegOrZR(is64, nn),
   3458                                      getIRegOrZR(is64, mm)));
   3459       }
   3460       DIP("%cdiv %s, %s, %s\n", isS ? 's' : 'u',
   3461           nameIRegOrZR(is64, dd),
   3462           nameIRegOrZR(is64, nn), nameIRegOrZR(is64, mm));
   3463       return True;
   3464    }
   3465 
   3466    /* ------------------ {S,U}M{ADD,SUB}L ------------------ */
   3467    /* 31        23  20 15 14 9 4
   3468       1001 1011 101 m  0  a  n d   UMADDL Xd,Wn,Wm,Xa
   3469       1001 1011 001 m  0  a  n d   SMADDL Xd,Wn,Wm,Xa
   3470       1001 1011 101 m  1  a  n d   UMSUBL Xd,Wn,Wm,Xa
   3471       1001 1011 001 m  1  a  n d   SMSUBL Xd,Wn,Wm,Xa
   3472       with operation
   3473          Xd = Xa +/- (Wn *u/s Wm)
   3474    */
   3475    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1) && INSN(22,21) == BITS2(0,1)) {
   3476       Bool   isU   = INSN(23,23) == 1;
   3477       UInt   mm    = INSN(20,16);
   3478       Bool   isAdd = INSN(15,15) == 0;
   3479       UInt   aa    = INSN(14,10);
   3480       UInt   nn    = INSN(9,5);
   3481       UInt   dd    = INSN(4,0);
   3482       IRTemp wN    = newTemp(Ity_I32);
   3483       IRTemp wM    = newTemp(Ity_I32);
   3484       IRTemp xA    = newTemp(Ity_I64);
   3485       IRTemp muld  = newTemp(Ity_I64);
   3486       IRTemp res   = newTemp(Ity_I64);
   3487       assign(wN, getIReg32orZR(nn));
   3488       assign(wM, getIReg32orZR(mm));
   3489       assign(xA, getIReg64orZR(aa));
   3490       assign(muld, binop(isU ? Iop_MullU32 : Iop_MullS32,
   3491                          mkexpr(wN), mkexpr(wM)));
   3492       assign(res, binop(isAdd ? Iop_Add64 : Iop_Sub64,
   3493                         mkexpr(xA), mkexpr(muld)));
   3494       putIReg64orZR(dd, mkexpr(res));
   3495       DIP("%cm%sl %s, %s, %s, %s\n", isU ? 'u' : 's', isAdd ? "add" : "sub",
   3496           nameIReg64orZR(dd), nameIReg32orZR(nn),
   3497           nameIReg32orZR(mm), nameIReg64orZR(aa));
   3498       return True;
   3499    }
   3500    vex_printf("ARM64 front end: data_processing_register\n");
   3501    return False;
   3502 #  undef INSN
   3503 }
   3504 
   3505 
   3506 /*------------------------------------------------------------*/
   3507 /*--- Math helpers for vector interleave/deinterleave      ---*/
   3508 /*------------------------------------------------------------*/
   3509 
   3510 #define EX(_tmp) \
   3511            mkexpr(_tmp)
   3512 #define SL(_hi128,_lo128,_nbytes) \
   3513            ( (_nbytes) == 0 \
   3514                 ? (_lo128) \
   3515                 : triop(Iop_SliceV128,(_hi128),(_lo128),mkU8(_nbytes)) )
   3516 #define ROR(_v128,_nbytes) \
   3517            SL((_v128),(_v128),(_nbytes))
   3518 #define ROL(_v128,_nbytes) \
   3519            SL((_v128),(_v128),16-(_nbytes))
   3520 #define SHR(_v128,_nbytes) \
   3521            binop(Iop_ShrV128,(_v128),mkU8(8*(_nbytes)))
   3522 #define SHL(_v128,_nbytes) \
   3523            binop(Iop_ShlV128,(_v128),mkU8(8*(_nbytes)))
   3524 #define ILO64x2(_argL,_argR) \
   3525            binop(Iop_InterleaveLO64x2,(_argL),(_argR))
   3526 #define IHI64x2(_argL,_argR) \
   3527            binop(Iop_InterleaveHI64x2,(_argL),(_argR))
   3528 #define ILO32x4(_argL,_argR) \
   3529            binop(Iop_InterleaveLO32x4,(_argL),(_argR))
   3530 #define IHI32x4(_argL,_argR) \
   3531            binop(Iop_InterleaveHI32x4,(_argL),(_argR))
   3532 #define ILO16x8(_argL,_argR) \
   3533            binop(Iop_InterleaveLO16x8,(_argL),(_argR))
   3534 #define IHI16x8(_argL,_argR) \
   3535            binop(Iop_InterleaveHI16x8,(_argL),(_argR))
   3536 #define ILO8x16(_argL,_argR) \
   3537            binop(Iop_InterleaveLO8x16,(_argL),(_argR))
   3538 #define IHI8x16(_argL,_argR) \
   3539            binop(Iop_InterleaveHI8x16,(_argL),(_argR))
   3540 #define CEV32x4(_argL,_argR) \
   3541            binop(Iop_CatEvenLanes32x4,(_argL),(_argR))
   3542 #define COD32x4(_argL,_argR) \
   3543            binop(Iop_CatOddLanes32x4,(_argL),(_argR))
   3544 #define COD16x8(_argL,_argR) \
   3545            binop(Iop_CatOddLanes16x8,(_argL),(_argR))
   3546 #define COD8x16(_argL,_argR) \
   3547            binop(Iop_CatOddLanes8x16,(_argL),(_argR))
   3548 #define CEV8x16(_argL,_argR) \
   3549            binop(Iop_CatEvenLanes8x16,(_argL),(_argR))
   3550 #define AND(_arg1,_arg2) \
   3551            binop(Iop_AndV128,(_arg1),(_arg2))
   3552 #define OR2(_arg1,_arg2) \
   3553            binop(Iop_OrV128,(_arg1),(_arg2))
   3554 #define OR3(_arg1,_arg2,_arg3) \
   3555            binop(Iop_OrV128,(_arg1),binop(Iop_OrV128,(_arg2),(_arg3)))
   3556 #define OR4(_arg1,_arg2,_arg3,_arg4) \
   3557            binop(Iop_OrV128, \
   3558                  binop(Iop_OrV128,(_arg1),(_arg2)), \
   3559                  binop(Iop_OrV128,(_arg3),(_arg4)))
   3560 
   3561 
   3562 /* Do interleaving for 1 128 bit vector, for ST1 insns. */
   3563 static
   3564 void math_INTERLEAVE1_128( /*OUTx1*/ IRTemp* i0,
   3565                            UInt laneSzBlg2, IRTemp u0 )
   3566 {
   3567    assign(*i0, mkexpr(u0));
   3568 }
   3569 
   3570 
   3571 /* Do interleaving for 2 128 bit vectors, for ST2 insns. */
   3572 static
   3573 void math_INTERLEAVE2_128( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
   3574                            UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
   3575 {
   3576    /* This is pretty easy, since we have primitives directly to
   3577       hand. */
   3578    if (laneSzBlg2 == 3) {
   3579       // 64x2
   3580       // u1 == B1 B0, u0 == A1 A0
   3581       // i1 == B1 A1, i0 == B0 A0
   3582       assign(*i0, binop(Iop_InterleaveLO64x2, mkexpr(u1), mkexpr(u0)));
   3583       assign(*i1, binop(Iop_InterleaveHI64x2, mkexpr(u1), mkexpr(u0)));
   3584       return;
   3585    }
   3586    if (laneSzBlg2 == 2) {
   3587       // 32x4
   3588       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
   3589       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
   3590       assign(*i0, binop(Iop_InterleaveLO32x4, mkexpr(u1), mkexpr(u0)));
   3591       assign(*i1, binop(Iop_InterleaveHI32x4, mkexpr(u1), mkexpr(u0)));
   3592       return;
   3593    }
   3594    if (laneSzBlg2 == 1) {
   3595       // 16x8
   3596       // u1 == B{7..0}, u0 == A{7..0}
   3597       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
   3598       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
   3599       assign(*i0, binop(Iop_InterleaveLO16x8, mkexpr(u1), mkexpr(u0)));
   3600       assign(*i1, binop(Iop_InterleaveHI16x8, mkexpr(u1), mkexpr(u0)));
   3601       return;
   3602    }
   3603    if (laneSzBlg2 == 0) {
   3604       // 8x16
   3605       // u1 == B{f..0}, u0 == A{f..0}
   3606       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
   3607       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
   3608       assign(*i0, binop(Iop_InterleaveLO8x16, mkexpr(u1), mkexpr(u0)));
   3609       assign(*i1, binop(Iop_InterleaveHI8x16, mkexpr(u1), mkexpr(u0)));
   3610       return;
   3611    }
   3612    /*NOTREACHED*/
   3613    vassert(0);
   3614 }
   3615 
   3616 
   3617 /* Do interleaving for 3 128 bit vectors, for ST3 insns. */
   3618 static
   3619 void math_INTERLEAVE3_128(
   3620         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
   3621         UInt laneSzBlg2,
   3622         IRTemp u0, IRTemp u1, IRTemp u2 )
   3623 {
   3624    if (laneSzBlg2 == 3) {
   3625       // 64x2
   3626       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
   3627       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
   3628       assign(*i2, IHI64x2( EX(u2), EX(u1) ));
   3629       assign(*i1, ILO64x2( ROR(EX(u0),8), EX(u2) ));
   3630       assign(*i0, ILO64x2( EX(u1), EX(u0) ));
   3631       return;
   3632    }
   3633 
   3634    if (laneSzBlg2 == 2) {
   3635       // 32x4
   3636       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
   3637       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
   3638       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
   3639       IRTemp p0    = newTempV128();
   3640       IRTemp p1    = newTempV128();
   3641       IRTemp p2    = newTempV128();
   3642       IRTemp c1100 = newTempV128();
   3643       IRTemp c0011 = newTempV128();
   3644       IRTemp c0110 = newTempV128();
   3645       assign(c1100, mkV128(0xFF00));
   3646       assign(c0011, mkV128(0x00FF));
   3647       assign(c0110, mkV128(0x0FF0));
   3648       // First interleave them at 64x2 granularity,
   3649       // generating partial ("p") values.
   3650       math_INTERLEAVE3_128(&p0, &p1, &p2, 3, u0, u1, u2);
   3651       // And more shuffling around for the final answer
   3652       assign(*i2, OR2( AND( IHI32x4(EX(p2), ROL(EX(p2),8)), EX(c1100) ),
   3653                        AND( IHI32x4(ROR(EX(p1),4), EX(p2)), EX(c0011) ) ));
   3654       assign(*i1, OR3( SHL(EX(p2),12),
   3655                        AND(EX(p1),EX(c0110)),
   3656                        SHR(EX(p0),12) ));
   3657       assign(*i0, OR2( AND( ILO32x4(EX(p0),ROL(EX(p1),4)), EX(c1100) ),
   3658                        AND( ILO32x4(ROR(EX(p0),8),EX(p0)), EX(c0011) ) ));
   3659       return;
   3660    }
   3661 
   3662    if (laneSzBlg2 == 1) {
   3663       // 16x8
   3664       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
   3665       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
   3666       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
   3667       //
   3668       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
   3669       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
   3670       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
   3671       //
   3672       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
   3673       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
   3674       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
   3675       IRTemp p0    = newTempV128();
   3676       IRTemp p1    = newTempV128();
   3677       IRTemp p2    = newTempV128();
   3678       IRTemp c1000 = newTempV128();
   3679       IRTemp c0100 = newTempV128();
   3680       IRTemp c0010 = newTempV128();
   3681       IRTemp c0001 = newTempV128();
   3682       assign(c1000, mkV128(0xF000));
   3683       assign(c0100, mkV128(0x0F00));
   3684       assign(c0010, mkV128(0x00F0));
   3685       assign(c0001, mkV128(0x000F));
   3686       // First interleave them at 32x4 granularity,
   3687       // generating partial ("p") values.
   3688       math_INTERLEAVE3_128(&p0, &p1, &p2, 2, u0, u1, u2);
   3689       // And more shuffling around for the final answer
   3690       assign(*i2,
   3691              OR4( AND( IHI16x8( EX(p2),        ROL(EX(p2),4) ), EX(c1000) ),
   3692                   AND( IHI16x8( ROL(EX(p2),6), EX(p2)        ), EX(c0100) ),
   3693                   AND( IHI16x8( ROL(EX(p2),2), ROL(EX(p2),6) ), EX(c0010) ),
   3694                   AND( ILO16x8( ROR(EX(p2),2), ROL(EX(p1),2) ), EX(c0001) )
   3695       ));
   3696       assign(*i1,
   3697              OR4( AND( IHI16x8( ROL(EX(p1),4), ROR(EX(p2),2) ), EX(c1000) ),
   3698                   AND( IHI16x8( EX(p1),        ROL(EX(p1),4) ), EX(c0100) ),
   3699                   AND( IHI16x8( ROL(EX(p1),4), ROL(EX(p1),8) ), EX(c0010) ),
   3700                   AND( IHI16x8( ROR(EX(p0),6), ROL(EX(p1),4) ), EX(c0001) )
   3701       ));
   3702       assign(*i0,
   3703              OR4( AND( IHI16x8( ROR(EX(p1),2), ROL(EX(p0),2) ), EX(c1000) ),
   3704                   AND( IHI16x8( ROL(EX(p0),2), ROL(EX(p0),6) ), EX(c0100) ),
   3705                   AND( IHI16x8( ROL(EX(p0),8), ROL(EX(p0),2) ), EX(c0010) ),
   3706                   AND( IHI16x8( ROL(EX(p0),4), ROL(EX(p0),8) ), EX(c0001) )
   3707       ));
   3708       return;
   3709    }
   3710 
   3711    if (laneSzBlg2 == 0) {
   3712       // 8x16.  It doesn't seem worth the hassle of first doing a
   3713       // 16x8 interleave, so just generate all 24 partial results
   3714       // directly :-(
   3715       // u2 == Cf .. C0, u1 == Bf .. B0, u0 == Af .. A0
   3716       // i2 == Cf Bf Af Ce .. Bb Ab Ca
   3717       // i1 == Ba Aa C9 B9 .. A6 C5 B5
   3718       // i0 == A5 C4 B4 A4 .. C0 B0 A0
   3719 
   3720       IRTemp i2_FEDC = newTempV128(); IRTemp i2_BA98 = newTempV128();
   3721       IRTemp i2_7654 = newTempV128(); IRTemp i2_3210 = newTempV128();
   3722       IRTemp i1_FEDC = newTempV128(); IRTemp i1_BA98 = newTempV128();
   3723       IRTemp i1_7654 = newTempV128(); IRTemp i1_3210 = newTempV128();
   3724       IRTemp i0_FEDC = newTempV128(); IRTemp i0_BA98 = newTempV128();
   3725       IRTemp i0_7654 = newTempV128(); IRTemp i0_3210 = newTempV128();
   3726       IRTemp i2_hi64 = newTempV128(); IRTemp i2_lo64 = newTempV128();
   3727       IRTemp i1_hi64 = newTempV128(); IRTemp i1_lo64 = newTempV128();
   3728       IRTemp i0_hi64 = newTempV128(); IRTemp i0_lo64 = newTempV128();
   3729 
   3730       // eg XXXX(qqq, CC, 0xF, BB, 0xA)) sets qqq to be a vector
   3731       // of the form 14 bytes junk : CC[0xF] : BB[0xA]
   3732       //
   3733 #     define XXXX(_tempName,_srcVec1,_srcShift1,_srcVec2,_srcShift2) \
   3734          IRTemp t_##_tempName = newTempV128(); \
   3735          assign(t_##_tempName, \
   3736                 ILO8x16( ROR(EX(_srcVec1),(_srcShift1)), \
   3737                          ROR(EX(_srcVec2),(_srcShift2)) ) )
   3738 
   3739       // Let CC, BB, AA be (handy) aliases of u2, u1, u0 respectively
   3740       IRTemp CC = u2; IRTemp BB = u1; IRTemp AA = u0;
   3741 
   3742       // The slicing and reassembly are done as interleavedly as possible,
   3743       // so as to minimise the demand for registers in the back end, which
   3744       // was observed to be a problem in testing.
   3745 
   3746       XXXX(CfBf, CC, 0xf, BB, 0xf); // i2[15:14]
   3747       XXXX(AfCe, AA, 0xf, CC, 0xe);
   3748       assign(i2_FEDC, ILO16x8(EX(t_CfBf), EX(t_AfCe)));
   3749 
   3750       XXXX(BeAe, BB, 0xe, AA, 0xe);
   3751       XXXX(CdBd, CC, 0xd, BB, 0xd);
   3752       assign(i2_BA98, ILO16x8(EX(t_BeAe), EX(t_CdBd)));
   3753       assign(i2_hi64, ILO32x4(EX(i2_FEDC), EX(i2_BA98)));
   3754 
   3755       XXXX(AdCc, AA, 0xd, CC, 0xc);
   3756       XXXX(BcAc, BB, 0xc, AA, 0xc);
   3757       assign(i2_7654, ILO16x8(EX(t_AdCc), EX(t_BcAc)));
   3758 
   3759       XXXX(CbBb, CC, 0xb, BB, 0xb);
   3760       XXXX(AbCa, AA, 0xb, CC, 0xa); // i2[1:0]
   3761       assign(i2_3210, ILO16x8(EX(t_CbBb), EX(t_AbCa)));
   3762       assign(i2_lo64, ILO32x4(EX(i2_7654), EX(i2_3210)));
   3763       assign(*i2, ILO64x2(EX(i2_hi64), EX(i2_lo64)));
   3764 
   3765       XXXX(BaAa, BB, 0xa, AA, 0xa); // i1[15:14]
   3766       XXXX(C9B9, CC, 0x9, BB, 0x9);
   3767       assign(i1_FEDC, ILO16x8(EX(t_BaAa), EX(t_C9B9)));
   3768 
   3769       XXXX(A9C8, AA, 0x9, CC, 0x8);
   3770       XXXX(B8A8, BB, 0x8, AA, 0x8);
   3771       assign(i1_BA98, ILO16x8(EX(t_A9C8), EX(t_B8A8)));
   3772       assign(i1_hi64, ILO32x4(EX(i1_FEDC), EX(i1_BA98)));
   3773 
   3774       XXXX(C7B7, CC, 0x7, BB, 0x7);
   3775       XXXX(A7C6, AA, 0x7, CC, 0x6);
   3776       assign(i1_7654, ILO16x8(EX(t_C7B7), EX(t_A7C6)));
   3777 
   3778       XXXX(B6A6, BB, 0x6, AA, 0x6);
   3779       XXXX(C5B5, CC, 0x5, BB, 0x5); // i1[1:0]
   3780       assign(i1_3210, ILO16x8(EX(t_B6A6), EX(t_C5B5)));
   3781       assign(i1_lo64, ILO32x4(EX(i1_7654), EX(i1_3210)));
   3782       assign(*i1, ILO64x2(EX(i1_hi64), EX(i1_lo64)));
   3783 
   3784       XXXX(A5C4, AA, 0x5, CC, 0x4); // i0[15:14]
   3785       XXXX(B4A4, BB, 0x4, AA, 0x4);
   3786       assign(i0_FEDC, ILO16x8(EX(t_A5C4), EX(t_B4A4)));
   3787 
   3788       XXXX(C3B3, CC, 0x3, BB, 0x3);
   3789       XXXX(A3C2, AA, 0x3, CC, 0x2);
   3790       assign(i0_BA98, ILO16x8(EX(t_C3B3), EX(t_A3C2)));
   3791       assign(i0_hi64, ILO32x4(EX(i0_FEDC), EX(i0_BA98)));
   3792 
   3793       XXXX(B2A2, BB, 0x2, AA, 0x2);
   3794       XXXX(C1B1, CC, 0x1, BB, 0x1);
   3795       assign(i0_7654, ILO16x8(EX(t_B2A2), EX(t_C1B1)));
   3796 
   3797       XXXX(A1C0, AA, 0x1, CC, 0x0);
   3798       XXXX(B0A0, BB, 0x0, AA, 0x0); // i0[1:0]
   3799       assign(i0_3210, ILO16x8(EX(t_A1C0), EX(t_B0A0)));
   3800       assign(i0_lo64, ILO32x4(EX(i0_7654), EX(i0_3210)));
   3801       assign(*i0, ILO64x2(EX(i0_hi64), EX(i0_lo64)));
   3802 
   3803 #     undef XXXX
   3804       return;
   3805    }
   3806 
   3807    /*NOTREACHED*/
   3808    vassert(0);
   3809 }
   3810 
   3811 
   3812 /* Do interleaving for 4 128 bit vectors, for ST4 insns. */
   3813 static
   3814 void math_INTERLEAVE4_128(
   3815         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
   3816         UInt laneSzBlg2,
   3817         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
   3818 {
   3819    if (laneSzBlg2 == 3) {
   3820       // 64x2
   3821       assign(*i0, ILO64x2(EX(u1), EX(u0)));
   3822       assign(*i1, ILO64x2(EX(u3), EX(u2)));
   3823       assign(*i2, IHI64x2(EX(u1), EX(u0)));
   3824       assign(*i3, IHI64x2(EX(u3), EX(u2)));
   3825       return;
   3826    }
   3827    if (laneSzBlg2 == 2) {
   3828       // 32x4
   3829       // First, interleave at the 64-bit lane size.
   3830       IRTemp p0 = newTempV128();
   3831       IRTemp p1 = newTempV128();
   3832       IRTemp p2 = newTempV128();
   3833       IRTemp p3 = newTempV128();
   3834       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 3, u0, u1, u2, u3);
   3835       // And interleave (cat) at the 32 bit size.
   3836       assign(*i0, CEV32x4(EX(p1), EX(p0)));
   3837       assign(*i1, COD32x4(EX(p1), EX(p0)));
   3838       assign(*i2, CEV32x4(EX(p3), EX(p2)));
   3839       assign(*i3, COD32x4(EX(p3), EX(p2)));
   3840       return;
   3841    }
   3842    if (laneSzBlg2 == 1) {
   3843       // 16x8
   3844       // First, interleave at the 32-bit lane size.
   3845       IRTemp p0 = newTempV128();
   3846       IRTemp p1 = newTempV128();
   3847       IRTemp p2 = newTempV128();
   3848       IRTemp p3 = newTempV128();
   3849       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 2, u0, u1, u2, u3);
   3850       // And rearrange within each vector, to get the right 16 bit lanes.
   3851       assign(*i0, COD16x8(EX(p0), SHL(EX(p0), 2)));
   3852       assign(*i1, COD16x8(EX(p1), SHL(EX(p1), 2)));
   3853       assign(*i2, COD16x8(EX(p2), SHL(EX(p2), 2)));
   3854       assign(*i3, COD16x8(EX(p3), SHL(EX(p3), 2)));
   3855       return;
   3856    }
   3857    if (laneSzBlg2 == 0) {
   3858       // 8x16
   3859       // First, interleave at the 16-bit lane size.
   3860       IRTemp p0 = newTempV128();
   3861       IRTemp p1 = newTempV128();
   3862       IRTemp p2 = newTempV128();
   3863       IRTemp p3 = newTempV128();
   3864       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 1, u0, u1, u2, u3);
   3865       // And rearrange within each vector, to get the right 8 bit lanes.
   3866       assign(*i0, IHI32x4(COD8x16(EX(p0),EX(p0)), CEV8x16(EX(p0),EX(p0))));
   3867       assign(*i1, IHI32x4(COD8x16(EX(p1),EX(p1)), CEV8x16(EX(p1),EX(p1))));
   3868       assign(*i2, IHI32x4(COD8x16(EX(p2),EX(p2)), CEV8x16(EX(p2),EX(p2))));
   3869       assign(*i3, IHI32x4(COD8x16(EX(p3),EX(p3)), CEV8x16(EX(p3),EX(p3))));
   3870       return;
   3871    }
   3872    /*NOTREACHED*/
   3873    vassert(0);
   3874 }
   3875 
   3876 
   3877 /* Do deinterleaving for 1 128 bit vector, for LD1 insns. */
   3878 static
   3879 void math_DEINTERLEAVE1_128( /*OUTx1*/ IRTemp* u0,
   3880                              UInt laneSzBlg2, IRTemp i0 )
   3881 {
   3882    assign(*u0, mkexpr(i0));
   3883 }
   3884 
   3885 
   3886 /* Do deinterleaving for 2 128 bit vectors, for LD2 insns. */
   3887 static
   3888 void math_DEINTERLEAVE2_128( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
   3889                              UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
   3890 {
   3891    /* This is pretty easy, since we have primitives directly to
   3892       hand. */
   3893    if (laneSzBlg2 == 3) {
   3894       // 64x2
   3895       // i1 == B1 A1, i0 == B0 A0
   3896       // u1 == B1 B0, u0 == A1 A0
   3897       assign(*u0, binop(Iop_InterleaveLO64x2, mkexpr(i1), mkexpr(i0)));
   3898       assign(*u1, binop(Iop_InterleaveHI64x2, mkexpr(i1), mkexpr(i0)));
   3899       return;
   3900    }
   3901    if (laneSzBlg2 == 2) {
   3902       // 32x4
   3903       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
   3904       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
   3905       assign(*u0, binop(Iop_CatEvenLanes32x4, mkexpr(i1), mkexpr(i0)));
   3906       assign(*u1, binop(Iop_CatOddLanes32x4, mkexpr(i1), mkexpr(i0)));
   3907       return;
   3908    }
   3909    if (laneSzBlg2 == 1) {
   3910       // 16x8
   3911       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
   3912       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
   3913       // u1 == B{7..0}, u0 == A{7..0}
   3914       assign(*u0, binop(Iop_CatEvenLanes16x8, mkexpr(i1), mkexpr(i0)));
   3915       assign(*u1, binop(Iop_CatOddLanes16x8,  mkexpr(i1), mkexpr(i0)));
   3916       return;
   3917    }
   3918    if (laneSzBlg2 == 0) {
   3919       // 8x16
   3920       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
   3921       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
   3922       // u1 == B{f..0}, u0 == A{f..0}
   3923       assign(*u0, binop(Iop_CatEvenLanes8x16, mkexpr(i1), mkexpr(i0)));
   3924       assign(*u1, binop(Iop_CatOddLanes8x16,  mkexpr(i1), mkexpr(i0)));
   3925       return;
   3926    }
   3927    /*NOTREACHED*/
   3928    vassert(0);
   3929 }
   3930 
   3931 
   3932 /* Do deinterleaving for 3 128 bit vectors, for LD3 insns. */
   3933 static
   3934 void math_DEINTERLEAVE3_128(
   3935         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
   3936         UInt laneSzBlg2,
   3937         IRTemp i0, IRTemp i1, IRTemp i2 )
   3938 {
   3939    if (laneSzBlg2 == 3) {
   3940       // 64x2
   3941       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
   3942       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
   3943       assign(*u2, ILO64x2( ROL(EX(i2),8), EX(i1)        ));
   3944       assign(*u1, ILO64x2( EX(i2),        ROL(EX(i0),8) ));
   3945       assign(*u0, ILO64x2( ROL(EX(i1),8), EX(i0)        ));
   3946       return;
   3947    }
   3948 
   3949    if (laneSzBlg2 == 2) {
   3950       // 32x4
   3951       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
   3952       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
   3953       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
   3954       IRTemp t_a1c0b0a0 = newTempV128();
   3955       IRTemp t_a2c1b1a1 = newTempV128();
   3956       IRTemp t_a3c2b2a2 = newTempV128();
   3957       IRTemp t_a0c3b3a3 = newTempV128();
   3958       IRTemp p0 = newTempV128();
   3959       IRTemp p1 = newTempV128();
   3960       IRTemp p2 = newTempV128();
   3961       // Compute some intermediate values.
   3962       assign(t_a1c0b0a0, EX(i0));
   3963       assign(t_a2c1b1a1, SL(EX(i1),EX(i0),3*4));
   3964       assign(t_a3c2b2a2, SL(EX(i2),EX(i1),2*4));
   3965       assign(t_a0c3b3a3, SL(EX(i0),EX(i2),1*4));
   3966       // First deinterleave into lane-pairs
   3967       assign(p0, ILO32x4(EX(t_a2c1b1a1),EX(t_a1c0b0a0)));
   3968       assign(p1, ILO64x2(ILO32x4(EX(t_a0c3b3a3), EX(t_a3c2b2a2)),
   3969                          IHI32x4(EX(t_a2c1b1a1), EX(t_a1c0b0a0))));
   3970       assign(p2, ILO32x4(ROR(EX(t_a0c3b3a3),1*4), ROR(EX(t_a3c2b2a2),1*4)));
   3971       // Then deinterleave at 64x2 granularity.
   3972       math_DEINTERLEAVE3_128(u0, u1, u2, 3, p0, p1, p2);
   3973       return;
   3974    }
   3975 
   3976    if (laneSzBlg2 == 1) {
   3977       // 16x8
   3978       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
   3979       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
   3980       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
   3981       //
   3982       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
   3983       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
   3984       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
   3985       //
   3986       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
   3987       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
   3988       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
   3989 
   3990       IRTemp s0, s1, s2, s3, t0, t1, t2, t3, p0, p1, p2, c00111111;
   3991       s0 = s1 = s2 = s3
   3992          = t0 = t1 = t2 = t3 = p0 = p1 = p2 = c00111111 = IRTemp_INVALID;
   3993       newTempsV128_4(&s0, &s1, &s2, &s3);
   3994       newTempsV128_4(&t0, &t1, &t2, &t3);
   3995       newTempsV128_4(&p0, &p1, &p2, &c00111111);
   3996 
   3997       // s0 == b2a2 c1b1a1 c0b0a0
   3998       // s1 == b4a4 c3b3c3 c2b2a2
   3999       // s2 == b6a6 c5b5a5 c4b4a4
   4000       // s3 == b0a0 c7b7a7 c6b6a6
   4001       assign(s0, EX(i0));
   4002       assign(s1, SL(EX(i1),EX(i0),6*2));
   4003       assign(s2, SL(EX(i2),EX(i1),4*2));
   4004       assign(s3, SL(EX(i0),EX(i2),2*2));
   4005 
   4006       // t0 == 0 0 c1c0 b1b0 a1a0
   4007       // t1 == 0 0 c3c2 b3b2 a3a2
   4008       // t2 == 0 0 c5c4 b5b4 a5a4
   4009       // t3 == 0 0 c7c6 b7b6 a7a6
   4010       assign(c00111111, mkV128(0x0FFF));
   4011       assign(t0, AND( ILO16x8( ROR(EX(s0),3*2), EX(s0)), EX(c00111111)));
   4012       assign(t1, AND( ILO16x8( ROR(EX(s1),3*2), EX(s1)), EX(c00111111)));
   4013       assign(t2, AND( ILO16x8( ROR(EX(s2),3*2), EX(s2)), EX(c00111111)));
   4014       assign(t3, AND( ILO16x8( ROR(EX(s3),3*2), EX(s3)), EX(c00111111)));
   4015 
   4016       assign(p0, OR2(EX(t0),          SHL(EX(t1),6*2)));
   4017       assign(p1, OR2(SHL(EX(t2),4*2), SHR(EX(t1),2*2)));
   4018       assign(p2, OR2(SHL(EX(t3),2*2), SHR(EX(t2),4*2)));
   4019 
   4020       // Then deinterleave at 32x4 granularity.
   4021       math_DEINTERLEAVE3_128(u0, u1, u2, 2, p0, p1, p2);
   4022       return;
   4023    }
   4024 
   4025    if (laneSzBlg2 == 0) {
   4026       // 8x16.  This is the same scheme as for 16x8, with twice the
   4027       // number of intermediate values.
   4028       //
   4029       // u2 == C{f..0}
   4030       // u1 == B{f..0}
   4031       // u0 == A{f..0}
   4032       //
   4033       // i2 == CBA{f} CBA{e} CBA{d} CBA{c} CBA{b} C{a}
   4034       // i1 ==  BA{a} CBA{9} CBA{8} CBA{7} CBA{6} CB{5}
   4035       // i0 ==   A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
   4036       //
   4037       // p2 == C{fe} B{fe} A{fe} C{dc} B{dc} A{dc} C{ba} B{ba}
   4038       // p1 == A{ba} C{98} B{98} A{98} C{76} B{76} A{76} C{54}
   4039       // p0 == B{54} A{54} C{32} B{32} A{32} C{10} B{10} A{10}
   4040       //
   4041       IRTemp s0, s1, s2, s3, s4, s5, s6, s7,
   4042              t0, t1, t2, t3, t4, t5, t6, t7, p0, p1, p2, cMASK;
   4043       s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7
   4044          = t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = p0 = p1 = p2 = cMASK
   4045          = IRTemp_INVALID;
   4046       newTempsV128_4(&s0, &s1, &s2, &s3);
   4047       newTempsV128_4(&s4, &s5, &s6, &s7);
   4048       newTempsV128_4(&t0, &t1, &t2, &t3);
   4049       newTempsV128_4(&t4, &t5, &t6, &t7);
   4050       newTempsV128_4(&p0, &p1, &p2, &cMASK);
   4051 
   4052       // s0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
   4053       // s1 == A{7} CBA{6} CBA{5} CBA{4} CBA{3} CBA{2}
   4054       // s2 == A{9} CBA{8} CBA{7} CBA{6} CBA{5} CBA{4}
   4055       // s3 == A{b} CBA{a} CBA{9} CBA{8} CBA{7} CBA{6}
   4056       // s4 == A{d} CBA{c} CBA{b} CBA{a} CBA{9} CBA{8}
   4057       // s5 == A{f} CBA{e} CBA{d} CBA{c} CBA{b} CBA{a}
   4058       // s6 == A{1} CBA{0} CBA{f} CBA{e} CBA{d} CBA{c}
   4059       // s7 == A{3} CBA{2} CBA{1} CBA{0} CBA{f} CBA{e}
   4060       assign(s0, SL(EX(i1),EX(i0), 0));
   4061       assign(s1, SL(EX(i1),EX(i0), 6));
   4062       assign(s2, SL(EX(i1),EX(i0),12));
   4063       assign(s3, SL(EX(i2),EX(i1), 2));
   4064       assign(s4, SL(EX(i2),EX(i1), 8));
   4065       assign(s5, SL(EX(i2),EX(i1),14));
   4066       assign(s6, SL(EX(i0),EX(i2), 4));
   4067       assign(s7, SL(EX(i0),EX(i2),10));
   4068 
   4069       // t0 == 0--(ten)--0 C1 C0 B1 B0 A1 A0
   4070       // t1 == 0--(ten)--0 C3 C2 B3 B2 A3 A2
   4071       // t2 == 0--(ten)--0 C5 C4 B5 B4 A5 A4
   4072       // t3 == 0--(ten)--0 C7 C6 B7 B6 A7 A6
   4073       // t4 == 0--(ten)--0 C9 C8 B9 B8 A9 A8
   4074       // t5 == 0--(ten)--0 Cb Ca Bb Ba Ab Aa
   4075       // t6 == 0--(ten)--0 Cd Cc Bd Bc Ad Ac
   4076       // t7 == 0--(ten)--0 Cf Ce Bf Be Af Ae
   4077       assign(cMASK, mkV128(0x003F));
   4078       assign(t0, AND( ILO8x16( ROR(EX(s0),3), EX(s0)), EX(cMASK)));
   4079       assign(t1, AND( ILO8x16( ROR(EX(s1),3), EX(s1)), EX(cMASK)));
   4080       assign(t2, AND( ILO8x16( ROR(EX(s2),3), EX(s2)), EX(cMASK)));
   4081       assign(t3, AND( ILO8x16( ROR(EX(s3),3), EX(s3)), EX(cMASK)));
   4082       assign(t4, AND( ILO8x16( ROR(EX(s4),3), EX(s4)), EX(cMASK)));
   4083       assign(t5, AND( ILO8x16( ROR(EX(s5),3), EX(s5)), EX(cMASK)));
   4084       assign(t6, AND( ILO8x16( ROR(EX(s6),3), EX(s6)), EX(cMASK)));
   4085       assign(t7, AND( ILO8x16( ROR(EX(s7),3), EX(s7)), EX(cMASK)));
   4086 
   4087       assign(p0, OR3( SHL(EX(t2),12), SHL(EX(t1),6), EX(t0) ));
   4088       assign(p1, OR4( SHL(EX(t5),14), SHL(EX(t4),8),
   4089                  SHL(EX(t3),2), SHR(EX(t2),4) ));
   4090       assign(p2, OR3( SHL(EX(t7),10), SHL(EX(t6),4), SHR(EX(t5),2) ));
   4091 
   4092       // Then deinterleave at 16x8 granularity.
   4093       math_DEINTERLEAVE3_128(u0, u1, u2, 1, p0, p1, p2);
   4094       return;
   4095    }
   4096 
   4097    /*NOTREACHED*/
   4098    vassert(0);
   4099 }
   4100 
   4101 
   4102 /* Do deinterleaving for 4 128 bit vectors, for LD4 insns. */
   4103 static
   4104 void math_DEINTERLEAVE4_128(
   4105         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
   4106         UInt laneSzBlg2,
   4107         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
   4108 {
   4109    if (laneSzBlg2 == 3) {
   4110       // 64x2
   4111       assign(*u0, ILO64x2(EX(i2), EX(i0)));
   4112       assign(*u1, IHI64x2(EX(i2), EX(i0)));
   4113       assign(*u2, ILO64x2(EX(i3), EX(i1)));
   4114       assign(*u3, IHI64x2(EX(i3), EX(i1)));
   4115       return;
   4116    }
   4117    if (laneSzBlg2 == 2) {
   4118       // 32x4
   4119       IRTemp p0 = newTempV128();
   4120       IRTemp p2 = newTempV128();
   4121       IRTemp p1 = newTempV128();
   4122       IRTemp p3 = newTempV128();
   4123       assign(p0, ILO32x4(EX(i1), EX(i0)));
   4124       assign(p1, IHI32x4(EX(i1), EX(i0)));
   4125       assign(p2, ILO32x4(EX(i3), EX(i2)));
   4126       assign(p3, IHI32x4(EX(i3), EX(i2)));
   4127       // And now do what we did for the 64-bit case.
   4128       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 3, p0, p1, p2, p3);
   4129       return;
   4130    }
   4131    if (laneSzBlg2 == 1) {
   4132       // 16x8
   4133       // Deinterleave into 32-bit chunks, then do as the 32-bit case.
   4134       IRTemp p0 = newTempV128();
   4135       IRTemp p1 = newTempV128();
   4136       IRTemp p2 = newTempV128();
   4137       IRTemp p3 = newTempV128();
   4138       assign(p0, IHI16x8(EX(i0), SHL(EX(i0), 8)));
   4139       assign(p1, IHI16x8(EX(i1), SHL(EX(i1), 8)));
   4140       assign(p2, IHI16x8(EX(i2), SHL(EX(i2), 8)));
   4141       assign(p3, IHI16x8(EX(i3), SHL(EX(i3), 8)));
   4142       // From here on is like the 32 bit case.
   4143       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 2, p0, p1, p2, p3);
   4144       return;
   4145    }
   4146    if (laneSzBlg2 == 0) {
   4147       // 8x16
   4148       // Deinterleave into 16-bit chunks, then do as the 16-bit case.
   4149       IRTemp p0 = newTempV128();
   4150       IRTemp p1 = newTempV128();
   4151       IRTemp p2 = newTempV128();
   4152       IRTemp p3 = newTempV128();
   4153       assign(p0, IHI64x2( IHI8x16(EX(i0),ROL(EX(i0),4)),
   4154                           ILO8x16(EX(i0),ROL(EX(i0),4)) ));
   4155       assign(p1, IHI64x2( IHI8x16(EX(i1),ROL(EX(i1),4)),
   4156                           ILO8x16(EX(i1),ROL(EX(i1),4)) ));
   4157       assign(p2, IHI64x2( IHI8x16(EX(i2),ROL(EX(i2),4)),
   4158                           ILO8x16(EX(i2),ROL(EX(i2),4)) ));
   4159       assign(p3, IHI64x2( IHI8x16(EX(i3),ROL(EX(i3),4)),
   4160                           ILO8x16(EX(i3),ROL(EX(i3),4)) ));
   4161       // From here on is like the 16 bit case.
   4162       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 1, p0, p1, p2, p3);
   4163       return;
   4164    }
   4165    /*NOTREACHED*/
   4166    vassert(0);
   4167 }
   4168 
   4169 
   4170 /* Wrappers that use the full-width (de)interleavers to do half-width
   4171    (de)interleaving.  The scheme is to clone each input lane in the
   4172    lower half of each incoming value, do a full width (de)interleave
   4173    at the next lane size up, and remove every other lane of the the
   4174    result.  The returned values may have any old junk in the upper
   4175    64 bits -- the caller must ignore that. */
   4176 
   4177 /* Helper function -- get doubling and narrowing operations. */
   4178 static
   4179 void math_get_doubler_and_halver ( /*OUT*/IROp* doubler,
   4180                                    /*OUT*/IROp* halver,
   4181                                    UInt laneSzBlg2 )
   4182 {
   4183    switch (laneSzBlg2) {
   4184       case 2:
   4185          *doubler = Iop_InterleaveLO32x4; *halver = Iop_CatEvenLanes32x4;
   4186          break;
   4187       case 1:
   4188          *doubler = Iop_InterleaveLO16x8; *halver = Iop_CatEvenLanes16x8;
   4189          break;
   4190       case 0:
   4191          *doubler = Iop_InterleaveLO8x16; *halver = Iop_CatEvenLanes8x16;
   4192          break;
   4193       default:
   4194          vassert(0);
   4195    }
   4196 }
   4197 
   4198 /* Do interleaving for 1 64 bit vector, for ST1 insns. */
   4199 static
   4200 void math_INTERLEAVE1_64( /*OUTx1*/ IRTemp* i0,
   4201                           UInt laneSzBlg2, IRTemp u0 )
   4202 {
   4203    assign(*i0, mkexpr(u0));
   4204 }
   4205 
   4206 
   4207 /* Do interleaving for 2 64 bit vectors, for ST2 insns. */
   4208 static
   4209 void math_INTERLEAVE2_64( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
   4210                           UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
   4211 {
   4212    if (laneSzBlg2 == 3) {
   4213       // 1x64, degenerate case
   4214       assign(*i0, EX(u0));
   4215       assign(*i1, EX(u1));
   4216       return;
   4217    }
   4218 
   4219    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
   4220    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
   4221    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
   4222 
   4223    IRTemp du0 = newTempV128();
   4224    IRTemp du1 = newTempV128();
   4225    assign(du0, binop(doubler, EX(u0), EX(u0)));
   4226    assign(du1, binop(doubler, EX(u1), EX(u1)));
   4227    IRTemp di0 = newTempV128();
   4228    IRTemp di1 = newTempV128();
   4229    math_INTERLEAVE2_128(&di0, &di1, laneSzBlg2 + 1, du0, du1);
   4230    assign(*i0, binop(halver, EX(di0), EX(di0)));
   4231    assign(*i1, binop(halver, EX(di1), EX(di1)));
   4232 }
   4233 
   4234 
   4235 /* Do interleaving for 3 64 bit vectors, for ST3 insns. */
   4236 static
   4237 void math_INTERLEAVE3_64(
   4238         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
   4239         UInt laneSzBlg2,
   4240         IRTemp u0, IRTemp u1, IRTemp u2 )
   4241 {
   4242    if (laneSzBlg2 == 3) {
   4243       // 1x64, degenerate case
   4244       assign(*i0, EX(u0));
   4245       assign(*i1, EX(u1));
   4246       assign(*i2, EX(u2));
   4247       return;
   4248    }
   4249 
   4250    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
   4251    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
   4252    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
   4253 
   4254    IRTemp du0 = newTempV128();
   4255    IRTemp du1 = newTempV128();
   4256    IRTemp du2 = newTempV128();
   4257    assign(du0, binop(doubler, EX(u0), EX(u0)));
   4258    assign(du1, binop(doubler, EX(u1), EX(u1)));
   4259    assign(du2, binop(doubler, EX(u2), EX(u2)));
   4260    IRTemp di0 = newTempV128();
   4261    IRTemp di1 = newTempV128();
   4262    IRTemp di2 = newTempV128();
   4263    math_INTERLEAVE3_128(&di0, &di1, &di2, laneSzBlg2 + 1, du0, du1, du2);
   4264    assign(*i0, binop(halver, EX(di0), EX(di0)));
   4265    assign(*i1, binop(halver, EX(di1), EX(di1)));
   4266    assign(*i2, binop(halver, EX(di2), EX(di2)));
   4267 }
   4268 
   4269 
   4270 /* Do interleaving for 4 64 bit vectors, for ST4 insns. */
   4271 static
   4272 void math_INTERLEAVE4_64(
   4273         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
   4274         UInt laneSzBlg2,
   4275         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
   4276 {
   4277    if (laneSzBlg2 == 3) {
   4278       // 1x64, degenerate case
   4279       assign(*i0, EX(u0));
   4280       assign(*i1, EX(u1));
   4281       assign(*i2, EX(u2));
   4282       assign(*i3, EX(u3));
   4283       return;
   4284    }
   4285 
   4286    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
   4287    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
   4288    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
   4289 
   4290    IRTemp du0 = newTempV128();
   4291    IRTemp du1 = newTempV128();
   4292    IRTemp du2 = newTempV128();
   4293    IRTemp du3 = newTempV128();
   4294    assign(du0, binop(doubler, EX(u0), EX(u0)));
   4295    assign(du1, binop(doubler, EX(u1), EX(u1)));
   4296    assign(du2, binop(doubler, EX(u2), EX(u2)));
   4297    assign(du3, binop(doubler, EX(u3), EX(u3)));
   4298    IRTemp di0 = newTempV128();
   4299    IRTemp di1 = newTempV128();
   4300    IRTemp di2 = newTempV128();
   4301    IRTemp di3 = newTempV128();
   4302    math_INTERLEAVE4_128(&di0, &di1, &di2, &di3,
   4303                         laneSzBlg2 + 1, du0, du1, du2, du3);
   4304    assign(*i0, binop(halver, EX(di0), EX(di0)));
   4305    assign(*i1, binop(halver, EX(di1), EX(di1)));
   4306    assign(*i2, binop(halver, EX(di2), EX(di2)));
   4307    assign(*i3, binop(halver, EX(di3), EX(di3)));
   4308 }
   4309 
   4310 
   4311 /* Do deinterleaving for 1 64 bit vector, for LD1 insns. */
   4312 static
   4313 void math_DEINTERLEAVE1_64( /*OUTx1*/ IRTemp* u0,
   4314                             UInt laneSzBlg2, IRTemp i0 )
   4315 {
   4316    assign(*u0, mkexpr(i0));
   4317 }
   4318 
   4319 
   4320 /* Do deinterleaving for 2 64 bit vectors, for LD2 insns. */
   4321 static
   4322 void math_DEINTERLEAVE2_64( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
   4323                             UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
   4324 {
   4325    if (laneSzBlg2 == 3) {
   4326       // 1x64, degenerate case
   4327       assign(*u0, EX(i0));
   4328       assign(*u1, EX(i1));
   4329       return;
   4330    }
   4331 
   4332    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
   4333    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
   4334    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
   4335 
   4336    IRTemp di0 = newTempV128();
   4337    IRTemp di1 = newTempV128();
   4338    assign(di0, binop(doubler, EX(i0), EX(i0)));
   4339    assign(di1, binop(doubler, EX(i1), EX(i1)));
   4340 
   4341    IRTemp du0 = newTempV128();
   4342    IRTemp du1 = newTempV128();
   4343    math_DEINTERLEAVE2_128(&du0, &du1, laneSzBlg2 + 1, di0, di1);
   4344    assign(*u0, binop(halver, EX(du0), EX(du0)));
   4345    assign(*u1, binop(halver, EX(du1), EX(du1)));
   4346 }
   4347 
   4348 
   4349 /* Do deinterleaving for 3 64 bit vectors, for LD3 insns. */
   4350 static
   4351 void math_DEINTERLEAVE3_64(
   4352         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
   4353         UInt laneSzBlg2,
   4354         IRTemp i0, IRTemp i1, IRTemp i2 )
   4355 {
   4356    if (laneSzBlg2 == 3) {
   4357       // 1x64, degenerate case
   4358       assign(*u0, EX(i0));
   4359       assign(*u1, EX(i1));
   4360       assign(*u2, EX(i2));
   4361       return;
   4362    }
   4363 
   4364    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
   4365    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
   4366    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
   4367 
   4368    IRTemp di0 = newTempV128();
   4369    IRTemp di1 = newTempV128();
   4370    IRTemp di2 = newTempV128();
   4371    assign(di0, binop(doubler, EX(i0), EX(i0)));
   4372    assign(di1, binop(doubler, EX(i1), EX(i1)));
   4373    assign(di2, binop(doubler, EX(i2), EX(i2)));
   4374    IRTemp du0 = newTempV128();
   4375    IRTemp du1 = newTempV128();
   4376    IRTemp du2 = newTempV128();
   4377    math_DEINTERLEAVE3_128(&du0, &du1, &du2, laneSzBlg2 + 1, di0, di1, di2);
   4378    assign(*u0, binop(halver, EX(du0), EX(du0)));
   4379    assign(*u1, binop(halver, EX(du1), EX(du1)));
   4380    assign(*u2, binop(halver, EX(du2), EX(du2)));
   4381 }
   4382 
   4383 
   4384 /* Do deinterleaving for 4 64 bit vectors, for LD4 insns. */
   4385 static
   4386 void math_DEINTERLEAVE4_64(
   4387         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
   4388         UInt laneSzBlg2,
   4389         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
   4390 {
   4391    if (laneSzBlg2 == 3) {
   4392       // 1x64, degenerate case
   4393       assign(*u0, EX(i0));
   4394       assign(*u1, EX(i1));
   4395       assign(*u2, EX(i2));
   4396       assign(*u3, EX(i3));
   4397       return;
   4398    }
   4399 
   4400    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
   4401    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
   4402    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
   4403 
   4404    IRTemp di0 = newTempV128();
   4405    IRTemp di1 = newTempV128();
   4406    IRTemp di2 = newTempV128();
   4407    IRTemp di3 = newTempV128();
   4408    assign(di0, binop(doubler, EX(i0), EX(i0)));
   4409    assign(di1, binop(doubler, EX(i1), EX(i1)));
   4410    assign(di2, binop(doubler, EX(i2), EX(i2)));
   4411    assign(di3, binop(doubler, EX(i3), EX(i3)));
   4412    IRTemp du0 = newTempV128();
   4413    IRTemp du1 = newTempV128();
   4414    IRTemp du2 = newTempV128();
   4415    IRTemp du3 = newTempV128();
   4416    math_DEINTERLEAVE4_128(&du0, &du1, &du2, &du3,
   4417                           laneSzBlg2 + 1, di0, di1, di2, di3);
   4418    assign(*u0, binop(halver, EX(du0), EX(du0)));
   4419    assign(*u1, binop(halver, EX(du1), EX(du1)));
   4420    assign(*u2, binop(halver, EX(du2), EX(du2)));
   4421    assign(*u3, binop(halver, EX(du3), EX(du3)));
   4422 }
   4423 
   4424 
   4425 #undef EX
   4426 #undef SL
   4427 #undef ROR
   4428 #undef ROL
   4429 #undef SHR
   4430 #undef SHL
   4431 #undef ILO64x2
   4432 #undef IHI64x2
   4433 #undef ILO32x4
   4434 #undef IHI32x4
   4435 #undef ILO16x8
   4436 #undef IHI16x8
   4437 #undef ILO16x8
   4438 #undef IHI16x8
   4439 #undef CEV32x4
   4440 #undef COD32x4
   4441 #undef COD16x8
   4442 #undef COD8x16
   4443 #undef CEV8x16
   4444 #undef AND
   4445 #undef OR2
   4446 #undef OR3
   4447 #undef OR4
   4448 
   4449 
   4450 /*------------------------------------------------------------*/
   4451 /*--- Load and Store instructions                          ---*/
   4452 /*------------------------------------------------------------*/
   4453 
   4454 /* Generate the EA for a "reg + reg" style amode.  This is done from
   4455    parts of the insn, but for sanity checking sake it takes the whole
   4456    insn.  This appears to depend on insn[15:12], with opt=insn[15:13]
   4457    and S=insn[12]:
   4458 
   4459    The possible forms, along with their opt:S values, are:
   4460       011:0   Xn|SP + Xm
   4461       111:0   Xn|SP + Xm
   4462       011:1   Xn|SP + Xm * transfer_szB
   4463       111:1   Xn|SP + Xm * transfer_szB
   4464       010:0   Xn|SP + 32Uto64(Wm)
   4465       010:1   Xn|SP + 32Uto64(Wm) * transfer_szB
   4466       110:0   Xn|SP + 32Sto64(Wm)
   4467       110:1   Xn|SP + 32Sto64(Wm) * transfer_szB
   4468 
   4469    Rm is insn[20:16].  Rn is insn[9:5].  Rt is insn[4:0].  Log2 of
   4470    the transfer size is insn[23,31,30].  For integer loads/stores,
   4471    insn[23] is zero, hence szLg2 can be at most 3 in such cases.
   4472 
   4473    If the decoding fails, it returns IRTemp_INVALID.
   4474 
   4475    isInt is True iff this is decoding is for transfers to/from integer
   4476    registers.  If False it is for transfers to/from vector registers.
   4477 */
   4478 static IRTemp gen_indexed_EA ( /*OUT*/HChar* buf, UInt insn, Bool isInt )
   4479 {
   4480    UInt    optS  = SLICE_UInt(insn, 15, 12);
   4481    UInt    mm    = SLICE_UInt(insn, 20, 16);
   4482    UInt    nn    = SLICE_UInt(insn, 9, 5);
   4483    UInt    szLg2 = (isInt ? 0 : (SLICE_UInt(insn, 23, 23) << 2))
   4484                    | SLICE_UInt(insn, 31, 30); // Log2 of the size
   4485 
   4486    buf[0] = 0;
   4487 
   4488    /* Sanity checks, that this really is a load/store insn. */
   4489    if (SLICE_UInt(insn, 11, 10) != BITS2(1,0))
   4490       goto fail;
   4491 
   4492    if (isInt
   4493        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,1,1)/*LDR*/
   4494        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,0,1)/*STR*/
   4495        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,0,1)/*LDRSbhw Xt*/
   4496        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,1,1))/*LDRSbhw Wt*/
   4497       goto fail;
   4498 
   4499    if (!isInt
   4500        && SLICE_UInt(insn, 29, 24) != BITS6(1,1,1,1,0,0)) /*LDR/STR*/
   4501       goto fail;
   4502 
   4503    /* Throw out non-verified but possibly valid cases. */
   4504    switch (szLg2) {
   4505       case BITS3(0,0,0): break; //  8 bit, valid for both int and vec
   4506       case BITS3(0,0,1): break; // 16 bit, valid for both int and vec
   4507       case BITS3(0,1,0): break; // 32 bit, valid for both int and vec
   4508       case BITS3(0,1,1): break; // 64 bit, valid for both int and vec
   4509       case BITS3(1,0,0): // can only ever be valid for the vector case
   4510                          if (isInt) goto fail; else break;
   4511       case BITS3(1,0,1): // these sizes are never valid
   4512       case BITS3(1,1,0):
   4513       case BITS3(1,1,1): goto fail;
   4514 
   4515       default: vassert(0);
   4516    }
   4517 
   4518    IRExpr* rhs  = NULL;
   4519    switch (optS) {
   4520       case BITS4(1,1,1,0): goto fail; //ATC
   4521       case BITS4(0,1,1,0):
   4522          rhs = getIReg64orZR(mm);
   4523          vex_sprintf(buf, "[%s, %s]",
   4524                      nameIReg64orZR(nn), nameIReg64orZR(mm));
   4525          break;
   4526       case BITS4(1,1,1,1): goto fail; //ATC
   4527       case BITS4(0,1,1,1):
   4528          rhs = binop(Iop_Shl64, getIReg64orZR(mm), mkU8(szLg2));
   4529          vex_sprintf(buf, "[%s, %s lsl %u]",
   4530                      nameIReg64orZR(nn), nameIReg64orZR(mm), szLg2);
   4531          break;
   4532       case BITS4(0,1,0,0):
   4533          rhs = unop(Iop_32Uto64, getIReg32orZR(mm));
   4534          vex_sprintf(buf, "[%s, %s uxtx]",
   4535                      nameIReg64orZR(nn), nameIReg32orZR(mm));
   4536          break;
   4537       case BITS4(0,1,0,1):
   4538          rhs = binop(Iop_Shl64,
   4539                      unop(Iop_32Uto64, getIReg32orZR(mm)), mkU8(szLg2));
   4540          vex_sprintf(buf, "[%s, %s uxtx, lsl %u]",
   4541                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
   4542          break;
   4543       case BITS4(1,1,0,0):
   4544          rhs = unop(Iop_32Sto64, getIReg32orZR(mm));
   4545          vex_sprintf(buf, "[%s, %s sxtx]",
   4546                      nameIReg64orZR(nn), nameIReg32orZR(mm));
   4547          break;
   4548       case BITS4(1,1,0,1):
   4549          rhs = binop(Iop_Shl64,
   4550                      unop(Iop_32Sto64, getIReg32orZR(mm)), mkU8(szLg2));
   4551          vex_sprintf(buf, "[%s, %s sxtx, lsl %u]",
   4552                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
   4553          break;
   4554       default:
   4555          /* The rest appear to be genuinely invalid */
   4556          goto fail;
   4557    }
   4558 
   4559    vassert(rhs);
   4560    IRTemp res = newTemp(Ity_I64);
   4561    assign(res, binop(Iop_Add64, getIReg64orSP(nn), rhs));
   4562    return res;
   4563 
   4564   fail:
   4565    vex_printf("gen_indexed_EA: unhandled case optS == 0x%x\n", optS);
   4566    return IRTemp_INVALID;
   4567 }
   4568 
   4569 
   4570 /* Generate an 8/16/32/64 bit integer store to ADDR for the lowest
   4571    bits of DATAE :: Ity_I64. */
   4572 static void gen_narrowing_store ( UInt szB, IRTemp addr, IRExpr* dataE )
   4573 {
   4574    IRExpr* addrE = mkexpr(addr);
   4575    switch (szB) {
   4576       case 8:
   4577          storeLE(addrE, dataE);
   4578          break;
   4579       case 4:
   4580          storeLE(addrE, unop(Iop_64to32, dataE));
   4581          break;
   4582       case 2:
   4583          storeLE(addrE, unop(Iop_64to16, dataE));
   4584          break;
   4585       case 1:
   4586          storeLE(addrE, unop(Iop_64to8, dataE));
   4587          break;
   4588       default:
   4589          vassert(0);
   4590    }
   4591 }
   4592 
   4593 
   4594 /* Generate an 8/16/32/64 bit unsigned widening load from ADDR,
   4595    placing the result in an Ity_I64 temporary. */
   4596 static IRTemp gen_zwidening_load ( UInt szB, IRTemp addr )
   4597 {
   4598    IRTemp  res   = newTemp(Ity_I64);
   4599    IRExpr* addrE = mkexpr(addr);
   4600    switch (szB) {
   4601       case 8:
   4602          assign(res, loadLE(Ity_I64,addrE));
   4603          break;
   4604       case 4:
   4605          assign(res, unop(Iop_32Uto64, loadLE(Ity_I32,addrE)));
   4606          break;
   4607       case 2:
   4608          assign(res, unop(Iop_16Uto64, loadLE(Ity_I16,addrE)));
   4609          break;
   4610       case 1:
   4611          assign(res, unop(Iop_8Uto64, loadLE(Ity_I8,addrE)));
   4612          break;
   4613       default:
   4614          vassert(0);
   4615    }
   4616    return res;
   4617 }
   4618 
   4619 
   4620 /* Generate a "standard 7" name, from bitQ and size.  But also
   4621    allow ".1d" since that's occasionally useful. */
   4622 static
   4623 const HChar* nameArr_Q_SZ ( UInt bitQ, UInt size )
   4624 {
   4625    vassert(bitQ <= 1 && size <= 3);
   4626    const HChar* nms[8]
   4627       = { "8b", "4h", "2s", "1d", "16b", "8h", "4s", "2d" };
   4628    UInt ix = (bitQ << 2) | size;
   4629    vassert(ix < 8);
   4630    return nms[ix];
   4631 }
   4632 
   4633 
   4634 static
   4635 Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
   4636 {
   4637 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   4638 
   4639    /* ------------ LDR,STR (immediate, uimm12) ----------- */
   4640    /* uimm12 is scaled by the transfer size
   4641 
   4642       31 29  26    21    9  4
   4643       |  |   |     |     |  |
   4644       11 111 00100 imm12 nn tt    STR  Xt, [Xn|SP, #imm12 * 8]
   4645       11 111 00101 imm12 nn tt    LDR  Xt, [Xn|SP, #imm12 * 8]
   4646 
   4647       10 111 00100 imm12 nn tt    STR  Wt, [Xn|SP, #imm12 * 4]
   4648       10 111 00101 imm12 nn tt    LDR  Wt, [Xn|SP, #imm12 * 4]
   4649 
   4650       01 111 00100 imm12 nn tt    STRH Wt, [Xn|SP, #imm12 * 2]
   4651       01 111 00101 imm12 nn tt    LDRH Wt, [Xn|SP, #imm12 * 2]
   4652 
   4653       00 111 00100 imm12 nn tt    STRB Wt, [Xn|SP, #imm12 * 1]
   4654       00 111 00101 imm12 nn tt    LDRB Wt, [Xn|SP, #imm12 * 1]
   4655    */
   4656    if (INSN(29,23) == BITS7(1,1,1,0,0,1,0)) {
   4657       UInt   szLg2 = INSN(31,30);
   4658       UInt   szB   = 1 << szLg2;
   4659       Bool   isLD  = INSN(22,22) == 1;
   4660       UInt   offs  = INSN(21,10) * szB;
   4661       UInt   nn    = INSN(9,5);
   4662       UInt   tt    = INSN(4,0);
   4663       IRTemp ta    = newTemp(Ity_I64);
   4664       assign(ta, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offs)));
   4665       if (nn == 31) { /* FIXME generate stack alignment check */ }
   4666       vassert(szLg2 < 4);
   4667       if (isLD) {
   4668          putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, ta)));
   4669       } else {
   4670          gen_narrowing_store(szB, ta, getIReg64orZR(tt));
   4671       }
   4672       const HChar* ld_name[4] = { "ldrb", "ldrh", "ldr", "ldr" };
   4673       const HChar* st_name[4] = { "strb", "strh", "str", "str" };
   4674       DIP("%s %s, [%s, #%u]\n",
   4675           (isLD ? ld_name : st_name)[szLg2], nameIRegOrZR(szB == 8, tt),
   4676           nameIReg64orSP(nn), offs);
   4677       return True;
   4678    }
   4679 
   4680    /* ------------ LDUR,STUR (immediate, simm9) ----------- */
   4681    /*
   4682       31 29  26      20   11 9  4
   4683       |  |   |       |    |  |  |
   4684       (at-Rn-then-Rn=EA)  |  |  |
   4685       sz 111 00000 0 imm9 01 Rn Rt   STR Rt, [Xn|SP], #simm9
   4686       sz 111 00001 0 imm9 01 Rn Rt   LDR Rt, [Xn|SP], #simm9
   4687 
   4688       (at-EA-then-Rn=EA)
   4689       sz 111 00000 0 imm9 11 Rn Rt   STR Rt, [Xn|SP, #simm9]!
   4690       sz 111 00001 0 imm9 11 Rn Rt   LDR Rt, [Xn|SP, #simm9]!
   4691 
   4692       (at-EA)
   4693       sz 111 00000 0 imm9 00 Rn Rt   STR Rt, [Xn|SP, #simm9]
   4694       sz 111 00001 0 imm9 00 Rn Rt   LDR Rt, [Xn|SP, #simm9]
   4695 
   4696       simm9 is unscaled.
   4697 
   4698       The case 'wback && Rn == Rt && Rt != 31' is disallowed.  In the
   4699       load case this is because would create two competing values for
   4700       Rt.  In the store case the reason is unclear, but the spec
   4701       disallows it anyway.
   4702 
   4703       Stores are narrowing, loads are unsigned widening.  sz encodes
   4704       the transfer size in the normal way: 00=1, 01=2, 10=4, 11=8.
   4705    */
   4706    if ((INSN(29,21) & BITS9(1,1,1, 1,1,1,1,0, 1))
   4707        == BITS9(1,1,1, 0,0,0,0,0, 0)) {
   4708       UInt szLg2  = INSN(31,30);
   4709       UInt szB    = 1 << szLg2;
   4710       Bool isLoad = INSN(22,22) == 1;
   4711       UInt imm9   = INSN(20,12);
   4712       UInt nn     = INSN(9,5);
   4713       UInt tt     = INSN(4,0);
   4714       Bool wBack  = INSN(10,10) == 1;
   4715       UInt how    = INSN(11,10);
   4716       if (how == BITS2(1,0) || (wBack && nn == tt && tt != 31)) {
   4717          /* undecodable; fall through */
   4718       } else {
   4719          if (nn == 31) { /* FIXME generate stack alignment check */ }
   4720 
   4721          // Compute the transfer address TA and the writeback address WA.
   4722          IRTemp tRN = newTemp(Ity_I64);
   4723          assign(tRN, getIReg64orSP(nn));
   4724          IRTemp tEA = newTemp(Ity_I64);
   4725          Long simm9 = (Long)sx_to_64(imm9, 9);
   4726          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
   4727 
   4728          IRTemp tTA = newTemp(Ity_I64);
   4729          IRTemp tWA = newTemp(Ity_I64);
   4730          switch (how) {
   4731             case BITS2(0,1):
   4732                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
   4733             case BITS2(1,1):
   4734                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
   4735             case BITS2(0,0):
   4736                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
   4737             default:
   4738                vassert(0); /* NOTREACHED */
   4739          }
   4740 
   4741          /* Normally rN would be updated after the transfer.  However, in
   4742             the special case typifed by
   4743                str x30, [sp,#-16]!
   4744             it is necessary to update SP before the transfer, (1)
   4745             because Memcheck will otherwise complain about a write
   4746             below the stack pointer, and (2) because the segfault
   4747             stack extension mechanism will otherwise extend the stack
   4748             only down to SP before the instruction, which might not be
   4749             far enough, if the -16 bit takes the actual access
   4750             address to the next page.
   4751          */
   4752          Bool earlyWBack
   4753            = wBack && simm9 < 0 && szB == 8
   4754              && how == BITS2(1,1) && nn == 31 && !isLoad && tt != nn;
   4755 
   4756          if (wBack && earlyWBack)
   4757             putIReg64orSP(nn, mkexpr(tEA));
   4758 
   4759          if (isLoad) {
   4760             putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, tTA)));
   4761          } else {
   4762             gen_narrowing_store(szB, tTA, getIReg64orZR(tt));
   4763          }
   4764 
   4765          if (wBack && !earlyWBack)
   4766             putIReg64orSP(nn, mkexpr(tEA));
   4767 
   4768          const HChar* ld_name[4] = { "ldurb", "ldurh", "ldur", "ldur" };
   4769          const HChar* st_name[4] = { "sturb", "sturh", "stur", "stur" };
   4770          const HChar* fmt_str = NULL;
   4771          switch (how) {
   4772             case BITS2(0,1):
   4773                fmt_str = "%s %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
   4774                break;
   4775             case BITS2(1,1):
   4776                fmt_str = "%s %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
   4777                break;
   4778             case BITS2(0,0):
   4779                fmt_str = "%s %s, [%s, #%lld] (at-Rn)\n";
   4780                break;
   4781             default:
   4782                vassert(0);
   4783          }
   4784          DIP(fmt_str, (isLoad ? ld_name : st_name)[szLg2],
   4785                       nameIRegOrZR(szB == 8, tt),
   4786                       nameIReg64orSP(nn), simm9);
   4787          return True;
   4788       }
   4789    }
   4790 
   4791    /* -------- LDP,STP (immediate, simm7) (INT REGS) -------- */
   4792    /* L==1 => mm==LD
   4793       L==0 => mm==ST
   4794       x==0 => 32 bit transfers, and zero extended loads
   4795       x==1 => 64 bit transfers
   4796       simm7 is scaled by the (single-register) transfer size
   4797 
   4798       (at-Rn-then-Rn=EA)
   4799       x0 101 0001 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP], #imm
   4800 
   4801       (at-EA-then-Rn=EA)
   4802       x0 101 0011 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]!
   4803 
   4804       (at-EA)
   4805       x0 101 0010 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]
   4806    */
   4807 
   4808    UInt insn_30_23 = INSN(30,23);
   4809    if (insn_30_23 == BITS8(0,1,0,1,0,0,0,1)
   4810        || insn_30_23 == BITS8(0,1,0,1,0,0,1,1)
   4811        || insn_30_23 == BITS8(0,1,0,1,0,0,1,0)) {
   4812       UInt bL     = INSN(22,22);
   4813       UInt bX     = INSN(31,31);
   4814       UInt bWBack = INSN(23,23);
   4815       UInt rT1    = INSN(4,0);
   4816       UInt rN     = INSN(9,5);
   4817       UInt rT2    = INSN(14,10);
   4818       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
   4819       if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
   4820           || (bL && rT1 == rT2)) {
   4821          /* undecodable; fall through */
   4822       } else {
   4823          if (rN == 31) { /* FIXME generate stack alignment check */ }
   4824 
   4825          // Compute the transfer address TA and the writeback address WA.
   4826          IRTemp tRN = newTemp(Ity_I64);
   4827          assign(tRN, getIReg64orSP(rN));
   4828          IRTemp tEA = newTemp(Ity_I64);
   4829          simm7 = (bX ? 8 : 4) * simm7;
   4830          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
   4831 
   4832          IRTemp tTA = newTemp(Ity_I64);
   4833          IRTemp tWA = newTemp(Ity_I64);
   4834          switch (INSN(24,23)) {
   4835             case BITS2(0,1):
   4836                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
   4837             case BITS2(1,1):
   4838                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
   4839             case BITS2(1,0):
   4840                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
   4841             default:
   4842                vassert(0); /* NOTREACHED */
   4843          }
   4844 
   4845          /* Normally rN would be updated after the transfer.  However, in
   4846             the special case typifed by
   4847                stp x29, x30, [sp,#-112]!
   4848             it is necessary to update SP before the transfer, (1)
   4849             because Memcheck will otherwise complain about a write
   4850             below the stack pointer, and (2) because the segfault
   4851             stack extension mechanism will otherwise extend the stack
   4852             only down to SP before the instruction, which might not be
   4853             far enough, if the -112 bit takes the actual access
   4854             address to the next page.
   4855          */
   4856          Bool earlyWBack
   4857            = bWBack && simm7 < 0
   4858              && INSN(24,23) == BITS2(1,1) && rN == 31 && bL == 0;
   4859 
   4860          if (bWBack && earlyWBack)
   4861             putIReg64orSP(rN, mkexpr(tEA));
   4862 
   4863          /**/ if (bL == 1 && bX == 1) {
   4864             // 64 bit load
   4865             putIReg64orZR(rT1, loadLE(Ity_I64,
   4866                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
   4867             putIReg64orZR(rT2, loadLE(Ity_I64,
   4868                                       binop(Iop_Add64,mkexpr(tTA),mkU64(8))));
   4869          } else if (bL == 1 && bX == 0) {
   4870             // 32 bit load
   4871             putIReg32orZR(rT1, loadLE(Ity_I32,
   4872                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
   4873             putIReg32orZR(rT2, loadLE(Ity_I32,
   4874                                       binop(Iop_Add64,mkexpr(tTA),mkU64(4))));
   4875          } else if (bL == 0 && bX == 1) {
   4876             // 64 bit store
   4877             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
   4878                     getIReg64orZR(rT1));
   4879             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(8)),
   4880                     getIReg64orZR(rT2));
   4881          } else {
   4882             vassert(bL == 0 && bX == 0);
   4883             // 32 bit store
   4884             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
   4885                     getIReg32orZR(rT1));
   4886             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(4)),
   4887                     getIReg32orZR(rT2));
   4888          }
   4889 
   4890          if (bWBack && !earlyWBack)
   4891             putIReg64orSP(rN, mkexpr(tEA));
   4892 
   4893          const HChar* fmt_str = NULL;
   4894          switch (INSN(24,23)) {
   4895             case BITS2(0,1):
   4896                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
   4897                break;
   4898             case BITS2(1,1):
   4899                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
   4900                break;
   4901             case BITS2(1,0):
   4902                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
   4903                break;
   4904             default:
   4905                vassert(0);
   4906          }
   4907          DIP(fmt_str, bL == 0 ? "st" : "ld",
   4908                       nameIRegOrZR(bX == 1, rT1),
   4909                       nameIRegOrZR(bX == 1, rT2),
   4910                       nameIReg64orSP(rN), simm7);
   4911          return True;
   4912       }
   4913    }
   4914 
   4915    /* ---------------- LDR (literal, int reg) ---------------- */
   4916    /* 31 29      23    4
   4917       00 011 000 imm19 Rt   LDR   Wt, [PC + sxTo64(imm19 << 2)]
   4918       01 011 000 imm19 Rt   LDR   Xt, [PC + sxTo64(imm19 << 2)]
   4919       10 011 000 imm19 Rt   LDRSW Xt, [PC + sxTo64(imm19 << 2)]
   4920       11 011 000 imm19 Rt   prefetch  [PC + sxTo64(imm19 << 2)]
   4921       Just handles the first two cases for now.
   4922    */
   4923    if (INSN(29,24) == BITS6(0,1,1,0,0,0) && INSN(31,31) == 0) {
   4924       UInt  imm19 = INSN(23,5);
   4925       UInt  rT    = INSN(4,0);
   4926       UInt  bX    = INSN(30,30);
   4927       ULong ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
   4928       if (bX) {
   4929          putIReg64orZR(rT, loadLE(Ity_I64, mkU64(ea)));
   4930       } else {
   4931          putIReg32orZR(rT, loadLE(Ity_I32, mkU64(ea)));
   4932       }
   4933       DIP("ldr %s, 0x%llx (literal)\n", nameIRegOrZR(bX == 1, rT), ea);
   4934       return True;
   4935    }
   4936 
   4937    /* -------------- {LD,ST}R (integer register) --------------- */
   4938    /* 31 29        20 15     12 11 9  4
   4939       |  |         |  |      |  |  |  |
   4940       11 111000011 Rm option S  10 Rn Rt  LDR  Xt, [Xn|SP, R<m>{ext/sh}]
   4941       10 111000011 Rm option S  10 Rn Rt  LDR  Wt, [Xn|SP, R<m>{ext/sh}]
   4942       01 111000011 Rm option S  10 Rn Rt  LDRH Wt, [Xn|SP, R<m>{ext/sh}]
   4943       00 111000011 Rm option S  10 Rn Rt  LDRB Wt, [Xn|SP, R<m>{ext/sh}]
   4944 
   4945       11 111000001 Rm option S  10 Rn Rt  STR  Xt, [Xn|SP, R<m>{ext/sh}]
   4946       10 111000001 Rm option S  10 Rn Rt  STR  Wt, [Xn|SP, R<m>{ext/sh}]
   4947       01 111000001 Rm option S  10 Rn Rt  STRH Wt, [Xn|SP, R<m>{ext/sh}]
   4948       00 111000001 Rm option S  10 Rn Rt  STRB Wt, [Xn|SP, R<m>{ext/sh}]
   4949    */
   4950    if (INSN(29,23) == BITS7(1,1,1,0,0,0,0)
   4951        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
   4952       HChar  dis_buf[64];
   4953       UInt   szLg2 = INSN(31,30);
   4954       Bool   isLD  = INSN(22,22) == 1;
   4955       UInt   tt    = INSN(4,0);
   4956       IRTemp ea    = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
   4957       if (ea != IRTemp_INVALID) {
   4958          switch (szLg2) {
   4959             case 3: /* 64 bit */
   4960                if (isLD) {
   4961                   putIReg64orZR(tt, loadLE(Ity_I64, mkexpr(ea)));
   4962                   DIP("ldr %s, %s\n", nameIReg64orZR(tt), dis_buf);
   4963                } else {
   4964                   storeLE(mkexpr(ea), getIReg64orZR(tt));
   4965                   DIP("str %s, %s\n", nameIReg64orZR(tt), dis_buf);
   4966                }
   4967                break;
   4968             case 2: /* 32 bit */
   4969                if (isLD) {
   4970                   putIReg32orZR(tt, loadLE(Ity_I32, mkexpr(ea)));
   4971                   DIP("ldr %s, %s\n", nameIReg32orZR(tt), dis_buf);
   4972                } else {
   4973                   storeLE(mkexpr(ea), getIReg32orZR(tt));
   4974                   DIP("str %s, %s\n", nameIReg32orZR(tt), dis_buf);
   4975                }
   4976                break;
   4977             case 1: /* 16 bit */
   4978                if (isLD) {
   4979                   putIReg64orZR(tt, unop(Iop_16Uto64,
   4980                                          loadLE(Ity_I16, mkexpr(ea))));
   4981                   DIP("ldruh %s, %s\n", nameIReg32orZR(tt), dis_buf);
   4982                } else {
   4983                   storeLE(mkexpr(ea), unop(Iop_64to16, getIReg64orZR(tt)));
   4984                   DIP("strh %s, %s\n", nameIReg32orZR(tt), dis_buf);
   4985                }
   4986                break;
   4987             case 0: /* 8 bit */
   4988                if (isLD) {
   4989                   putIReg64orZR(tt, unop(Iop_8Uto64,
   4990                                          loadLE(Ity_I8, mkexpr(ea))));
   4991                   DIP("ldrub %s, %s\n", nameIReg32orZR(tt), dis_buf);
   4992                } else {
   4993                   storeLE(mkexpr(ea), unop(Iop_64to8, getIReg64orZR(tt)));
   4994                   DIP("strb %s, %s\n", nameIReg32orZR(tt), dis_buf);
   4995                }
   4996                break;
   4997             default:
   4998                vassert(0);
   4999          }
   5000          return True;
   5001       }
   5002    }
   5003 
   5004    /* -------------- LDRS{B,H,W} (uimm12) -------------- */
   5005    /* 31 29  26  23 21    9 4
   5006       10 111 001 10 imm12 n t   LDRSW Xt, [Xn|SP, #pimm12 * 4]
   5007       01 111 001 1x imm12 n t   LDRSH Rt, [Xn|SP, #pimm12 * 2]
   5008       00 111 001 1x imm12 n t   LDRSB Rt, [Xn|SP, #pimm12 * 1]
   5009       where
   5010          Rt is Wt when x==1, Xt when x==0
   5011    */
   5012    if (INSN(29,23) == BITS7(1,1,1,0,0,1,1)) {
   5013       /* Further checks on bits 31:30 and 22 */
   5014       Bool valid = False;
   5015       switch ((INSN(31,30) << 1) | INSN(22,22)) {
   5016          case BITS3(1,0,0):
   5017          case BITS3(0,1,0): case BITS3(0,1,1):
   5018          case BITS3(0,0,0): case BITS3(0,0,1):
   5019             valid = True;
   5020             break;
   5021       }
   5022       if (valid) {
   5023          UInt    szLg2 = INSN(31,30);
   5024          UInt    bitX  = INSN(22,22);
   5025          UInt    imm12 = INSN(21,10);
   5026          UInt    nn    = INSN(9,5);
   5027          UInt    tt    = INSN(4,0);
   5028          UInt    szB   = 1 << szLg2;
   5029          IRExpr* ea    = binop(Iop_Add64,
   5030                                getIReg64orSP(nn), mkU64(imm12 * szB));
   5031          switch (szB) {
   5032             case 4:
   5033                vassert(bitX == 0);
   5034                putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, ea)));
   5035                DIP("ldrsw %s, [%s, #%u]\n", nameIReg64orZR(tt),
   5036                    nameIReg64orSP(nn), imm12 * szB);
   5037                break;
   5038             case 2:
   5039                if (bitX == 1) {
   5040                   putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, ea)));
   5041                } else {
   5042                   putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, ea)));
   5043                }
   5044                DIP("ldrsh %s, [%s, #%u]\n",
   5045                    nameIRegOrZR(bitX == 0, tt),
   5046                    nameIReg64orSP(nn), imm12 * szB);
   5047                break;
   5048             case 1:
   5049                if (bitX == 1) {
   5050                   putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, ea)));
   5051                } else {
   5052                   putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, ea)));
   5053                }
   5054                DIP("ldrsb %s, [%s, #%u]\n",
   5055                    nameIRegOrZR(bitX == 0, tt),
   5056                    nameIReg64orSP(nn), imm12 * szB);
   5057                break;
   5058             default:
   5059                vassert(0);
   5060          }
   5061          return True;
   5062       }
   5063       /* else fall through */
   5064    }
   5065 
   5066    /* -------------- LDRS{B,H,W} (simm9, upd) -------------- */
   5067    /* (at-Rn-then-Rn=EA)
   5068       31 29      23 21 20   11 9 4
   5069       00 111 000 1x 0  imm9 01 n t  LDRSB Rt, [Xn|SP], #simm9
   5070       01 111 000 1x 0  imm9 01 n t  LDRSH Rt, [Xn|SP], #simm9
   5071       10 111 000 10 0  imm9 01 n t  LDRSW Xt, [Xn|SP], #simm9
   5072 
   5073       (at-EA-then-Rn=EA)
   5074       00 111 000 1x 0  imm9 11 n t  LDRSB Rt, [Xn|SP, #simm9]!
   5075       01 111 000 1x 0  imm9 11 n t  LDRSH Rt, [Xn|SP, #simm9]!
   5076       10 111 000 10 0  imm9 11 n t  LDRSW Xt, [Xn|SP, #simm9]!
   5077       where
   5078          Rt is Wt when x==1, Xt when x==0
   5079          transfer-at-Rn when [11]==0, at EA when [11]==1
   5080    */
   5081    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
   5082        && INSN(21,21) == 0 && INSN(10,10) == 1) {
   5083       /* Further checks on bits 31:30 and 22 */
   5084       Bool valid = False;
   5085       switch ((INSN(31,30) << 1) | INSN(22,22)) {
   5086          case BITS3(1,0,0):                    // LDRSW Xt
   5087          case BITS3(0,1,0): case BITS3(0,1,1): // LDRSH Xt, Wt
   5088          case BITS3(0,0,0): case BITS3(0,0,1): // LDRSB Xt, Wt
   5089             valid = True;
   5090             break;
   5091       }
   5092       if (valid) {
   5093          UInt   szLg2 = INSN(31,30);
   5094          UInt   imm9  = INSN(20,12);
   5095          Bool   atRN  = INSN(11,11) == 0;
   5096          UInt   nn    = INSN(9,5);
   5097          UInt   tt    = INSN(4,0);
   5098          IRTemp tRN   = newTemp(Ity_I64);
   5099          IRTemp tEA   = newTemp(Ity_I64);
   5100          IRTemp tTA   = IRTemp_INVALID;
   5101          ULong  simm9 = sx_to_64(imm9, 9);
   5102          Bool   is64  = INSN(22,22) == 0;
   5103          assign(tRN, getIReg64orSP(nn));
   5104          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
   5105          tTA = atRN ? tRN : tEA;
   5106          HChar ch = '?';
   5107          /* There are 5 cases:
   5108                byte     load,           SX to 64
   5109                byte     load, SX to 32, ZX to 64
   5110                halfword load,           SX to 64
   5111                halfword load, SX to 32, ZX to 64
   5112                word     load,           SX to 64
   5113             The ifs below handle them in the listed order.
   5114          */
   5115          if (szLg2 == 0) {
   5116             ch = 'b';
   5117             if (is64) {
   5118                putIReg64orZR(tt, unop(Iop_8Sto64,
   5119                                       loadLE(Ity_I8, mkexpr(tTA))));
   5120             } else {
   5121                putIReg32orZR(tt, unop(Iop_8Sto32,
   5122                                       loadLE(Ity_I8, mkexpr(tTA))));
   5123             }
   5124          }
   5125          else if (szLg2 == 1) {
   5126             ch = 'h';
   5127             if (is64) {
   5128                putIReg64orZR(tt, unop(Iop_16Sto64,
   5129                                       loadLE(Ity_I16, mkexpr(tTA))));
   5130             } else {
   5131                putIReg32orZR(tt, unop(Iop_16Sto32,
   5132                                       loadLE(Ity_I16, mkexpr(tTA))));
   5133             }
   5134          }
   5135          else if (szLg2 == 2 && is64) {
   5136             ch = 'w';
   5137             putIReg64orZR(tt, unop(Iop_32Sto64,
   5138                                    loadLE(Ity_I32, mkexpr(tTA))));
   5139          }
   5140          else {
   5141             vassert(0);
   5142          }
   5143          putIReg64orSP(nn, mkexpr(tEA));
   5144          DIP(atRN ? "ldrs%c %s, [%s], #%llu\n" : "ldrs%c %s, [%s, #%llu]!",
   5145              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), simm9);
   5146          return True;
   5147       }
   5148       /* else fall through */
   5149    }
   5150 
   5151    /* -------------- LDRS{B,H,W} (simm9, noUpd) -------------- */
   5152    /* 31 29      23 21 20   11 9 4
   5153       00 111 000 1x 0  imm9 00 n t  LDURSB Rt, [Xn|SP, #simm9]
   5154       01 111 000 1x 0  imm9 00 n t  LDURSH Rt, [Xn|SP, #simm9]
   5155       10 111 000 10 0  imm9 00 n t  LDURSW Xt, [Xn|SP, #simm9]
   5156       where
   5157          Rt is Wt when x==1, Xt when x==0
   5158    */
   5159    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
   5160        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
   5161       /* Further checks on bits 31:30 and 22 */
   5162       Bool valid = False;
   5163       switch ((INSN(31,30) << 1) | INSN(22,22)) {
   5164          case BITS3(1,0,0):                    // LDURSW Xt
   5165          case BITS3(0,1,0): case BITS3(0,1,1): // LDURSH Xt, Wt
   5166          case BITS3(0,0,0): case BITS3(0,0,1): // LDURSB Xt, Wt
   5167             valid = True;
   5168             break;
   5169       }
   5170       if (valid) {
   5171          UInt   szLg2 = INSN(31,30);
   5172          UInt   imm9  = INSN(20,12);
   5173          UInt   nn    = INSN(9,5);
   5174          UInt   tt    = INSN(4,0);
   5175          IRTemp tRN   = newTemp(Ity_I64);
   5176          IRTemp tEA   = newTemp(Ity_I64);
   5177          ULong  simm9 = sx_to_64(imm9, 9);
   5178          Bool   is64  = INSN(22,22) == 0;
   5179          assign(tRN, getIReg64orSP(nn));
   5180          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
   5181          HChar ch = '?';
   5182          /* There are 5 cases:
   5183                byte     load,           SX to 64
   5184                byte     load, SX to 32, ZX to 64
   5185                halfword load,           SX to 64
   5186                halfword load, SX to 32, ZX to 64
   5187                word     load,           SX to 64
   5188             The ifs below handle them in the listed order.
   5189          */
   5190          if (szLg2 == 0) {
   5191             ch = 'b';
   5192             if (is64) {
   5193                putIReg64orZR(tt, unop(Iop_8Sto64,
   5194                                       loadLE(Ity_I8, mkexpr(tEA))));
   5195             } else {
   5196                putIReg32orZR(tt, unop(Iop_8Sto32,
   5197                                       loadLE(Ity_I8, mkexpr(tEA))));
   5198             }
   5199          }
   5200          else if (szLg2 == 1) {
   5201             ch = 'h';
   5202             if (is64) {
   5203                putIReg64orZR(tt, unop(Iop_16Sto64,
   5204                                       loadLE(Ity_I16, mkexpr(tEA))));
   5205             } else {
   5206                putIReg32orZR(tt, unop(Iop_16Sto32,
   5207                                       loadLE(Ity_I16, mkexpr(tEA))));
   5208             }
   5209          }
   5210          else if (szLg2 == 2 && is64) {
   5211             ch = 'w';
   5212             putIReg64orZR(tt, unop(Iop_32Sto64,
   5213                                    loadLE(Ity_I32, mkexpr(tEA))));
   5214          }
   5215          else {
   5216             vassert(0);
   5217          }
   5218          DIP("ldurs%c %s, [%s, #%lld]",
   5219              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), (Long)simm9);
   5220          return True;
   5221       }
   5222       /* else fall through */
   5223    }
   5224 
   5225    /* -------- LDP,STP (immediate, simm7) (FP&VEC) -------- */
   5226    /* L==1    => mm==LD
   5227       L==0    => mm==ST
   5228       sz==00  => 32 bit (S) transfers
   5229       sz==01  => 64 bit (D) transfers
   5230       sz==10  => 128 bit (Q) transfers
   5231       sz==11  isn't allowed
   5232       simm7 is scaled by the (single-register) transfer size
   5233 
   5234       31 29  26   22 21   14 9 4
   5235 
   5236       sz 101 1000 L  imm7 t2 n t1   mmNP SDQt1, SDQt2, [Xn|SP, #imm]
   5237                                     (at-EA, with nontemporal hint)
   5238 
   5239       sz 101 1001 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP], #imm
   5240                                     (at-Rn-then-Rn=EA)
   5241 
   5242       sz 101 1010 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]
   5243                                     (at-EA)
   5244 
   5245       sz 101 1011 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]!
   5246                                     (at-EA-then-Rn=EA)
   5247    */
   5248    if (INSN(29,25) == BITS5(1,0,1,1,0)) {
   5249       UInt szSlg2 = INSN(31,30); // log2 of the xfer size in 32-bit units
   5250       Bool isLD   = INSN(22,22) == 1;
   5251       Bool wBack  = INSN(23,23) == 1;
   5252       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
   5253       UInt tt2    = INSN(14,10);
   5254       UInt nn     = INSN(9,5);
   5255       UInt tt1    = INSN(4,0);
   5256       if (szSlg2 == BITS2(1,1) || (isLD && tt1 == tt2)) {
   5257          /* undecodable; fall through */
   5258       } else {
   5259          if (nn == 31) { /* FIXME generate stack alignment check */ }
   5260 
   5261          // Compute the transfer address TA and the writeback address WA.
   5262          UInt   szB = 4 << szSlg2; /* szB is the per-register size */
   5263          IRTemp tRN = newTemp(Ity_I64);
   5264          assign(tRN, getIReg64orSP(nn));
   5265          IRTemp tEA = newTemp(Ity_I64);
   5266          simm7 = szB * simm7;
   5267          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
   5268 
   5269          IRTemp tTA = newTemp(Ity_I64);
   5270          IRTemp tWA = newTemp(Ity_I64);
   5271          switch (INSN(24,23)) {
   5272             case BITS2(0,1):
   5273                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
   5274             case BITS2(1,1):
   5275                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
   5276             case BITS2(1,0):
   5277             case BITS2(0,0):
   5278                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
   5279             default:
   5280                vassert(0); /* NOTREACHED */
   5281          }
   5282 
   5283          IRType ty = Ity_INVALID;
   5284          switch (szB) {
   5285             case 4:  ty = Ity_F32;  break;
   5286             case 8:  ty = Ity_F64;  break;
   5287             case 16: ty = Ity_V128; break;
   5288             default: vassert(0);
   5289          }
   5290 
   5291          /* Normally rN would be updated after the transfer.  However, in
   5292             the special cases typifed by
   5293                stp q0, q1, [sp,#-512]!
   5294                stp d0, d1, [sp,#-512]!
   5295                stp s0, s1, [sp,#-512]!
   5296             it is necessary to update SP before the transfer, (1)
   5297             because Memcheck will otherwise complain about a write
   5298             below the stack pointer, and (2) because the segfault
   5299             stack extension mechanism will otherwise extend the stack
   5300             only down to SP before the instruction, which might not be
   5301             far enough, if the -512 bit takes the actual access
   5302             address to the next page.
   5303          */
   5304          Bool earlyWBack
   5305            = wBack && simm7 < 0
   5306              && INSN(24,23) == BITS2(1,1) && nn == 31 && !isLD;
   5307 
   5308          if (wBack && earlyWBack)
   5309             putIReg64orSP(nn, mkexpr(tEA));
   5310 
   5311          if (isLD) {
   5312             if (szB < 16) {
   5313                putQReg128(tt1, mkV128(0x0000));
   5314             }
   5315             putQRegLO(tt1,
   5316                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(0))));
   5317             if (szB < 16) {
   5318                putQReg128(tt2, mkV128(0x0000));
   5319             }
   5320             putQRegLO(tt2,
   5321                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(szB))));
   5322          } else {
   5323             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(0)),
   5324                     getQRegLO(tt1, ty));
   5325             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(szB)),
   5326                     getQRegLO(tt2, ty));
   5327          }
   5328 
   5329          if (wBack && !earlyWBack)
   5330             putIReg64orSP(nn, mkexpr(tEA));
   5331 
   5332          const HChar* fmt_str = NULL;
   5333          switch (INSN(24,23)) {
   5334             case BITS2(0,1):
   5335                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
   5336                break;
   5337             case BITS2(1,1):
   5338                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
   5339                break;
   5340             case BITS2(1,0):
   5341                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
   5342                break;
   5343             case BITS2(0,0):
   5344                fmt_str = "%snp %s, %s, [%s, #%lld] (at-Rn)\n";
   5345                break;
   5346             default:
   5347                vassert(0);
   5348          }
   5349          DIP(fmt_str, isLD ? "ld" : "st",
   5350                       nameQRegLO(tt1, ty), nameQRegLO(tt2, ty),
   5351                       nameIReg64orSP(nn), simm7);
   5352          return True;
   5353       }
   5354    }
   5355 
   5356    /* -------------- {LD,ST}R (vector register) --------------- */
   5357    /* 31 29     23  20 15     12 11 9  4
   5358       |  |      |   |  |      |  |  |  |
   5359       00 111100 011 Rm option S  10 Rn Rt  LDR Bt, [Xn|SP, R<m>{ext/sh}]
   5360       01 111100 011 Rm option S  10 Rn Rt  LDR Ht, [Xn|SP, R<m>{ext/sh}]
   5361       10 111100 011 Rm option S  10 Rn Rt  LDR St, [Xn|SP, R<m>{ext/sh}]
   5362       11 111100 011 Rm option S  10 Rn Rt  LDR Dt, [Xn|SP, R<m>{ext/sh}]
   5363       00 111100 111 Rm option S  10 Rn Rt  LDR Qt, [Xn|SP, R<m>{ext/sh}]
   5364 
   5365       00 111100 001 Rm option S  10 Rn Rt  STR Bt, [Xn|SP, R<m>{ext/sh}]
   5366       01 111100 001 Rm option S  10 Rn Rt  STR Ht, [Xn|SP, R<m>{ext/sh}]
   5367       10 111100 001 Rm option S  10 Rn Rt  STR St, [Xn|SP, R<m>{ext/sh}]
   5368       11 111100 001 Rm option S  10 Rn Rt  STR Dt, [Xn|SP, R<m>{ext/sh}]
   5369       00 111100 101 Rm option S  10 Rn Rt  STR Qt, [Xn|SP, R<m>{ext/sh}]
   5370    */
   5371    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
   5372        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
   5373       HChar  dis_buf[64];
   5374       UInt   szLg2 = (INSN(23,23) << 2) | INSN(31,30);
   5375       Bool   isLD  = INSN(22,22) == 1;
   5376       UInt   tt    = INSN(4,0);
   5377       if (szLg2 > 4) goto after_LDR_STR_vector_register;
   5378       IRTemp ea    = gen_indexed_EA(dis_buf, insn, False/*to/from vec regs*/);
   5379       if (ea == IRTemp_INVALID) goto after_LDR_STR_vector_register;
   5380       switch (szLg2) {
   5381          case 0: /* 8 bit */
   5382             if (isLD) {
   5383                putQReg128(tt, mkV128(0x0000));
   5384                putQRegLO(tt, loadLE(Ity_I8, mkexpr(ea)));
   5385                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
   5386             } else {
   5387                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I8));
   5388                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
   5389             }
   5390             break;
   5391          case 1:
   5392             if (isLD) {
   5393                putQReg128(tt, mkV128(0x0000));
   5394                putQRegLO(tt, loadLE(Ity_I16, mkexpr(ea)));
   5395                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
   5396             } else {
   5397                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I16));
   5398                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
   5399             }
   5400             break;
   5401          case 2: /* 32 bit */
   5402             if (isLD) {
   5403                putQReg128(tt, mkV128(0x0000));
   5404                putQRegLO(tt, loadLE(Ity_I32, mkexpr(ea)));
   5405                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
   5406             } else {
   5407                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I32));
   5408                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
   5409             }
   5410             break;
   5411          case 3: /* 64 bit */
   5412             if (isLD) {
   5413                putQReg128(tt, mkV128(0x0000));
   5414                putQRegLO(tt, loadLE(Ity_I64, mkexpr(ea)));
   5415                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
   5416             } else {
   5417                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I64));
   5418                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
   5419             }
   5420             break;
   5421          case 4:
   5422             if (isLD) {
   5423                putQReg128(tt, loadLE(Ity_V128, mkexpr(ea)));
   5424                DIP("ldr %s, %s\n", nameQReg128(tt), dis_buf);
   5425             } else {
   5426                storeLE(mkexpr(ea), getQReg128(tt));
   5427                DIP("str %s, %s\n", nameQReg128(tt), dis_buf);
   5428             }
   5429             break;
   5430          default:
   5431             vassert(0);
   5432       }
   5433       return True;
   5434    }
   5435   after_LDR_STR_vector_register:
   5436 
   5437    /* ---------- LDRS{B,H,W} (integer register, SX) ---------- */
   5438    /* 31 29      22 20 15  12 11 9  4
   5439       |  |       |  |  |   |  |  |  |
   5440       10 1110001 01 Rm opt S 10 Rn Rt    LDRSW Xt, [Xn|SP, R<m>{ext/sh}]
   5441 
   5442       01 1110001 01 Rm opt S 10 Rn Rt    LDRSH Xt, [Xn|SP, R<m>{ext/sh}]
   5443       01 1110001 11 Rm opt S 10 Rn Rt    LDRSH Wt, [Xn|SP, R<m>{ext/sh}]
   5444 
   5445       00 1110001 01 Rm opt S 10 Rn Rt    LDRSB Xt, [Xn|SP, R<m>{ext/sh}]
   5446       00 1110001 11 Rm opt S 10 Rn Rt    LDRSB Wt, [Xn|SP, R<m>{ext/sh}]
   5447    */
   5448    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
   5449        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
   5450       HChar  dis_buf[64];
   5451       UInt   szLg2  = INSN(31,30);
   5452       Bool   sxTo64 = INSN(22,22) == 0; // else sx to 32 and zx to 64
   5453       UInt   tt     = INSN(4,0);
   5454       if (szLg2 == 3) goto after_LDRS_integer_register;
   5455       IRTemp ea     = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
   5456       if (ea == IRTemp_INVALID) goto after_LDRS_integer_register;
   5457       /* Enumerate the 5 variants explicitly. */
   5458       if (szLg2 == 2/*32 bit*/ && sxTo64) {
   5459          putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, mkexpr(ea))));
   5460          DIP("ldrsw %s, %s\n", nameIReg64orZR(tt), dis_buf);
   5461          return True;
   5462       }
   5463       else
   5464       if (szLg2 == 1/*16 bit*/) {
   5465          if (sxTo64) {
   5466             putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, mkexpr(ea))));
   5467             DIP("ldrsh %s, %s\n", nameIReg64orZR(tt), dis_buf);
   5468          } else {
   5469             putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, mkexpr(ea))));
   5470             DIP("ldrsh %s, %s\n", nameIReg32orZR(tt), dis_buf);
   5471          }
   5472          return True;
   5473       }
   5474       else
   5475       if (szLg2 == 0/*8 bit*/) {
   5476          if (sxTo64) {
   5477             putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, mkexpr(ea))));
   5478             DIP("ldrsb %s, %s\n", nameIReg64orZR(tt), dis_buf);
   5479          } else {
   5480             putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, mkexpr(ea))));
   5481             DIP("ldrsb %s, %s\n", nameIReg32orZR(tt), dis_buf);
   5482          }
   5483          return True;
   5484       }
   5485       /* else it's an invalid combination */
   5486    }
   5487   after_LDRS_integer_register:
   5488 
   5489    /* -------- LDR/STR (immediate, SIMD&FP, unsigned offset) -------- */
   5490    /* This is the Unsigned offset variant only.  The Post-Index and
   5491       Pre-Index variants are below.
   5492 
   5493       31 29      23 21    9 4
   5494       00 111 101 01 imm12 n t   LDR Bt, [Xn|SP + imm12 * 1]
   5495       01 111 101 01 imm12 n t   LDR Ht, [Xn|SP + imm12 * 2]
   5496       10 111 101 01 imm12 n t   LDR St, [Xn|SP + imm12 * 4]
   5497       11 111 101 01 imm12 n t   LDR Dt, [Xn|SP + imm12 * 8]
   5498       00 111 101 11 imm12 n t   LDR Qt, [Xn|SP + imm12 * 16]
   5499 
   5500       00 111 101 00 imm12 n t   STR Bt, [Xn|SP + imm12 * 1]
   5501       01 111 101 00 imm12 n t   STR Ht, [Xn|SP + imm12 * 2]
   5502       10 111 101 00 imm12 n t   STR St, [Xn|SP + imm12 * 4]
   5503       11 111 101 00 imm12 n t   STR Dt, [Xn|SP + imm12 * 8]
   5504       00 111 101 10 imm12 n t   STR Qt, [Xn|SP + imm12 * 16]
   5505    */
   5506    if (INSN(29,24) == BITS6(1,1,1,1,0,1)
   5507        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4) {
   5508       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
   5509       Bool   isLD   = INSN(22,22) == 1;
   5510       UInt   pimm12 = INSN(21,10) << szLg2;
   5511       UInt   nn     = INSN(9,5);
   5512       UInt   tt     = INSN(4,0);
   5513       IRTemp tEA    = newTemp(Ity_I64);
   5514       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
   5515       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(pimm12)));
   5516       if (isLD) {
   5517          if (szLg2 < 4) {
   5518             putQReg128(tt, mkV128(0x0000));
   5519          }
   5520          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
   5521       } else {
   5522          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
   5523       }
   5524       DIP("%s %s, [%s, #%u]\n",
   5525           isLD ? "ldr" : "str",
   5526           nameQRegLO(tt, ty), nameIReg64orSP(nn), pimm12);
   5527       return True;
   5528    }
   5529 
   5530    /* -------- LDR/STR (immediate, SIMD&FP, pre/post index) -------- */
   5531    /* These are the Post-Index and Pre-Index variants.
   5532 
   5533       31 29      23   20   11 9 4
   5534       (at-Rn-then-Rn=EA)
   5535       00 111 100 01 0 imm9 01 n t   LDR Bt, [Xn|SP], #simm
   5536       01 111 100 01 0 imm9 01 n t   LDR Ht, [Xn|SP], #simm
   5537       10 111 100 01 0 imm9 01 n t   LDR St, [Xn|SP], #simm
   5538       11 111 100 01 0 imm9 01 n t   LDR Dt, [Xn|SP], #simm
   5539       00 111 100 11 0 imm9 01 n t   LDR Qt, [Xn|SP], #simm
   5540 
   5541       (at-EA-then-Rn=EA)
   5542       00 111 100 01 0 imm9 11 n t   LDR Bt, [Xn|SP, #simm]!
   5543       01 111 100 01 0 imm9 11 n t   LDR Ht, [Xn|SP, #simm]!
   5544       10 111 100 01 0 imm9 11 n t   LDR St, [Xn|SP, #simm]!
   5545       11 111 100 01 0 imm9 11 n t   LDR Dt, [Xn|SP, #simm]!
   5546       00 111 100 11 0 imm9 11 n t   LDR Qt, [Xn|SP, #simm]!
   5547 
   5548       Stores are the same except with bit 22 set to 0.
   5549    */
   5550    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
   5551        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
   5552        && INSN(21,21) == 0 && INSN(10,10) == 1) {
   5553       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
   5554       Bool   isLD   = INSN(22,22) == 1;
   5555       UInt   imm9   = INSN(20,12);
   5556       Bool   atRN   = INSN(11,11) == 0;
   5557       UInt   nn     = INSN(9,5);
   5558       UInt   tt     = INSN(4,0);
   5559       IRTemp tRN    = newTemp(Ity_I64);
   5560       IRTemp tEA    = newTemp(Ity_I64);
   5561       IRTemp tTA    = IRTemp_INVALID;
   5562       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
   5563       ULong  simm9  = sx_to_64(imm9, 9);
   5564       assign(tRN, getIReg64orSP(nn));
   5565       assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
   5566       tTA = atRN ? tRN : tEA;
   5567       if (isLD) {
   5568          if (szLg2 < 4) {
   5569             putQReg128(tt, mkV128(0x0000));
   5570          }
   5571          putQRegLO(tt, loadLE(ty, mkexpr(tTA)));
   5572       } else {
   5573          storeLE(mkexpr(tTA), getQRegLO(tt, ty));
   5574       }
   5575       putIReg64orSP(nn, mkexpr(tEA));
   5576       DIP(atRN ? "%s %s, [%s], #%lld\n" : "%s %s, [%s, #%lld]!\n",
   5577           isLD ? "ldr" : "str",
   5578           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
   5579       return True;
   5580    }
   5581 
   5582    /* -------- LDUR/STUR (unscaled offset, SIMD&FP) -------- */
   5583    /* 31 29      23   20   11 9 4
   5584       00 111 100 01 0 imm9 00 n t   LDR Bt, [Xn|SP, #simm]
   5585       01 111 100 01 0 imm9 00 n t   LDR Ht, [Xn|SP, #simm]
   5586       10 111 100 01 0 imm9 00 n t   LDR St, [Xn|SP, #simm]
   5587       11 111 100 01 0 imm9 00 n t   LDR Dt, [Xn|SP, #simm]
   5588       00 111 100 11 0 imm9 00 n t   LDR Qt, [Xn|SP, #simm]
   5589 
   5590       00 111 100 00 0 imm9 00 n t   STR Bt, [Xn|SP, #simm]
   5591       01 111 100 00 0 imm9 00 n t   STR Ht, [Xn|SP, #simm]
   5592       10 111 100 00 0 imm9 00 n t   STR St, [Xn|SP, #simm]
   5593       11 111 100 00 0 imm9 00 n t   STR Dt, [Xn|SP, #simm]
   5594       00 111 100 10 0 imm9 00 n t   STR Qt, [Xn|SP, #simm]
   5595    */
   5596    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
   5597        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
   5598        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
   5599       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
   5600       Bool   isLD   = INSN(22,22) == 1;
   5601       UInt   imm9   = INSN(20,12);
   5602       UInt   nn     = INSN(9,5);
   5603       UInt   tt     = INSN(4,0);
   5604       ULong  simm9  = sx_to_64(imm9, 9);
   5605       IRTemp tEA    = newTemp(Ity_I64);
   5606       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
   5607       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(simm9)));
   5608       if (isLD) {
   5609          if (szLg2 < 4) {
   5610             putQReg128(tt, mkV128(0x0000));
   5611          }
   5612          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
   5613       } else {
   5614          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
   5615       }
   5616       DIP("%s %s, [%s, #%lld]\n",
   5617           isLD ? "ldur" : "stur",
   5618           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
   5619       return True;
   5620    }
   5621 
   5622    /* ---------------- LDR (literal, SIMD&FP) ---------------- */
   5623    /* 31 29      23    4
   5624       00 011 100 imm19 t    LDR St, [PC + sxTo64(imm19 << 2)]
   5625       01 011 100 imm19 t    LDR Dt, [PC + sxTo64(imm19 << 2)]
   5626       10 011 100 imm19 t    LDR Qt, [PC + sxTo64(imm19 << 2)]
   5627    */
   5628    if (INSN(29,24) == BITS6(0,1,1,1,0,0) && INSN(31,30) < BITS2(1,1)) {
   5629       UInt   szB   = 4 << INSN(31,30);
   5630       UInt   imm19 = INSN(23,5);
   5631       UInt   tt    = INSN(4,0);
   5632       ULong  ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
   5633       IRType ty    = preferredVectorSubTypeFromSize(szB);
   5634       putQReg128(tt, mkV128(0x0000));
   5635       putQRegLO(tt, loadLE(ty, mkU64(ea)));
   5636       DIP("ldr %s, 0x%llx (literal)\n", nameQRegLO(tt, ty), ea);
   5637       return True;
   5638    }
   5639 
   5640    /* ------ LD1/ST1 (multiple 1-elem structs to/from 1 reg  ------ */
   5641    /* ------ LD2/ST2 (multiple 2-elem structs to/from 2 regs ------ */
   5642    /* ------ LD3/ST3 (multiple 3-elem structs to/from 3 regs ------ */
   5643    /* ------ LD4/ST4 (multiple 4-elem structs to/from 4 regs ------ */
   5644    /* 31 29  26   22 21 20    15   11 9 4
   5645 
   5646       0q 001 1000 L  0  00000 0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP]
   5647       0q 001 1001 L  0  m     0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP], step
   5648 
   5649       0q 001 1000 L  0  00000 0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP]
   5650       0q 001 1001 L  0  m     0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP], step
   5651 
   5652       0q 001 1000 L  0  00000 1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP]
   5653       0q 001 1001 L  0  m     1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP], step
   5654 
   5655       0q 001 1000 L  0  00000 0111 sz n t  xx1 {Vt.T},      [Xn|SP]
   5656       0q 001 1001 L  0  m     0111 sz n t  xx1 {Vt.T},      [Xn|SP], step
   5657 
   5658       T    = defined by Q and sz in the normal way
   5659       step = if m == 11111 then transfer-size else Xm
   5660       xx   = case L of 1 -> LD ; 0 -> ST
   5661    */
   5662    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
   5663        && INSN(21,21) == 0) {
   5664       Bool bitQ  = INSN(30,30);
   5665       Bool isPX  = INSN(23,23) == 1;
   5666       Bool isLD  = INSN(22,22) == 1;
   5667       UInt mm    = INSN(20,16);
   5668       UInt opc   = INSN(15,12);
   5669       UInt sz    = INSN(11,10);
   5670       UInt nn    = INSN(9,5);
   5671       UInt tt    = INSN(4,0);
   5672       Bool isQ   = bitQ == 1;
   5673       Bool is1d  = sz == BITS2(1,1) && !isQ;
   5674       UInt nRegs = 0;
   5675       switch (opc) {
   5676          case BITS4(0,0,0,0): nRegs = 4; break;
   5677          case BITS4(0,1,0,0): nRegs = 3; break;
   5678          case BITS4(1,0,0,0): nRegs = 2; break;
   5679          case BITS4(0,1,1,1): nRegs = 1; break;
   5680          default: break;
   5681       }
   5682 
   5683       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
   5684          If we see it, set nRegs to 0 so as to cause the next conditional
   5685          to fail. */
   5686       if (!isPX && mm != 0)
   5687          nRegs = 0;
   5688 
   5689       if (nRegs == 1                             /* .1d is allowed */
   5690           || (nRegs >= 2 && nRegs <= 4 && !is1d) /* .1d is not allowed */) {
   5691 
   5692          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
   5693 
   5694          /* Generate the transfer address (TA) and if necessary the
   5695             writeback address (WB) */
   5696          IRTemp tTA = newTemp(Ity_I64);
   5697          assign(tTA, getIReg64orSP(nn));
   5698          if (nn == 31) { /* FIXME generate stack alignment check */ }
   5699          IRTemp tWB = IRTemp_INVALID;
   5700          if (isPX) {
   5701             tWB = newTemp(Ity_I64);
   5702             assign(tWB, binop(Iop_Add64,
   5703                               mkexpr(tTA),
   5704                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
   5705                                                      : getIReg64orZR(mm)));
   5706          }
   5707 
   5708          /* -- BEGIN generate the transfers -- */
   5709 
   5710          IRTemp u0, u1, u2, u3, i0, i1, i2, i3;
   5711          u0 = u1 = u2 = u3 = i0 = i1 = i2 = i3 = IRTemp_INVALID;
   5712          switch (nRegs) {
   5713             case 4: u3 = newTempV128(); i3 = newTempV128(); /* fallthru */
   5714             case 3: u2 = newTempV128(); i2 = newTempV128(); /* fallthru */
   5715             case 2: u1 = newTempV128(); i1 = newTempV128(); /* fallthru */
   5716             case 1: u0 = newTempV128(); i0 = newTempV128(); break;
   5717             default: vassert(0);
   5718          }
   5719 
   5720          /* -- Multiple 128 or 64 bit stores -- */
   5721          if (!isLD) {
   5722             switch (nRegs) {
   5723                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
   5724                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
   5725                case 2: assign(u1, getQReg128((tt+1) % 32)); /* fallthru */
   5726                case 1: assign(u0, getQReg128((tt+0) % 32)); break;
   5727                default: vassert(0);
   5728             }
   5729             switch (nRegs) {
   5730                case 4:  (isQ ? math_INTERLEAVE4_128 : math_INTERLEAVE4_64)
   5731                            (&i0, &i1, &i2, &i3, sz, u0, u1, u2, u3);
   5732                         break;
   5733                case 3:  (isQ ? math_INTERLEAVE3_128 : math_INTERLEAVE3_64)
   5734                            (&i0, &i1, &i2, sz, u0, u1, u2);
   5735                         break;
   5736                case 2:  (isQ ? math_INTERLEAVE2_128 : math_INTERLEAVE2_64)
   5737                            (&i0, &i1, sz, u0, u1);
   5738                         break;
   5739                case 1:  (isQ ? math_INTERLEAVE1_128 : math_INTERLEAVE1_64)
   5740                            (&i0, sz, u0);
   5741                         break;
   5742                default: vassert(0);
   5743             }
   5744 #           define MAYBE_NARROW_TO_64(_expr) \
   5745                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
   5746             UInt step = isQ ? 16 : 8;
   5747             switch (nRegs) {
   5748                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
   5749                                  MAYBE_NARROW_TO_64(mkexpr(i3)) );
   5750                         /* fallthru */
   5751                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
   5752                                  MAYBE_NARROW_TO_64(mkexpr(i2)) );
   5753                         /* fallthru */
   5754                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
   5755                                  MAYBE_NARROW_TO_64(mkexpr(i1)) );
   5756                         /* fallthru */
   5757                case 1:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
   5758                                  MAYBE_NARROW_TO_64(mkexpr(i0)) );
   5759                         break;
   5760                default: vassert(0);
   5761             }
   5762 #           undef MAYBE_NARROW_TO_64
   5763          }
   5764 
   5765          /* -- Multiple 128 or 64 bit loads -- */
   5766          else /* isLD */ {
   5767             UInt   step   = isQ ? 16 : 8;
   5768             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
   5769 #           define MAYBE_WIDEN_FROM_64(_expr) \
   5770                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
   5771             switch (nRegs) {
   5772                case 4:
   5773                   assign(i3, MAYBE_WIDEN_FROM_64(
   5774                                 loadLE(loadTy,
   5775                                        binop(Iop_Add64, mkexpr(tTA),
   5776                                                         mkU64(3 * step)))));
   5777                   /* fallthru */
   5778                case 3:
   5779                   assign(i2, MAYBE_WIDEN_FROM_64(
   5780                                 loadLE(loadTy,
   5781                                        binop(Iop_Add64, mkexpr(tTA),
   5782                                                         mkU64(2 * step)))));
   5783                   /* fallthru */
   5784                case 2:
   5785                   assign(i1, MAYBE_WIDEN_FROM_64(
   5786                                 loadLE(loadTy,
   5787                                        binop(Iop_Add64, mkexpr(tTA),
   5788                                                         mkU64(1 * step)))));
   5789                   /* fallthru */
   5790                case 1:
   5791                   assign(i0, MAYBE_WIDEN_FROM_64(
   5792                                 loadLE(loadTy,
   5793                                        binop(Iop_Add64, mkexpr(tTA),
   5794                                                         mkU64(0 * step)))));
   5795                   break;
   5796                default:
   5797                   vassert(0);
   5798             }
   5799 #           undef MAYBE_WIDEN_FROM_64
   5800             switch (nRegs) {
   5801                case 4:  (isQ ? math_DEINTERLEAVE4_128 : math_DEINTERLEAVE4_64)
   5802                            (&u0, &u1, &u2, &u3, sz, i0,i1,i2,i3);
   5803                         break;
   5804                case 3:  (isQ ? math_DEINTERLEAVE3_128 : math_DEINTERLEAVE3_64)
   5805                            (&u0, &u1, &u2, sz, i0, i1, i2);
   5806                         break;
   5807                case 2:  (isQ ? math_DEINTERLEAVE2_128 : math_DEINTERLEAVE2_64)
   5808                            (&u0, &u1, sz, i0, i1);
   5809                         break;
   5810                case 1:  (isQ ? math_DEINTERLEAVE1_128 : math_DEINTERLEAVE1_64)
   5811                            (&u0, sz, i0);
   5812                         break;
   5813                default: vassert(0);
   5814             }
   5815             switch (nRegs) {
   5816                case 4:  putQReg128( (tt+3) % 32,
   5817                                     math_MAYBE_ZERO_HI64(bitQ, u3));
   5818                         /* fallthru */
   5819                case 3:  putQReg128( (tt+2) % 32,
   5820                                     math_MAYBE_ZERO_HI64(bitQ, u2));
   5821                         /* fallthru */
   5822                case 2:  putQReg128( (tt+1) % 32,
   5823                                     math_MAYBE_ZERO_HI64(bitQ, u1));
   5824                         /* fallthru */
   5825                case 1:  putQReg128( (tt+0) % 32,
   5826                                     math_MAYBE_ZERO_HI64(bitQ, u0));
   5827                         break;
   5828                default: vassert(0);
   5829             }
   5830          }
   5831 
   5832          /* -- END generate the transfers -- */
   5833 
   5834          /* Do the writeback, if necessary */
   5835          if (isPX) {
   5836             putIReg64orSP(nn, mkexpr(tWB));
   5837          }
   5838 
   5839          HChar pxStr[20];
   5840          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
   5841          if (isPX) {
   5842             if (mm == BITS5(1,1,1,1,1))
   5843                vex_sprintf(pxStr, ", #%u", xferSzB);
   5844             else
   5845                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
   5846          }
   5847          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
   5848          DIP("%s%u {v%u.%s .. v%u.%s}, [%s]%s\n",
   5849              isLD ? "ld" : "st", nRegs,
   5850              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
   5851              pxStr);
   5852 
   5853          return True;
   5854       }
   5855       /* else fall through */
   5856    }
   5857 
   5858    /* ------ LD1/ST1 (multiple 1-elem structs to/from 2 regs  ------ */
   5859    /* ------ LD1/ST1 (multiple 1-elem structs to/from 3 regs  ------ */
   5860    /* ------ LD1/ST1 (multiple 1-elem structs to/from 4 regs  ------ */
   5861    /* 31 29  26   22 21 20    15   11 9 4
   5862 
   5863       0q 001 1000 L  0  00000 0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP]
   5864       0q 001 1001 L  0  m     0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP], step
   5865 
   5866       0q 001 1000 L  0  00000 0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP]
   5867       0q 001 1001 L  0  m     0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP], step
   5868 
   5869       0q 001 1000 L  0  00000 1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP]
   5870       0q 001 1001 L  0  m     1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP], step
   5871 
   5872       T    = defined by Q and sz in the normal way
   5873       step = if m == 11111 then transfer-size else Xm
   5874       xx   = case L of 1 -> LD ; 0 -> ST
   5875    */
   5876    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
   5877        && INSN(21,21) == 0) {
   5878       Bool bitQ  = INSN(30,30);
   5879       Bool isPX  = INSN(23,23) == 1;
   5880       Bool isLD  = INSN(22,22) == 1;
   5881       UInt mm    = INSN(20,16);
   5882       UInt opc   = INSN(15,12);
   5883       UInt sz    = INSN(11,10);
   5884       UInt nn    = INSN(9,5);
   5885       UInt tt    = INSN(4,0);
   5886       Bool isQ   = bitQ == 1;
   5887       UInt nRegs = 0;
   5888       switch (opc) {
   5889          case BITS4(0,0,1,0): nRegs = 4; break;
   5890          case BITS4(0,1,1,0): nRegs = 3; break;
   5891          case BITS4(1,0,1,0): nRegs = 2; break;
   5892          default: break;
   5893       }
   5894 
   5895       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
   5896          If we see it, set nRegs to 0 so as to cause the next conditional
   5897          to fail. */
   5898       if (!isPX && mm != 0)
   5899          nRegs = 0;
   5900 
   5901       if (nRegs >= 2 && nRegs <= 4) {
   5902 
   5903          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
   5904 
   5905          /* Generate the transfer address (TA) and if necessary the
   5906             writeback address (WB) */
   5907          IRTemp tTA = newTemp(Ity_I64);
   5908          assign(tTA, getIReg64orSP(nn));
   5909          if (nn == 31) { /* FIXME generate stack alignment check */ }
   5910          IRTemp tWB = IRTemp_INVALID;
   5911          if (isPX) {
   5912             tWB = newTemp(Ity_I64);
   5913             assign(tWB, binop(Iop_Add64,
   5914                               mkexpr(tTA),
   5915                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
   5916                                                      : getIReg64orZR(mm)));
   5917          }
   5918 
   5919          /* -- BEGIN generate the transfers -- */
   5920 
   5921          IRTemp u0, u1, u2, u3;
   5922          u0 = u1 = u2 = u3 = IRTemp_INVALID;
   5923          switch (nRegs) {
   5924             case 4: u3 = newTempV128(); /* fallthru */
   5925             case 3: u2 = newTempV128(); /* fallthru */
   5926             case 2: u1 = newTempV128();
   5927                     u0 = newTempV128(); break;
   5928             default: vassert(0);
   5929          }
   5930 
   5931          /* -- Multiple 128 or 64 bit stores -- */
   5932          if (!isLD) {
   5933             switch (nRegs) {
   5934                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
   5935                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
   5936                case 2: assign(u1, getQReg128((tt+1) % 32));
   5937                        assign(u0, getQReg128((tt+0) % 32)); break;
   5938                default: vassert(0);
   5939             }
   5940 #           define MAYBE_NARROW_TO_64(_expr) \
   5941                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
   5942             UInt step = isQ ? 16 : 8;
   5943             switch (nRegs) {
   5944                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
   5945                                  MAYBE_NARROW_TO_64(mkexpr(u3)) );
   5946                         /* fallthru */
   5947                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
   5948                                  MAYBE_NARROW_TO_64(mkexpr(u2)) );
   5949                         /* fallthru */
   5950                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
   5951                                  MAYBE_NARROW_TO_64(mkexpr(u1)) );
   5952                         storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
   5953                                  MAYBE_NARROW_TO_64(mkexpr(u0)) );
   5954                         break;
   5955                default: vassert(0);
   5956             }
   5957 #           undef MAYBE_NARROW_TO_64
   5958          }
   5959 
   5960          /* -- Multiple 128 or 64 bit loads -- */
   5961          else /* isLD */ {
   5962             UInt   step   = isQ ? 16 : 8;
   5963             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
   5964 #           define MAYBE_WIDEN_FROM_64(_expr) \
   5965                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
   5966             switch (nRegs) {
   5967                case 4:
   5968                   assign(u3, MAYBE_WIDEN_FROM_64(
   5969                                 loadLE(loadTy,
   5970                                        binop(Iop_Add64, mkexpr(tTA),
   5971                                                         mkU64(3 * step)))));
   5972                   /* fallthru */
   5973                case 3:
   5974                   assign(u2, MAYBE_WIDEN_FROM_64(
   5975                                 loadLE(loadTy,
   5976                                        binop(Iop_Add64, mkexpr(tTA),
   5977                                                         mkU64(2 * step)))));
   5978                   /* fallthru */
   5979                case 2:
   5980                   assign(u1, MAYBE_WIDEN_FROM_64(
   5981                                 loadLE(loadTy,
   5982                                        binop(Iop_Add64, mkexpr(tTA),
   5983                                                         mkU64(1 * step)))));
   5984                   assign(u0, MAYBE_WIDEN_FROM_64(
   5985                                 loadLE(loadTy,
   5986                                        binop(Iop_Add64, mkexpr(tTA),
   5987                                                         mkU64(0 * step)))));
   5988                   break;
   5989                default:
   5990                   vassert(0);
   5991             }
   5992 #           undef MAYBE_WIDEN_FROM_64
   5993             switch (nRegs) {
   5994                case 4:  putQReg128( (tt+3) % 32,
   5995                                     math_MAYBE_ZERO_HI64(bitQ, u3));
   5996                         /* fallthru */
   5997                case 3:  putQReg128( (tt+2) % 32,
   5998                                     math_MAYBE_ZERO_HI64(bitQ, u2));
   5999                         /* fallthru */
   6000                case 2:  putQReg128( (tt+1) % 32,
   6001                                     math_MAYBE_ZERO_HI64(bitQ, u1));
   6002                         putQReg128( (tt+0) % 32,
   6003                                     math_MAYBE_ZERO_HI64(bitQ, u0));
   6004                         break;
   6005                default: vassert(0);
   6006             }
   6007          }
   6008 
   6009          /* -- END generate the transfers -- */
   6010 
   6011          /* Do the writeback, if necessary */
   6012          if (isPX) {
   6013             putIReg64orSP(nn, mkexpr(tWB));
   6014          }
   6015 
   6016          HChar pxStr[20];
   6017          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
   6018          if (isPX) {
   6019             if (mm == BITS5(1,1,1,1,1))
   6020                vex_sprintf(pxStr, ", #%u", xferSzB);
   6021             else
   6022                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
   6023          }
   6024          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
   6025          DIP("%s1 {v%u.%s .. v%u.%s}, [%s]%s\n",
   6026              isLD ? "ld" : "st",
   6027              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
   6028              pxStr);
   6029 
   6030          return True;
   6031       }
   6032       /* else fall through */
   6033    }
   6034 
   6035    /* ---------- LD1R (single structure, replicate) ---------- */
   6036    /* ---------- LD2R (single structure, replicate) ---------- */
   6037    /* ---------- LD3R (single structure, replicate) ---------- */
   6038    /* ---------- LD4R (single structure, replicate) ---------- */
   6039    /* 31 29       22 20    15    11 9 4
   6040       0q 001 1010 10 00000 110 0 sz n t  LD1R {Vt.T}, [Xn|SP]
   6041       0q 001 1011 10 m     110 0 sz n t  LD1R {Vt.T}, [Xn|SP], step
   6042 
   6043       0q 001 1010 11 00000 110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP]
   6044       0q 001 1011 11 m     110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP], step
   6045 
   6046       0q 001 1010 10 00000 111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP]
   6047       0q 001 1011 10 m     111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP], step
   6048 
   6049       0q 001 1010 11 00000 111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP]
   6050       0q 001 1011 11 m     111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP], step
   6051 
   6052       step = if m == 11111 then transfer-size else Xm
   6053    */
   6054    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)
   6055        && INSN(22,22) == 1 && INSN(15,14) == BITS2(1,1)
   6056        && INSN(12,12) == 0) {
   6057       UInt   bitQ  = INSN(30,30);
   6058       Bool   isPX  = INSN(23,23) == 1;
   6059       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
   6060       UInt   mm    = INSN(20,16);
   6061       UInt   sz    = INSN(11,10);
   6062       UInt   nn    = INSN(9,5);
   6063       UInt   tt    = INSN(4,0);
   6064 
   6065       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
   6066       if (isPX || mm == 0) {
   6067 
   6068          IRType ty    = integerIRTypeOfSize(1 << sz);
   6069 
   6070          UInt laneSzB = 1 << sz;
   6071          UInt xferSzB = laneSzB * nRegs;
   6072 
   6073          /* Generate the transfer address (TA) and if necessary the
   6074             writeback address (WB) */
   6075          IRTemp tTA = newTemp(Ity_I64);
   6076          assign(tTA, getIReg64orSP(nn));
   6077          if (nn == 31) { /* FIXME generate stack alignment check */ }
   6078          IRTemp tWB = IRTemp_INVALID;
   6079          if (isPX) {
   6080             tWB = newTemp(Ity_I64);
   6081             assign(tWB, binop(Iop_Add64,
   6082                               mkexpr(tTA),
   6083                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
   6084                                                      : getIReg64orZR(mm)));
   6085          }
   6086 
   6087          /* Do the writeback, if necessary */
   6088          if (isPX) {
   6089             putIReg64orSP(nn, mkexpr(tWB));
   6090          }
   6091 
   6092          IRTemp e0, e1, e2, e3, v0, v1, v2, v3;
   6093          e0 = e1 = e2 = e3 = v0 = v1 = v2 = v3 = IRTemp_INVALID;
   6094          switch (nRegs) {
   6095             case 4:
   6096                e3 = newTemp(ty);
   6097                assign(e3, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
   6098                                                       mkU64(3 * laneSzB))));
   6099                v3 = math_DUP_TO_V128(e3, ty);
   6100                putQReg128((tt+3) % 32, math_MAYBE_ZERO_HI64(bitQ, v3));
   6101                /* fallthrough */
   6102             case 3:
   6103                e2 = newTemp(ty);
   6104                assign(e2, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
   6105                                                       mkU64(2 * laneSzB))));
   6106                v2 = math_DUP_TO_V128(e2, ty);
   6107                putQReg128((tt+2) % 32, math_MAYBE_ZERO_HI64(bitQ, v2));
   6108                /* fallthrough */
   6109             case 2:
   6110                e1 = newTemp(ty);
   6111                assign(e1, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
   6112                                                       mkU64(1 * laneSzB))));
   6113                v1 = math_DUP_TO_V128(e1, ty);
   6114                putQReg128((tt+1) % 32, math_MAYBE_ZERO_HI64(bitQ, v1));
   6115                /* fallthrough */
   6116             case 1:
   6117                e0 = newTemp(ty);
   6118                assign(e0, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
   6119                                                       mkU64(0 * laneSzB))));
   6120                v0 = math_DUP_TO_V128(e0, ty);
   6121                putQReg128((tt+0) % 32, math_MAYBE_ZERO_HI64(bitQ, v0));
   6122                break;
   6123             default:
   6124                vassert(0);
   6125          }
   6126 
   6127          HChar pxStr[20];
   6128          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
   6129          if (isPX) {
   6130             if (mm == BITS5(1,1,1,1,1))
   6131                vex_sprintf(pxStr, ", #%u", xferSzB);
   6132             else
   6133                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
   6134          }
   6135          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
   6136          DIP("ld%ur {v%u.%s .. v%u.%s}, [%s]%s\n",
   6137              nRegs,
   6138              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
   6139              pxStr);
   6140 
   6141          return True;
   6142       }
   6143       /* else fall through */
   6144    }
   6145 
   6146    /* ------ LD1/ST1 (single structure, to/from one lane) ------ */
   6147    /* ------ LD2/ST2 (single structure, to/from one lane) ------ */
   6148    /* ------ LD3/ST3 (single structure, to/from one lane) ------ */
   6149    /* ------ LD4/ST4 (single structure, to/from one lane) ------ */
   6150    /* 31 29       22 21 20    15    11 9 4
   6151       0q 001 1010 L  0  00000 xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP]
   6152       0q 001 1011 L  0  m     xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP], step
   6153 
   6154       0q 001 1010 L  1  00000 xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP]
   6155       0q 001 1011 L  1  m     xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP], step
   6156 
   6157       0q 001 1010 L  0  00000 xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP]
   6158       0q 001 1011 L  0  m     xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP], step
   6159 
   6160       0q 001 1010 L  1  00000 xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP]
   6161       0q 001 1011 L  1  m     xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP], step
   6162 
   6163       step = if m == 11111 then transfer-size else Xm
   6164       op   = case L of 1 -> LD ; 0 -> ST
   6165 
   6166       laneszB,ix = case xx:q:S:sz of 00:b:b:bb -> 1, bbbb
   6167                                      01:b:b:b0 -> 2, bbb
   6168                                      10:b:b:00 -> 4, bb
   6169                                      10:b:0:01 -> 8, b
   6170    */
   6171    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)) {
   6172       UInt   bitQ  = INSN(30,30);
   6173       Bool   isPX  = INSN(23,23) == 1;
   6174       Bool   isLD  = INSN(22,22) == 1;
   6175       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
   6176       UInt   mm    = INSN(20,16);
   6177       UInt   xx    = INSN(15,14);
   6178       UInt   bitS  = INSN(12,12);
   6179       UInt   sz    = INSN(11,10);
   6180       UInt   nn    = INSN(9,5);
   6181       UInt   tt    = INSN(4,0);
   6182 
   6183       Bool valid = True;
   6184 
   6185       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
   6186       if (!isPX && mm != 0)
   6187          valid = False;
   6188 
   6189       UInt laneSzB = 0;  /* invalid */
   6190       UInt ix      = 16; /* invalid */
   6191 
   6192       UInt xx_q_S_sz = (xx << 4) | (bitQ << 3) | (bitS << 2) | sz;
   6193       switch (xx_q_S_sz) {
   6194          case 0x00: case 0x01: case 0x02: case 0x03:
   6195          case 0x04: case 0x05: case 0x06: case 0x07:
   6196          case 0x08: case 0x09: case 0x0A: case 0x0B:
   6197          case 0x0C: case 0x0D: case 0x0E: case 0x0F:
   6198             laneSzB = 1; ix = xx_q_S_sz & 0xF;
   6199             break;
   6200          case 0x10: case 0x12: case 0x14: case 0x16:
   6201          case 0x18: case 0x1A: case 0x1C: case 0x1E:
   6202             laneSzB = 2; ix = (xx_q_S_sz >> 1) & 7;
   6203             break;
   6204          case 0x20: case 0x24: case 0x28: case 0x2C:
   6205             laneSzB = 4; ix = (xx_q_S_sz >> 2) & 3;
   6206             break;
   6207          case 0x21: case 0x29:
   6208             laneSzB = 8; ix = (xx_q_S_sz >> 3) & 1;
   6209             break;
   6210          default:
   6211             break;
   6212       }
   6213 
   6214       if (valid && laneSzB != 0) {
   6215 
   6216          IRType ty      = integerIRTypeOfSize(laneSzB);
   6217          UInt   xferSzB = laneSzB * nRegs;
   6218 
   6219          /* Generate the transfer address (TA) and if necessary the
   6220             writeback address (WB) */
   6221          IRTemp tTA = newTemp(Ity_I64);
   6222          assign(tTA, getIReg64orSP(nn));
   6223          if (nn == 31) { /* FIXME generate stack alignment check */ }
   6224          IRTemp tWB = IRTemp_INVALID;
   6225          if (isPX) {
   6226             tWB = newTemp(Ity_I64);
   6227             assign(tWB, binop(Iop_Add64,
   6228                               mkexpr(tTA),
   6229                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
   6230                                                      : getIReg64orZR(mm)));
   6231          }
   6232 
   6233          /* Do the writeback, if necessary */
   6234          if (isPX) {
   6235             putIReg64orSP(nn, mkexpr(tWB));
   6236          }
   6237 
   6238          switch (nRegs) {
   6239             case 4: {
   6240                IRExpr* addr
   6241                   = binop(Iop_Add64, mkexpr(tTA), mkU64(3 * laneSzB));
   6242                if (isLD) {
   6243                   putQRegLane((tt+3) % 32, ix, loadLE(ty, addr));
   6244                } else {
   6245                   storeLE(addr, getQRegLane((tt+3) % 32, ix, ty));
   6246                }
   6247                /* fallthrough */
   6248             }
   6249             case 3: {
   6250                IRExpr* addr
   6251                   = binop(Iop_Add64, mkexpr(tTA), mkU64(2 * laneSzB));
   6252                if (isLD) {
   6253                   putQRegLane((tt+2) % 32, ix, loadLE(ty, addr));
   6254                } else {
   6255                   storeLE(addr, getQRegLane((tt+2) % 32, ix, ty));
   6256                }
   6257                /* fallthrough */
   6258             }
   6259             case 2: {
   6260                IRExpr* addr
   6261                   = binop(Iop_Add64, mkexpr(tTA), mkU64(1 * laneSzB));
   6262                if (isLD) {
   6263                   putQRegLane((tt+1) % 32, ix, loadLE(ty, addr));
   6264                } else {
   6265                   storeLE(addr, getQRegLane((tt+1) % 32, ix, ty));
   6266                }
   6267                /* fallthrough */
   6268             }
   6269             case 1: {
   6270                IRExpr* addr
   6271                   = binop(Iop_Add64, mkexpr(tTA), mkU64(0 * laneSzB));
   6272                if (isLD) {
   6273                   putQRegLane((tt+0) % 32, ix, loadLE(ty, addr));
   6274                } else {
   6275                   storeLE(addr, getQRegLane((tt+0) % 32, ix, ty));
   6276                }
   6277                break;
   6278             }
   6279             default:
   6280                vassert(0);
   6281          }
   6282 
   6283          HChar pxStr[20];
   6284          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
   6285          if (isPX) {
   6286             if (mm == BITS5(1,1,1,1,1))
   6287                vex_sprintf(pxStr, ", #%u", xferSzB);
   6288             else
   6289                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
   6290          }
   6291          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
   6292          DIP("%s%u {v%u.%s .. v%u.%s}[%u], [%s]%s\n",
   6293              isLD ? "ld" : "st", nRegs,
   6294              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr,
   6295              ix, nameIReg64orSP(nn), pxStr);
   6296 
   6297          return True;
   6298       }
   6299       /* else fall through */
   6300    }
   6301 
   6302    /* ------------------ LD{,A}X{R,RH,RB} ------------------ */
   6303    /* ------------------ ST{,L}X{R,RH,RB} ------------------ */
   6304    /* 31 29     23  20      14    9 4
   6305       sz 001000 010 11111 0 11111 n t   LDX{R,RH,RB}  Rt, [Xn|SP]
   6306       sz 001000 010 11111 1 11111 n t   LDAX{R,RH,RB} Rt, [Xn|SP]
   6307       sz 001000 000 s     0 11111 n t   STX{R,RH,RB}  Ws, Rt, [Xn|SP]
   6308       sz 001000 000 s     1 11111 n t   STLX{R,RH,RB} Ws, Rt, [Xn|SP]
   6309    */
   6310    if (INSN(29,23) == BITS7(0,0,1,0,0,0,0)
   6311        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
   6312        && INSN(14,10) == BITS5(1,1,1,1,1)) {
   6313       UInt szBlg2     = INSN(31,30);
   6314       Bool isLD       = INSN(22,22) == 1;
   6315       Bool isAcqOrRel = INSN(15,15) == 1;
   6316       UInt ss         = INSN(20,16);
   6317       UInt nn         = INSN(9,5);
   6318       UInt tt         = INSN(4,0);
   6319 
   6320       vassert(szBlg2 < 4);
   6321       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
   6322       IRType ty  = integerIRTypeOfSize(szB);
   6323       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
   6324 
   6325       IRTemp ea = newTemp(Ity_I64);
   6326       assign(ea, getIReg64orSP(nn));
   6327       /* FIXME generate check that ea is szB-aligned */
   6328 
   6329       if (isLD && ss == BITS5(1,1,1,1,1)) {
   6330          IRTemp res = newTemp(ty);
   6331          stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
   6332          putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
   6333          if (isAcqOrRel) {
   6334             stmt(IRStmt_MBE(Imbe_Fence));
   6335          }
   6336          DIP("ld%sx%s %s, [%s]\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
   6337              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
   6338          return True;
   6339       }
   6340       if (!isLD) {
   6341          if (isAcqOrRel) {
   6342             stmt(IRStmt_MBE(Imbe_Fence));
   6343          }
   6344          IRTemp  res  = newTemp(Ity_I1);
   6345          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
   6346          stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
   6347          /* IR semantics: res is 1 if store succeeds, 0 if it fails.
   6348             Need to set rS to 1 on failure, 0 on success. */
   6349          putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
   6350                                             mkU64(1)));
   6351          DIP("st%sx%s %s, %s, [%s]\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
   6352              nameIRegOrZR(False, ss),
   6353              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
   6354          return True;
   6355       }
   6356       /* else fall through */
   6357    }
   6358 
   6359    /* ------------------ LDA{R,RH,RB} ------------------ */
   6360    /* ------------------ STL{R,RH,RB} ------------------ */
   6361    /* 31 29     23  20      14    9 4
   6362       sz 001000 110 11111 1 11111 n t   LDAR<sz> Rt, [Xn|SP]
   6363       sz 001000 100 11111 1 11111 n t   STLR<sz> Rt, [Xn|SP]
   6364    */
   6365    if (INSN(29,23) == BITS7(0,0,1,0,0,0,1)
   6366        && INSN(21,10) == BITS12(0,1,1,1,1,1,1,1,1,1,1,1)) {
   6367       UInt szBlg2 = INSN(31,30);
   6368       Bool isLD   = INSN(22,22) == 1;
   6369       UInt nn     = INSN(9,5);
   6370       UInt tt     = INSN(4,0);
   6371 
   6372       vassert(szBlg2 < 4);
   6373       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
   6374       IRType ty  = integerIRTypeOfSize(szB);
   6375       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
   6376 
   6377       IRTemp ea = newTemp(Ity_I64);
   6378       assign(ea, getIReg64orSP(nn));
   6379       /* FIXME generate check that ea is szB-aligned */
   6380 
   6381       if (isLD) {
   6382          IRTemp res = newTemp(ty);
   6383          assign(res, loadLE(ty, mkexpr(ea)));
   6384          putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
   6385          stmt(IRStmt_MBE(Imbe_Fence));
   6386          DIP("lda%s %s, [%s]\n", suffix[szBlg2],
   6387              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
   6388       } else {
   6389          stmt(IRStmt_MBE(Imbe_Fence));
   6390          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
   6391          storeLE(mkexpr(ea), data);
   6392          DIP("stl%s %s, [%s]\n", suffix[szBlg2],
   6393              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
   6394       }
   6395       return True;
   6396    }
   6397 
   6398    /* ------------------ PRFM (immediate) ------------------ */
   6399    /* 31           21    9 4
   6400       11 111 00110 imm12 n t   PRFM pfrop=Rt, [Xn|SP, #pimm]
   6401    */
   6402    if (INSN(31,22) == BITS10(1,1,1,1,1,0,0,1,1,0)) {
   6403       UInt imm12 = INSN(21,10);
   6404       UInt nn    = INSN(9,5);
   6405       UInt tt    = INSN(4,0);
   6406       /* Generating any IR here is pointless, except for documentation
   6407          purposes, as it will get optimised away later. */
   6408       IRTemp ea = newTemp(Ity_I64);
   6409       assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(imm12 * 8)));
   6410       DIP("prfm prfop=%u, [%s, #%u]\n", tt, nameIReg64orSP(nn), imm12 * 8);
   6411       return True;
   6412    }
   6413 
   6414    /* ------------------ PRFM (register) ------------------ */
   6415    /* 31 29      22 20 15  12 11 9  4
   6416       11 1110001 01 Rm opt S  10 Rn Rt    PRFM pfrop=Rt, [Xn|SP, R<m>{ext/sh}]
   6417    */
   6418    if (INSN(31,21) == BITS11(1,1,1,1,1,0,0,0,1,0,1)
   6419        && INSN(11,10) == BITS2(1,0)) {
   6420       HChar  dis_buf[64];
   6421       UInt   tt = INSN(4,0);
   6422       IRTemp ea = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
   6423       if (ea != IRTemp_INVALID) {
   6424          /* No actual code to generate. */
   6425          DIP("prfm prfop=%u, %s\n", tt, dis_buf);
   6426          return True;
   6427       }
   6428    }
   6429 
   6430    vex_printf("ARM64 front end: load_store\n");
   6431    return False;
   6432 #  undef INSN
   6433 }
   6434 
   6435 
   6436 /*------------------------------------------------------------*/
   6437 /*--- Control flow and misc instructions                   ---*/
   6438 /*------------------------------------------------------------*/
   6439 
   6440 static
   6441 Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn,
   6442                           const VexArchInfo* archinfo)
   6443 {
   6444 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   6445 
   6446    /* ---------------------- B cond ----------------------- */
   6447    /* 31        24    4 3
   6448       0101010 0 imm19 0 cond */
   6449    if (INSN(31,24) == BITS8(0,1,0,1,0,1,0,0) && INSN(4,4) == 0) {
   6450       UInt  cond   = INSN(3,0);
   6451       ULong uimm64 = INSN(23,5) << 2;
   6452       Long  simm64 = (Long)sx_to_64(uimm64, 21);
   6453       vassert(dres->whatNext    == Dis_Continue);
   6454       vassert(dres->len         == 4);
   6455       vassert(dres->continueAt  == 0);
   6456       vassert(dres->jk_StopHere == Ijk_INVALID);
   6457       stmt( IRStmt_Exit(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
   6458                         Ijk_Boring,
   6459                         IRConst_U64(guest_PC_curr_instr + simm64),
   6460                         OFFB_PC) );
   6461       putPC(mkU64(guest_PC_curr_instr + 4));
   6462       dres->whatNext    = Dis_StopHere;
   6463       dres->jk_StopHere = Ijk_Boring;
   6464       DIP("b.%s 0x%llx\n", nameCC(cond), guest_PC_curr_instr + simm64);
   6465       return True;
   6466    }
   6467 
   6468    /* -------------------- B{L} uncond -------------------- */
   6469    if (INSN(30,26) == BITS5(0,0,1,0,1)) {
   6470       /* 000101 imm26  B  (PC + sxTo64(imm26 << 2))
   6471          100101 imm26  B  (PC + sxTo64(imm26 << 2))
   6472       */
   6473       UInt  bLink  = INSN(31,31);
   6474       ULong uimm64 = INSN(25,0) << 2;
   6475       Long  simm64 = (Long)sx_to_64(uimm64, 28);
   6476       if (bLink) {
   6477          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
   6478       }
   6479       putPC(mkU64(guest_PC_curr_instr + simm64));
   6480       dres->whatNext = Dis_StopHere;
   6481       dres->jk_StopHere = Ijk_Call;
   6482       DIP("b%s 0x%llx\n", bLink == 1 ? "l" : "",
   6483                           guest_PC_curr_instr + simm64);
   6484       return True;
   6485    }
   6486 
   6487    /* --------------------- B{L} reg --------------------- */
   6488    /* 31      24 22 20    15     9  4
   6489       1101011 00 10 11111 000000 nn 00000  RET  Rn
   6490       1101011 00 01 11111 000000 nn 00000  CALL Rn
   6491       1101011 00 00 11111 000000 nn 00000  JMP  Rn
   6492    */
   6493    if (INSN(31,23) == BITS9(1,1,0,1,0,1,1,0,0)
   6494        && INSN(20,16) == BITS5(1,1,1,1,1)
   6495        && INSN(15,10) == BITS6(0,0,0,0,0,0)
   6496        && INSN(4,0) == BITS5(0,0,0,0,0)) {
   6497       UInt branch_type = INSN(22,21);
   6498       UInt nn          = INSN(9,5);
   6499       if (branch_type == BITS2(1,0) /* RET */) {
   6500          putPC(getIReg64orZR(nn));
   6501          dres->whatNext = Dis_StopHere;
   6502          dres->jk_StopHere = Ijk_Ret;
   6503          DIP("ret %s\n", nameIReg64orZR(nn));
   6504          return True;
   6505       }
   6506       if (branch_type == BITS2(0,1) /* CALL */) {
   6507          IRTemp dst = newTemp(Ity_I64);
   6508          assign(dst, getIReg64orZR(nn));
   6509          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
   6510          putPC(mkexpr(dst));
   6511          dres->whatNext = Dis_StopHere;
   6512          dres->jk_StopHere = Ijk_Call;
   6513          DIP("blr %s\n", nameIReg64orZR(nn));
   6514          return True;
   6515       }
   6516       if (branch_type == BITS2(0,0) /* JMP */) {
   6517          putPC(getIReg64orZR(nn));
   6518          dres->whatNext = Dis_StopHere;
   6519          dres->jk_StopHere = Ijk_Boring;
   6520          DIP("jmp %s\n", nameIReg64orZR(nn));
   6521          return True;
   6522       }
   6523    }
   6524 
   6525    /* -------------------- CB{N}Z -------------------- */
   6526    /* sf 011 010 1 imm19 Rt   CBNZ Xt|Wt, (PC + sxTo64(imm19 << 2))
   6527       sf 011 010 0 imm19 Rt   CBZ  Xt|Wt, (PC + sxTo64(imm19 << 2))
   6528    */
   6529    if (INSN(30,25) == BITS6(0,1,1,0,1,0)) {
   6530       Bool    is64   = INSN(31,31) == 1;
   6531       Bool    bIfZ   = INSN(24,24) == 0;
   6532       ULong   uimm64 = INSN(23,5) << 2;
   6533       UInt    rT     = INSN(4,0);
   6534       Long    simm64 = (Long)sx_to_64(uimm64, 21);
   6535       IRExpr* cond   = NULL;
   6536       if (is64) {
   6537          cond = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
   6538                       getIReg64orZR(rT), mkU64(0));
   6539       } else {
   6540          cond = binop(bIfZ ? Iop_CmpEQ32 : Iop_CmpNE32,
   6541                       getIReg32orZR(rT), mkU32(0));
   6542       }
   6543       stmt( IRStmt_Exit(cond,
   6544                         Ijk_Boring,
   6545                         IRConst_U64(guest_PC_curr_instr + simm64),
   6546                         OFFB_PC) );
   6547       putPC(mkU64(guest_PC_curr_instr + 4));
   6548       dres->whatNext    = Dis_StopHere;
   6549       dres->jk_StopHere = Ijk_Boring;
   6550       DIP("cb%sz %s, 0x%llx\n",
   6551           bIfZ ? "" : "n", nameIRegOrZR(is64, rT),
   6552           guest_PC_curr_instr + simm64);
   6553       return True;
   6554    }
   6555 
   6556    /* -------------------- TB{N}Z -------------------- */
   6557    /* 31 30      24 23  18  5 4
   6558       b5 011 011 1  b40 imm14 t  TBNZ Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
   6559       b5 011 011 0  b40 imm14 t  TBZ  Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
   6560    */
   6561    if (INSN(30,25) == BITS6(0,1,1,0,1,1)) {
   6562       UInt    b5     = INSN(31,31);
   6563       Bool    bIfZ   = INSN(24,24) == 0;
   6564       UInt    b40    = INSN(23,19);
   6565       UInt    imm14  = INSN(18,5);
   6566       UInt    tt     = INSN(4,0);
   6567       UInt    bitNo  = (b5 << 5) | b40;
   6568       ULong   uimm64 = imm14 << 2;
   6569       Long    simm64 = sx_to_64(uimm64, 16);
   6570       IRExpr* cond
   6571          = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
   6572                  binop(Iop_And64,
   6573                        binop(Iop_Shr64, getIReg64orZR(tt), mkU8(bitNo)),
   6574                        mkU64(1)),
   6575                  mkU64(0));
   6576       stmt( IRStmt_Exit(cond,
   6577                         Ijk_Boring,
   6578                         IRConst_U64(guest_PC_curr_instr + simm64),
   6579                         OFFB_PC) );
   6580       putPC(mkU64(guest_PC_curr_instr + 4));
   6581       dres->whatNext    = Dis_StopHere;
   6582       dres->jk_StopHere = Ijk_Boring;
   6583       DIP("tb%sz %s, #%u, 0x%llx\n",
   6584           bIfZ ? "" : "n", nameIReg64orZR(tt), bitNo,
   6585           guest_PC_curr_instr + simm64);
   6586       return True;
   6587    }
   6588 
   6589    /* -------------------- SVC -------------------- */
   6590    /* 11010100 000 imm16 000 01
   6591       Don't bother with anything except the imm16==0 case.
   6592    */
   6593    if (INSN(31,0) == 0xD4000001) {
   6594       putPC(mkU64(guest_PC_curr_instr + 4));
   6595       dres->whatNext    = Dis_StopHere;
   6596       dres->jk_StopHere = Ijk_Sys_syscall;
   6597       DIP("svc #0\n");
   6598       return True;
   6599    }
   6600 
   6601    /* ------------------ M{SR,RS} ------------------ */
   6602    /* ---- Cases for TPIDR_EL0 ----
   6603       0xD51BD0 010 Rt   MSR tpidr_el0, rT
   6604       0xD53BD0 010 Rt   MRS rT, tpidr_el0
   6605    */
   6606    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51BD040 /*MSR*/
   6607        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53BD040 /*MRS*/) {
   6608       Bool toSys = INSN(21,21) == 0;
   6609       UInt tt    = INSN(4,0);
   6610       if (toSys) {
   6611          stmt( IRStmt_Put( OFFB_TPIDR_EL0, getIReg64orZR(tt)) );
   6612          DIP("msr tpidr_el0, %s\n", nameIReg64orZR(tt));
   6613       } else {
   6614          putIReg64orZR(tt, IRExpr_Get( OFFB_TPIDR_EL0, Ity_I64 ));
   6615          DIP("mrs %s, tpidr_el0\n", nameIReg64orZR(tt));
   6616       }
   6617       return True;
   6618    }
   6619    /* ---- Cases for FPCR ----
   6620       0xD51B44 000 Rt  MSR fpcr, rT
   6621       0xD53B44 000 Rt  MSR rT, fpcr
   6622    */
   6623    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4400 /*MSR*/
   6624        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4400 /*MRS*/) {
   6625       Bool toSys = INSN(21,21) == 0;
   6626       UInt tt    = INSN(4,0);
   6627       if (toSys) {
   6628          stmt( IRStmt_Put( OFFB_FPCR, getIReg32orZR(tt)) );
   6629          DIP("msr fpcr, %s\n", nameIReg64orZR(tt));
   6630       } else {
   6631          putIReg32orZR(tt, IRExpr_Get(OFFB_FPCR, Ity_I32));
   6632          DIP("mrs %s, fpcr\n", nameIReg64orZR(tt));
   6633       }
   6634       return True;
   6635    }
   6636    /* ---- Cases for FPSR ----
   6637       0xD51B44 001 Rt  MSR fpsr, rT
   6638       0xD53B44 001 Rt  MSR rT, fpsr
   6639       The only part of this we model is FPSR.QC.  All other bits
   6640       are ignored when writing to it and RAZ when reading from it.
   6641    */
   6642    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4420 /*MSR*/
   6643        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4420 /*MRS*/) {
   6644       Bool toSys = INSN(21,21) == 0;
   6645       UInt tt    = INSN(4,0);
   6646       if (toSys) {
   6647          /* Just deal with FPSR.QC.  Make up a V128 value which is
   6648             zero if Xt[27] is zero and any other value if Xt[27] is
   6649             nonzero. */
   6650          IRTemp qc64 = newTemp(Ity_I64);
   6651          assign(qc64, binop(Iop_And64,
   6652                             binop(Iop_Shr64, getIReg64orZR(tt), mkU8(27)),
   6653                             mkU64(1)));
   6654          IRExpr* qcV128 = binop(Iop_64HLtoV128, mkexpr(qc64), mkexpr(qc64));
   6655          stmt( IRStmt_Put( OFFB_QCFLAG, qcV128 ) );
   6656          DIP("msr fpsr, %s\n", nameIReg64orZR(tt));
   6657       } else {
   6658          /* Generate a value which is all zeroes except for bit 27,
   6659             which must be zero if QCFLAG is all zeroes and one otherwise. */
   6660          IRTemp qcV128 = newTempV128();
   6661          assign(qcV128, IRExpr_Get( OFFB_QCFLAG, Ity_V128 ));
   6662          IRTemp qc64 = newTemp(Ity_I64);
   6663          assign(qc64, binop(Iop_Or64, unop(Iop_V128HIto64, mkexpr(qcV128)),
   6664                                       unop(Iop_V128to64,   mkexpr(qcV128))));
   6665          IRExpr* res = binop(Iop_Shl64,
   6666                              unop(Iop_1Uto64,
   6667                                   binop(Iop_CmpNE64, mkexpr(qc64), mkU64(0))),
   6668                              mkU8(27));
   6669          putIReg64orZR(tt, res);
   6670          DIP("mrs %s, fpsr\n", nameIReg64orZR(tt));
   6671       }
   6672       return True;
   6673    }
   6674    /* ---- Cases for NZCV ----
   6675       D51B42 000 Rt  MSR nzcv, rT
   6676       D53B42 000 Rt  MRS rT, nzcv
   6677       The only parts of NZCV that actually exist are bits 31:28, which
   6678       are the N Z C and V bits themselves.  Hence the flags thunk provides
   6679       all the state we need.
   6680    */
   6681    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4200 /*MSR*/
   6682        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4200 /*MRS*/) {
   6683       Bool  toSys = INSN(21,21) == 0;
   6684       UInt  tt    = INSN(4,0);
   6685       if (toSys) {
   6686          IRTemp t = newTemp(Ity_I64);
   6687          assign(t, binop(Iop_And64, getIReg64orZR(tt), mkU64(0xF0000000ULL)));
   6688          setFlags_COPY(t);
   6689          DIP("msr %s, nzcv\n", nameIReg32orZR(tt));
   6690       } else {
   6691          IRTemp res = newTemp(Ity_I64);
   6692          assign(res, mk_arm64g_calculate_flags_nzcv());
   6693          putIReg32orZR(tt, unop(Iop_64to32, mkexpr(res)));
   6694          DIP("mrs %s, nzcv\n", nameIReg64orZR(tt));
   6695       }
   6696       return True;
   6697    }
   6698    /* ---- Cases for DCZID_EL0 ----
   6699       Don't support arbitrary reads and writes to this register.  Just
   6700       return the value 16, which indicates that the DC ZVA instruction
   6701       is not permitted, so we don't have to emulate it.
   6702       D5 3B 00 111 Rt  MRS rT, dczid_el0
   6703    */
   6704    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B00E0) {
   6705       UInt tt = INSN(4,0);
   6706       putIReg64orZR(tt, mkU64(1<<4));
   6707       DIP("mrs %s, dczid_el0 (FAKED)\n", nameIReg64orZR(tt));
   6708       return True;
   6709    }
   6710    /* ---- Cases for CTR_EL0 ----
   6711       We just handle reads, and make up a value from the D and I line
   6712       sizes in the VexArchInfo we are given, and patch in the following
   6713       fields that the Foundation model gives ("natively"):
   6714       CWG = 0b0100, ERG = 0b0100, L1Ip = 0b11
   6715       D5 3B 00 001 Rt  MRS rT, dczid_el0
   6716    */
   6717    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B0020) {
   6718       UInt tt = INSN(4,0);
   6719       /* Need to generate a value from dMinLine_lg2_szB and
   6720          dMinLine_lg2_szB.  The value in the register is in 32-bit
   6721          units, so need to subtract 2 from the values in the
   6722          VexArchInfo.  We can assume that the values here are valid --
   6723          disInstr_ARM64 checks them -- so there's no need to deal with
   6724          out-of-range cases. */
   6725       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
   6726               && archinfo->arm64_dMinLine_lg2_szB <= 17
   6727               && archinfo->arm64_iMinLine_lg2_szB >= 2
   6728               && archinfo->arm64_iMinLine_lg2_szB <= 17);
   6729       UInt val
   6730          = 0x8440c000 | ((0xF & (archinfo->arm64_dMinLine_lg2_szB - 2)) << 16)
   6731                       | ((0xF & (archinfo->arm64_iMinLine_lg2_szB - 2)) << 0);
   6732       putIReg64orZR(tt, mkU64(val));
   6733       DIP("mrs %s, ctr_el0\n", nameIReg64orZR(tt));
   6734       return True;
   6735    }
   6736    /* ---- Cases for CNTVCT_EL0 ----
   6737       This is a timestamp counter of some sort.  Support reads of it only
   6738       by passing through to the host.
   6739       D5 3B E0 010 Rt  MRS Xt, cntvct_el0
   6740    */
   6741    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE040) {
   6742       UInt     tt   = INSN(4,0);
   6743       IRTemp   val  = newTemp(Ity_I64);
   6744       IRExpr** args = mkIRExprVec_0();
   6745       IRDirty* d    = unsafeIRDirty_1_N (
   6746                          val,
   6747                          0/*regparms*/,
   6748                          "arm64g_dirtyhelper_MRS_CNTVCT_EL0",
   6749                          &arm64g_dirtyhelper_MRS_CNTVCT_EL0,
   6750                          args
   6751                       );
   6752       /* execute the dirty call, dumping the result in val. */
   6753       stmt( IRStmt_Dirty(d) );
   6754       putIReg64orZR(tt, mkexpr(val));
   6755       DIP("mrs %s, cntvct_el0\n", nameIReg64orZR(tt));
   6756       return True;
   6757    }
   6758 
   6759    /* ------------------ IC_IVAU ------------------ */
   6760    /* D5 0B 75 001 Rt  ic ivau, rT
   6761    */
   6762    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7520) {
   6763       /* We will always be provided with a valid iMinLine value. */
   6764       vassert(archinfo->arm64_iMinLine_lg2_szB >= 2
   6765               && archinfo->arm64_iMinLine_lg2_szB <= 17);
   6766       /* Round the requested address, in rT, down to the start of the
   6767          containing block. */
   6768       UInt   tt      = INSN(4,0);
   6769       ULong  lineszB = 1ULL << archinfo->arm64_iMinLine_lg2_szB;
   6770       IRTemp addr    = newTemp(Ity_I64);
   6771       assign( addr, binop( Iop_And64,
   6772                            getIReg64orZR(tt),
   6773                            mkU64(~(lineszB - 1))) );
   6774       /* Set the invalidation range, request exit-and-invalidate, with
   6775          continuation at the next instruction. */
   6776       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
   6777       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
   6778       /* be paranoid ... */
   6779       stmt( IRStmt_MBE(Imbe_Fence) );
   6780       putPC(mkU64( guest_PC_curr_instr + 4 ));
   6781       dres->whatNext    = Dis_StopHere;
   6782       dres->jk_StopHere = Ijk_InvalICache;
   6783       DIP("ic ivau, %s\n", nameIReg64orZR(tt));
   6784       return True;
   6785    }
   6786 
   6787    /* ------------------ DC_CVAU ------------------ */
   6788    /* D5 0B 7B 001 Rt  dc cvau, rT
   6789    */
   6790    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7B20) {
   6791       /* Exactly the same scheme as for IC IVAU, except we observe the
   6792          dMinLine size, and request an Ijk_FlushDCache instead of
   6793          Ijk_InvalICache. */
   6794       /* We will always be provided with a valid dMinLine value. */
   6795       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
   6796               && archinfo->arm64_dMinLine_lg2_szB <= 17);
   6797       /* Round the requested address, in rT, down to the start of the
   6798          containing block. */
   6799       UInt   tt      = INSN(4,0);
   6800       ULong  lineszB = 1ULL << archinfo->arm64_dMinLine_lg2_szB;
   6801       IRTemp addr    = newTemp(Ity_I64);
   6802       assign( addr, binop( Iop_And64,
   6803                            getIReg64orZR(tt),
   6804                            mkU64(~(lineszB - 1))) );
   6805       /* Set the flush range, request exit-and-flush, with
   6806          continuation at the next instruction. */
   6807       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
   6808       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
   6809       /* be paranoid ... */
   6810       stmt( IRStmt_MBE(Imbe_Fence) );
   6811       putPC(mkU64( guest_PC_curr_instr + 4 ));
   6812       dres->whatNext    = Dis_StopHere;
   6813       dres->jk_StopHere = Ijk_FlushDCache;
   6814       DIP("dc cvau, %s\n", nameIReg64orZR(tt));
   6815       return True;
   6816    }
   6817 
   6818    /* ------------------ ISB, DMB, DSB ------------------ */
   6819    /* 31          21            11  7 6  4
   6820       11010 10100 0 00 011 0011 CRm 1 01 11111  DMB opt
   6821       11010 10100 0 00 011 0011 CRm 1 00 11111  DSB opt
   6822       11010 10100 0 00 011 0011 CRm 1 10 11111  ISB opt
   6823    */
   6824    if (INSN(31,22) == BITS10(1,1,0,1,0,1,0,1,0,0)
   6825        && INSN(21,12) == BITS10(0,0,0,0,1,1,0,0,1,1)
   6826        && INSN(7,7) == 1
   6827        && INSN(6,5) <= BITS2(1,0) && INSN(4,0) == BITS5(1,1,1,1,1)) {
   6828       UInt opc = INSN(6,5);
   6829       UInt CRm = INSN(11,8);
   6830       vassert(opc <= 2 && CRm <= 15);
   6831       stmt(IRStmt_MBE(Imbe_Fence));
   6832       const HChar* opNames[3]
   6833          = { "dsb", "dmb", "isb" };
   6834       const HChar* howNames[16]
   6835          = { "#0", "oshld", "oshst", "osh", "#4", "nshld", "nshst", "nsh",
   6836              "#8", "ishld", "ishst", "ish", "#12", "ld", "st", "sy" };
   6837       DIP("%s %s\n", opNames[opc], howNames[CRm]);
   6838       return True;
   6839    }
   6840 
   6841    /* -------------------- NOP -------------------- */
   6842    if (INSN(31,0) == 0xD503201F) {
   6843       DIP("nop\n");
   6844       return True;
   6845    }
   6846 
   6847    /* -------------------- BRK -------------------- */
   6848    /* 31        23  20    4
   6849       1101 0100 001 imm16 00000  BRK #imm16
   6850    */
   6851    if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,0)
   6852        && INSN(23,21) == BITS3(0,0,1) && INSN(4,0) == BITS5(0,0,0,0,0)) {
   6853       UInt imm16 = INSN(20,5);
   6854       /* Request SIGTRAP and then restart of this insn. */
   6855       putPC(mkU64(guest_PC_curr_instr + 0));
   6856       dres->whatNext    = Dis_StopHere;
   6857       dres->jk_StopHere = Ijk_SigTRAP;
   6858       DIP("brk #%u\n", imm16);
   6859       return True;
   6860    }
   6861 
   6862    /* ------------------- YIELD ------------------- */
   6863    /* 31        23        15        7
   6864       1101 0101 0000 0011 0010 0000 0011 1111
   6865    */
   6866    if (INSN(31,0) == 0xD503203F) {
   6867       /* Request yield followed by continuation at the next insn. */
   6868       putPC(mkU64(guest_PC_curr_instr + 4));
   6869       dres->whatNext    = Dis_StopHere;
   6870       dres->jk_StopHere = Ijk_Yield;
   6871       DIP("yield\n");
   6872       return True;
   6873    }
   6874 
   6875   //fail:
   6876    vex_printf("ARM64 front end: branch_etc\n");
   6877    return False;
   6878 #  undef INSN
   6879 }
   6880 
   6881 
   6882 /*------------------------------------------------------------*/
   6883 /*--- SIMD and FP instructions: helper functions           ---*/
   6884 /*------------------------------------------------------------*/
   6885 
   6886 /* Some constructors for interleave/deinterleave expressions. */
   6887 
   6888 static IRExpr* mk_CatEvenLanes64x2 ( IRTemp a10, IRTemp b10 ) {
   6889    // returns a0 b0
   6890    return binop(Iop_InterleaveLO64x2, mkexpr(a10), mkexpr(b10));
   6891 }
   6892 
   6893 static IRExpr* mk_CatOddLanes64x2 ( IRTemp a10, IRTemp b10 ) {
   6894    // returns a1 b1
   6895    return binop(Iop_InterleaveHI64x2, mkexpr(a10), mkexpr(b10));
   6896 }
   6897 
   6898 static IRExpr* mk_CatEvenLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
   6899    // returns a2 a0 b2 b0
   6900    return binop(Iop_CatEvenLanes32x4, mkexpr(a3210), mkexpr(b3210));
   6901 }
   6902 
   6903 static IRExpr* mk_CatOddLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
   6904    // returns a3 a1 b3 b1
   6905    return binop(Iop_CatOddLanes32x4, mkexpr(a3210), mkexpr(b3210));
   6906 }
   6907 
   6908 static IRExpr* mk_InterleaveLO32x4 ( IRTemp a3210, IRTemp b3210 ) {
   6909    // returns a1 b1 a0 b0
   6910    return binop(Iop_InterleaveLO32x4, mkexpr(a3210), mkexpr(b3210));
   6911 }
   6912 
   6913 static IRExpr* mk_InterleaveHI32x4 ( IRTemp a3210, IRTemp b3210 ) {
   6914    // returns a3 b3 a2 b2
   6915    return binop(Iop_InterleaveHI32x4, mkexpr(a3210), mkexpr(b3210));
   6916 }
   6917 
   6918 static IRExpr* mk_CatEvenLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
   6919    // returns a6 a4 a2 a0 b6 b4 b2 b0
   6920    return binop(Iop_CatEvenLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
   6921 }
   6922 
   6923 static IRExpr* mk_CatOddLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
   6924    // returns a7 a5 a3 a1 b7 b5 b3 b1
   6925    return binop(Iop_CatOddLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
   6926 }
   6927 
   6928 static IRExpr* mk_InterleaveLO16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
   6929    // returns a3 b3 a2 b2 a1 b1 a0 b0
   6930    return binop(Iop_InterleaveLO16x8, mkexpr(a76543210), mkexpr(b76543210));
   6931 }
   6932 
   6933 static IRExpr* mk_InterleaveHI16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
   6934    // returns a7 b7 a6 b6 a5 b5 a4 b4
   6935    return binop(Iop_InterleaveHI16x8, mkexpr(a76543210), mkexpr(b76543210));
   6936 }
   6937 
   6938 static IRExpr* mk_CatEvenLanes8x16 ( IRTemp aFEDCBA9876543210,
   6939                                      IRTemp bFEDCBA9876543210 ) {
   6940    // returns aE aC aA a8 a6 a4 a2 a0 bE bC bA b8 b6 b4 b2 b0
   6941    return binop(Iop_CatEvenLanes8x16, mkexpr(aFEDCBA9876543210),
   6942                                       mkexpr(bFEDCBA9876543210));
   6943 }
   6944 
   6945 static IRExpr* mk_CatOddLanes8x16 ( IRTemp aFEDCBA9876543210,
   6946                                     IRTemp bFEDCBA9876543210 ) {
   6947    // returns aF aD aB a9 a7 a5 a3 a1 bF bD bB b9 b7 b5 b3 b1
   6948    return binop(Iop_CatOddLanes8x16, mkexpr(aFEDCBA9876543210),
   6949                                      mkexpr(bFEDCBA9876543210));
   6950 }
   6951 
   6952 static IRExpr* mk_InterleaveLO8x16 ( IRTemp aFEDCBA9876543210,
   6953                                      IRTemp bFEDCBA9876543210 ) {
   6954    // returns a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
   6955    return binop(Iop_InterleaveLO8x16, mkexpr(aFEDCBA9876543210),
   6956                                       mkexpr(bFEDCBA9876543210));
   6957 }
   6958 
   6959 static IRExpr* mk_InterleaveHI8x16 ( IRTemp aFEDCBA9876543210,
   6960                                      IRTemp bFEDCBA9876543210 ) {
   6961    // returns aF bF aE bE aD bD aC bC aB bB aA bA a9 b9 a8 b8
   6962    return binop(Iop_InterleaveHI8x16, mkexpr(aFEDCBA9876543210),
   6963                                       mkexpr(bFEDCBA9876543210));
   6964 }
   6965 
   6966 /* Generate N copies of |bit| in the bottom of a ULong. */
   6967 static ULong Replicate ( ULong bit, Int N )
   6968 {
   6969    vassert(bit <= 1 && N >= 1 && N < 64);
   6970    if (bit == 0) {
   6971       return 0;
   6972     } else {
   6973       /* Careful.  This won't work for N == 64. */
   6974       return (1ULL << N) - 1;
   6975    }
   6976 }
   6977 
   6978 static ULong Replicate32x2 ( ULong bits32 )
   6979 {
   6980    vassert(0 == (bits32 & ~0xFFFFFFFFULL));
   6981    return (bits32 << 32) | bits32;
   6982 }
   6983 
   6984 static ULong Replicate16x4 ( ULong bits16 )
   6985 {
   6986    vassert(0 == (bits16 & ~0xFFFFULL));
   6987    return Replicate32x2((bits16 << 16) | bits16);
   6988 }
   6989 
   6990 static ULong Replicate8x8 ( ULong bits8 )
   6991 {
   6992    vassert(0 == (bits8 & ~0xFFULL));
   6993    return Replicate16x4((bits8 << 8) | bits8);
   6994 }
   6995 
   6996 /* Expand the VFPExpandImm-style encoding in the bottom 8 bits of
   6997    |imm8| to either a 32-bit value if N is 32 or a 64 bit value if N
   6998    is 64.  In the former case, the upper 32 bits of the returned value
   6999    are guaranteed to be zero. */
   7000 static ULong VFPExpandImm ( ULong imm8, Int N )
   7001 {
   7002    vassert(imm8 <= 0xFF);
   7003    vassert(N == 32 || N == 64);
   7004    Int E = ((N == 32) ? 8 : 11) - 2; // The spec incorrectly omits the -2.
   7005    Int F = N - E - 1;
   7006    ULong imm8_6 = (imm8 >> 6) & 1;
   7007    /* sign: 1 bit */
   7008    /* exp:  E bits */
   7009    /* frac: F bits */
   7010    ULong sign = (imm8 >> 7) & 1;
   7011    ULong exp  = ((imm8_6 ^ 1) << (E-1)) | Replicate(imm8_6, E-1);
   7012    ULong frac = ((imm8 & 63) << (F-6)) | Replicate(0, F-6);
   7013    vassert(sign < (1ULL << 1));
   7014    vassert(exp  < (1ULL << E));
   7015    vassert(frac < (1ULL << F));
   7016    vassert(1 + E + F == N);
   7017    ULong res = (sign << (E+F)) | (exp << F) | frac;
   7018    return res;
   7019 }
   7020 
   7021 /* Expand an AdvSIMDExpandImm-style encoding into a 64-bit value.
   7022    This might fail, as indicated by the returned Bool.  Page 2530 of
   7023    the manual. */
   7024 static Bool AdvSIMDExpandImm ( /*OUT*/ULong* res,
   7025                                UInt op, UInt cmode, UInt imm8 )
   7026 {
   7027    vassert(op <= 1);
   7028    vassert(cmode <= 15);
   7029    vassert(imm8 <= 255);
   7030 
   7031    *res = 0; /* will overwrite iff returning True */
   7032 
   7033    ULong imm64    = 0;
   7034    Bool  testimm8 = False;
   7035 
   7036    switch (cmode >> 1) {
   7037       case 0:
   7038          testimm8 = False; imm64 = Replicate32x2(imm8); break;
   7039       case 1:
   7040          testimm8 = True; imm64 = Replicate32x2(imm8 << 8); break;
   7041       case 2:
   7042          testimm8 = True; imm64 = Replicate32x2(imm8 << 16); break;
   7043       case 3:
   7044          testimm8 = True; imm64 = Replicate32x2(imm8 << 24); break;
   7045       case 4:
   7046           testimm8 = False; imm64 = Replicate16x4(imm8); break;
   7047       case 5:
   7048           testimm8 = True; imm64 = Replicate16x4(imm8 << 8); break;
   7049       case 6:
   7050           testimm8 = True;
   7051           if ((cmode & 1) == 0)
   7052               imm64 = Replicate32x2((imm8 << 8) | 0xFF);
   7053           else
   7054               imm64 = Replicate32x2((imm8 << 16) | 0xFFFF);
   7055           break;
   7056       case 7:
   7057          testimm8 = False;
   7058          if ((cmode & 1) == 0 && op == 0)
   7059              imm64 = Replicate8x8(imm8);
   7060          if ((cmode & 1) == 0 && op == 1) {
   7061              imm64 = 0;   imm64 |= (imm8 & 0x80) ? 0xFF : 0x00;
   7062              imm64 <<= 8; imm64 |= (imm8 & 0x40) ? 0xFF : 0x00;
   7063              imm64 <<= 8; imm64 |= (imm8 & 0x20) ? 0xFF : 0x00;
   7064              imm64 <<= 8; imm64 |= (imm8 & 0x10) ? 0xFF : 0x00;
   7065              imm64 <<= 8; imm64 |= (imm8 & 0x08) ? 0xFF : 0x00;
   7066              imm64 <<= 8; imm64 |= (imm8 & 0x04) ? 0xFF : 0x00;
   7067              imm64 <<= 8; imm64 |= (imm8 & 0x02) ? 0xFF : 0x00;
   7068              imm64 <<= 8; imm64 |= (imm8 & 0x01) ? 0xFF : 0x00;
   7069          }
   7070          if ((cmode & 1) == 1 && op == 0) {
   7071             ULong imm8_7  = (imm8 >> 7) & 1;
   7072             ULong imm8_6  = (imm8 >> 6) & 1;
   7073             ULong imm8_50 = imm8 & 63;
   7074             ULong imm32 = (imm8_7                 << (1 + 5 + 6 + 19))
   7075                           | ((imm8_6 ^ 1)         << (5 + 6 + 19))
   7076                           | (Replicate(imm8_6, 5) << (6 + 19))
   7077                           | (imm8_50              << 19);
   7078             imm64 = Replicate32x2(imm32);
   7079          }
   7080          if ((cmode & 1) == 1 && op == 1) {
   7081             // imm64 = imm8<7>:NOT(imm8<6>)
   7082             //                :Replicate(imm8<6>,8):imm8<5:0>:Zeros(48);
   7083             ULong imm8_7  = (imm8 >> 7) & 1;
   7084             ULong imm8_6  = (imm8 >> 6) & 1;
   7085             ULong imm8_50 = imm8 & 63;
   7086             imm64 = (imm8_7 << 63) | ((imm8_6 ^ 1) << 62)
   7087                     | (Replicate(imm8_6, 8) << 54)
   7088                     | (imm8_50 << 48);
   7089          }
   7090          break;
   7091       default:
   7092         vassert(0);
   7093    }
   7094 
   7095    if (testimm8 && imm8 == 0)
   7096       return False;
   7097 
   7098    *res = imm64;
   7099    return True;
   7100 }
   7101 
   7102 /* Help a bit for decoding laneage for vector operations that can be
   7103    of the form 4x32, 2x64 or 2x32-and-zero-upper-half, as encoded by Q
   7104    and SZ bits, typically for vector floating point. */
   7105 static Bool getLaneInfo_Q_SZ ( /*OUT*/IRType* tyI,  /*OUT*/IRType* tyF,
   7106                                /*OUT*/UInt* nLanes, /*OUT*/Bool* zeroUpper,
   7107                                /*OUT*/const HChar** arrSpec,
   7108                                Bool bitQ, Bool bitSZ )
   7109 {
   7110    vassert(bitQ == True || bitQ == False);
   7111    vassert(bitSZ == True || bitSZ == False);
   7112    if (bitQ && bitSZ) { // 2x64
   7113       if (tyI)       *tyI       = Ity_I64;
   7114       if (tyF)       *tyF       = Ity_F64;
   7115       if (nLanes)    *nLanes    = 2;
   7116       if (zeroUpper) *zeroUpper = False;
   7117       if (arrSpec)   *arrSpec   = "2d";
   7118       return True;
   7119    }
   7120    if (bitQ && !bitSZ) { // 4x32
   7121       if (tyI)       *tyI       = Ity_I32;
   7122       if (tyF)       *tyF       = Ity_F32;
   7123       if (nLanes)    *nLanes    = 4;
   7124       if (zeroUpper) *zeroUpper = False;
   7125       if (arrSpec)   *arrSpec   = "4s";
   7126       return True;
   7127    }
   7128    if (!bitQ && !bitSZ) { // 2x32
   7129       if (tyI)       *tyI       = Ity_I32;
   7130       if (tyF)       *tyF       = Ity_F32;
   7131       if (nLanes)    *nLanes    = 2;
   7132       if (zeroUpper) *zeroUpper = True;
   7133       if (arrSpec)   *arrSpec   = "2s";
   7134       return True;
   7135    }
   7136    // Else impliedly 1x64, which isn't allowed.
   7137    return False;
   7138 }
   7139 
   7140 /* Helper for decoding laneage for shift-style vector operations
   7141    that involve an immediate shift amount. */
   7142 static Bool getLaneInfo_IMMH_IMMB ( /*OUT*/UInt* shift, /*OUT*/UInt* szBlg2,
   7143                                     UInt immh, UInt immb )
   7144 {
   7145    vassert(immh < (1<<4));
   7146    vassert(immb < (1<<3));
   7147    UInt immhb = (immh << 3) | immb;
   7148    if (immh & 8) {
   7149       if (shift)  *shift  = 128 - immhb;
   7150       if (szBlg2) *szBlg2 = 3;
   7151       return True;
   7152    }
   7153    if (immh & 4) {
   7154       if (shift)  *shift  = 64 - immhb;
   7155       if (szBlg2) *szBlg2 = 2;
   7156       return True;
   7157    }
   7158    if (immh & 2) {
   7159       if (shift)  *shift  = 32 - immhb;
   7160       if (szBlg2) *szBlg2 = 1;
   7161       return True;
   7162    }
   7163    if (immh & 1) {
   7164       if (shift)  *shift  = 16 - immhb;
   7165       if (szBlg2) *szBlg2 = 0;
   7166       return True;
   7167    }
   7168    return False;
   7169 }
   7170 
   7171 /* Generate IR to fold all lanes of the V128 value in 'src' as
   7172    characterised by the operator 'op', and return the result in the
   7173    bottom bits of a V128, with all other bits set to zero. */
   7174 static IRTemp math_FOLDV ( IRTemp src, IROp op )
   7175 {
   7176    /* The basic idea is to use repeated applications of Iop_CatEven*
   7177       and Iop_CatOdd* operators to 'src' so as to clone each lane into
   7178       a complete vector.  Then fold all those vectors with 'op' and
   7179       zero out all but the least significant lane. */
   7180    switch (op) {
   7181       case Iop_Min8Sx16: case Iop_Min8Ux16:
   7182       case Iop_Max8Sx16: case Iop_Max8Ux16: case Iop_Add8x16: {
   7183          /* NB: temp naming here is misleading -- the naming is for 8
   7184             lanes of 16 bit, whereas what is being operated on is 16
   7185             lanes of 8 bits. */
   7186          IRTemp x76543210 = src;
   7187          IRTemp x76547654 = newTempV128();
   7188          IRTemp x32103210 = newTempV128();
   7189          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
   7190          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
   7191          IRTemp x76767676 = newTempV128();
   7192          IRTemp x54545454 = newTempV128();
   7193          IRTemp x32323232 = newTempV128();
   7194          IRTemp x10101010 = newTempV128();
   7195          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
   7196          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
   7197          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
   7198          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
   7199          IRTemp x77777777 = newTempV128();
   7200          IRTemp x66666666 = newTempV128();
   7201          IRTemp x55555555 = newTempV128();
   7202          IRTemp x44444444 = newTempV128();
   7203          IRTemp x33333333 = newTempV128();
   7204          IRTemp x22222222 = newTempV128();
   7205          IRTemp x11111111 = newTempV128();
   7206          IRTemp x00000000 = newTempV128();
   7207          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
   7208          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
   7209          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
   7210          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
   7211          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
   7212          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
   7213          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
   7214          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
   7215          /* Naming not misleading after here. */
   7216          IRTemp xAllF = newTempV128();
   7217          IRTemp xAllE = newTempV128();
   7218          IRTemp xAllD = newTempV128();
   7219          IRTemp xAllC = newTempV128();
   7220          IRTemp xAllB = newTempV128();
   7221          IRTemp xAllA = newTempV128();
   7222          IRTemp xAll9 = newTempV128();
   7223          IRTemp xAll8 = newTempV128();
   7224          IRTemp xAll7 = newTempV128();
   7225          IRTemp xAll6 = newTempV128();
   7226          IRTemp xAll5 = newTempV128();
   7227          IRTemp xAll4 = newTempV128();
   7228          IRTemp xAll3 = newTempV128();
   7229          IRTemp xAll2 = newTempV128();
   7230          IRTemp xAll1 = newTempV128();
   7231          IRTemp xAll0 = newTempV128();
   7232          assign(xAllF, mk_CatOddLanes8x16 (x77777777, x77777777));
   7233          assign(xAllE, mk_CatEvenLanes8x16(x77777777, x77777777));
   7234          assign(xAllD, mk_CatOddLanes8x16 (x66666666, x66666666));
   7235          assign(xAllC, mk_CatEvenLanes8x16(x66666666, x66666666));
   7236          assign(xAllB, mk_CatOddLanes8x16 (x55555555, x55555555));
   7237          assign(xAllA, mk_CatEvenLanes8x16(x55555555, x55555555));
   7238          assign(xAll9, mk_CatOddLanes8x16 (x44444444, x44444444));
   7239          assign(xAll8, mk_CatEvenLanes8x16(x44444444, x44444444));
   7240          assign(xAll7, mk_CatOddLanes8x16 (x33333333, x33333333));
   7241          assign(xAll6, mk_CatEvenLanes8x16(x33333333, x33333333));
   7242          assign(xAll5, mk_CatOddLanes8x16 (x22222222, x22222222));
   7243          assign(xAll4, mk_CatEvenLanes8x16(x22222222, x22222222));
   7244          assign(xAll3, mk_CatOddLanes8x16 (x11111111, x11111111));
   7245          assign(xAll2, mk_CatEvenLanes8x16(x11111111, x11111111));
   7246          assign(xAll1, mk_CatOddLanes8x16 (x00000000, x00000000));
   7247          assign(xAll0, mk_CatEvenLanes8x16(x00000000, x00000000));
   7248          IRTemp maxFE = newTempV128();
   7249          IRTemp maxDC = newTempV128();
   7250          IRTemp maxBA = newTempV128();
   7251          IRTemp max98 = newTempV128();
   7252          IRTemp max76 = newTempV128();
   7253          IRTemp max54 = newTempV128();
   7254          IRTemp max32 = newTempV128();
   7255          IRTemp max10 = newTempV128();
   7256          assign(maxFE, binop(op, mkexpr(xAllF), mkexpr(xAllE)));
   7257          assign(maxDC, binop(op, mkexpr(xAllD), mkexpr(xAllC)));
   7258          assign(maxBA, binop(op, mkexpr(xAllB), mkexpr(xAllA)));
   7259          assign(max98, binop(op, mkexpr(xAll9), mkexpr(xAll8)));
   7260          assign(max76, binop(op, mkexpr(xAll7), mkexpr(xAll6)));
   7261          assign(max54, binop(op, mkexpr(xAll5), mkexpr(xAll4)));
   7262          assign(max32, binop(op, mkexpr(xAll3), mkexpr(xAll2)));
   7263          assign(max10, binop(op, mkexpr(xAll1), mkexpr(xAll0)));
   7264          IRTemp maxFEDC = newTempV128();
   7265          IRTemp maxBA98 = newTempV128();
   7266          IRTemp max7654 = newTempV128();
   7267          IRTemp max3210 = newTempV128();
   7268          assign(maxFEDC, binop(op, mkexpr(maxFE), mkexpr(maxDC)));
   7269          assign(maxBA98, binop(op, mkexpr(maxBA), mkexpr(max98)));
   7270          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
   7271          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
   7272          IRTemp maxFEDCBA98 = newTempV128();
   7273          IRTemp max76543210 = newTempV128();
   7274          assign(maxFEDCBA98, binop(op, mkexpr(maxFEDC), mkexpr(maxBA98)));
   7275          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
   7276          IRTemp maxAllLanes = newTempV128();
   7277          assign(maxAllLanes, binop(op, mkexpr(maxFEDCBA98),
   7278                                        mkexpr(max76543210)));
   7279          IRTemp res = newTempV128();
   7280          assign(res, unop(Iop_ZeroHI120ofV128, mkexpr(maxAllLanes)));
   7281          return res;
   7282       }
   7283       case Iop_Min16Sx8: case Iop_Min16Ux8:
   7284       case Iop_Max16Sx8: case Iop_Max16Ux8: case Iop_Add16x8: {
   7285          IRTemp x76543210 = src;
   7286          IRTemp x76547654 = newTempV128();
   7287          IRTemp x32103210 = newTempV128();
   7288          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
   7289          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
   7290          IRTemp x76767676 = newTempV128();
   7291          IRTemp x54545454 = newTempV128();
   7292          IRTemp x32323232 = newTempV128();
   7293          IRTemp x10101010 = newTempV128();
   7294          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
   7295          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
   7296          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
   7297          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
   7298          IRTemp x77777777 = newTempV128();
   7299          IRTemp x66666666 = newTempV128();
   7300          IRTemp x55555555 = newTempV128();
   7301          IRTemp x44444444 = newTempV128();
   7302          IRTemp x33333333 = newTempV128();
   7303          IRTemp x22222222 = newTempV128();
   7304          IRTemp x11111111 = newTempV128();
   7305          IRTemp x00000000 = newTempV128();
   7306          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
   7307          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
   7308          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
   7309          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
   7310          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
   7311          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
   7312          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
   7313          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
   7314          IRTemp max76 = newTempV128();
   7315          IRTemp max54 = newTempV128();
   7316          IRTemp max32 = newTempV128();
   7317          IRTemp max10 = newTempV128();
   7318          assign(max76, binop(op, mkexpr(x77777777), mkexpr(x66666666)));
   7319          assign(max54, binop(op, mkexpr(x55555555), mkexpr(x44444444)));
   7320          assign(max32, binop(op, mkexpr(x33333333), mkexpr(x22222222)));
   7321          assign(max10, binop(op, mkexpr(x11111111), mkexpr(x00000000)));
   7322          IRTemp max7654 = newTempV128();
   7323          IRTemp max3210 = newTempV128();
   7324          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
   7325          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
   7326          IRTemp max76543210 = newTempV128();
   7327          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
   7328          IRTemp res = newTempV128();
   7329          assign(res, unop(Iop_ZeroHI112ofV128, mkexpr(max76543210)));
   7330          return res;
   7331       }
   7332       case Iop_Max32Fx4: case Iop_Min32Fx4:
   7333       case Iop_Min32Sx4: case Iop_Min32Ux4:
   7334       case Iop_Max32Sx4: case Iop_Max32Ux4: case Iop_Add32x4: {
   7335          IRTemp x3210 = src;
   7336          IRTemp x3232 = newTempV128();
   7337          IRTemp x1010 = newTempV128();
   7338          assign(x3232, mk_CatOddLanes64x2 (x3210, x3210));
   7339          assign(x1010, mk_CatEvenLanes64x2(x3210, x3210));
   7340          IRTemp x3333 = newTempV128();
   7341          IRTemp x2222 = newTempV128();
   7342          IRTemp x1111 = newTempV128();
   7343          IRTemp x0000 = newTempV128();
   7344          assign(x3333, mk_CatOddLanes32x4 (x3232, x3232));
   7345          assign(x2222, mk_CatEvenLanes32x4(x3232, x3232));
   7346          assign(x1111, mk_CatOddLanes32x4 (x1010, x1010));
   7347          assign(x0000, mk_CatEvenLanes32x4(x1010, x1010));
   7348          IRTemp max32 = newTempV128();
   7349          IRTemp max10 = newTempV128();
   7350          assign(max32, binop(op, mkexpr(x3333), mkexpr(x2222)));
   7351          assign(max10, binop(op, mkexpr(x1111), mkexpr(x0000)));
   7352          IRTemp max3210 = newTempV128();
   7353          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
   7354          IRTemp res = newTempV128();
   7355          assign(res, unop(Iop_ZeroHI96ofV128, mkexpr(max3210)));
   7356          return res;
   7357       }
   7358       case Iop_Add64x2: {
   7359          IRTemp x10 = src;
   7360          IRTemp x00 = newTempV128();
   7361          IRTemp x11 = newTempV128();
   7362          assign(x11, binop(Iop_InterleaveHI64x2, mkexpr(x10), mkexpr(x10)));
   7363          assign(x00, binop(Iop_InterleaveLO64x2, mkexpr(x10), mkexpr(x10)));
   7364          IRTemp max10 = newTempV128();
   7365          assign(max10, binop(op, mkexpr(x11), mkexpr(x00)));
   7366          IRTemp res = newTempV128();
   7367          assign(res, unop(Iop_ZeroHI64ofV128, mkexpr(max10)));
   7368          return res;
   7369       }
   7370       default:
   7371          vassert(0);
   7372    }
   7373 }
   7374 
   7375 
   7376 /* Generate IR for TBL and TBX.  This deals with the 128 bit case
   7377    only. */
   7378 static IRTemp math_TBL_TBX ( IRTemp tab[4], UInt len, IRTemp src,
   7379                              IRTemp oor_values )
   7380 {
   7381    vassert(len >= 0 && len <= 3);
   7382 
   7383    /* Generate some useful constants as concisely as possible. */
   7384    IRTemp half15 = newTemp(Ity_I64);
   7385    assign(half15, mkU64(0x0F0F0F0F0F0F0F0FULL));
   7386    IRTemp half16 = newTemp(Ity_I64);
   7387    assign(half16, mkU64(0x1010101010101010ULL));
   7388 
   7389    /* A zero vector */
   7390    IRTemp allZero = newTempV128();
   7391    assign(allZero, mkV128(0x0000));
   7392    /* A vector containing 15 in each 8-bit lane */
   7393    IRTemp all15 = newTempV128();
   7394    assign(all15, binop(Iop_64HLtoV128, mkexpr(half15), mkexpr(half15)));
   7395    /* A vector containing 16 in each 8-bit lane */
   7396    IRTemp all16 = newTempV128();
   7397    assign(all16, binop(Iop_64HLtoV128, mkexpr(half16), mkexpr(half16)));
   7398    /* A vector containing 32 in each 8-bit lane */
   7399    IRTemp all32 = newTempV128();
   7400    assign(all32, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all16)));
   7401    /* A vector containing 48 in each 8-bit lane */
   7402    IRTemp all48 = newTempV128();
   7403    assign(all48, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all32)));
   7404    /* A vector containing 64 in each 8-bit lane */
   7405    IRTemp all64 = newTempV128();
   7406    assign(all64, binop(Iop_Add8x16, mkexpr(all32), mkexpr(all32)));
   7407 
   7408    /* Group the 16/32/48/64 vectors so as to be indexable. */
   7409    IRTemp allXX[4] = { all16, all32, all48, all64 };
   7410 
   7411    /* Compute the result for each table vector, with zeroes in places
   7412       where the index values are out of range, and OR them into the
   7413       running vector. */
   7414    IRTemp running_result = newTempV128();
   7415    assign(running_result, mkV128(0));
   7416 
   7417    UInt tabent;
   7418    for (tabent = 0; tabent <= len; tabent++) {
   7419       vassert(tabent >= 0 && tabent < 4);
   7420       IRTemp bias = newTempV128();
   7421       assign(bias,
   7422              mkexpr(tabent == 0 ? allZero : allXX[tabent-1]));
   7423       IRTemp biased_indices = newTempV128();
   7424       assign(biased_indices,
   7425              binop(Iop_Sub8x16, mkexpr(src), mkexpr(bias)));
   7426       IRTemp valid_mask = newTempV128();
   7427       assign(valid_mask,
   7428              binop(Iop_CmpGT8Ux16, mkexpr(all16), mkexpr(biased_indices)));
   7429       IRTemp safe_biased_indices = newTempV128();
   7430       assign(safe_biased_indices,
   7431              binop(Iop_AndV128, mkexpr(biased_indices), mkexpr(all15)));
   7432       IRTemp results_or_junk = newTempV128();
   7433       assign(results_or_junk,
   7434              binop(Iop_Perm8x16, mkexpr(tab[tabent]),
   7435                                  mkexpr(safe_biased_indices)));
   7436       IRTemp results_or_zero = newTempV128();
   7437       assign(results_or_zero,
   7438              binop(Iop_AndV128, mkexpr(results_or_junk), mkexpr(valid_mask)));
   7439       /* And OR that into the running result. */
   7440       IRTemp tmp = newTempV128();
   7441       assign(tmp, binop(Iop_OrV128, mkexpr(results_or_zero),
   7442                         mkexpr(running_result)));
   7443       running_result = tmp;
   7444    }
   7445 
   7446    /* So now running_result holds the overall result where the indices
   7447       are in range, and zero in out-of-range lanes.  Now we need to
   7448       compute an overall validity mask and use this to copy in the
   7449       lanes in the oor_values for out of range indices.  This is
   7450       unnecessary for TBL but will get folded out by iropt, so we lean
   7451       on that and generate the same code for TBL and TBX here. */
   7452    IRTemp overall_valid_mask = newTempV128();
   7453    assign(overall_valid_mask,
   7454           binop(Iop_CmpGT8Ux16, mkexpr(allXX[len]), mkexpr(src)));
   7455    IRTemp result = newTempV128();
   7456    assign(result,
   7457           binop(Iop_OrV128,
   7458                 mkexpr(running_result),
   7459                 binop(Iop_AndV128,
   7460                       mkexpr(oor_values),
   7461                       unop(Iop_NotV128, mkexpr(overall_valid_mask)))));
   7462    return result;
   7463 }
   7464 
   7465 
   7466 /* Let |argL| and |argR| be V128 values, and let |opI64x2toV128| be
   7467    an op which takes two I64s and produces a V128.  That is, a widening
   7468    operator.  Generate IR which applies |opI64x2toV128| to either the
   7469    lower (if |is2| is False) or upper (if |is2| is True) halves of
   7470    |argL| and |argR|, and return the value in a new IRTemp.
   7471 */
   7472 static
   7473 IRTemp math_BINARY_WIDENING_V128 ( Bool is2, IROp opI64x2toV128,
   7474                                    IRExpr* argL, IRExpr* argR )
   7475 {
   7476    IRTemp res   = newTempV128();
   7477    IROp   slice = is2 ? Iop_V128HIto64 : Iop_V128to64;
   7478    assign(res, binop(opI64x2toV128, unop(slice, argL),
   7479                                     unop(slice, argR)));
   7480    return res;
   7481 }
   7482 
   7483 
   7484 /* Generate signed/unsigned absolute difference vector IR. */
   7485 static
   7486 IRTemp math_ABD ( Bool isU, UInt size, IRExpr* argLE, IRExpr* argRE )
   7487 {
   7488    vassert(size <= 3);
   7489    IRTemp argL = newTempV128();
   7490    IRTemp argR = newTempV128();
   7491    IRTemp msk  = newTempV128();
   7492    IRTemp res  = newTempV128();
   7493    assign(argL, argLE);
   7494    assign(argR, argRE);
   7495    assign(msk, binop(isU ? mkVecCMPGTU(size) : mkVecCMPGTS(size),
   7496                      mkexpr(argL), mkexpr(argR)));
   7497    assign(res,
   7498           binop(Iop_OrV128,
   7499                 binop(Iop_AndV128,
   7500                       binop(mkVecSUB(size), mkexpr(argL), mkexpr(argR)),
   7501                       mkexpr(msk)),
   7502                 binop(Iop_AndV128,
   7503                       binop(mkVecSUB(size), mkexpr(argR), mkexpr(argL)),
   7504                       unop(Iop_NotV128, mkexpr(msk)))));
   7505    return res;
   7506 }
   7507 
   7508 
   7509 /* Generate IR that takes a V128 and sign- or zero-widens
   7510    either the lower or upper set of lanes to twice-as-wide,
   7511    resulting in a new V128 value. */
   7512 static
   7513 IRTemp math_WIDEN_LO_OR_HI_LANES ( Bool zWiden, Bool fromUpperHalf,
   7514                                    UInt sizeNarrow, IRExpr* srcE )
   7515 {
   7516    IRTemp src = newTempV128();
   7517    IRTemp res = newTempV128();
   7518    assign(src, srcE);
   7519    switch (sizeNarrow) {
   7520       case X10:
   7521          assign(res,
   7522                 binop(zWiden ? Iop_ShrN64x2 : Iop_SarN64x2,
   7523                       binop(fromUpperHalf ? Iop_InterleaveHI32x4
   7524                                           : Iop_InterleaveLO32x4,
   7525                             mkexpr(src),
   7526                             mkexpr(src)),
   7527                       mkU8(32)));
   7528          break;
   7529       case X01:
   7530          assign(res,
   7531                 binop(zWiden ? Iop_ShrN32x4 : Iop_SarN32x4,
   7532                       binop(fromUpperHalf ? Iop_InterleaveHI16x8
   7533                                           : Iop_InterleaveLO16x8,
   7534                             mkexpr(src),
   7535                             mkexpr(src)),
   7536                       mkU8(16)));
   7537          break;
   7538       case X00:
   7539          assign(res,
   7540                 binop(zWiden ? Iop_ShrN16x8 : Iop_SarN16x8,
   7541                       binop(fromUpperHalf ? Iop_InterleaveHI8x16
   7542                                           : Iop_InterleaveLO8x16,
   7543                             mkexpr(src),
   7544                             mkexpr(src)),
   7545                       mkU8(8)));
   7546          break;
   7547       default:
   7548          vassert(0);
   7549    }
   7550    return res;
   7551 }
   7552 
   7553 
   7554 /* Generate IR that takes a V128 and sign- or zero-widens
   7555    either the even or odd lanes to twice-as-wide,
   7556    resulting in a new V128 value. */
   7557 static
   7558 IRTemp math_WIDEN_EVEN_OR_ODD_LANES ( Bool zWiden, Bool fromOdd,
   7559                                       UInt sizeNarrow, IRExpr* srcE )
   7560 {
   7561    IRTemp src   = newTempV128();
   7562    IRTemp res   = newTempV128();
   7563    IROp   opSAR = mkVecSARN(sizeNarrow+1);
   7564    IROp   opSHR = mkVecSHRN(sizeNarrow+1);
   7565    IROp   opSHL = mkVecSHLN(sizeNarrow+1);
   7566    IROp   opSxR = zWiden ? opSHR : opSAR;
   7567    UInt   amt   = 0;
   7568    switch (sizeNarrow) {
   7569       case X10: amt = 32; break;
   7570       case X01: amt = 16; break;
   7571       case X00: amt = 8;  break;
   7572       default: vassert(0);
   7573    }
   7574    assign(src, srcE);
   7575    if (fromOdd) {
   7576       assign(res, binop(opSxR, mkexpr(src), mkU8(amt)));
   7577    } else {
   7578       assign(res, binop(opSxR, binop(opSHL, mkexpr(src), mkU8(amt)),
   7579                                mkU8(amt)));
   7580    }
   7581    return res;
   7582 }
   7583 
   7584 
   7585 /* Generate IR that takes two V128s and narrows (takes lower half)
   7586    of each lane, producing a single V128 value. */
   7587 static
   7588 IRTemp math_NARROW_LANES ( IRTemp argHi, IRTemp argLo, UInt sizeNarrow )
   7589 {
   7590    IRTemp res = newTempV128();
   7591    assign(res, binop(mkVecCATEVENLANES(sizeNarrow),
   7592                      mkexpr(argHi), mkexpr(argLo)));
   7593    return res;
   7594 }
   7595 
   7596 
   7597 /* Return a temp which holds the vector dup of the lane of width
   7598    (1 << size) obtained from src[laneNo]. */
   7599 static
   7600 IRTemp math_DUP_VEC_ELEM ( IRExpr* src, UInt size, UInt laneNo )
   7601 {
   7602    vassert(size <= 3);
   7603    /* Normalise |laneNo| so it is of the form
   7604       x000 for D, xx00 for S, xxx0 for H, and xxxx for B.
   7605       This puts the bits we want to inspect at constant offsets
   7606       regardless of the value of |size|.
   7607    */
   7608    UInt ix = laneNo << size;
   7609    vassert(ix <= 15);
   7610    IROp ops[4] = { Iop_INVALID, Iop_INVALID, Iop_INVALID, Iop_INVALID };
   7611    switch (size) {
   7612       case 0: /* B */
   7613          ops[0] = (ix & 1) ? Iop_CatOddLanes8x16 : Iop_CatEvenLanes8x16;
   7614          /* fallthrough */
   7615       case 1: /* H */
   7616          ops[1] = (ix & 2) ? Iop_CatOddLanes16x8 : Iop_CatEvenLanes16x8;
   7617          /* fallthrough */
   7618       case 2: /* S */
   7619          ops[2] = (ix & 4) ? Iop_CatOddLanes32x4 : Iop_CatEvenLanes32x4;
   7620          /* fallthrough */
   7621       case 3: /* D */
   7622          ops[3] = (ix & 8) ? Iop_InterleaveHI64x2 : Iop_InterleaveLO64x2;
   7623          break;
   7624       default:
   7625          vassert(0);
   7626    }
   7627    IRTemp res = newTempV128();
   7628    assign(res, src);
   7629    Int i;
   7630    for (i = 3; i >= 0; i--) {
   7631       if (ops[i] == Iop_INVALID)
   7632          break;
   7633       IRTemp tmp = newTempV128();
   7634       assign(tmp, binop(ops[i], mkexpr(res), mkexpr(res)));
   7635       res = tmp;
   7636    }
   7637    return res;
   7638 }
   7639 
   7640 
   7641 /* Let |srcV| be a V128 value, and let |imm5| be a lane-and-size
   7642    selector encoded as shown below.  Return a new V128 holding the
   7643    selected lane from |srcV| dup'd out to V128, and also return the
   7644    lane number, log2 of the lane size in bytes, and width-character via
   7645    *laneNo, *laneSzLg2 and *laneCh respectively.  It may be that imm5
   7646    is an invalid selector, in which case return
   7647    IRTemp_INVALID, 0, 0 and '?' respectively.
   7648 
   7649    imm5 = xxxx1   signifies .b[xxxx]
   7650         = xxx10   .h[xxx]
   7651         = xx100   .s[xx]
   7652         = x1000   .d[x]
   7653         otherwise invalid
   7654 */
   7655 static
   7656 IRTemp handle_DUP_VEC_ELEM ( /*OUT*/UInt* laneNo,
   7657                              /*OUT*/UInt* laneSzLg2, /*OUT*/HChar* laneCh,
   7658                              IRExpr* srcV, UInt imm5 )
   7659 {
   7660    *laneNo    = 0;
   7661    *laneSzLg2 = 0;
   7662    *laneCh    = '?';
   7663 
   7664    if (imm5 & 1) {
   7665       *laneNo    = (imm5 >> 1) & 15;
   7666       *laneSzLg2 = 0;
   7667       *laneCh    = 'b';
   7668    }
   7669    else if (imm5 & 2) {
   7670       *laneNo    = (imm5 >> 2) & 7;
   7671       *laneSzLg2 = 1;
   7672       *laneCh    = 'h';
   7673    }
   7674    else if (imm5 & 4) {
   7675       *laneNo    = (imm5 >> 3) & 3;
   7676       *laneSzLg2 = 2;
   7677       *laneCh    = 's';
   7678    }
   7679    else if (imm5 & 8) {
   7680       *laneNo    = (imm5 >> 4) & 1;
   7681       *laneSzLg2 = 3;
   7682       *laneCh    = 'd';
   7683    }
   7684    else {
   7685       /* invalid */
   7686       return IRTemp_INVALID;
   7687    }
   7688 
   7689    return math_DUP_VEC_ELEM(srcV, *laneSzLg2, *laneNo);
   7690 }
   7691 
   7692 
   7693 /* Clone |imm| to every lane of a V128, with lane size log2 of |size|. */
   7694 static
   7695 IRTemp math_VEC_DUP_IMM ( UInt size, ULong imm )
   7696 {
   7697    IRType ty  = Ity_INVALID;
   7698    IRTemp rcS = IRTemp_INVALID;
   7699    switch (size) {
   7700       case X01:
   7701          vassert(imm <= 0xFFFFULL);
   7702          ty  = Ity_I16;
   7703          rcS = newTemp(ty); assign(rcS, mkU16( (UShort)imm ));
   7704          break;
   7705       case X10:
   7706          vassert(imm <= 0xFFFFFFFFULL);
   7707          ty  = Ity_I32;
   7708          rcS = newTemp(ty); assign(rcS, mkU32( (UInt)imm ));
   7709          break;
   7710       case X11:
   7711          ty  = Ity_I64;
   7712          rcS = newTemp(ty); assign(rcS, mkU64(imm)); break;
   7713       default:
   7714          vassert(0);
   7715    }
   7716    IRTemp rcV = math_DUP_TO_V128(rcS, ty);
   7717    return rcV;
   7718 }
   7719 
   7720 
   7721 /* Let |new64| be a V128 in which only the lower 64 bits are interesting,
   7722    and the upper can contain any value -- it is ignored.  If |is2| is False,
   7723    generate IR to put |new64| in the lower half of vector reg |dd| and zero
   7724    the upper half.  If |is2| is True, generate IR to put |new64| in the upper
   7725    half of vector reg |dd| and leave the lower half unchanged.  This
   7726    simulates the behaviour of the "foo/foo2" instructions in which the
   7727    destination is half the width of sources, for example addhn/addhn2.
   7728 */
   7729 static
   7730 void putLO64andZUorPutHI64 ( Bool is2, UInt dd, IRTemp new64 )
   7731 {
   7732    if (is2) {
   7733       /* Get the old contents of Vdd, zero the upper half, and replace
   7734          it with 'x'. */
   7735       IRTemp t_zero_oldLO = newTempV128();
   7736       assign(t_zero_oldLO, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
   7737       IRTemp t_newHI_zero = newTempV128();
   7738       assign(t_newHI_zero, binop(Iop_InterleaveLO64x2, mkexpr(new64),
   7739                                                        mkV128(0x0000)));
   7740       IRTemp res = newTempV128();
   7741       assign(res, binop(Iop_OrV128, mkexpr(t_zero_oldLO),
   7742                                     mkexpr(t_newHI_zero)));
   7743       putQReg128(dd, mkexpr(res));
   7744    } else {
   7745       /* This is simple. */
   7746       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(new64)));
   7747    }
   7748 }
   7749 
   7750 
   7751 /* Compute vector SQABS at lane size |size| for |srcE|, returning
   7752    the q result in |*qabs| and the normal result in |*nabs|. */
   7753 static
   7754 void math_SQABS ( /*OUT*/IRTemp* qabs, /*OUT*/IRTemp* nabs,
   7755                   IRExpr* srcE, UInt size )
   7756 {
   7757       IRTemp src, mask, maskn, nsub, qsub;
   7758       src = mask = maskn = nsub = qsub = IRTemp_INVALID;
   7759       newTempsV128_7(&src, &mask, &maskn, &nsub, &qsub, nabs, qabs);
   7760       assign(src,   srcE);
   7761       assign(mask,  binop(mkVecCMPGTS(size),  mkV128(0x0000), mkexpr(src)));
   7762       assign(maskn, unop(Iop_NotV128, mkexpr(mask)));
   7763       assign(nsub,  binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
   7764       assign(qsub,  binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
   7765       assign(*nabs, binop(Iop_OrV128,
   7766                           binop(Iop_AndV128, mkexpr(nsub), mkexpr(mask)),
   7767                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
   7768       assign(*qabs, binop(Iop_OrV128,
   7769                           binop(Iop_AndV128, mkexpr(qsub), mkexpr(mask)),
   7770                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
   7771 }
   7772 
   7773 
   7774 /* Compute vector SQNEG at lane size |size| for |srcE|, returning
   7775    the q result in |*qneg| and the normal result in |*nneg|. */
   7776 static
   7777 void math_SQNEG ( /*OUT*/IRTemp* qneg, /*OUT*/IRTemp* nneg,
   7778                   IRExpr* srcE, UInt size )
   7779 {
   7780       IRTemp src = IRTemp_INVALID;
   7781       newTempsV128_3(&src, nneg, qneg);
   7782       assign(src,   srcE);
   7783       assign(*nneg, binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
   7784       assign(*qneg, binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
   7785 }
   7786 
   7787 
   7788 /* Zero all except the least significant lane of |srcE|, where |size|
   7789    indicates the lane size in the usual way. */
   7790 static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( UInt size, IRExpr* srcE )
   7791 {
   7792    vassert(size < 4);
   7793    IRTemp t = newTempV128();
   7794    assign(t, unop(mkVecZEROHIxxOFV128(size), srcE));
   7795    return t;
   7796 }
   7797 
   7798 
   7799 /* Generate IR to compute vector widening MULL from either the lower
   7800    (is2==False) or upper (is2==True) halves of vecN and vecM.  The
   7801    widening multiplies are unsigned when isU==True and signed when
   7802    isU==False.  |size| is the narrow lane size indication.  Optionally,
   7803    the product may be added to or subtracted from vecD, at the wide lane
   7804    size.  This happens when |mas| is 'a' (add) or 's' (sub).  When |mas|
   7805    is 'm' (only multiply) then the accumulate part does not happen, and
   7806    |vecD| is expected to == IRTemp_INVALID.
   7807 
   7808    Only size==0 (h_b_b), size==1 (s_h_h) and size==2 (d_s_s) variants
   7809    are allowed.  The result is returned in a new IRTemp, which is
   7810    returned in *res. */
   7811 static
   7812 void math_MULL_ACC ( /*OUT*/IRTemp* res,
   7813                      Bool is2, Bool isU, UInt size, HChar mas,
   7814                      IRTemp vecN, IRTemp vecM, IRTemp vecD )
   7815 {
   7816    vassert(res && *res == IRTemp_INVALID);
   7817    vassert(size <= 2);
   7818    vassert(mas == 'm' || mas == 'a' || mas == 's');
   7819    if (mas == 'm') vassert(vecD == IRTemp_INVALID);
   7820    IROp   mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
   7821    IROp   accOp = (mas == 'a') ? mkVecADD(size+1)
   7822                   : (mas == 's' ? mkVecSUB(size+1)
   7823                   : Iop_INVALID);
   7824    IRTemp mul   = math_BINARY_WIDENING_V128(is2, mulOp,
   7825                                             mkexpr(vecN), mkexpr(vecM));
   7826    *res = newTempV128();
   7827    assign(*res, mas == 'm' ? mkexpr(mul)
   7828                            : binop(accOp, mkexpr(vecD), mkexpr(mul)));
   7829 }
   7830 
   7831 
   7832 /* Same as math_MULL_ACC, except the multiply is signed widening,
   7833    the multiplied value is then doubled, before being added to or
   7834    subtracted from the accumulated value.  And everything is
   7835    saturated.  In all cases, saturation residuals are returned
   7836    via (sat1q, sat1n), and in the accumulate cases,
   7837    via (sat2q, sat2n) too.  All results are returned in new temporaries.
   7838    In the no-accumulate case, *sat2q and *sat2n are never instantiated,
   7839    so the caller can tell this has happened. */
   7840 static
   7841 void math_SQDMULL_ACC ( /*OUT*/IRTemp* res,
   7842                         /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
   7843                         /*OUT*/IRTemp* sat2q, /*OUT*/IRTemp* sat2n,
   7844                         Bool is2, UInt size, HChar mas,
   7845                         IRTemp vecN, IRTemp vecM, IRTemp vecD )
   7846 {
   7847    vassert(size <= 2);
   7848    vassert(mas == 'm' || mas == 'a' || mas == 's');
   7849    /* Compute
   7850          sat1q = vecN.D[is2] *sq vecM.d[is2] *q 2
   7851          sat1n = vecN.D[is2] *s  vecM.d[is2] *  2
   7852       IOW take either the low or high halves of vecN and vecM, signed widen,
   7853       multiply, double that, and signedly saturate.  Also compute the same
   7854       but without saturation.
   7855    */
   7856    vassert(sat2q && *sat2q == IRTemp_INVALID);
   7857    vassert(sat2n && *sat2n == IRTemp_INVALID);
   7858    newTempsV128_3(sat1q, sat1n, res);
   7859    IRTemp tq = math_BINARY_WIDENING_V128(is2, mkVecQDMULLS(size),
   7860                                          mkexpr(vecN), mkexpr(vecM));
   7861    IRTemp tn = math_BINARY_WIDENING_V128(is2, mkVecMULLS(size),
   7862                                          mkexpr(vecN), mkexpr(vecM));
   7863    assign(*sat1q, mkexpr(tq));
   7864    assign(*sat1n, binop(mkVecADD(size+1), mkexpr(tn), mkexpr(tn)));
   7865 
   7866    /* If there is no accumulation, the final result is sat1q,
   7867       and there's no assignment to sat2q or sat2n. */
   7868    if (mas == 'm') {
   7869       assign(*res, mkexpr(*sat1q));
   7870       return;
   7871    }
   7872 
   7873    /* Compute
   7874          sat2q  = vecD +sq/-sq sat1q
   7875          sat2n  = vecD +/-     sat1n
   7876          result = sat2q
   7877    */
   7878    newTempsV128_2(sat2q, sat2n);
   7879    assign(*sat2q, binop(mas == 'a' ? mkVecQADDS(size+1) : mkVecQSUBS(size+1),
   7880                         mkexpr(vecD), mkexpr(*sat1q)));
   7881    assign(*sat2n, binop(mas == 'a' ? mkVecADD(size+1) : mkVecSUB(size+1),
   7882                         mkexpr(vecD), mkexpr(*sat1n)));
   7883    assign(*res, mkexpr(*sat2q));
   7884 }
   7885 
   7886 
   7887 /* Generate IR for widening signed vector multiplies.  The operands
   7888    have their lane width signedly widened, and they are then multiplied
   7889    at the wider width, returning results in two new IRTemps. */
   7890 static
   7891 void math_MULLS ( /*OUT*/IRTemp* resHI, /*OUT*/IRTemp* resLO,
   7892                   UInt sizeNarrow, IRTemp argL, IRTemp argR )
   7893 {
   7894    vassert(sizeNarrow <= 2);
   7895    newTempsV128_2(resHI, resLO);
   7896    IRTemp argLhi = newTemp(Ity_I64);
   7897    IRTemp argLlo = newTemp(Ity_I64);
   7898    IRTemp argRhi = newTemp(Ity_I64);
   7899    IRTemp argRlo = newTemp(Ity_I64);
   7900    assign(argLhi, unop(Iop_V128HIto64, mkexpr(argL)));
   7901    assign(argLlo, unop(Iop_V128to64,   mkexpr(argL)));
   7902    assign(argRhi, unop(Iop_V128HIto64, mkexpr(argR)));
   7903    assign(argRlo, unop(Iop_V128to64,   mkexpr(argR)));
   7904    IROp opMulls = mkVecMULLS(sizeNarrow);
   7905    assign(*resHI, binop(opMulls, mkexpr(argLhi), mkexpr(argRhi)));
   7906    assign(*resLO, binop(opMulls, mkexpr(argLlo), mkexpr(argRlo)));
   7907 }
   7908 
   7909 
   7910 /* Generate IR for SQDMULH and SQRDMULH: signedly wideningly multiply,
   7911    double that, possibly add a rounding constant (R variants), and take
   7912    the high half. */
   7913 static
   7914 void math_SQDMULH ( /*OUT*/IRTemp* res,
   7915                     /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
   7916                     Bool isR, UInt size, IRTemp vN, IRTemp vM )
   7917 {
   7918    vassert(size == X01 || size == X10); /* s or h only */
   7919 
   7920    newTempsV128_3(res, sat1q, sat1n);
   7921 
   7922    IRTemp mullsHI = IRTemp_INVALID, mullsLO = IRTemp_INVALID;
   7923    math_MULLS(&mullsHI, &mullsLO, size, vN, vM);
   7924 
   7925    IRTemp addWide = mkVecADD(size+1);
   7926 
   7927    if (isR) {
   7928       assign(*sat1q, binop(mkVecQRDMULHIS(size), mkexpr(vN), mkexpr(vM)));
   7929 
   7930       Int    rcShift    = size == X01 ? 15 : 31;
   7931       IRTemp roundConst = math_VEC_DUP_IMM(size+1, 1ULL << rcShift);
   7932       assign(*sat1n,
   7933              binop(mkVecCATODDLANES(size),
   7934                    binop(addWide,
   7935                          binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
   7936                          mkexpr(roundConst)),
   7937                    binop(addWide,
   7938                          binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO)),
   7939                          mkexpr(roundConst))));
   7940    } else {
   7941       assign(*sat1q, binop(mkVecQDMULHIS(size), mkexpr(vN), mkexpr(vM)));
   7942 
   7943       assign(*sat1n,
   7944              binop(mkVecCATODDLANES(size),
   7945                    binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
   7946                    binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO))));
   7947    }
   7948 
   7949    assign(*res, mkexpr(*sat1q));
   7950 }
   7951 
   7952 
   7953 /* Generate IR for SQSHL, UQSHL, SQSHLU by imm.  Put the result in
   7954    a new temp in *res, and the Q difference pair in new temps in
   7955    *qDiff1 and *qDiff2 respectively.  |nm| denotes which of the
   7956    three operations it is. */
   7957 static
   7958 void math_QSHL_IMM ( /*OUT*/IRTemp* res,
   7959                      /*OUT*/IRTemp* qDiff1, /*OUT*/IRTemp* qDiff2,
   7960                      IRTemp src, UInt size, UInt shift, const HChar* nm )
   7961 {
   7962    vassert(size <= 3);
   7963    UInt laneBits = 8 << size;
   7964    vassert(shift < laneBits);
   7965    newTempsV128_3(res, qDiff1, qDiff2);
   7966    IRTemp z128 = newTempV128();
   7967    assign(z128, mkV128(0x0000));
   7968 
   7969    /* UQSHL */
   7970    if (vex_streq(nm, "uqshl")) {
   7971       IROp qop = mkVecQSHLNSATUU(size);
   7972       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
   7973       if (shift == 0) {
   7974          /* No shift means no saturation. */
   7975          assign(*qDiff1, mkexpr(z128));
   7976          assign(*qDiff2, mkexpr(z128));
   7977       } else {
   7978          /* Saturation has occurred if any of the shifted-out bits are
   7979             nonzero.  We get the shifted-out bits by right-shifting the
   7980             original value. */
   7981          UInt rshift = laneBits - shift;
   7982          vassert(rshift >= 1 && rshift < laneBits);
   7983          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
   7984          assign(*qDiff2, mkexpr(z128));
   7985       }
   7986       return;
   7987    }
   7988 
   7989    /* SQSHL */
   7990    if (vex_streq(nm, "sqshl")) {
   7991       IROp qop = mkVecQSHLNSATSS(size);
   7992       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
   7993       if (shift == 0) {
   7994          /* No shift means no saturation. */
   7995          assign(*qDiff1, mkexpr(z128));
   7996          assign(*qDiff2, mkexpr(z128));
   7997       } else {
   7998          /* Saturation has occurred if any of the shifted-out bits are
   7999             different from the top bit of the original value. */
   8000          UInt rshift = laneBits - 1 - shift;
   8001          vassert(rshift >= 0 && rshift < laneBits-1);
   8002          /* qDiff1 is the shifted out bits, and the top bit of the original
   8003             value, preceded by zeroes. */
   8004          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
   8005          /* qDiff2 is the top bit of the original value, cloned the
   8006             correct number of times. */
   8007          assign(*qDiff2, binop(mkVecSHRN(size),
   8008                                binop(mkVecSARN(size), mkexpr(src),
   8009                                                       mkU8(laneBits-1)),
   8010                                mkU8(rshift)));
   8011          /* This also succeeds in comparing the top bit of the original
   8012             value to itself, which is a bit stupid, but not wrong. */
   8013       }
   8014       return;
   8015    }
   8016 
   8017    /* SQSHLU */
   8018    if (vex_streq(nm, "sqshlu")) {
   8019       IROp qop = mkVecQSHLNSATSU(size);
   8020       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
   8021       if (shift == 0) {
   8022          /* If there's no shift, saturation depends on the top bit
   8023             of the source. */
   8024          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(laneBits-1)));
   8025          assign(*qDiff2, mkexpr(z128));
   8026       } else {
   8027          /* Saturation has occurred if any of the shifted-out bits are
   8028             nonzero.  We get the shifted-out bits by right-shifting the
   8029             original value. */
   8030          UInt rshift = laneBits - shift;
   8031          vassert(rshift >= 1 && rshift < laneBits);
   8032          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
   8033          assign(*qDiff2, mkexpr(z128));
   8034       }
   8035       return;
   8036    }
   8037 
   8038    vassert(0);
   8039 }
   8040 
   8041 
   8042 /* Generate IR to do SRHADD and URHADD. */
   8043 static
   8044 IRTemp math_RHADD ( UInt size, Bool isU, IRTemp aa, IRTemp bb )
   8045 {
   8046    /* Generate this:
   8047       (A >> 1) + (B >> 1) + (((A & 1) + (B & 1) + 1) >> 1)
   8048    */
   8049    vassert(size <= 3);
   8050    IROp opSHR = isU ? mkVecSHRN(size) : mkVecSARN(size);
   8051    IROp opADD = mkVecADD(size);
   8052    /* The only tricky bit is to generate the correct vector 1 constant. */
   8053    const ULong ones64[4]
   8054       = { 0x0101010101010101ULL, 0x0001000100010001ULL,
   8055           0x0000000100000001ULL, 0x0000000000000001ULL };
   8056    IRTemp imm64 = newTemp(Ity_I64);
   8057    assign(imm64, mkU64(ones64[size]));
   8058    IRTemp vecOne = newTempV128();
   8059    assign(vecOne, binop(Iop_64HLtoV128, mkexpr(imm64), mkexpr(imm64)));
   8060    IRTemp scaOne = newTemp(Ity_I8);
   8061    assign(scaOne, mkU8(1));
   8062    IRTemp res = newTempV128();
   8063    assign(res,
   8064           binop(opADD,
   8065                 binop(opSHR, mkexpr(aa), mkexpr(scaOne)),
   8066                 binop(opADD,
   8067                       binop(opSHR, mkexpr(bb), mkexpr(scaOne)),
   8068                       binop(opSHR,
   8069                             binop(opADD,
   8070                                   binop(opADD,
   8071                                         binop(Iop_AndV128, mkexpr(aa),
   8072                                                            mkexpr(vecOne)),
   8073                                         binop(Iop_AndV128, mkexpr(bb),
   8074                                                            mkexpr(vecOne))
   8075                                   ),
   8076                                   mkexpr(vecOne)
   8077                             ),
   8078                             mkexpr(scaOne)
   8079                       )
   8080                 )
   8081           )
   8082    );
   8083    return res;
   8084 }
   8085 
   8086 
   8087 /* QCFLAG tracks the SIMD sticky saturation status.  Update the status
   8088    thusly: if, after application of |opZHI| to both |qres| and |nres|,
   8089    they have the same value, leave QCFLAG unchanged.  Otherwise, set it
   8090    (implicitly) to 1.  |opZHI| may only be one of the Iop_ZeroHIxxofV128
   8091    operators, or Iop_INVALID, in which case |qres| and |nres| are used
   8092    unmodified.  The presence |opZHI| means this function can be used to
   8093    generate QCFLAG update code for both scalar and vector SIMD operations.
   8094 */
   8095 static
   8096 void updateQCFLAGwithDifferenceZHI ( IRTemp qres, IRTemp nres, IROp opZHI )
   8097 {
   8098    IRTemp diff      = newTempV128();
   8099    IRTemp oldQCFLAG = newTempV128();
   8100    IRTemp newQCFLAG = newTempV128();
   8101    if (opZHI == Iop_INVALID) {
   8102       assign(diff, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres)));
   8103    } else {
   8104       vassert(opZHI == Iop_ZeroHI64ofV128
   8105               || opZHI == Iop_ZeroHI96ofV128 || opZHI == Iop_ZeroHI112ofV128);
   8106       assign(diff, unop(opZHI, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres))));
   8107    }
   8108    assign(oldQCFLAG, IRExpr_Get(OFFB_QCFLAG, Ity_V128));
   8109    assign(newQCFLAG, binop(Iop_OrV128, mkexpr(oldQCFLAG), mkexpr(diff)));
   8110    stmt(IRStmt_Put(OFFB_QCFLAG, mkexpr(newQCFLAG)));
   8111 }
   8112 
   8113 
   8114 /* A variant of updateQCFLAGwithDifferenceZHI in which |qres| and |nres|
   8115    are used unmodified, hence suitable for QCFLAG updates for whole-vector
   8116    operations. */
   8117 static
   8118 void updateQCFLAGwithDifference ( IRTemp qres, IRTemp nres )
   8119 {
   8120    updateQCFLAGwithDifferenceZHI(qres, nres, Iop_INVALID);
   8121 }
   8122 
   8123 
   8124 /* Generate IR to rearrange two vector values in a way which is useful
   8125    for doing S/D add-pair etc operations.  There are 3 cases:
   8126 
   8127    2d:  [m1 m0] [n1 n0]  -->  [m1 n1] [m0 n0]
   8128 
   8129    4s:  [m3 m2 m1 m0] [n3 n2 n1 n0]  -->  [m3 m1 n3 n1] [m2 m0 n2 n0]
   8130 
   8131    2s:  [m2 m2 m1 m0] [n3 n2 n1 n0]  -->  [0 0 m1 n1] [0 0 m0 n0]
   8132 
   8133    The cases are distinguished as follows:
   8134    isD == True,  bitQ == 1  =>  2d
   8135    isD == False, bitQ == 1  =>  4s
   8136    isD == False, bitQ == 0  =>  2s
   8137 */
   8138 static
   8139 void math_REARRANGE_FOR_FLOATING_PAIRWISE (
   8140         /*OUT*/IRTemp* rearrL, /*OUT*/IRTemp* rearrR,
   8141         IRTemp vecM, IRTemp vecN, Bool isD, UInt bitQ
   8142      )
   8143 {
   8144    vassert(rearrL && *rearrL == IRTemp_INVALID);
   8145    vassert(rearrR && *rearrR == IRTemp_INVALID);
   8146    *rearrL = newTempV128();
   8147    *rearrR = newTempV128();
   8148    if (isD) {
   8149       // 2d case
   8150       vassert(bitQ == 1);
   8151       assign(*rearrL, binop(Iop_InterleaveHI64x2, mkexpr(vecM), mkexpr(vecN)));
   8152       assign(*rearrR, binop(Iop_InterleaveLO64x2, mkexpr(vecM), mkexpr(vecN)));
   8153    }
   8154    else if (!isD && bitQ == 1) {
   8155       // 4s case
   8156       assign(*rearrL, binop(Iop_CatOddLanes32x4,  mkexpr(vecM), mkexpr(vecN)));
   8157       assign(*rearrR, binop(Iop_CatEvenLanes32x4, mkexpr(vecM), mkexpr(vecN)));
   8158    } else {
   8159       // 2s case
   8160       vassert(!isD && bitQ == 0);
   8161       IRTemp m1n1m0n0 = newTempV128();
   8162       IRTemp m0n0m1n1 = newTempV128();
   8163       assign(m1n1m0n0, binop(Iop_InterleaveLO32x4,
   8164                              mkexpr(vecM), mkexpr(vecN)));
   8165       assign(m0n0m1n1, triop(Iop_SliceV128,
   8166                              mkexpr(m1n1m0n0), mkexpr(m1n1m0n0), mkU8(8)));
   8167       assign(*rearrL, unop(Iop_ZeroHI64ofV128, mkexpr(m1n1m0n0)));
   8168       assign(*rearrR, unop(Iop_ZeroHI64ofV128, mkexpr(m0n0m1n1)));
   8169    }
   8170 }
   8171 
   8172 
   8173 /* Returns 2.0 ^ (-n) for n in 1 .. 64 */
   8174 static Double two_to_the_minus ( Int n )
   8175 {
   8176    if (n == 1) return 0.5;
   8177    vassert(n >= 2 && n <= 64);
   8178    Int half = n / 2;
   8179    return two_to_the_minus(half) * two_to_the_minus(n - half);
   8180 }
   8181 
   8182 
   8183 /* Returns 2.0 ^ n for n in 1 .. 64 */
   8184 static Double two_to_the_plus ( Int n )
   8185 {
   8186    if (n == 1) return 2.0;
   8187    vassert(n >= 2 && n <= 64);
   8188    Int half = n / 2;
   8189    return two_to_the_plus(half) * two_to_the_plus(n - half);
   8190 }
   8191 
   8192 
   8193 /*------------------------------------------------------------*/
   8194 /*--- SIMD and FP instructions                             ---*/
   8195 /*------------------------------------------------------------*/
   8196 
   8197 static
   8198 Bool dis_AdvSIMD_EXT(/*MB_OUT*/DisResult* dres, UInt insn)
   8199 {
   8200    /* 31  29     23  21 20 15 14   10 9 4
   8201       0 q 101110 op2 0  m  0  imm4 0  n d
   8202       Decode fields: op2
   8203    */
   8204 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   8205    if (INSN(31,31) != 0
   8206        || INSN(29,24) != BITS6(1,0,1,1,1,0)
   8207        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(10,10) != 0) {
   8208       return False;
   8209    }
   8210    UInt bitQ = INSN(30,30);
   8211    UInt op2  = INSN(23,22);
   8212    UInt mm   = INSN(20,16);
   8213    UInt imm4 = INSN(14,11);
   8214    UInt nn   = INSN(9,5);
   8215    UInt dd   = INSN(4,0);
   8216 
   8217    if (op2 == BITS2(0,0)) {
   8218       /* -------- 00: EXT 16b_16b_16b, 8b_8b_8b -------- */
   8219       IRTemp sHi = newTempV128();
   8220       IRTemp sLo = newTempV128();
   8221       IRTemp res = newTempV128();
   8222       assign(sHi, getQReg128(mm));
   8223       assign(sLo, getQReg128(nn));
   8224       if (bitQ == 1) {
   8225          if (imm4 == 0) {
   8226             assign(res, mkexpr(sLo));
   8227          } else {
   8228             vassert(imm4 >= 1 && imm4 <= 15);
   8229             assign(res, triop(Iop_SliceV128,
   8230                               mkexpr(sHi), mkexpr(sLo), mkU8(imm4)));
   8231          }
   8232          putQReg128(dd, mkexpr(res));
   8233          DIP("ext v%u.16b, v%u.16b, v%u.16b, #%u\n", dd, nn, mm, imm4);
   8234       } else {
   8235          if (imm4 >= 8) return False;
   8236          if (imm4 == 0) {
   8237             assign(res, mkexpr(sLo));
   8238          } else {
   8239             vassert(imm4 >= 1 && imm4 <= 7);
   8240             IRTemp hi64lo64 = newTempV128();
   8241             assign(hi64lo64, binop(Iop_InterleaveLO64x2,
   8242                                    mkexpr(sHi), mkexpr(sLo)));
   8243             assign(res, triop(Iop_SliceV128,
   8244                               mkexpr(hi64lo64), mkexpr(hi64lo64), mkU8(imm4)));
   8245          }
   8246          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
   8247          DIP("ext v%u.8b, v%u.8b, v%u.8b, #%u\n", dd, nn, mm, imm4);
   8248       }
   8249       return True;
   8250    }
   8251 
   8252    return False;
   8253 #  undef INSN
   8254 }
   8255 
   8256 
   8257 static
   8258 Bool dis_AdvSIMD_TBL_TBX(/*MB_OUT*/DisResult* dres, UInt insn)
   8259 {
   8260    /* 31  29     23  21 20 15 14  12 11 9 4
   8261       0 q 001110 op2 0  m  0  len op 00 n d
   8262       Decode fields: op2,len,op
   8263    */
   8264 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   8265    if (INSN(31,31) != 0
   8266        || INSN(29,24) != BITS6(0,0,1,1,1,0)
   8267        || INSN(21,21) != 0
   8268        || INSN(15,15) != 0
   8269        || INSN(11,10) != BITS2(0,0)) {
   8270       return False;
   8271    }
   8272    UInt bitQ  = INSN(30,30);
   8273    UInt op2   = INSN(23,22);
   8274    UInt mm    = INSN(20,16);
   8275    UInt len   = INSN(14,13);
   8276    UInt bitOP = INSN(12,12);
   8277    UInt nn    = INSN(9,5);
   8278    UInt dd    = INSN(4,0);
   8279 
   8280    if (op2 == X00) {
   8281       /* -------- 00,xx,0 TBL, xx register table -------- */
   8282       /* -------- 00,xx,1 TBX, xx register table -------- */
   8283       /* 31  28        20 15 14  12  9 4
   8284          0q0 01110 000 m  0  len 000 n d  TBL Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
   8285          0q0 01110 000 m  0  len 100 n d  TBX Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
   8286          where Ta = 16b(q=1) or 8b(q=0)
   8287       */
   8288       Bool isTBX = bitOP == 1;
   8289       /* The out-of-range values to use. */
   8290       IRTemp oor_values = newTempV128();
   8291       assign(oor_values, isTBX ? getQReg128(dd) : mkV128(0));
   8292       /* src value */
   8293       IRTemp src = newTempV128();
   8294       assign(src, getQReg128(mm));
   8295       /* The table values */
   8296       IRTemp tab[4];
   8297       UInt   i;
   8298       for (i = 0; i <= len; i++) {
   8299          vassert(i < 4);
   8300          tab[i] = newTempV128();
   8301          assign(tab[i], getQReg128((nn + i) % 32));
   8302       }
   8303       IRTemp res = math_TBL_TBX(tab, len, src, oor_values);
   8304       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   8305       const HChar* Ta = bitQ ==1 ? "16b" : "8b";
   8306       const HChar* nm = isTBX ? "tbx" : "tbl";
   8307       DIP("%s %s.%s, {v%u.16b .. v%u.16b}, %s.%s\n",
   8308           nm, nameQReg128(dd), Ta, nn, (nn + len) % 32, nameQReg128(mm), Ta);
   8309       return True;
   8310    }
   8311 
   8312 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   8313    return False;
   8314 #  undef INSN
   8315 }
   8316 
   8317 
   8318 static
   8319 Bool dis_AdvSIMD_ZIP_UZP_TRN(/*MB_OUT*/DisResult* dres, UInt insn)
   8320 {
   8321    /* 31  29     23   21 20 15 14     11 9 4
   8322       0 q 001110 size 0  m  0  opcode 10 n d
   8323       Decode fields: opcode
   8324    */
   8325 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   8326    if (INSN(31,31) != 0
   8327        || INSN(29,24) != BITS6(0,0,1,1,1,0)
   8328        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(11,10) != BITS2(1,0)) {
   8329       return False;
   8330    }
   8331    UInt bitQ   = INSN(30,30);
   8332    UInt size   = INSN(23,22);
   8333    UInt mm     = INSN(20,16);
   8334    UInt opcode = INSN(14,12);
   8335    UInt nn     = INSN(9,5);
   8336    UInt dd     = INSN(4,0);
   8337 
   8338    if (opcode == BITS3(0,0,1) || opcode == BITS3(1,0,1)) {
   8339       /* -------- 001 UZP1 std7_std7_std7 -------- */
   8340       /* -------- 101 UZP2 std7_std7_std7 -------- */
   8341       if (bitQ == 0 && size == X11) return False; // implied 1d case
   8342       Bool   isUZP1 = opcode == BITS3(0,0,1);
   8343       IROp   op     = isUZP1 ? mkVecCATEVENLANES(size)
   8344                              : mkVecCATODDLANES(size);
   8345       IRTemp preL = newTempV128();
   8346       IRTemp preR = newTempV128();
   8347       IRTemp res  = newTempV128();
   8348       if (bitQ == 0) {
   8349          assign(preL, binop(Iop_InterleaveLO64x2, getQReg128(mm),
   8350                                                   getQReg128(nn)));
   8351          assign(preR, mkexpr(preL));
   8352       } else {
   8353          assign(preL, getQReg128(mm));
   8354          assign(preR, getQReg128(nn));
   8355       }
   8356       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
   8357       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   8358       const HChar* nm  = isUZP1 ? "uzp1" : "uzp2";
   8359       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   8360       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   8361           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   8362       return True;
   8363    }
   8364 
   8365    if (opcode == BITS3(0,1,0) || opcode == BITS3(1,1,0)) {
   8366       /* -------- 010 TRN1 std7_std7_std7 -------- */
   8367       /* -------- 110 TRN2 std7_std7_std7 -------- */
   8368       if (bitQ == 0 && size == X11) return False; // implied 1d case
   8369       Bool   isTRN1 = opcode == BITS3(0,1,0);
   8370       IROp   op1    = isTRN1 ? mkVecCATEVENLANES(size)
   8371                              : mkVecCATODDLANES(size);
   8372       IROp op2 = mkVecINTERLEAVEHI(size);
   8373       IRTemp srcM = newTempV128();
   8374       IRTemp srcN = newTempV128();
   8375       IRTemp res  = newTempV128();
   8376       assign(srcM, getQReg128(mm));
   8377       assign(srcN, getQReg128(nn));
   8378       assign(res, binop(op2, binop(op1, mkexpr(srcM), mkexpr(srcM)),
   8379                              binop(op1, mkexpr(srcN), mkexpr(srcN))));
   8380       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   8381       const HChar* nm  = isTRN1 ? "trn1" : "trn2";
   8382       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   8383       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   8384           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   8385       return True;
   8386    }
   8387 
   8388    if (opcode == BITS3(0,1,1) || opcode == BITS3(1,1,1)) {
   8389       /* -------- 011 ZIP1 std7_std7_std7 -------- */
   8390       /* -------- 111 ZIP2 std7_std7_std7 -------- */
   8391       if (bitQ == 0 && size == X11) return False; // implied 1d case
   8392       Bool   isZIP1 = opcode == BITS3(0,1,1);
   8393       IROp   op     = isZIP1 ? mkVecINTERLEAVELO(size)
   8394                              : mkVecINTERLEAVEHI(size);
   8395       IRTemp preL = newTempV128();
   8396       IRTemp preR = newTempV128();
   8397       IRTemp res  = newTempV128();
   8398       if (bitQ == 0 && !isZIP1) {
   8399          IRTemp z128 = newTempV128();
   8400          assign(z128, mkV128(0x0000));
   8401          // preL = Vm shifted left 32 bits
   8402          // preR = Vn shifted left 32 bits
   8403          assign(preL, triop(Iop_SliceV128,
   8404                             getQReg128(mm), mkexpr(z128), mkU8(12)));
   8405          assign(preR, triop(Iop_SliceV128,
   8406                             getQReg128(nn), mkexpr(z128), mkU8(12)));
   8407 
   8408       } else {
   8409          assign(preL, getQReg128(mm));
   8410          assign(preR, getQReg128(nn));
   8411       }
   8412       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
   8413       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   8414       const HChar* nm  = isZIP1 ? "zip1" : "zip2";
   8415       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   8416       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   8417           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   8418       return True;
   8419    }
   8420 
   8421    return False;
   8422 #  undef INSN
   8423 }
   8424 
   8425 
   8426 static
   8427 Bool dis_AdvSIMD_across_lanes(/*MB_OUT*/DisResult* dres, UInt insn)
   8428 {
   8429    /* 31    28    23   21    16     11 9 4
   8430       0 q u 01110 size 11000 opcode 10 n d
   8431       Decode fields: u,size,opcode
   8432    */
   8433 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   8434    if (INSN(31,31) != 0
   8435        || INSN(28,24) != BITS5(0,1,1,1,0)
   8436        || INSN(21,17) != BITS5(1,1,0,0,0) || INSN(11,10) != BITS2(1,0)) {
   8437       return False;
   8438    }
   8439    UInt bitQ   = INSN(30,30);
   8440    UInt bitU   = INSN(29,29);
   8441    UInt size   = INSN(23,22);
   8442    UInt opcode = INSN(16,12);
   8443    UInt nn     = INSN(9,5);
   8444    UInt dd     = INSN(4,0);
   8445 
   8446    if (opcode == BITS5(0,0,0,1,1)) {
   8447       /* -------- 0,xx,00011 SADDLV -------- */
   8448       /* -------- 1,xx,00011 UADDLV -------- */
   8449       /* size is the narrow size */
   8450       if (size == X11 || (size == X10 && bitQ == 0)) return False;
   8451       Bool   isU = bitU == 1;
   8452       IRTemp src = newTempV128();
   8453       assign(src, getQReg128(nn));
   8454       /* The basic plan is to widen the lower half, and if Q = 1,
   8455          the upper half too.  Add them together (if Q = 1), and in
   8456          either case fold with add at twice the lane width.
   8457       */
   8458       IRExpr* widened
   8459          = mkexpr(math_WIDEN_LO_OR_HI_LANES(
   8460                      isU, False/*!fromUpperHalf*/, size, mkexpr(src)));
   8461       if (bitQ == 1) {
   8462          widened
   8463             = binop(mkVecADD(size+1),
   8464                     widened,
   8465                     mkexpr(math_WIDEN_LO_OR_HI_LANES(
   8466                               isU, True/*fromUpperHalf*/, size, mkexpr(src)))
   8467               );
   8468       }
   8469       /* Now fold. */
   8470       IRTemp tWi = newTempV128();
   8471       assign(tWi, widened);
   8472       IRTemp res = math_FOLDV(tWi, mkVecADD(size+1));
   8473       putQReg128(dd, mkexpr(res));
   8474       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   8475       const HChar  ch  = "bhsd"[size];
   8476       DIP("%s %s.%c, %s.%s\n", isU ? "uaddlv" : "saddlv",
   8477           nameQReg128(dd), ch, nameQReg128(nn), arr);
   8478       return True;
   8479    }
   8480 
   8481    UInt ix = 0;
   8482    /**/ if (opcode == BITS5(0,1,0,1,0)) { ix = bitU == 0 ? 1 : 2; }
   8483    else if (opcode == BITS5(1,1,0,1,0)) { ix = bitU == 0 ? 3 : 4; }
   8484    else if (opcode == BITS5(1,1,0,1,1) && bitU == 0) { ix = 5; }
   8485    /**/
   8486    if (ix != 0) {
   8487       /* -------- 0,xx,01010: SMAXV -------- (1) */
   8488       /* -------- 1,xx,01010: UMAXV -------- (2) */
   8489       /* -------- 0,xx,11010: SMINV -------- (3) */
   8490       /* -------- 1,xx,11010: UMINV -------- (4) */
   8491       /* -------- 0,xx,11011: ADDV  -------- (5) */
   8492       vassert(ix >= 1 && ix <= 5);
   8493       if (size == X11) return False; // 1d,2d cases not allowed
   8494       if (size == X10 && bitQ == 0) return False; // 2s case not allowed
   8495       const IROp opMAXS[3]
   8496          = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4 };
   8497       const IROp opMAXU[3]
   8498          = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4 };
   8499       const IROp opMINS[3]
   8500          = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4 };
   8501       const IROp opMINU[3]
   8502          = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4 };
   8503       const IROp opADD[3]
   8504          = { Iop_Add8x16,  Iop_Add16x8,  Iop_Add32x4 };
   8505       vassert(size < 3);
   8506       IROp op = Iop_INVALID;
   8507       const HChar* nm = NULL;
   8508       switch (ix) {
   8509          case 1: op = opMAXS[size]; nm = "smaxv"; break;
   8510          case 2: op = opMAXU[size]; nm = "umaxv"; break;
   8511          case 3: op = opMINS[size]; nm = "sminv"; break;
   8512          case 4: op = opMINU[size]; nm = "uminv"; break;
   8513          case 5: op = opADD[size];  nm = "addv";  break;
   8514          default: vassert(0);
   8515       }
   8516       vassert(op != Iop_INVALID && nm != NULL);
   8517       IRTemp tN1 = newTempV128();
   8518       assign(tN1, getQReg128(nn));
   8519       /* If Q == 0, we're just folding lanes in the lower half of
   8520          the value.  In which case, copy the lower half of the
   8521          source into the upper half, so we can then treat it the
   8522          same as the full width case.  Except for the addition case,
   8523          in which we have to zero out the upper half. */
   8524       IRTemp tN2 = newTempV128();
   8525       assign(tN2, bitQ == 0
   8526                      ? (ix == 5 ? unop(Iop_ZeroHI64ofV128, mkexpr(tN1))
   8527                                 : mk_CatEvenLanes64x2(tN1,tN1))
   8528                      : mkexpr(tN1));
   8529       IRTemp res = math_FOLDV(tN2, op);
   8530       if (res == IRTemp_INVALID)
   8531          return False; /* means math_FOLDV
   8532                           doesn't handle this case yet */
   8533       putQReg128(dd, mkexpr(res));
   8534       const IRType tys[3] = { Ity_I8, Ity_I16, Ity_I32 };
   8535       IRType laneTy = tys[size];
   8536       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   8537       DIP("%s %s, %s.%s\n", nm,
   8538           nameQRegLO(dd, laneTy), nameQReg128(nn), arr);
   8539       return True;
   8540    }
   8541 
   8542    if ((size == X00 || size == X10)
   8543        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
   8544       /* -------- 0,00,01100: FMAXMNV s_4s -------- */
   8545       /* -------- 0,10,01100: FMINMNV s_4s -------- */
   8546       /* -------- 1,00,01111: FMAXV   s_4s -------- */
   8547       /* -------- 1,10,01111: FMINV   s_4s -------- */
   8548       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
   8549       if (bitQ == 0) return False; // Only 4s is allowed
   8550       Bool   isMIN = (size & 2) == 2;
   8551       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
   8552       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(2);
   8553       IRTemp src = newTempV128();
   8554       assign(src, getQReg128(nn));
   8555       IRTemp res = math_FOLDV(src, opMXX);
   8556       putQReg128(dd, mkexpr(res));
   8557       DIP("%s%sv s%u, %u.4s\n",
   8558           isMIN ? "fmin" : "fmax", isNM ? "nm" : "", dd, nn);
   8559       return True;
   8560    }
   8561 
   8562 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   8563    return False;
   8564 #  undef INSN
   8565 }
   8566 
   8567 
   8568 static
   8569 Bool dis_AdvSIMD_copy(/*MB_OUT*/DisResult* dres, UInt insn)
   8570 {
   8571    /* 31     28       20   15 14   10 9 4
   8572       0 q op 01110000 imm5 0  imm4 1  n d
   8573       Decode fields: q,op,imm4
   8574    */
   8575 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   8576    if (INSN(31,31) != 0
   8577        || INSN(28,21) != BITS8(0,1,1,1,0,0,0,0)
   8578        || INSN(15,15) != 0 || INSN(10,10) != 1) {
   8579       return False;
   8580    }
   8581    UInt bitQ  = INSN(30,30);
   8582    UInt bitOP = INSN(29,29);
   8583    UInt imm5  = INSN(20,16);
   8584    UInt imm4  = INSN(14,11);
   8585    UInt nn    = INSN(9,5);
   8586    UInt dd    = INSN(4,0);
   8587 
   8588    /* -------- x,0,0000: DUP (element, vector) -------- */
   8589    /* 31  28       20   15     9 4
   8590       0q0 01110000 imm5 000001 n d  DUP Vd.T, Vn.Ts[index]
   8591    */
   8592    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
   8593       UInt   laneNo    = 0;
   8594       UInt   laneSzLg2 = 0;
   8595       HChar  laneCh    = '?';
   8596       IRTemp res       = handle_DUP_VEC_ELEM(&laneNo, &laneSzLg2, &laneCh,
   8597                                              getQReg128(nn), imm5);
   8598       if (res == IRTemp_INVALID)
   8599          return False;
   8600       if (bitQ == 0 && laneSzLg2 == X11)
   8601          return False; /* .1d case */
   8602       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   8603       const HChar* arT = nameArr_Q_SZ(bitQ, laneSzLg2);
   8604       DIP("dup %s.%s, %s.%c[%u]\n",
   8605            nameQReg128(dd), arT, nameQReg128(nn), laneCh, laneNo);
   8606       return True;
   8607    }
   8608 
   8609    /* -------- x,0,0001: DUP (general, vector) -------- */
   8610    /* 31  28       20   15       9 4
   8611       0q0 01110000 imm5 0 0001 1 n d  DUP Vd.T, Rn
   8612       Q=0 writes 64, Q=1 writes 128
   8613       imm5: xxxx1  8B(q=0)      or 16b(q=1),     R=W
   8614             xxx10  4H(q=0)      or 8H(q=1),      R=W
   8615             xx100  2S(q=0)      or 4S(q=1),      R=W
   8616             x1000  Invalid(q=0) or 2D(q=1),      R=X
   8617             x0000  Invalid(q=0) or Invalid(q=1)
   8618       Require op=0, imm4=0001
   8619    */
   8620    if (bitOP == 0 && imm4 == BITS4(0,0,0,1)) {
   8621       Bool   isQ = bitQ == 1;
   8622       IRTemp w0  = newTemp(Ity_I64);
   8623       const HChar* arT = "??";
   8624       IRType laneTy = Ity_INVALID;
   8625       if (imm5 & 1) {
   8626          arT    = isQ ? "16b" : "8b";
   8627          laneTy = Ity_I8;
   8628          assign(w0, unop(Iop_8Uto64, unop(Iop_64to8, getIReg64orZR(nn))));
   8629       }
   8630       else if (imm5 & 2) {
   8631          arT    = isQ ? "8h" : "4h";
   8632          laneTy = Ity_I16;
   8633          assign(w0, unop(Iop_16Uto64, unop(Iop_64to16, getIReg64orZR(nn))));
   8634       }
   8635       else if (imm5 & 4) {
   8636          arT    = isQ ? "4s" : "2s";
   8637          laneTy = Ity_I32;
   8638          assign(w0, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
   8639       }
   8640       else if ((imm5 & 8) && isQ) {
   8641          arT    = "2d";
   8642          laneTy = Ity_I64;
   8643          assign(w0, getIReg64orZR(nn));
   8644       }
   8645       else {
   8646          /* invalid; leave laneTy unchanged. */
   8647       }
   8648       /* */
   8649       if (laneTy != Ity_INVALID) {
   8650          IRTemp w1 = math_DUP_TO_64(w0, laneTy);
   8651          putQReg128(dd, binop(Iop_64HLtoV128,
   8652                               isQ ? mkexpr(w1) : mkU64(0), mkexpr(w1)));
   8653          DIP("dup %s.%s, %s\n",
   8654              nameQReg128(dd), arT, nameIRegOrZR(laneTy == Ity_I64, nn));
   8655          return True;
   8656       }
   8657       /* invalid */
   8658       return False;
   8659    }
   8660 
   8661    /* -------- 1,0,0011: INS (general) -------- */
   8662    /* 31  28       20   15     9 4
   8663       010 01110000 imm5 000111 n d  INS Vd.Ts[ix], Rn
   8664       where Ts,ix = case imm5 of xxxx1 -> B, xxxx
   8665                                  xxx10 -> H, xxx
   8666                                  xx100 -> S, xx
   8667                                  x1000 -> D, x
   8668    */
   8669    if (bitQ == 1 && bitOP == 0 && imm4 == BITS4(0,0,1,1)) {
   8670       HChar   ts     = '?';
   8671       UInt    laneNo = 16;
   8672       IRExpr* src    = NULL;
   8673       if (imm5 & 1) {
   8674          src    = unop(Iop_64to8, getIReg64orZR(nn));
   8675          laneNo = (imm5 >> 1) & 15;
   8676          ts     = 'b';
   8677       }
   8678       else if (imm5 & 2) {
   8679          src    = unop(Iop_64to16, getIReg64orZR(nn));
   8680          laneNo = (imm5 >> 2) & 7;
   8681          ts     = 'h';
   8682       }
   8683       else if (imm5 & 4) {
   8684          src    = unop(Iop_64to32, getIReg64orZR(nn));
   8685          laneNo = (imm5 >> 3) & 3;
   8686          ts     = 's';
   8687       }
   8688       else if (imm5 & 8) {
   8689          src    = getIReg64orZR(nn);
   8690          laneNo = (imm5 >> 4) & 1;
   8691          ts     = 'd';
   8692       }
   8693       /* */
   8694       if (src) {
   8695          vassert(laneNo < 16);
   8696          putQRegLane(dd, laneNo, src);
   8697          DIP("ins %s.%c[%u], %s\n",
   8698              nameQReg128(dd), ts, laneNo, nameIReg64orZR(nn));
   8699          return True;
   8700       }
   8701       /* invalid */
   8702       return False;
   8703    }
   8704 
   8705    /* -------- x,0,0101: SMOV -------- */
   8706    /* -------- x,0,0111: UMOV -------- */
   8707    /* 31  28        20   15     9 4
   8708       0q0 01110 000 imm5 001111 n d  UMOV Xd/Wd, Vn.Ts[index]
   8709       0q0 01110 000 imm5 001011 n d  SMOV Xd/Wd, Vn.Ts[index]
   8710       dest is Xd when q==1, Wd when q==0
   8711       UMOV:
   8712          Ts,index,ops = case q:imm5 of
   8713                           0:xxxx1 -> B, xxxx, 8Uto64
   8714                           1:xxxx1 -> invalid
   8715                           0:xxx10 -> H, xxx,  16Uto64
   8716                           1:xxx10 -> invalid
   8717                           0:xx100 -> S, xx,   32Uto64
   8718                           1:xx100 -> invalid
   8719                           1:x1000 -> D, x,    copy64
   8720                           other   -> invalid
   8721       SMOV:
   8722          Ts,index,ops = case q:imm5 of
   8723                           0:xxxx1 -> B, xxxx, (32Uto64 . 8Sto32)
   8724                           1:xxxx1 -> B, xxxx, 8Sto64
   8725                           0:xxx10 -> H, xxx,  (32Uto64 . 16Sto32)
   8726                           1:xxx10 -> H, xxx,  16Sto64
   8727                           0:xx100 -> invalid
   8728                           1:xx100 -> S, xx,   32Sto64
   8729                           1:x1000 -> invalid
   8730                           other   -> invalid
   8731    */
   8732    if (bitOP == 0 && (imm4 == BITS4(0,1,0,1) || imm4 == BITS4(0,1,1,1))) {
   8733       Bool isU  = (imm4 & 2) == 2;
   8734       const HChar* arTs = "??";
   8735       UInt    laneNo = 16; /* invalid */
   8736       // Setting 'res' to non-NULL determines valid/invalid
   8737       IRExpr* res    = NULL;
   8738       if (!bitQ && (imm5 & 1)) { // 0:xxxx1
   8739          laneNo = (imm5 >> 1) & 15;
   8740          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
   8741          res = isU ? unop(Iop_8Uto64, lane)
   8742                    : unop(Iop_32Uto64, unop(Iop_8Sto32, lane));
   8743          arTs = "b";
   8744       }
   8745       else if (bitQ && (imm5 & 1)) { // 1:xxxx1
   8746          laneNo = (imm5 >> 1) & 15;
   8747          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
   8748          res = isU ? NULL
   8749                    : unop(Iop_8Sto64, lane);
   8750          arTs = "b";
   8751       }
   8752       else if (!bitQ && (imm5 & 2)) { // 0:xxx10
   8753          laneNo = (imm5 >> 2) & 7;
   8754          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
   8755          res = isU ? unop(Iop_16Uto64, lane)
   8756                    : unop(Iop_32Uto64, unop(Iop_16Sto32, lane));
   8757          arTs = "h";
   8758       }
   8759       else if (bitQ && (imm5 & 2)) { // 1:xxx10
   8760          laneNo = (imm5 >> 2) & 7;
   8761          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
   8762          res = isU ? NULL
   8763                    : unop(Iop_16Sto64, lane);
   8764          arTs = "h";
   8765       }
   8766       else if (!bitQ && (imm5 & 4)) { // 0:xx100
   8767          laneNo = (imm5 >> 3) & 3;
   8768          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
   8769          res = isU ? unop(Iop_32Uto64, lane)
   8770                    : NULL;
   8771          arTs = "s";
   8772       }
   8773       else if (bitQ && (imm5 & 4)) { // 1:xxx10
   8774          laneNo = (imm5 >> 3) & 3;
   8775          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
   8776          res = isU ? NULL
   8777                    : unop(Iop_32Sto64, lane);
   8778          arTs = "s";
   8779       }
   8780       else if (bitQ && (imm5 & 8)) { // 1:x1000
   8781          laneNo = (imm5 >> 4) & 1;
   8782          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I64);
   8783          res = isU ? lane
   8784                    : NULL;
   8785          arTs = "d";
   8786       }
   8787       /* */
   8788       if (res) {
   8789          vassert(laneNo < 16);
   8790          putIReg64orZR(dd, res);
   8791          DIP("%cmov %s, %s.%s[%u]\n", isU ? 'u' : 's',
   8792              nameIRegOrZR(bitQ == 1, dd),
   8793              nameQReg128(nn), arTs, laneNo);
   8794          return True;
   8795       }
   8796       /* invalid */
   8797       return False;
   8798    }
   8799 
   8800    /* -------- 1,1,xxxx: INS (element) -------- */
   8801    /* 31  28       20     14   9 4
   8802       011 01110000 imm5 0 imm4 n d  INS Vd.Ts[ix1], Vn.Ts[ix2]
   8803       where Ts,ix1,ix2
   8804                = case imm5 of xxxx1 -> B, xxxx, imm4[3:0]
   8805                               xxx10 -> H, xxx,  imm4[3:1]
   8806                               xx100 -> S, xx,   imm4[3:2]
   8807                               x1000 -> D, x,    imm4[3:3]
   8808    */
   8809    if (bitQ == 1 && bitOP == 1) {
   8810       HChar   ts  = '?';
   8811       IRType  ity = Ity_INVALID;
   8812       UInt    ix1 = 16;
   8813       UInt    ix2 = 16;
   8814       if (imm5 & 1) {
   8815          ts  = 'b';
   8816          ity = Ity_I8;
   8817          ix1 = (imm5 >> 1) & 15;
   8818          ix2 = (imm4 >> 0) & 15;
   8819       }
   8820       else if (imm5 & 2) {
   8821          ts  = 'h';
   8822          ity = Ity_I16;
   8823          ix1 = (imm5 >> 2) & 7;
   8824          ix2 = (imm4 >> 1) & 7;
   8825       }
   8826       else if (imm5 & 4) {
   8827          ts  = 's';
   8828          ity = Ity_I32;
   8829          ix1 = (imm5 >> 3) & 3;
   8830          ix2 = (imm4 >> 2) & 3;
   8831       }
   8832       else if (imm5 & 8) {
   8833          ts  = 'd';
   8834          ity = Ity_I64;
   8835          ix1 = (imm5 >> 4) & 1;
   8836          ix2 = (imm4 >> 3) & 1;
   8837       }
   8838       /* */
   8839       if (ity != Ity_INVALID) {
   8840          vassert(ix1 < 16);
   8841          vassert(ix2 < 16);
   8842          putQRegLane(dd, ix1, getQRegLane(nn, ix2, ity));
   8843          DIP("ins %s.%c[%u], %s.%c[%u]\n",
   8844              nameQReg128(dd), ts, ix1, nameQReg128(nn), ts, ix2);
   8845          return True;
   8846       }
   8847       /* invalid */
   8848       return False;
   8849    }
   8850 
   8851    return False;
   8852 #  undef INSN
   8853 }
   8854 
   8855 
   8856 static
   8857 Bool dis_AdvSIMD_modified_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
   8858 {
   8859    /* 31    28          18  15    11 9     4
   8860       0q op 01111 00000 abc cmode 01 defgh d
   8861       Decode fields: q,op,cmode
   8862       Bit 11 is really "o2", but it is always zero.
   8863    */
   8864 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   8865    if (INSN(31,31) != 0
   8866        || INSN(28,19) != BITS10(0,1,1,1,1,0,0,0,0,0)
   8867        || INSN(11,10) != BITS2(0,1)) {
   8868       return False;
   8869    }
   8870    UInt bitQ     = INSN(30,30);
   8871    UInt bitOP    = INSN(29,29);
   8872    UInt cmode    = INSN(15,12);
   8873    UInt abcdefgh = (INSN(18,16) << 5) | INSN(9,5);
   8874    UInt dd       = INSN(4,0);
   8875 
   8876    ULong imm64lo  = 0;
   8877    UInt  op_cmode = (bitOP << 4) | cmode;
   8878    Bool  ok       = False;
   8879    Bool  isORR    = False;
   8880    Bool  isBIC    = False;
   8881    Bool  isMOV    = False;
   8882    Bool  isMVN    = False;
   8883    Bool  isFMOV   = False;
   8884    switch (op_cmode) {
   8885       /* -------- x,0,0000 MOVI 32-bit shifted imm -------- */
   8886       /* -------- x,0,0010 MOVI 32-bit shifted imm -------- */
   8887       /* -------- x,0,0100 MOVI 32-bit shifted imm -------- */
   8888       /* -------- x,0,0110 MOVI 32-bit shifted imm -------- */
   8889       case BITS5(0,0,0,0,0): case BITS5(0,0,0,1,0):
   8890       case BITS5(0,0,1,0,0): case BITS5(0,0,1,1,0): // 0:0xx0
   8891          ok = True; isMOV = True; break;
   8892 
   8893       /* -------- x,0,0001 ORR (vector, immediate) 32-bit -------- */
   8894       /* -------- x,0,0011 ORR (vector, immediate) 32-bit -------- */
   8895       /* -------- x,0,0101 ORR (vector, immediate) 32-bit -------- */
   8896       /* -------- x,0,0111 ORR (vector, immediate) 32-bit -------- */
   8897       case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,1):
   8898       case BITS5(0,0,1,0,1): case BITS5(0,0,1,1,1): // 0:0xx1
   8899          ok = True; isORR = True; break;
   8900 
   8901       /* -------- x,0,1000 MOVI 16-bit shifted imm -------- */
   8902       /* -------- x,0,1010 MOVI 16-bit shifted imm -------- */
   8903       case BITS5(0,1,0,0,0): case BITS5(0,1,0,1,0): // 0:10x0
   8904          ok = True; isMOV = True; break;
   8905 
   8906       /* -------- x,0,1001 ORR (vector, immediate) 16-bit -------- */
   8907       /* -------- x,0,1011 ORR (vector, immediate) 16-bit -------- */
   8908       case BITS5(0,1,0,0,1): case BITS5(0,1,0,1,1): // 0:10x1
   8909          ok = True; isORR = True; break;
   8910 
   8911       /* -------- x,0,1100 MOVI 32-bit shifting ones -------- */
   8912       /* -------- x,0,1101 MOVI 32-bit shifting ones -------- */
   8913       case BITS5(0,1,1,0,0): case BITS5(0,1,1,0,1): // 0:110x
   8914          ok = True; isMOV = True; break;
   8915 
   8916       /* -------- x,0,1110 MOVI 8-bit -------- */
   8917       case BITS5(0,1,1,1,0):
   8918          ok = True; isMOV = True; break;
   8919 
   8920       /* -------- x,0,1111 FMOV (vector, immediate, F32) -------- */
   8921       case BITS5(0,1,1,1,1): // 0:1111
   8922          ok = True; isFMOV = True; break;
   8923 
   8924       /* -------- x,1,0000 MVNI 32-bit shifted imm -------- */
   8925       /* -------- x,1,0010 MVNI 32-bit shifted imm  -------- */
   8926       /* -------- x,1,0100 MVNI 32-bit shifted imm  -------- */
   8927       /* -------- x,1,0110 MVNI 32-bit shifted imm  -------- */
   8928       case BITS5(1,0,0,0,0): case BITS5(1,0,0,1,0):
   8929       case BITS5(1,0,1,0,0): case BITS5(1,0,1,1,0): // 1:0xx0
   8930          ok = True; isMVN = True; break;
   8931 
   8932       /* -------- x,1,0001 BIC (vector, immediate) 32-bit -------- */
   8933       /* -------- x,1,0011 BIC (vector, immediate) 32-bit -------- */
   8934       /* -------- x,1,0101 BIC (vector, immediate) 32-bit -------- */
   8935       /* -------- x,1,0111 BIC (vector, immediate) 32-bit -------- */
   8936       case BITS5(1,0,0,0,1): case BITS5(1,0,0,1,1):
   8937       case BITS5(1,0,1,0,1): case BITS5(1,0,1,1,1): // 1:0xx1
   8938          ok = True; isBIC = True; break;
   8939 
   8940       /* -------- x,1,1000 MVNI 16-bit shifted imm -------- */
   8941       /* -------- x,1,1010 MVNI 16-bit shifted imm -------- */
   8942       case BITS5(1,1,0,0,0): case BITS5(1,1,0,1,0): // 1:10x0
   8943          ok = True; isMVN = True; break;
   8944 
   8945       /* -------- x,1,1001 BIC (vector, immediate) 16-bit -------- */
   8946       /* -------- x,1,1011 BIC (vector, immediate) 16-bit -------- */
   8947       case BITS5(1,1,0,0,1): case BITS5(1,1,0,1,1): // 1:10x1
   8948          ok = True; isBIC = True; break;
   8949 
   8950       /* -------- x,1,1100 MVNI 32-bit shifting ones -------- */
   8951       /* -------- x,1,1101 MVNI 32-bit shifting ones -------- */
   8952       case BITS5(1,1,1,0,0): case BITS5(1,1,1,0,1): // 1:110x
   8953          ok = True; isMVN = True; break;
   8954 
   8955       /* -------- 0,1,1110 MOVI 64-bit scalar -------- */
   8956       /* -------- 1,1,1110 MOVI 64-bit vector -------- */
   8957       case BITS5(1,1,1,1,0):
   8958          ok = True; isMOV = True; break;
   8959 
   8960       /* -------- 1,1,1111 FMOV (vector, immediate, F64) -------- */
   8961       case BITS5(1,1,1,1,1): // 1:1111
   8962          ok = bitQ == 1; isFMOV = True; break;
   8963 
   8964       default:
   8965         break;
   8966    }
   8967    if (ok) {
   8968       vassert(1 == (isMOV ? 1 : 0) + (isMVN ? 1 : 0)
   8969                    + (isORR ? 1 : 0) + (isBIC ? 1 : 0) + (isFMOV ? 1 : 0));
   8970       ok = AdvSIMDExpandImm(&imm64lo, bitOP, cmode, abcdefgh);
   8971    }
   8972    if (ok) {
   8973       if (isORR || isBIC) {
   8974          ULong inv
   8975             = isORR ? 0ULL : ~0ULL;
   8976          IRExpr* immV128
   8977             = binop(Iop_64HLtoV128, mkU64(inv ^ imm64lo), mkU64(inv ^ imm64lo));
   8978          IRExpr* res
   8979             = binop(isORR ? Iop_OrV128 : Iop_AndV128, getQReg128(dd), immV128);
   8980          const HChar* nm = isORR ? "orr" : "bic";
   8981          if (bitQ == 0) {
   8982             putQReg128(dd, unop(Iop_ZeroHI64ofV128, res));
   8983             DIP("%s %s.1d, %016llx\n", nm, nameQReg128(dd), imm64lo);
   8984          } else {
   8985             putQReg128(dd, res);
   8986             DIP("%s %s.2d, #0x%016llx'%016llx\n", nm,
   8987                 nameQReg128(dd), imm64lo, imm64lo);
   8988          }
   8989       }
   8990       else if (isMOV || isMVN || isFMOV) {
   8991          if (isMVN) imm64lo = ~imm64lo;
   8992          ULong   imm64hi = bitQ == 0  ? 0  :  imm64lo;
   8993          IRExpr* immV128 = binop(Iop_64HLtoV128, mkU64(imm64hi),
   8994                                                  mkU64(imm64lo));
   8995          putQReg128(dd, immV128);
   8996          DIP("mov %s, #0x%016llx'%016llx\n", nameQReg128(dd), imm64hi, imm64lo);
   8997       }
   8998       return True;
   8999    }
   9000    /* else fall through */
   9001 
   9002    return False;
   9003 #  undef INSN
   9004 }
   9005 
   9006 
   9007 static
   9008 Bool dis_AdvSIMD_scalar_copy(/*MB_OUT*/DisResult* dres, UInt insn)
   9009 {
   9010    /* 31    28       20   15 14   10 9 4
   9011       01 op 11110000 imm5 0  imm4 1  n d
   9012       Decode fields: op,imm4
   9013    */
   9014 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   9015    if (INSN(31,30) != BITS2(0,1)
   9016        || INSN(28,21) != BITS8(1,1,1,1,0,0,0,0)
   9017        || INSN(15,15) != 0 || INSN(10,10) != 1) {
   9018       return False;
   9019    }
   9020    UInt bitOP = INSN(29,29);
   9021    UInt imm5  = INSN(20,16);
   9022    UInt imm4  = INSN(14,11);
   9023    UInt nn    = INSN(9,5);
   9024    UInt dd    = INSN(4,0);
   9025 
   9026    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
   9027       /* -------- 0,0000 DUP (element, scalar) -------- */
   9028       IRTemp w0     = newTemp(Ity_I64);
   9029       const HChar* arTs = "??";
   9030       IRType laneTy = Ity_INVALID;
   9031       UInt   laneNo = 16; /* invalid */
   9032       if (imm5 & 1) {
   9033          arTs   = "b";
   9034          laneNo = (imm5 >> 1) & 15;
   9035          laneTy = Ity_I8;
   9036          assign(w0, unop(Iop_8Uto64, getQRegLane(nn, laneNo, laneTy)));
   9037       }
   9038       else if (imm5 & 2) {
   9039          arTs   = "h";
   9040          laneNo = (imm5 >> 2) & 7;
   9041          laneTy = Ity_I16;
   9042          assign(w0, unop(Iop_16Uto64, getQRegLane(nn, laneNo, laneTy)));
   9043       }
   9044       else if (imm5 & 4) {
   9045          arTs   = "s";
   9046          laneNo = (imm5 >> 3) & 3;
   9047          laneTy = Ity_I32;
   9048          assign(w0, unop(Iop_32Uto64, getQRegLane(nn, laneNo, laneTy)));
   9049       }
   9050       else if (imm5 & 8) {
   9051          arTs   = "d";
   9052          laneNo = (imm5 >> 4) & 1;
   9053          laneTy = Ity_I64;
   9054          assign(w0, getQRegLane(nn, laneNo, laneTy));
   9055       }
   9056       else {
   9057          /* invalid; leave laneTy unchanged. */
   9058       }
   9059       /* */
   9060       if (laneTy != Ity_INVALID) {
   9061          vassert(laneNo < 16);
   9062          putQReg128(dd, binop(Iop_64HLtoV128, mkU64(0), mkexpr(w0)));
   9063          DIP("dup %s, %s.%s[%u]\n",
   9064              nameQRegLO(dd, laneTy), nameQReg128(nn), arTs, laneNo);
   9065          return True;
   9066       }
   9067       /* else fall through */
   9068    }
   9069 
   9070    return False;
   9071 #  undef INSN
   9072 }
   9073 
   9074 
   9075 static
   9076 Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn)
   9077 {
   9078    /* 31   28    23 21    16     11 9 4
   9079       01 u 11110 sz 11000 opcode 10 n d
   9080       Decode fields: u,sz,opcode
   9081    */
   9082 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   9083    if (INSN(31,30) != BITS2(0,1)
   9084        || INSN(28,24) != BITS5(1,1,1,1,0)
   9085        || INSN(21,17) != BITS5(1,1,0,0,0)
   9086        || INSN(11,10) != BITS2(1,0)) {
   9087       return False;
   9088    }
   9089    UInt bitU   = INSN(29,29);
   9090    UInt sz     = INSN(23,22);
   9091    UInt opcode = INSN(16,12);
   9092    UInt nn     = INSN(9,5);
   9093    UInt dd     = INSN(4,0);
   9094 
   9095    if (bitU == 0 && sz == X11 && opcode == BITS5(1,1,0,1,1)) {
   9096       /* -------- 0,11,11011 ADDP d_2d -------- */
   9097       IRTemp xy = newTempV128();
   9098       IRTemp xx = newTempV128();
   9099       assign(xy, getQReg128(nn));
   9100       assign(xx, binop(Iop_InterleaveHI64x2, mkexpr(xy), mkexpr(xy)));
   9101       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
   9102                           binop(Iop_Add64x2, mkexpr(xy), mkexpr(xx))));
   9103       DIP("addp d%u, %s.2d\n", dd, nameQReg128(nn));
   9104       return True;
   9105    }
   9106 
   9107    if (bitU == 1 && sz <= X01 && opcode == BITS5(0,1,1,0,1)) {
   9108       /* -------- 1,00,01101 ADDP s_2s -------- */
   9109       /* -------- 1,01,01101 ADDP d_2d -------- */
   9110       Bool   isD   = sz == X01;
   9111       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
   9112       IROp   opADD = mkVecADDF(isD ? 3 : 2);
   9113       IRTemp src   = newTempV128();
   9114       IRTemp argL  = newTempV128();
   9115       IRTemp argR  = newTempV128();
   9116       assign(src, getQReg128(nn));
   9117       assign(argL, unop(opZHI, mkexpr(src)));
   9118       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
   9119                                                     mkU8(isD ? 8 : 4))));
   9120       putQReg128(dd, unop(opZHI,
   9121                           triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
   9122                                               mkexpr(argL), mkexpr(argR))));
   9123       DIP(isD ? "faddp d%u, v%u.2d\n" : "faddp s%u, v%u.2s\n", dd, nn);
   9124       return True;
   9125    }
   9126 
   9127    if (bitU == 1
   9128        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
   9129       /* -------- 1,0x,01100 FMAXNMP d_2d, s_2s -------- */
   9130       /* -------- 1,1x,01100 FMINNMP d_2d, s_2s -------- */
   9131       /* -------- 1,0x,01111 FMAXP   d_2d, s_2s -------- */
   9132       /* -------- 1,1x,01111 FMINP   d_2d, s_2s -------- */
   9133       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
   9134       Bool   isD   = (sz & 1) == 1;
   9135       Bool   isMIN = (sz & 2) == 2;
   9136       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
   9137       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
   9138       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
   9139       IRTemp src   = newTempV128();
   9140       IRTemp argL  = newTempV128();
   9141       IRTemp argR  = newTempV128();
   9142       assign(src, getQReg128(nn));
   9143       assign(argL, unop(opZHI, mkexpr(src)));
   9144       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
   9145                                                     mkU8(isD ? 8 : 4))));
   9146       putQReg128(dd, unop(opZHI,
   9147                           binop(opMXX, mkexpr(argL), mkexpr(argR))));
   9148       HChar c = isD ? 'd' : 's';
   9149       DIP("%s%sp %c%u, v%u.2%c\n",
   9150            isMIN ? "fmin" : "fmax", isNM ? "nm" : "", c, dd, nn, c);
   9151       return True;
   9152    }
   9153 
   9154    return False;
   9155 #  undef INSN
   9156 }
   9157 
   9158 
   9159 static
   9160 Bool dis_AdvSIMD_scalar_shift_by_imm(/*MB_OUT*/DisResult* dres, UInt insn)
   9161 {
   9162    /* 31   28     22   18   15     10 9 4
   9163       01 u 111110 immh immb opcode 1  n d
   9164       Decode fields: u,immh,opcode
   9165    */
   9166 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   9167    if (INSN(31,30) != BITS2(0,1)
   9168        || INSN(28,23) != BITS6(1,1,1,1,1,0) || INSN(10,10) != 1) {
   9169       return False;
   9170    }
   9171    UInt bitU   = INSN(29,29);
   9172    UInt immh   = INSN(22,19);
   9173    UInt immb   = INSN(18,16);
   9174    UInt opcode = INSN(15,11);
   9175    UInt nn     = INSN(9,5);
   9176    UInt dd     = INSN(4,0);
   9177    UInt immhb  = (immh << 3) | immb;
   9178 
   9179    if ((immh & 8) == 8
   9180        && (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0))) {
   9181       /* -------- 0,1xxx,00000 SSHR d_d_#imm -------- */
   9182       /* -------- 1,1xxx,00000 USHR d_d_#imm -------- */
   9183       /* -------- 0,1xxx,00010 SSRA d_d_#imm -------- */
   9184       /* -------- 1,1xxx,00010 USRA d_d_#imm -------- */
   9185       Bool isU   = bitU == 1;
   9186       Bool isAcc = opcode == BITS5(0,0,0,1,0);
   9187       UInt sh    = 128 - immhb;
   9188       vassert(sh >= 1 && sh <= 64);
   9189       IROp    op  = isU ? Iop_ShrN64x2 : Iop_SarN64x2;
   9190       IRExpr* src = getQReg128(nn);
   9191       IRTemp  shf = newTempV128();
   9192       IRTemp  res = newTempV128();
   9193       if (sh == 64 && isU) {
   9194          assign(shf, mkV128(0x0000));
   9195       } else {
   9196          UInt nudge = 0;
   9197          if (sh == 64) {
   9198             vassert(!isU);
   9199             nudge = 1;
   9200          }
   9201          assign(shf, binop(op, src, mkU8(sh - nudge)));
   9202       }
   9203       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
   9204                         : mkexpr(shf));
   9205       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
   9206       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
   9207                               : (isU ? "ushr" : "sshr");
   9208       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
   9209       return True;
   9210    }
   9211 
   9212    if ((immh & 8) == 8
   9213        && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0))) {
   9214       /* -------- 0,1xxx,00100 SRSHR d_d_#imm -------- */
   9215       /* -------- 1,1xxx,00100 URSHR d_d_#imm -------- */
   9216       /* -------- 0,1xxx,00110 SRSRA d_d_#imm -------- */
   9217       /* -------- 1,1xxx,00110 URSRA d_d_#imm -------- */
   9218       Bool isU   = bitU == 1;
   9219       Bool isAcc = opcode == BITS5(0,0,1,1,0);
   9220       UInt sh    = 128 - immhb;
   9221       vassert(sh >= 1 && sh <= 64);
   9222       IROp    op  = isU ? Iop_Rsh64Ux2 : Iop_Rsh64Sx2;
   9223       vassert(sh >= 1 && sh <= 64);
   9224       IRExpr* src  = getQReg128(nn);
   9225       IRTemp  imm8 = newTemp(Ity_I8);
   9226       assign(imm8, mkU8((UChar)(-sh)));
   9227       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
   9228       IRTemp  shf  = newTempV128();
   9229       IRTemp  res  = newTempV128();
   9230       assign(shf, binop(op, src, amt));
   9231       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
   9232                         : mkexpr(shf));
   9233       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
   9234       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
   9235                               : (isU ? "urshr" : "srshr");
   9236       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
   9237       return True;
   9238    }
   9239 
   9240    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,0,0)) {
   9241       /* -------- 1,1xxx,01000 SRI d_d_#imm -------- */
   9242       UInt sh = 128 - immhb;
   9243       vassert(sh >= 1 && sh <= 64);
   9244       if (sh == 64) {
   9245          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
   9246       } else {
   9247          /* sh is in range 1 .. 63 */
   9248          ULong   nmask  = (ULong)(((Long)0x8000000000000000ULL) >> (sh-1));
   9249          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
   9250          IRTemp  res    = newTempV128();
   9251          assign(res, binop(Iop_OrV128,
   9252                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
   9253                            binop(Iop_ShrN64x2, getQReg128(nn), mkU8(sh))));
   9254          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
   9255       }
   9256       DIP("sri d%u, d%u, #%u\n", dd, nn, sh);
   9257       return True;
   9258    }
   9259 
   9260    if (bitU == 0 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
   9261       /* -------- 0,1xxx,01010 SHL d_d_#imm -------- */
   9262       UInt sh = immhb - 64;
   9263       vassert(sh >= 0 && sh < 64);
   9264       putQReg128(dd,
   9265                  unop(Iop_ZeroHI64ofV128,
   9266                       sh == 0 ? getQReg128(nn)
   9267                               : binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
   9268       DIP("shl d%u, d%u, #%u\n", dd, nn, sh);
   9269       return True;
   9270    }
   9271 
   9272    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
   9273       /* -------- 1,1xxx,01010 SLI d_d_#imm -------- */
   9274       UInt sh = immhb - 64;
   9275       vassert(sh >= 0 && sh < 64);
   9276       if (sh == 0) {
   9277          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(nn)));
   9278       } else {
   9279          /* sh is in range 1 .. 63 */
   9280          ULong   nmask  = (1ULL << sh) - 1;
   9281          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
   9282          IRTemp  res    = newTempV128();
   9283          assign(res, binop(Iop_OrV128,
   9284                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
   9285                            binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
   9286          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
   9287       }
   9288       DIP("sli d%u, d%u, #%u\n", dd, nn, sh);
   9289       return True;
   9290    }
   9291 
   9292    if (opcode == BITS5(0,1,1,1,0)
   9293        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
   9294       /* -------- 0,01110  SQSHL  #imm -------- */
   9295       /* -------- 1,01110  UQSHL  #imm -------- */
   9296       /* -------- 1,01100  SQSHLU #imm -------- */
   9297       UInt size  = 0;
   9298       UInt shift = 0;
   9299       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
   9300       if (!ok) return False;
   9301       vassert(size >= 0 && size <= 3);
   9302       /* The shift encoding has opposite sign for the leftwards case.
   9303          Adjust shift to compensate. */
   9304       UInt lanebits = 8 << size;
   9305       shift = lanebits - shift;
   9306       vassert(shift >= 0 && shift < lanebits);
   9307       const HChar* nm = NULL;
   9308       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
   9309       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
   9310       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
   9311       else vassert(0);
   9312       IRTemp qDiff1 = IRTemp_INVALID;
   9313       IRTemp qDiff2 = IRTemp_INVALID;
   9314       IRTemp res = IRTemp_INVALID;
   9315       IRTemp src = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn));
   9316       /* This relies on the fact that the zeroed out lanes generate zeroed
   9317          result lanes and don't saturate, so there's no point in trimming
   9318          the resulting res, qDiff1 or qDiff2 values. */
   9319       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
   9320       putQReg128(dd, mkexpr(res));
   9321       updateQCFLAGwithDifference(qDiff1, qDiff2);
   9322       const HChar arr = "bhsd"[size];
   9323       DIP("%s %c%u, %c%u, #%u\n", nm, arr, dd, arr, nn, shift);
   9324       return True;
   9325    }
   9326 
   9327    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
   9328        || (bitU == 1
   9329            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
   9330       /* -------- 0,10010   SQSHRN #imm -------- */
   9331       /* -------- 1,10010   UQSHRN #imm -------- */
   9332       /* -------- 0,10011  SQRSHRN #imm -------- */
   9333       /* -------- 1,10011  UQRSHRN #imm -------- */
   9334       /* -------- 1,10000  SQSHRUN #imm -------- */
   9335       /* -------- 1,10001 SQRSHRUN #imm -------- */
   9336       UInt size  = 0;
   9337       UInt shift = 0;
   9338       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
   9339       if (!ok || size == X11) return False;
   9340       vassert(size >= X00 && size <= X10);
   9341       vassert(shift >= 1 && shift <= (8 << size));
   9342       const HChar* nm = "??";
   9343       IROp op = Iop_INVALID;
   9344       /* Decide on the name and the operation. */
   9345       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
   9346          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
   9347       }
   9348       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
   9349          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
   9350       }
   9351       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
   9352          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
   9353       }
   9354       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
   9355          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
   9356       }
   9357       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
   9358          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
   9359       }
   9360       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
   9361          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
   9362       }
   9363       else vassert(0);
   9364       /* Compute the result (Q, shifted value) pair. */
   9365       IRTemp src128 = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size+1, getQReg128(nn));
   9366       IRTemp pair   = newTempV128();
   9367       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
   9368       /* Update the result reg */
   9369       IRTemp res64in128 = newTempV128();
   9370       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
   9371       putQReg128(dd, mkexpr(res64in128));
   9372       /* Update the Q flag. */
   9373       IRTemp q64q64 = newTempV128();
   9374       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
   9375       IRTemp z128 = newTempV128();
   9376       assign(z128, mkV128(0x0000));
   9377       updateQCFLAGwithDifference(q64q64, z128);
   9378       /* */
   9379       const HChar arrNarrow = "bhsd"[size];
   9380       const HChar arrWide   = "bhsd"[size+1];
   9381       DIP("%s %c%u, %c%u, #%u\n", nm, arrNarrow, dd, arrWide, nn, shift);
   9382       return True;
   9383    }
   9384 
   9385    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,0,0)) {
   9386       /* -------- 0,!=00xx,11100 SCVTF d_d_imm, s_s_imm -------- */
   9387       /* -------- 1,!=00xx,11100 UCVTF d_d_imm, s_s_imm -------- */
   9388       UInt size  = 0;
   9389       UInt fbits = 0;
   9390       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
   9391       /* The following holds because immh is never zero. */
   9392       vassert(ok);
   9393       /* The following holds because immh >= 0100. */
   9394       vassert(size == X10 || size == X11);
   9395       Bool isD = size == X11;
   9396       Bool isU = bitU == 1;
   9397       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
   9398       Double  scale  = two_to_the_minus(fbits);
   9399       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
   9400                              : IRExpr_Const(IRConst_F32( (Float)scale ));
   9401       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
   9402       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
   9403                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
   9404       IRType tyF = isD ? Ity_F64 : Ity_F32;
   9405       IRType tyI = isD ? Ity_I64 : Ity_I32;
   9406       IRTemp src = newTemp(tyI);
   9407       IRTemp res = newTemp(tyF);
   9408       IRTemp rm  = mk_get_IR_rounding_mode();
   9409       assign(src, getQRegLane(nn, 0, tyI));
   9410       assign(res, triop(opMUL, mkexpr(rm),
   9411                                binop(opCVT, mkexpr(rm), mkexpr(src)), scaleE));
   9412       putQRegLane(dd, 0, mkexpr(res));
   9413       if (!isD) {
   9414          putQRegLane(dd, 1, mkU32(0));
   9415       }
   9416       putQRegLane(dd, 1, mkU64(0));
   9417       const HChar ch = isD ? 'd' : 's';
   9418       DIP("%s %c%u, %c%u, #%u\n", isU ? "ucvtf" : "scvtf",
   9419           ch, dd, ch, nn, fbits);
   9420       return True;
   9421    }
   9422 
   9423    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,1,1)) {
   9424       /* -------- 0,!=00xx,11111 FCVTZS d_d_imm, s_s_imm -------- */
   9425       /* -------- 1,!=00xx,11111 FCVTZU d_d_imm, s_s_imm -------- */
   9426       UInt size  = 0;
   9427       UInt fbits = 0;
   9428       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
   9429       /* The following holds because immh is never zero. */
   9430       vassert(ok);
   9431       /* The following holds because immh >= 0100. */
   9432       vassert(size == X10 || size == X11);
   9433       Bool isD = size == X11;
   9434       Bool isU = bitU == 1;
   9435       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
   9436       Double  scale  = two_to_the_plus(fbits);
   9437       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
   9438                            : IRExpr_Const(IRConst_F32( (Float)scale ));
   9439       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
   9440       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
   9441                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
   9442       IRType tyF = isD ? Ity_F64 : Ity_F32;
   9443       IRType tyI = isD ? Ity_I64 : Ity_I32;
   9444       IRTemp src = newTemp(tyF);
   9445       IRTemp res = newTemp(tyI);
   9446       IRTemp rm  = newTemp(Ity_I32);
   9447       assign(src, getQRegLane(nn, 0, tyF));
   9448       assign(rm,  mkU32(Irrm_ZERO));
   9449       assign(res, binop(opCVT, mkexpr(rm),
   9450                                triop(opMUL, mkexpr(rm), mkexpr(src), scaleE)));
   9451       putQRegLane(dd, 0, mkexpr(res));
   9452       if (!isD) {
   9453          putQRegLane(dd, 1, mkU32(0));
   9454       }
   9455       putQRegLane(dd, 1, mkU64(0));
   9456       const HChar ch = isD ? 'd' : 's';
   9457       DIP("%s %c%u, %c%u, #%u\n", isU ? "fcvtzu" : "fcvtzs",
   9458           ch, dd, ch, nn, fbits);
   9459       return True;
   9460    }
   9461 
   9462 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   9463    return False;
   9464 #  undef INSN
   9465 }
   9466 
   9467 
   9468 static
   9469 Bool dis_AdvSIMD_scalar_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
   9470 {
   9471    /* 31 29 28    23   21 20 15     11 9 4
   9472       01 U  11110 size 1  m  opcode 00 n d
   9473       Decode fields: u,opcode
   9474    */
   9475 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   9476    if (INSN(31,30) != BITS2(0,1)
   9477        || INSN(28,24) != BITS5(1,1,1,1,0)
   9478        || INSN(21,21) != 1
   9479        || INSN(11,10) != BITS2(0,0)) {
   9480       return False;
   9481    }
   9482    UInt bitU   = INSN(29,29);
   9483    UInt size   = INSN(23,22);
   9484    UInt mm     = INSN(20,16);
   9485    UInt opcode = INSN(15,12);
   9486    UInt nn     = INSN(9,5);
   9487    UInt dd     = INSN(4,0);
   9488    vassert(size < 4);
   9489 
   9490    if (bitU == 0
   9491        && (opcode == BITS4(1,1,0,1)
   9492            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
   9493       /* -------- 0,1101  SQDMULL -------- */ // 0 (ks)
   9494       /* -------- 0,1001  SQDMLAL -------- */ // 1
   9495       /* -------- 0,1011  SQDMLSL -------- */ // 2
   9496       /* Widens, and size refers to the narrowed lanes. */
   9497       UInt ks = 3;
   9498       switch (opcode) {
   9499          case BITS4(1,1,0,1): ks = 0; break;
   9500          case BITS4(1,0,0,1): ks = 1; break;
   9501          case BITS4(1,0,1,1): ks = 2; break;
   9502          default: vassert(0);
   9503       }
   9504       vassert(ks >= 0 && ks <= 2);
   9505       if (size == X00 || size == X11) return False;
   9506       vassert(size <= 2);
   9507       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
   9508       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
   9509       newTempsV128_3(&vecN, &vecM, &vecD);
   9510       assign(vecN, getQReg128(nn));
   9511       assign(vecM, getQReg128(mm));
   9512       assign(vecD, getQReg128(dd));
   9513       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
   9514                        False/*!is2*/, size, "mas"[ks],
   9515                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
   9516       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
   9517       putQReg128(dd, unop(opZHI, mkexpr(res)));
   9518       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
   9519       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
   9520       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
   9521          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
   9522       }
   9523       const HChar* nm        = ks == 0 ? "sqdmull"
   9524                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
   9525       const HChar  arrNarrow = "bhsd"[size];
   9526       const HChar  arrWide   = "bhsd"[size+1];
   9527       DIP("%s %c%u, %c%u, %c%u\n",
   9528           nm, arrWide, dd, arrNarrow, nn, arrNarrow, mm);
   9529       return True;
   9530    }
   9531 
   9532    return False;
   9533 #  undef INSN
   9534 }
   9535 
   9536 
   9537 static
   9538 Bool dis_AdvSIMD_scalar_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
   9539 {
   9540    /* 31 29 28    23   21 20 15     10 9 4
   9541       01 U  11110 size 1  m  opcode 1  n d
   9542       Decode fields: u,size,opcode
   9543    */
   9544 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   9545    if (INSN(31,30) != BITS2(0,1)
   9546        || INSN(28,24) != BITS5(1,1,1,1,0)
   9547        || INSN(21,21) != 1
   9548        || INSN(10,10) != 1) {
   9549       return False;
   9550    }
   9551    UInt bitU   = INSN(29,29);
   9552    UInt size   = INSN(23,22);
   9553    UInt mm     = INSN(20,16);
   9554    UInt opcode = INSN(15,11);
   9555    UInt nn     = INSN(9,5);
   9556    UInt dd     = INSN(4,0);
   9557    vassert(size < 4);
   9558 
   9559    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
   9560       /* -------- 0,xx,00001 SQADD std4_std4_std4 -------- */
   9561       /* -------- 1,xx,00001 UQADD std4_std4_std4 -------- */
   9562       /* -------- 0,xx,00101 SQSUB std4_std4_std4 -------- */
   9563       /* -------- 1,xx,00101 UQSUB std4_std4_std4 -------- */
   9564       Bool isADD = opcode == BITS5(0,0,0,0,1);
   9565       Bool isU   = bitU == 1;
   9566       IROp qop   = Iop_INVALID;
   9567       IROp nop   = Iop_INVALID;
   9568       if (isADD) {
   9569          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
   9570          nop = mkVecADD(size);
   9571       } else {
   9572          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
   9573          nop = mkVecSUB(size);
   9574       }
   9575       IRTemp argL = newTempV128();
   9576       IRTemp argR = newTempV128();
   9577       IRTemp qres = newTempV128();
   9578       IRTemp nres = newTempV128();
   9579       assign(argL, getQReg128(nn));
   9580       assign(argR, getQReg128(mm));
   9581       assign(qres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
   9582                              size, binop(qop, mkexpr(argL), mkexpr(argR)))));
   9583       assign(nres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
   9584                              size, binop(nop, mkexpr(argL), mkexpr(argR)))));
   9585       putQReg128(dd, mkexpr(qres));
   9586       updateQCFLAGwithDifference(qres, nres);
   9587       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
   9588                                : (isU ? "uqsub" : "sqsub");
   9589       const HChar  arr = "bhsd"[size];
   9590       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
   9591       return True;
   9592    }
   9593 
   9594    if (size == X11 && opcode == BITS5(0,0,1,1,0)) {
   9595       /* -------- 0,11,00110 CMGT d_d_d -------- */ // >s
   9596       /* -------- 1,11,00110 CMHI d_d_d -------- */ // >u
   9597       Bool    isGT = bitU == 0;
   9598       IRExpr* argL = getQReg128(nn);
   9599       IRExpr* argR = getQReg128(mm);
   9600       IRTemp  res  = newTempV128();
   9601       assign(res,
   9602              isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
   9603                   : binop(Iop_CmpGT64Ux2, argL, argR));
   9604       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
   9605       DIP("%s %s, %s, %s\n",isGT ? "cmgt" : "cmhi",
   9606           nameQRegLO(dd, Ity_I64),
   9607           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
   9608       return True;
   9609    }
   9610 
   9611    if (size == X11 && opcode == BITS5(0,0,1,1,1)) {
   9612       /* -------- 0,11,00111 CMGE d_d_d -------- */ // >=s
   9613       /* -------- 1,11,00111 CMHS d_d_d -------- */ // >=u
   9614       Bool    isGE = bitU == 0;
   9615       IRExpr* argL = getQReg128(nn);
   9616       IRExpr* argR = getQReg128(mm);
   9617       IRTemp  res  = newTempV128();
   9618       assign(res,
   9619              isGE ? unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL))
   9620                   : unop(Iop_NotV128, binop(Iop_CmpGT64Ux2, argR, argL)));
   9621       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
   9622       DIP("%s %s, %s, %s\n", isGE ? "cmge" : "cmhs",
   9623           nameQRegLO(dd, Ity_I64),
   9624           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
   9625       return True;
   9626    }
   9627 
   9628    if (size == X11 && (opcode == BITS5(0,1,0,0,0)
   9629                        || opcode == BITS5(0,1,0,1,0))) {
   9630       /* -------- 0,xx,01000 SSHL  d_d_d -------- */
   9631       /* -------- 0,xx,01010 SRSHL d_d_d -------- */
   9632       /* -------- 1,xx,01000 USHL  d_d_d -------- */
   9633       /* -------- 1,xx,01010 URSHL d_d_d -------- */
   9634       Bool isU = bitU == 1;
   9635       Bool isR = opcode == BITS5(0,1,0,1,0);
   9636       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
   9637                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
   9638       IRTemp res = newTempV128();
   9639       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
   9640       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
   9641       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
   9642                              : (isU ? "ushl"  : "sshl");
   9643       DIP("%s %s, %s, %s\n", nm,
   9644           nameQRegLO(dd, Ity_I64),
   9645           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
   9646       return True;
   9647    }
   9648 
   9649    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
   9650       /* -------- 0,xx,01001 SQSHL  std4_std4_std4 -------- */
   9651       /* -------- 0,xx,01011 SQRSHL std4_std4_std4 -------- */
   9652       /* -------- 1,xx,01001 UQSHL  std4_std4_std4 -------- */
   9653       /* -------- 1,xx,01011 UQRSHL std4_std4_std4 -------- */
   9654       Bool isU = bitU == 1;
   9655       Bool isR = opcode == BITS5(0,1,0,1,1);
   9656       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
   9657                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
   9658       /* This is a bit tricky.  Since we're only interested in the lowest
   9659          lane of the result, we zero out all the rest in the operands, so
   9660          as to ensure that other lanes don't pollute the returned Q value.
   9661          This works because it means, for the lanes we don't care about, we
   9662          are shifting zero by zero, which can never saturate. */
   9663       IRTemp res256 = newTemp(Ity_V256);
   9664       IRTemp resSH  = newTempV128();
   9665       IRTemp resQ   = newTempV128();
   9666       IRTemp zero   = newTempV128();
   9667       assign(
   9668          res256,
   9669          binop(op,
   9670                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn))),
   9671                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(mm)))));
   9672       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
   9673       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
   9674       assign(zero,  mkV128(0x0000));
   9675       putQReg128(dd, mkexpr(resSH));
   9676       updateQCFLAGwithDifference(resQ, zero);
   9677       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
   9678                              : (isU ? "uqshl"  : "sqshl");
   9679       const HChar  arr = "bhsd"[size];
   9680       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
   9681       return True;
   9682    }
   9683 
   9684    if (size == X11 && opcode == BITS5(1,0,0,0,0)) {
   9685       /* -------- 0,11,10000 ADD d_d_d -------- */
   9686       /* -------- 1,11,10000 SUB d_d_d -------- */
   9687       Bool   isSUB = bitU == 1;
   9688       IRTemp res   = newTemp(Ity_I64);
   9689       assign(res, binop(isSUB ? Iop_Sub64 : Iop_Add64,
   9690                         getQRegLane(nn, 0, Ity_I64),
   9691                         getQRegLane(mm, 0, Ity_I64)));
   9692       putQRegLane(dd, 0, mkexpr(res));
   9693       putQRegLane(dd, 1, mkU64(0));
   9694       DIP("%s %s, %s, %s\n", isSUB ? "sub" : "add",
   9695           nameQRegLO(dd, Ity_I64),
   9696           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
   9697       return True;
   9698    }
   9699 
   9700    if (size == X11 && opcode == BITS5(1,0,0,0,1)) {
   9701       /* -------- 0,11,10001 CMTST d_d_d -------- */ // &, != 0
   9702       /* -------- 1,11,10001 CMEQ  d_d_d -------- */ // ==
   9703       Bool    isEQ = bitU == 1;
   9704       IRExpr* argL = getQReg128(nn);
   9705       IRExpr* argR = getQReg128(mm);
   9706       IRTemp  res  = newTempV128();
   9707       assign(res,
   9708              isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
   9709                   : unop(Iop_NotV128, binop(Iop_CmpEQ64x2,
   9710                                             binop(Iop_AndV128, argL, argR),
   9711                                             mkV128(0x0000))));
   9712       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
   9713       DIP("%s %s, %s, %s\n", isEQ ? "cmeq" : "cmtst",
   9714           nameQRegLO(dd, Ity_I64),
   9715           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
   9716       return True;
   9717    }
   9718 
   9719    if (opcode == BITS5(1,0,1,1,0)) {
   9720       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
   9721       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
   9722       if (size == X00 || size == X11) return False;
   9723       Bool isR = bitU == 1;
   9724       IRTemp res, sat1q, sat1n, vN, vM;
   9725       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
   9726       newTempsV128_2(&vN, &vM);
   9727       assign(vN, getQReg128(nn));
   9728       assign(vM, getQReg128(mm));
   9729       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
   9730       putQReg128(dd,
   9731                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
   9732       updateQCFLAGwithDifference(
   9733          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1q)),
   9734          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1n)));
   9735       const HChar  arr = "bhsd"[size];
   9736       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
   9737       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
   9738       return True;
   9739    }
   9740 
   9741    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
   9742       /* -------- 1,1x,11010 FABD d_d_d, s_s_s -------- */
   9743       IRType ity = size == X11 ? Ity_F64 : Ity_F32;
   9744       IRTemp res = newTemp(ity);
   9745       assign(res, unop(mkABSF(ity),
   9746                        triop(mkSUBF(ity),
   9747                              mkexpr(mk_get_IR_rounding_mode()),
   9748                              getQRegLO(nn,ity), getQRegLO(mm,ity))));
   9749       putQReg128(dd, mkV128(0x0000));
   9750       putQRegLO(dd, mkexpr(res));
   9751       DIP("fabd %s, %s, %s\n",
   9752           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
   9753       return True;
   9754    }
   9755 
   9756    if (bitU == 0 && size <= X01 && opcode == BITS5(1,1,0,1,1)) {
   9757       /* -------- 0,0x,11011 FMULX d_d_d, s_s_s -------- */
   9758       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
   9759       IRType ity = size == X01 ? Ity_F64 : Ity_F32;
   9760       IRTemp res = newTemp(ity);
   9761       assign(res, triop(mkMULF(ity),
   9762                         mkexpr(mk_get_IR_rounding_mode()),
   9763                         getQRegLO(nn,ity), getQRegLO(mm,ity)));
   9764       putQReg128(dd, mkV128(0x0000));
   9765       putQRegLO(dd, mkexpr(res));
   9766       DIP("fmulx %s, %s, %s\n",
   9767           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
   9768       return True;
   9769    }
   9770 
   9771    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
   9772       /* -------- 0,0x,11100 FCMEQ d_d_d, s_s_s -------- */
   9773       /* -------- 1,0x,11100 FCMGE d_d_d, s_s_s -------- */
   9774       Bool   isD   = size == X01;
   9775       IRType ity   = isD ? Ity_F64 : Ity_F32;
   9776       Bool   isGE  = bitU == 1;
   9777       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
   9778                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
   9779       IRTemp res   = newTempV128();
   9780       assign(res, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
   9781                        : binop(opCMP, getQReg128(nn), getQReg128(mm)));
   9782       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
   9783                                                              mkexpr(res))));
   9784       DIP("%s %s, %s, %s\n", isGE ? "fcmge" : "fcmeq",
   9785           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
   9786       return True;
   9787    }
   9788 
   9789    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
   9790       /* -------- 1,1x,11100 FCMGT d_d_d, s_s_s -------- */
   9791       Bool   isD   = size == X11;
   9792       IRType ity   = isD ? Ity_F64 : Ity_F32;
   9793       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
   9794       IRTemp res   = newTempV128();
   9795       assign(res, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
   9796       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
   9797                                                              mkexpr(res))));
   9798       DIP("%s %s, %s, %s\n", "fcmgt",
   9799           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
   9800       return True;
   9801    }
   9802 
   9803    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
   9804       /* -------- 1,0x,11101 FACGE d_d_d, s_s_s -------- */
   9805       /* -------- 1,1x,11101 FACGT d_d_d, s_s_s -------- */
   9806       Bool   isD   = (size & 1) == 1;
   9807       IRType ity   = isD ? Ity_F64 : Ity_F32;
   9808       Bool   isGT  = (size & 2) == 2;
   9809       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
   9810                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
   9811       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
   9812       IRTemp res   = newTempV128();
   9813       assign(res, binop(opCMP, unop(opABS, getQReg128(mm)),
   9814                                unop(opABS, getQReg128(nn)))); // swapd
   9815       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
   9816                                                              mkexpr(res))));
   9817       DIP("%s %s, %s, %s\n", isGT ? "facgt" : "facge",
   9818           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
   9819       return True;
   9820    }
   9821 
   9822    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
   9823       /* -------- 0,0x,11111: FRECPS  d_d_d, s_s_s -------- */
   9824       /* -------- 0,1x,11111: FRSQRTS d_d_d, s_s_s -------- */
   9825       Bool isSQRT = (size & 2) == 2;
   9826       Bool isD    = (size & 1) == 1;
   9827       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
   9828                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
   9829       IRTemp res = newTempV128();
   9830       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
   9831       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
   9832                                                              mkexpr(res))));
   9833       HChar c = isD ? 'd' : 's';
   9834       DIP("%s %c%u, %c%u, %c%u\n", isSQRT ? "frsqrts" : "frecps",
   9835           c, dd, c, nn, c, mm);
   9836       return True;
   9837    }
   9838 
   9839    return False;
   9840 #  undef INSN
   9841 }
   9842 
   9843 
   9844 static
   9845 Bool dis_AdvSIMD_scalar_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
   9846 {
   9847    /* 31 29 28    23   21    16     11 9 4
   9848       01 U  11110 size 10000 opcode 10 n d
   9849       Decode fields: u,size,opcode
   9850    */
   9851 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   9852    if (INSN(31,30) != BITS2(0,1)
   9853        || INSN(28,24) != BITS5(1,1,1,1,0)
   9854        || INSN(21,17) != BITS5(1,0,0,0,0)
   9855        || INSN(11,10) != BITS2(1,0)) {
   9856       return False;
   9857    }
   9858    UInt bitU   = INSN(29,29);
   9859    UInt size   = INSN(23,22);
   9860    UInt opcode = INSN(16,12);
   9861    UInt nn     = INSN(9,5);
   9862    UInt dd     = INSN(4,0);
   9863    vassert(size < 4);
   9864 
   9865    if (opcode == BITS5(0,0,0,1,1)) {
   9866       /* -------- 0,xx,00011: SUQADD std4_std4 -------- */
   9867       /* -------- 1,xx,00011: USQADD std4_std4 -------- */
   9868       /* These are a bit tricky (to say the least).  See comments on
   9869          the vector variants (in dis_AdvSIMD_two_reg_misc) below for
   9870          details. */
   9871       Bool   isUSQADD = bitU == 1;
   9872       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
   9873                              : mkVecQADDEXTUSSATSS(size);
   9874       IROp   nop  = mkVecADD(size);
   9875       IRTemp argL = newTempV128();
   9876       IRTemp argR = newTempV128();
   9877       assign(argL, getQReg128(nn));
   9878       assign(argR, getQReg128(dd));
   9879       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
   9880                        size, binop(qop, mkexpr(argL), mkexpr(argR)));
   9881       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
   9882                        size, binop(nop, mkexpr(argL), mkexpr(argR)));
   9883       putQReg128(dd, mkexpr(qres));
   9884       updateQCFLAGwithDifference(qres, nres);
   9885       const HChar arr = "bhsd"[size];
   9886       DIP("%s %c%u, %c%u\n", isUSQADD ? "usqadd" : "suqadd", arr, dd, arr, nn);
   9887       return True;
   9888    }
   9889 
   9890    if (opcode == BITS5(0,0,1,1,1)) {
   9891       /* -------- 0,xx,00111 SQABS std4_std4 -------- */
   9892       /* -------- 1,xx,00111 SQNEG std4_std4 -------- */
   9893       Bool isNEG = bitU == 1;
   9894       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
   9895       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
   9896                                          getQReg128(nn), size );
   9897       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(qresFW));
   9898       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(nresFW));
   9899       putQReg128(dd, mkexpr(qres));
   9900       updateQCFLAGwithDifference(qres, nres);
   9901       const HChar arr = "bhsd"[size];
   9902       DIP("%s %c%u, %c%u\n", isNEG ? "sqneg" : "sqabs", arr, dd, arr, nn);
   9903       return True;
   9904    }
   9905 
   9906    if (size == X11 && opcode == BITS5(0,1,0,0,0)) {
   9907       /* -------- 0,11,01000: CMGT d_d_#0 -------- */ // >s 0
   9908       /* -------- 1,11,01000: CMGE d_d_#0 -------- */ // >=s 0
   9909       Bool    isGT = bitU == 0;
   9910       IRExpr* argL = getQReg128(nn);
   9911       IRExpr* argR = mkV128(0x0000);
   9912       IRTemp  res  = newTempV128();
   9913       assign(res, isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
   9914                        : unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL)));
   9915       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
   9916       DIP("cm%s d%u, d%u, #0\n", isGT ? "gt" : "ge", dd, nn);
   9917       return True;
   9918    }
   9919 
   9920    if (size == X11 && opcode == BITS5(0,1,0,0,1)) {
   9921       /* -------- 0,11,01001: CMEQ d_d_#0 -------- */ // == 0
   9922       /* -------- 1,11,01001: CMLE d_d_#0 -------- */ // <=s 0
   9923       Bool    isEQ = bitU == 0;
   9924       IRExpr* argL = getQReg128(nn);
   9925       IRExpr* argR = mkV128(0x0000);
   9926       IRTemp  res  = newTempV128();
   9927       assign(res, isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
   9928                        : unop(Iop_NotV128,
   9929                               binop(Iop_CmpGT64Sx2, argL, argR)));
   9930       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
   9931       DIP("cm%s d%u, d%u, #0\n", isEQ ? "eq" : "le", dd, nn);
   9932       return True;
   9933    }
   9934 
   9935    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,0)) {
   9936       /* -------- 0,11,01010: CMLT d_d_#0 -------- */ // <s 0
   9937       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
   9938                           binop(Iop_CmpGT64Sx2, mkV128(0x0000),
   9939                                                 getQReg128(nn))));
   9940       DIP("cm%s d%u, d%u, #0\n", "lt", dd, nn);
   9941       return True;
   9942    }
   9943 
   9944    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
   9945       /* -------- 0,11,01011 ABS d_d -------- */
   9946       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
   9947                           unop(Iop_Abs64x2, getQReg128(nn))));
   9948       DIP("abs d%u, d%u\n", dd, nn);
   9949       return True;
   9950    }
   9951 
   9952    if (bitU == 1 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
   9953       /* -------- 1,11,01011 NEG d_d -------- */
   9954       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
   9955                           binop(Iop_Sub64x2, mkV128(0x0000), getQReg128(nn))));
   9956       DIP("neg d%u, d%u\n", dd, nn);
   9957       return True;
   9958    }
   9959 
   9960    UInt ix = 0; /*INVALID*/
   9961    if (size >= X10) {
   9962       switch (opcode) {
   9963          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
   9964          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
   9965          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
   9966          default: break;
   9967       }
   9968    }
   9969    if (ix > 0) {
   9970       /* -------- 0,1x,01100 FCMGT d_d_#0.0, s_s_#0.0 (ix 1) -------- */
   9971       /* -------- 0,1x,01101 FCMEQ d_d_#0.0, s_s_#0.0 (ix 2) -------- */
   9972       /* -------- 0,1x,01110 FCMLT d_d_#0.0, s_s_#0.0 (ix 3) -------- */
   9973       /* -------- 1,1x,01100 FCMGE d_d_#0.0, s_s_#0.0 (ix 4) -------- */
   9974       /* -------- 1,1x,01101 FCMLE d_d_#0.0, s_s_#0.0 (ix 5) -------- */
   9975       Bool   isD     = size == X11;
   9976       IRType ity     = isD ? Ity_F64 : Ity_F32;
   9977       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
   9978       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
   9979       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
   9980       IROp   opCmp   = Iop_INVALID;
   9981       Bool   swap    = False;
   9982       const HChar* nm = "??";
   9983       switch (ix) {
   9984          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
   9985          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
   9986          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
   9987          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
   9988          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
   9989          default: vassert(0);
   9990       }
   9991       IRExpr* zero = mkV128(0x0000);
   9992       IRTemp res = newTempV128();
   9993       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
   9994                        : binop(opCmp, getQReg128(nn), zero));
   9995       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
   9996                                                              mkexpr(res))));
   9997 
   9998       DIP("%s %s, %s, #0.0\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
   9999       return True;
   10000    }
   10001 
   10002    if (opcode == BITS5(1,0,1,0,0)
   10003        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
   10004       /* -------- 0,xx,10100: SQXTN -------- */
   10005       /* -------- 1,xx,10100: UQXTN -------- */
   10006       /* -------- 1,xx,10010: SQXTUN -------- */
   10007       if (size == X11) return False;
   10008       vassert(size < 3);
   10009       IROp  opN    = Iop_INVALID;
   10010       Bool  zWiden = True;
   10011       const HChar* nm = "??";
   10012       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
   10013          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
   10014       }
   10015       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
   10016          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
   10017       }
   10018       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
   10019          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
   10020       }
   10021       else vassert(0);
   10022       IRTemp src  = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
   10023                        size+1, getQReg128(nn));
   10024       IRTemp resN = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
   10025                        size, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
   10026       putQReg128(dd, mkexpr(resN));
   10027       /* This widens zero lanes to zero, and compares it against zero, so all
   10028          of the non-participating lanes make no contribution to the
   10029          Q flag state. */
   10030       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
   10031                                               size, mkexpr(resN));
   10032       updateQCFLAGwithDifference(src, resW);
   10033       const HChar arrNarrow = "bhsd"[size];
   10034       const HChar arrWide   = "bhsd"[size+1];
   10035       DIP("%s %c%u, %c%u\n", nm, arrNarrow, dd, arrWide, nn);
   10036       return True;
   10037    }
   10038 
   10039    if (opcode == BITS5(1,0,1,1,0) && bitU == 1 && size == X01) {
   10040       /* -------- 1,01,10110 FCVTXN s_d -------- */
   10041       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
   10042          odd" but I don't know what that really means. */
   10043       putQRegLO(dd,
   10044                 binop(Iop_F64toF32, mkU32(Irrm_NEAREST),
   10045                                     getQRegLO(nn, Ity_F64)));
   10046       putQRegLane(dd, 1, mkU32(0));
   10047       putQRegLane(dd, 1, mkU64(0));
   10048       DIP("fcvtxn s%u, d%u\n", dd, nn);
   10049       return True;
   10050    }
   10051 
   10052    ix = 0; /*INVALID*/
   10053    switch (opcode) {
   10054       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
   10055       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
   10056       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
   10057       default: break;
   10058    }
   10059    if (ix > 0) {
   10060       /* -------- 0,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
   10061       /* -------- 0,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
   10062       /* -------- 0,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
   10063       /* -------- 0,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
   10064       /* -------- 0,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
   10065       /* -------- 1,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
   10066       /* -------- 1,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
   10067       /* -------- 1,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
   10068       /* -------- 1,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
   10069       /* -------- 1,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
   10070       Bool           isD  = (size & 1) == 1;
   10071       IRType         tyF  = isD ? Ity_F64 : Ity_F32;
   10072       IRType         tyI  = isD ? Ity_I64 : Ity_I32;
   10073       IRRoundingMode irrm = 8; /*impossible*/
   10074       HChar          ch   = '?';
   10075       switch (ix) {
   10076          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
   10077          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
   10078          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
   10079          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
   10080          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
   10081          default: vassert(0);
   10082       }
   10083       IROp cvt = Iop_INVALID;
   10084       if (bitU == 1) {
   10085          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
   10086       } else {
   10087          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
   10088       }
   10089       IRTemp src = newTemp(tyF);
   10090       IRTemp res = newTemp(tyI);
   10091       assign(src, getQRegLane(nn, 0, tyF));
   10092       assign(res, binop(cvt, mkU32(irrm), mkexpr(src)));
   10093       putQRegLane(dd, 0, mkexpr(res)); /* bits 31-0 or 63-0 */
   10094       if (!isD) {
   10095          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
   10096       }
   10097       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
   10098       HChar sOrD = isD ? 'd' : 's';
   10099       DIP("fcvt%c%c %c%u, %c%u\n", ch, bitU == 1 ? 'u' : 's',
   10100           sOrD, dd, sOrD, nn);
   10101       return True;
   10102    }
   10103 
   10104    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
   10105       /* -------- 0,0x,11101: SCVTF d_d, s_s -------- */
   10106       /* -------- 1,0x,11101: UCVTF d_d, s_s -------- */
   10107       Bool   isU = bitU == 1;
   10108       Bool   isD = (size & 1) == 1;
   10109       IRType tyI = isD ? Ity_I64 : Ity_I32;
   10110       IROp   iop = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
   10111                        : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
   10112       IRTemp rm  = mk_get_IR_rounding_mode();
   10113       putQRegLO(dd, binop(iop, mkexpr(rm), getQRegLO(nn, tyI)));
   10114       if (!isD) {
   10115          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
   10116       }
   10117       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
   10118       HChar c = isD ? 'd' : 's';
   10119       DIP("%ccvtf %c%u, %c%u\n", isU ? 'u' : 's', c, dd, c, nn);
   10120       return True;
   10121    }
   10122 
   10123    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
   10124       /* -------- 0,1x,11101: FRECPE  d_d, s_s -------- */
   10125       /* -------- 1,1x,11101: FRSQRTE d_d, s_s -------- */
   10126       Bool isSQRT = bitU == 1;
   10127       Bool isD    = (size & 1) == 1;
   10128       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
   10129                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
   10130       IRTemp resV = newTempV128();
   10131       assign(resV, unop(op, getQReg128(nn)));
   10132       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
   10133                                                              mkexpr(resV))));
   10134       HChar c = isD ? 'd' : 's';
   10135       DIP("%s %c%u, %c%u\n", isSQRT ? "frsqrte" : "frecpe", c, dd, c, nn);
   10136       return True;
   10137    }
   10138 
   10139    if (bitU == 0 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
   10140       /* -------- 0,1x,11111: FRECPX  d_d, s_s -------- */
   10141       Bool   isD = (size & 1) == 1;
   10142       IRType ty  = isD ? Ity_F64 : Ity_F32;
   10143       IROp   op  = isD ? Iop_RecpExpF64 : Iop_RecpExpF32;
   10144       IRTemp res = newTemp(ty);
   10145       IRTemp rm  = mk_get_IR_rounding_mode();
   10146       assign(res, binop(op, mkexpr(rm), getQRegLane(nn, 0, ty)));
   10147       putQReg128(dd, mkV128(0x0000));
   10148       putQRegLane(dd, 0, mkexpr(res));
   10149       HChar c = isD ? 'd' : 's';
   10150       DIP("%s %c%u, %c%u\n", "frecpx", c, dd, c, nn);
   10151       return True;
   10152    }
   10153 
   10154    return False;
   10155 #  undef INSN
   10156 }
   10157 
   10158 
   10159 static
   10160 Bool dis_AdvSIMD_scalar_x_indexed_element(/*MB_OUT*/DisResult* dres, UInt insn)
   10161 {
   10162    /* 31   28    23   21 20 19 15     11   9 4
   10163       01 U 11111 size L  M  m  opcode H  0 n d
   10164       Decode fields are: u,size,opcode
   10165       M is really part of the mm register number.  Individual
   10166       cases need to inspect L and H though.
   10167    */
   10168 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   10169    if (INSN(31,30) != BITS2(0,1)
   10170        || INSN(28,24) != BITS5(1,1,1,1,1) || INSN(10,10) !=0) {
   10171       return False;
   10172    }
   10173    UInt bitU   = INSN(29,29);
   10174    UInt size   = INSN(23,22);
   10175    UInt bitL   = INSN(21,21);
   10176    UInt bitM   = INSN(20,20);
   10177    UInt mmLO4  = INSN(19,16);
   10178    UInt opcode = INSN(15,12);
   10179    UInt bitH   = INSN(11,11);
   10180    UInt nn     = INSN(9,5);
   10181    UInt dd     = INSN(4,0);
   10182    vassert(size < 4);
   10183    vassert(bitH < 2 && bitM < 2 && bitL < 2);
   10184 
   10185    if (bitU == 0 && size >= X10
   10186        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
   10187       /* -------- 0,1x,0001 FMLA d_d_d[], s_s_s[] -------- */
   10188       /* -------- 0,1x,0101 FMLS d_d_d[], s_s_s[] -------- */
   10189       Bool isD   = (size & 1) == 1;
   10190       Bool isSUB = opcode == BITS4(0,1,0,1);
   10191       UInt index;
   10192       if      (!isD)             index = (bitH << 1) | bitL;
   10193       else if (isD && bitL == 0) index = bitH;
   10194       else return False; // sz:L == x11 => unallocated encoding
   10195       vassert(index < (isD ? 2 : 4));
   10196       IRType ity   = isD ? Ity_F64 : Ity_F32;
   10197       IRTemp elem  = newTemp(ity);
   10198       UInt   mm    = (bitM << 4) | mmLO4;
   10199       assign(elem, getQRegLane(mm, index, ity));
   10200       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
   10201       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
   10202       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
   10203       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
   10204       IRTemp rm    = mk_get_IR_rounding_mode();
   10205       IRTemp t1    = newTempV128();
   10206       IRTemp t2    = newTempV128();
   10207       // FIXME: double rounding; use FMA primops instead
   10208       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
   10209       assign(t2, triop(isSUB ? opSUB : opADD,
   10210                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
   10211       putQReg128(dd,
   10212                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
   10213                                                          mkexpr(t2))));
   10214       const HChar c = isD ? 'd' : 's';
   10215       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
   10216           c, dd, c, nn, nameQReg128(mm), c, index);
   10217       return True;
   10218    }
   10219 
   10220    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
   10221       /* -------- 0,1x,1001 FMUL  d_d_d[], s_s_s[] -------- */
   10222       /* -------- 1,1x,1001 FMULX d_d_d[], s_s_s[] -------- */
   10223       Bool isD    = (size & 1) == 1;
   10224       Bool isMULX = bitU == 1;
   10225       UInt index;
   10226       if      (!isD)             index = (bitH << 1) | bitL;
   10227       else if (isD && bitL == 0) index = bitH;
   10228       else return False; // sz:L == x11 => unallocated encoding
   10229       vassert(index < (isD ? 2 : 4));
   10230       IRType ity   = isD ? Ity_F64 : Ity_F32;
   10231       IRTemp elem  = newTemp(ity);
   10232       UInt   mm    = (bitM << 4) | mmLO4;
   10233       assign(elem, getQRegLane(mm, index, ity));
   10234       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
   10235       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
   10236       IRTemp rm    = mk_get_IR_rounding_mode();
   10237       IRTemp t1    = newTempV128();
   10238       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
   10239       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
   10240       putQReg128(dd,
   10241                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
   10242                                                          mkexpr(t1))));
   10243       const HChar c = isD ? 'd' : 's';
   10244       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isMULX ? "fmulx" : "fmul",
   10245           c, dd, c, nn, nameQReg128(mm), c, index);
   10246       return True;
   10247    }
   10248 
   10249    if (bitU == 0
   10250        && (opcode == BITS4(1,0,1,1)
   10251            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
   10252       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
   10253       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
   10254       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
   10255       /* Widens, and size refers to the narrowed lanes. */
   10256       UInt ks = 3;
   10257       switch (opcode) {
   10258          case BITS4(1,0,1,1): ks = 0; break;
   10259          case BITS4(0,0,1,1): ks = 1; break;
   10260          case BITS4(0,1,1,1): ks = 2; break;
   10261          default: vassert(0);
   10262       }
   10263       vassert(ks >= 0 && ks <= 2);
   10264       UInt mm  = 32; // invalid
   10265       UInt ix  = 16; // invalid
   10266       switch (size) {
   10267          case X00:
   10268             return False; // h_b_b[] case is not allowed
   10269          case X01:
   10270             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
   10271          case X10:
   10272             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
   10273          case X11:
   10274             return False; // q_d_d[] case is not allowed
   10275          default:
   10276             vassert(0);
   10277       }
   10278       vassert(mm < 32 && ix < 16);
   10279       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
   10280       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
   10281       newTempsV128_2(&vecN, &vecD);
   10282       assign(vecN, getQReg128(nn));
   10283       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
   10284       assign(vecD, getQReg128(dd));
   10285       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
   10286                        False/*!is2*/, size, "mas"[ks],
   10287                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
   10288       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
   10289       putQReg128(dd, unop(opZHI, mkexpr(res)));
   10290       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
   10291       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
   10292       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
   10293          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
   10294       }
   10295       const HChar* nm        = ks == 0 ? "sqmull"
   10296                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
   10297       const HChar  arrNarrow = "bhsd"[size];
   10298       const HChar  arrWide   = "bhsd"[size+1];
   10299       DIP("%s %c%u, %c%u, v%u.%c[%u]\n",
   10300           nm, arrWide, dd, arrNarrow, nn, dd, arrNarrow, ix);
   10301       return True;
   10302    }
   10303 
   10304    if (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1)) {
   10305       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
   10306       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
   10307       UInt mm  = 32; // invalid
   10308       UInt ix  = 16; // invalid
   10309       switch (size) {
   10310          case X00:
   10311             return False; // b case is not allowed
   10312          case X01:
   10313             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
   10314          case X10:
   10315             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
   10316          case X11:
   10317             return False; // q case is not allowed
   10318          default:
   10319             vassert(0);
   10320       }
   10321       vassert(mm < 32 && ix < 16);
   10322       Bool isR = opcode == BITS4(1,1,0,1);
   10323       IRTemp res, sat1q, sat1n, vN, vM;
   10324       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
   10325       vN = newTempV128();
   10326       assign(vN, getQReg128(nn));
   10327       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
   10328       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
   10329       IROp opZHI = mkVecZEROHIxxOFV128(size);
   10330       putQReg128(dd, unop(opZHI, mkexpr(res)));
   10331       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
   10332       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
   10333       HChar ch         = size == X01 ? 'h' : 's';
   10334       DIP("%s %c%u, %c%u, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, (Int)dd, ix);
   10335       return True;
   10336    }
   10337 
   10338    return False;
   10339 #  undef INSN
   10340 }
   10341 
   10342 
   10343 static
   10344 Bool dis_AdvSIMD_shift_by_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
   10345 {
   10346    /* 31    28     22   18   15     10 9 4
   10347       0 q u 011110 immh immb opcode 1  n d
   10348       Decode fields: u,opcode
   10349    */
   10350 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   10351    if (INSN(31,31) != 0
   10352        || INSN(28,23) != BITS6(0,1,1,1,1,0) || INSN(10,10) != 1) {
   10353       return False;
   10354    }
   10355    UInt bitQ   = INSN(30,30);
   10356    UInt bitU   = INSN(29,29);
   10357    UInt immh   = INSN(22,19);
   10358    UInt immb   = INSN(18,16);
   10359    UInt opcode = INSN(15,11);
   10360    UInt nn     = INSN(9,5);
   10361    UInt dd     = INSN(4,0);
   10362 
   10363    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0)) {
   10364       /* -------- 0,00000 SSHR std7_std7_#imm -------- */
   10365       /* -------- 1,00000 USHR std7_std7_#imm -------- */
   10366       /* -------- 0,00010 SSRA std7_std7_#imm -------- */
   10367       /* -------- 1,00010 USRA std7_std7_#imm -------- */
   10368       /* laneTy, shift = case immh:immb of
   10369                          0001:xxx -> B, SHR:8-xxx
   10370                          001x:xxx -> H, SHR:16-xxxx
   10371                          01xx:xxx -> S, SHR:32-xxxxx
   10372                          1xxx:xxx -> D, SHR:64-xxxxxx
   10373                          other    -> invalid
   10374       */
   10375       UInt size  = 0;
   10376       UInt shift = 0;
   10377       Bool isQ   = bitQ == 1;
   10378       Bool isU   = bitU == 1;
   10379       Bool isAcc = opcode == BITS5(0,0,0,1,0);
   10380       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
   10381       if (!ok || (bitQ == 0 && size == X11)) return False;
   10382       vassert(size >= 0 && size <= 3);
   10383       UInt lanebits = 8 << size;
   10384       vassert(shift >= 1 && shift <= lanebits);
   10385       IROp    op  = isU ? mkVecSHRN(size) : mkVecSARN(size);
   10386       IRExpr* src = getQReg128(nn);
   10387       IRTemp  shf = newTempV128();
   10388       IRTemp  res = newTempV128();
   10389       if (shift == lanebits && isU) {
   10390          assign(shf, mkV128(0x0000));
   10391       } else {
   10392          UInt nudge = 0;
   10393          if (shift == lanebits) {
   10394             vassert(!isU);
   10395             nudge = 1;
   10396          }
   10397          assign(shf, binop(op, src, mkU8(shift - nudge)));
   10398       }
   10399       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
   10400                         : mkexpr(shf));
   10401       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   10402       HChar laneCh = "bhsd"[size];
   10403       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
   10404       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
   10405                               : (isU ? "ushr" : "sshr");
   10406       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
   10407           nameQReg128(dd), nLanes, laneCh,
   10408           nameQReg128(nn), nLanes, laneCh, shift);
   10409       return True;
   10410    }
   10411 
   10412    if (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0)) {
   10413       /* -------- 0,00100 SRSHR std7_std7_#imm -------- */
   10414       /* -------- 1,00100 URSHR std7_std7_#imm -------- */
   10415       /* -------- 0,00110 SRSRA std7_std7_#imm -------- */
   10416       /* -------- 1,00110 URSRA std7_std7_#imm -------- */
   10417       /* laneTy, shift = case immh:immb of
   10418                          0001:xxx -> B, SHR:8-xxx
   10419                          001x:xxx -> H, SHR:16-xxxx
   10420                          01xx:xxx -> S, SHR:32-xxxxx
   10421                          1xxx:xxx -> D, SHR:64-xxxxxx
   10422                          other    -> invalid
   10423       */
   10424       UInt size  = 0;
   10425       UInt shift = 0;
   10426       Bool isQ   = bitQ == 1;
   10427       Bool isU   = bitU == 1;
   10428       Bool isAcc = opcode == BITS5(0,0,1,1,0);
   10429       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
   10430       if (!ok || (bitQ == 0 && size == X11)) return False;
   10431       vassert(size >= 0 && size <= 3);
   10432       UInt lanebits = 8 << size;
   10433       vassert(shift >= 1 && shift <= lanebits);
   10434       IROp    op   = isU ? mkVecRSHU(size) : mkVecRSHS(size);
   10435       IRExpr* src  = getQReg128(nn);
   10436       IRTemp  imm8 = newTemp(Ity_I8);
   10437       assign(imm8, mkU8((UChar)(-shift)));
   10438       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
   10439       IRTemp  shf  = newTempV128();
   10440       IRTemp  res  = newTempV128();
   10441       assign(shf, binop(op, src, amt));
   10442       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
   10443                         : mkexpr(shf));
   10444       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   10445       HChar laneCh = "bhsd"[size];
   10446       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
   10447       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
   10448                               : (isU ? "urshr" : "srshr");
   10449       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
   10450           nameQReg128(dd), nLanes, laneCh,
   10451           nameQReg128(nn), nLanes, laneCh, shift);
   10452       return True;
   10453    }
   10454 
   10455    if (bitU == 1 && opcode == BITS5(0,1,0,0,0)) {
   10456       /* -------- 1,01000 SRI std7_std7_#imm -------- */
   10457       /* laneTy, shift = case immh:immb of
   10458                          0001:xxx -> B, SHR:8-xxx
   10459                          001x:xxx -> H, SHR:16-xxxx
   10460                          01xx:xxx -> S, SHR:32-xxxxx
   10461                          1xxx:xxx -> D, SHR:64-xxxxxx
   10462                          other    -> invalid
   10463       */
   10464       UInt size  = 0;
   10465       UInt shift = 0;
   10466       Bool isQ   = bitQ == 1;
   10467       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
   10468       if (!ok || (bitQ == 0 && size == X11)) return False;
   10469       vassert(size >= 0 && size <= 3);
   10470       UInt lanebits = 8 << size;
   10471       vassert(shift >= 1 && shift <= lanebits);
   10472       IRExpr* src = getQReg128(nn);
   10473       IRTemp  res = newTempV128();
   10474       if (shift == lanebits) {
   10475          assign(res, getQReg128(dd));
   10476       } else {
   10477          assign(res, binop(mkVecSHRN(size), src, mkU8(shift)));
   10478          IRExpr* nmask = binop(mkVecSHLN(size),
   10479                                mkV128(0xFFFF), mkU8(lanebits - shift));
   10480          IRTemp  tmp   = newTempV128();
   10481          assign(tmp, binop(Iop_OrV128,
   10482                            mkexpr(res),
   10483                            binop(Iop_AndV128, getQReg128(dd), nmask)));
   10484          res = tmp;
   10485       }
   10486       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   10487       HChar laneCh = "bhsd"[size];
   10488       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
   10489       DIP("%s %s.%u%c, %s.%u%c, #%u\n", "sri",
   10490           nameQReg128(dd), nLanes, laneCh,
   10491           nameQReg128(nn), nLanes, laneCh, shift);
   10492       return True;
   10493    }
   10494 
   10495    if (opcode == BITS5(0,1,0,1,0)) {
   10496       /* -------- 0,01010 SHL std7_std7_#imm -------- */
   10497       /* -------- 1,01010 SLI std7_std7_#imm -------- */
   10498       /* laneTy, shift = case immh:immb of
   10499                          0001:xxx -> B, xxx
   10500                          001x:xxx -> H, xxxx
   10501                          01xx:xxx -> S, xxxxx
   10502                          1xxx:xxx -> D, xxxxxx
   10503                          other    -> invalid
   10504       */
   10505       UInt size  = 0;
   10506       UInt shift = 0;
   10507       Bool isSLI = bitU == 1;
   10508       Bool isQ   = bitQ == 1;
   10509       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
   10510       if (!ok || (bitQ == 0 && size == X11)) return False;
   10511       vassert(size >= 0 && size <= 3);
   10512       /* The shift encoding has opposite sign for the leftwards case.
   10513          Adjust shift to compensate. */
   10514       UInt lanebits = 8 << size;
   10515       shift = lanebits - shift;
   10516       vassert(shift >= 0 && shift < lanebits);
   10517       IROp    op  = mkVecSHLN(size);
   10518       IRExpr* src = getQReg128(nn);
   10519       IRTemp  res = newTempV128();
   10520       if (shift == 0) {
   10521          assign(res, src);
   10522       } else {
   10523          assign(res, binop(op, src, mkU8(shift)));
   10524          if (isSLI) {
   10525             IRExpr* nmask = binop(mkVecSHRN(size),
   10526                                   mkV128(0xFFFF), mkU8(lanebits - shift));
   10527             IRTemp  tmp   = newTempV128();
   10528             assign(tmp, binop(Iop_OrV128,
   10529                               mkexpr(res),
   10530                               binop(Iop_AndV128, getQReg128(dd), nmask)));
   10531             res = tmp;
   10532          }
   10533       }
   10534       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   10535       HChar laneCh = "bhsd"[size];
   10536       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
   10537       const HChar* nm = isSLI ? "sli" : "shl";
   10538       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
   10539           nameQReg128(dd), nLanes, laneCh,
   10540           nameQReg128(nn), nLanes, laneCh, shift);
   10541       return True;
   10542    }
   10543 
   10544    if (opcode == BITS5(0,1,1,1,0)
   10545        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
   10546       /* -------- 0,01110  SQSHL  std7_std7_#imm -------- */
   10547       /* -------- 1,01110  UQSHL  std7_std7_#imm -------- */
   10548       /* -------- 1,01100  SQSHLU std7_std7_#imm -------- */
   10549       UInt size  = 0;
   10550       UInt shift = 0;
   10551       Bool isQ   = bitQ == 1;
   10552       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
   10553       if (!ok || (bitQ == 0 && size == X11)) return False;
   10554       vassert(size >= 0 && size <= 3);
   10555       /* The shift encoding has opposite sign for the leftwards case.
   10556          Adjust shift to compensate. */
   10557       UInt lanebits = 8 << size;
   10558       shift = lanebits - shift;
   10559       vassert(shift >= 0 && shift < lanebits);
   10560       const HChar* nm = NULL;
   10561       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
   10562       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
   10563       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
   10564       else vassert(0);
   10565       IRTemp qDiff1 = IRTemp_INVALID;
   10566       IRTemp qDiff2 = IRTemp_INVALID;
   10567       IRTemp res = IRTemp_INVALID;
   10568       IRTemp src = newTempV128();
   10569       assign(src, getQReg128(nn));
   10570       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
   10571       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   10572       updateQCFLAGwithDifferenceZHI(qDiff1, qDiff2,
   10573                                     isQ ? Iop_INVALID : Iop_ZeroHI64ofV128);
   10574       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   10575       DIP("%s %s.%s, %s.%s, #%u\n", nm,
   10576           nameQReg128(dd), arr, nameQReg128(nn), arr, shift);
   10577       return True;
   10578    }
   10579 
   10580    if (bitU == 0
   10581        && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
   10582       /* -------- 0,10000  SHRN{,2} #imm -------- */
   10583       /* -------- 0,10001 RSHRN{,2} #imm -------- */
   10584       /* Narrows, and size is the narrow size. */
   10585       UInt size  = 0;
   10586       UInt shift = 0;
   10587       Bool is2   = bitQ == 1;
   10588       Bool isR   = opcode == BITS5(1,0,0,0,1);
   10589       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
   10590       if (!ok || size == X11) return False;
   10591       vassert(shift >= 1);
   10592       IRTemp t1 = newTempV128();
   10593       IRTemp t2 = newTempV128();
   10594       IRTemp t3 = newTempV128();
   10595       assign(t1, getQReg128(nn));
   10596       assign(t2, isR ? binop(mkVecADD(size+1),
   10597                              mkexpr(t1),
   10598                              mkexpr(math_VEC_DUP_IMM(size+1, 1ULL<<(shift-1))))
   10599                      : mkexpr(t1));
   10600       assign(t3, binop(mkVecSHRN(size+1), mkexpr(t2), mkU8(shift)));
   10601       IRTemp t4 = math_NARROW_LANES(t3, t3, size);
   10602       putLO64andZUorPutHI64(is2, dd, t4);
   10603       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   10604       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   10605       DIP("%s %s.%s, %s.%s, #%u\n", isR ? "rshrn" : "shrn",
   10606           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
   10607       return True;
   10608    }
   10609 
   10610    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
   10611        || (bitU == 1
   10612            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
   10613       /* -------- 0,10010   SQSHRN{,2} #imm -------- */
   10614       /* -------- 1,10010   UQSHRN{,2} #imm -------- */
   10615       /* -------- 0,10011  SQRSHRN{,2} #imm -------- */
   10616       /* -------- 1,10011  UQRSHRN{,2} #imm -------- */
   10617       /* -------- 1,10000  SQSHRUN{,2} #imm -------- */
   10618       /* -------- 1,10001 SQRSHRUN{,2} #imm -------- */
   10619       UInt size  = 0;
   10620       UInt shift = 0;
   10621       Bool is2   = bitQ == 1;
   10622       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
   10623       if (!ok || size == X11) return False;
   10624       vassert(shift >= 1 && shift <= (8 << size));
   10625       const HChar* nm = "??";
   10626       IROp op = Iop_INVALID;
   10627       /* Decide on the name and the operation. */
   10628       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
   10629          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
   10630       }
   10631       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
   10632          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
   10633       }
   10634       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
   10635          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
   10636       }
   10637       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
   10638          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
   10639       }
   10640       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
   10641          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
   10642       }
   10643       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
   10644          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
   10645       }
   10646       else vassert(0);
   10647       /* Compute the result (Q, shifted value) pair. */
   10648       IRTemp src128 = newTempV128();
   10649       assign(src128, getQReg128(nn));
   10650       IRTemp pair = newTempV128();
   10651       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
   10652       /* Update the result reg */
   10653       IRTemp res64in128 = newTempV128();
   10654       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
   10655       putLO64andZUorPutHI64(is2, dd, res64in128);
   10656       /* Update the Q flag. */
   10657       IRTemp q64q64 = newTempV128();
   10658       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
   10659       IRTemp z128 = newTempV128();
   10660       assign(z128, mkV128(0x0000));
   10661       updateQCFLAGwithDifference(q64q64, z128);
   10662       /* */
   10663       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   10664       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   10665       DIP("%s %s.%s, %s.%s, #%u\n", nm,
   10666           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
   10667       return True;
   10668    }
   10669 
   10670    if (opcode == BITS5(1,0,1,0,0)) {
   10671       /* -------- 0,10100 SSHLL{,2} #imm -------- */
   10672       /* -------- 1,10100 USHLL{,2} #imm -------- */
   10673       /* 31  28     22   18   15     9 4
   10674          0q0 011110 immh immb 101001 n d  SSHLL Vd.Ta, Vn.Tb, #sh
   10675          0q1 011110 immh immb 101001 n d  USHLL Vd.Ta, Vn.Tb, #sh
   10676          where Ta,Tb,sh
   10677            = case immh of 1xxx -> invalid
   10678                           01xx -> 2d, 2s(q0)/4s(q1),  immh:immb - 32 (0..31)
   10679                           001x -> 4s, 4h(q0)/8h(q1),  immh:immb - 16 (0..15)
   10680                           0001 -> 8h, 8b(q0)/16b(q1), immh:immb - 8  (0..7)
   10681                           0000 -> AdvSIMD modified immediate (???)
   10682       */
   10683       Bool    isQ   = bitQ == 1;
   10684       Bool    isU   = bitU == 1;
   10685       UInt    immhb = (immh << 3) | immb;
   10686       IRTemp  src   = newTempV128();
   10687       IRTemp  zero  = newTempV128();
   10688       IRExpr* res   = NULL;
   10689       UInt    sh    = 0;
   10690       const HChar* ta = "??";
   10691       const HChar* tb = "??";
   10692       assign(src, getQReg128(nn));
   10693       assign(zero, mkV128(0x0000));
   10694       if (immh & 8) {
   10695          /* invalid; don't assign to res */
   10696       }
   10697       else if (immh & 4) {
   10698          sh = immhb - 32;
   10699          vassert(sh < 32); /* so 32-sh is 1..32 */
   10700          ta = "2d";
   10701          tb = isQ ? "4s" : "2s";
   10702          IRExpr* tmp = isQ ? mk_InterleaveHI32x4(src, zero)
   10703                            : mk_InterleaveLO32x4(src, zero);
   10704          res = binop(isU ? Iop_ShrN64x2 : Iop_SarN64x2, tmp, mkU8(32-sh));
   10705       }
   10706       else if (immh & 2) {
   10707          sh = immhb - 16;
   10708          vassert(sh < 16); /* so 16-sh is 1..16 */
   10709          ta = "4s";
   10710          tb = isQ ? "8h" : "4h";
   10711          IRExpr* tmp = isQ ? mk_InterleaveHI16x8(src, zero)
   10712                            : mk_InterleaveLO16x8(src, zero);
   10713          res = binop(isU ? Iop_ShrN32x4 : Iop_SarN32x4, tmp, mkU8(16-sh));
   10714       }
   10715       else if (immh & 1) {
   10716          sh = immhb - 8;
   10717          vassert(sh < 8); /* so 8-sh is 1..8 */
   10718          ta = "8h";
   10719          tb = isQ ? "16b" : "8b";
   10720          IRExpr* tmp = isQ ? mk_InterleaveHI8x16(src, zero)
   10721                            : mk_InterleaveLO8x16(src, zero);
   10722          res = binop(isU ? Iop_ShrN16x8 : Iop_SarN16x8, tmp, mkU8(8-sh));
   10723       } else {
   10724          vassert(immh == 0);
   10725          /* invalid; don't assign to res */
   10726       }
   10727       /* */
   10728       if (res) {
   10729          putQReg128(dd, res);
   10730          DIP("%cshll%s %s.%s, %s.%s, #%u\n",
   10731              isU ? 'u' : 's', isQ ? "2" : "",
   10732              nameQReg128(dd), ta, nameQReg128(nn), tb, sh);
   10733          return True;
   10734       }
   10735       return False;
   10736    }
   10737 
   10738    if (opcode == BITS5(1,1,1,0,0)) {
   10739       /* -------- 0,11100 SCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
   10740       /* -------- 1,11100 UCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
   10741       /* If immh is of the form 00xx, the insn is invalid. */
   10742       if (immh < BITS4(0,1,0,0)) return False;
   10743       UInt size  = 0;
   10744       UInt fbits = 0;
   10745       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
   10746       /* The following holds because immh is never zero. */
   10747       vassert(ok);
   10748       /* The following holds because immh >= 0100. */
   10749       vassert(size == X10 || size == X11);
   10750       Bool isD = size == X11;
   10751       Bool isU = bitU == 1;
   10752       Bool isQ = bitQ == 1;
   10753       if (isD && !isQ) return False; /* reject .1d case */
   10754       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
   10755       Double  scale  = two_to_the_minus(fbits);
   10756       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
   10757                            : IRExpr_Const(IRConst_F32( (Float)scale ));
   10758       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
   10759       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
   10760                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
   10761       IRType tyF = isD ? Ity_F64 : Ity_F32;
   10762       IRType tyI = isD ? Ity_I64 : Ity_I32;
   10763       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
   10764       vassert(nLanes == 2 || nLanes == 4);
   10765       for (UInt i = 0; i < nLanes; i++) {
   10766          IRTemp src = newTemp(tyI);
   10767          IRTemp res = newTemp(tyF);
   10768          IRTemp rm  = mk_get_IR_rounding_mode();
   10769          assign(src, getQRegLane(nn, i, tyI));
   10770          assign(res, triop(opMUL, mkexpr(rm),
   10771                                   binop(opCVT, mkexpr(rm), mkexpr(src)),
   10772                                   scaleE));
   10773          putQRegLane(dd, i, mkexpr(res));
   10774       }
   10775       if (!isQ) {
   10776          putQRegLane(dd, 1, mkU64(0));
   10777       }
   10778       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   10779       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "ucvtf" : "scvtf",
   10780           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
   10781       return True;
   10782    }
   10783 
   10784    if (opcode == BITS5(1,1,1,1,1)) {
   10785       /* -------- 0,11111 FCVTZS {2d_2d,4s_4s,2s_2s}_imm -------- */
   10786       /* -------- 1,11111 FCVTZU {2d_2d,4s_4s,2s_2s}_imm -------- */
   10787       /* If immh is of the form 00xx, the insn is invalid. */
   10788       if (immh < BITS4(0,1,0,0)) return False;
   10789       UInt size  = 0;
   10790       UInt fbits = 0;
   10791       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
   10792       /* The following holds because immh is never zero. */
   10793       vassert(ok);
   10794       /* The following holds because immh >= 0100. */
   10795       vassert(size == X10 || size == X11);
   10796       Bool isD = size == X11;
   10797       Bool isU = bitU == 1;
   10798       Bool isQ = bitQ == 1;
   10799       if (isD && !isQ) return False; /* reject .1d case */
   10800       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
   10801       Double  scale  = two_to_the_plus(fbits);
   10802       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
   10803                            : IRExpr_Const(IRConst_F32( (Float)scale ));
   10804       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
   10805       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
   10806                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
   10807       IRType tyF = isD ? Ity_F64 : Ity_F32;
   10808       IRType tyI = isD ? Ity_I64 : Ity_I32;
   10809       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
   10810       vassert(nLanes == 2 || nLanes == 4);
   10811       for (UInt i = 0; i < nLanes; i++) {
   10812          IRTemp src = newTemp(tyF);
   10813          IRTemp res = newTemp(tyI);
   10814          IRTemp rm  = newTemp(Ity_I32);
   10815          assign(src, getQRegLane(nn, i, tyF));
   10816          assign(rm,  mkU32(Irrm_ZERO));
   10817          assign(res, binop(opCVT, mkexpr(rm),
   10818                                   triop(opMUL, mkexpr(rm),
   10819                                                mkexpr(src), scaleE)));
   10820          putQRegLane(dd, i, mkexpr(res));
   10821       }
   10822       if (!isQ) {
   10823          putQRegLane(dd, 1, mkU64(0));
   10824       }
   10825       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   10826       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "fcvtzu" : "fcvtzs",
   10827           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
   10828       return True;
   10829    }
   10830 
   10831 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   10832    return False;
   10833 #  undef INSN
   10834 }
   10835 
   10836 
   10837 static
   10838 Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
   10839 {
   10840    /* 31 30 29 28    23   21 20 15     11 9 4
   10841       0  Q  U  01110 size 1  m  opcode 00 n d
   10842       Decode fields: u,opcode
   10843    */
   10844 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   10845    if (INSN(31,31) != 0
   10846        || INSN(28,24) != BITS5(0,1,1,1,0)
   10847        || INSN(21,21) != 1
   10848        || INSN(11,10) != BITS2(0,0)) {
   10849       return False;
   10850    }
   10851    UInt bitQ   = INSN(30,30);
   10852    UInt bitU   = INSN(29,29);
   10853    UInt size   = INSN(23,22);
   10854    UInt mm     = INSN(20,16);
   10855    UInt opcode = INSN(15,12);
   10856    UInt nn     = INSN(9,5);
   10857    UInt dd     = INSN(4,0);
   10858    vassert(size < 4);
   10859    Bool is2    = bitQ == 1;
   10860 
   10861    if (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,1,0)) {
   10862       /* -------- 0,0000 SADDL{2} -------- */
   10863       /* -------- 1,0000 UADDL{2} -------- */
   10864       /* -------- 0,0010 SSUBL{2} -------- */
   10865       /* -------- 1,0010 USUBL{2} -------- */
   10866       /* Widens, and size refers to the narrowed lanes. */
   10867       if (size == X11) return False;
   10868       vassert(size <= 2);
   10869       Bool   isU   = bitU == 1;
   10870       Bool   isADD = opcode == BITS4(0,0,0,0);
   10871       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
   10872       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
   10873       IRTemp res   = newTempV128();
   10874       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
   10875                         mkexpr(argL), mkexpr(argR)));
   10876       putQReg128(dd, mkexpr(res));
   10877       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   10878       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   10879       const HChar* nm        = isADD ? (isU ? "uaddl" : "saddl")
   10880                                      : (isU ? "usubl" : "ssubl");
   10881       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
   10882           nameQReg128(dd), arrWide,
   10883           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
   10884       return True;
   10885    }
   10886 
   10887    if (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,0,1,1)) {
   10888       /* -------- 0,0001 SADDW{2} -------- */
   10889       /* -------- 1,0001 UADDW{2} -------- */
   10890       /* -------- 0,0011 SSUBW{2} -------- */
   10891       /* -------- 1,0011 USUBW{2} -------- */
   10892       /* Widens, and size refers to the narrowed lanes. */
   10893       if (size == X11) return False;
   10894       vassert(size <= 2);
   10895       Bool   isU   = bitU == 1;
   10896       Bool   isADD = opcode == BITS4(0,0,0,1);
   10897       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
   10898       IRTemp res   = newTempV128();
   10899       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
   10900                         getQReg128(nn), mkexpr(argR)));
   10901       putQReg128(dd, mkexpr(res));
   10902       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   10903       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   10904       const HChar* nm        = isADD ? (isU ? "uaddw" : "saddw")
   10905                                      : (isU ? "usubw" : "ssubw");
   10906       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
   10907           nameQReg128(dd), arrWide,
   10908           nameQReg128(nn), arrWide, nameQReg128(mm), arrNarrow);
   10909       return True;
   10910    }
   10911 
   10912    if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) {
   10913       /* -------- 0,0100  ADDHN{2} -------- */
   10914       /* -------- 1,0100 RADDHN{2} -------- */
   10915       /* -------- 0,0110  SUBHN{2} -------- */
   10916       /* -------- 1,0110 RSUBHN{2} -------- */
   10917       /* Narrows, and size refers to the narrowed lanes. */
   10918       if (size == X11) return False;
   10919       vassert(size <= 2);
   10920       const UInt shift[3] = { 8, 16, 32 };
   10921       Bool isADD = opcode == BITS4(0,1,0,0);
   10922       Bool isR   = bitU == 1;
   10923       /* Combined elements in wide lanes */
   10924       IRTemp  wide  = newTempV128();
   10925       IRExpr* wideE = binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
   10926                             getQReg128(nn), getQReg128(mm));
   10927       if (isR) {
   10928          wideE = binop(mkVecADD(size+1),
   10929                        wideE,
   10930                        mkexpr(math_VEC_DUP_IMM(size+1,
   10931                                                1ULL << (shift[size]-1))));
   10932       }
   10933       assign(wide, wideE);
   10934       /* Top halves of elements, still in wide lanes */
   10935       IRTemp shrd = newTempV128();
   10936       assign(shrd, binop(mkVecSHRN(size+1), mkexpr(wide), mkU8(shift[size])));
   10937       /* Elements now compacted into lower 64 bits */
   10938       IRTemp new64 = newTempV128();
   10939       assign(new64, binop(mkVecCATEVENLANES(size), mkexpr(shrd), mkexpr(shrd)));
   10940       putLO64andZUorPutHI64(is2, dd, new64);
   10941       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   10942       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   10943       const HChar* nm = isADD ? (isR ? "raddhn" : "addhn")
   10944                               : (isR ? "rsubhn" : "subhn");
   10945       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
   10946           nameQReg128(dd), arrNarrow,
   10947           nameQReg128(nn), arrWide, nameQReg128(mm), arrWide);
   10948       return True;
   10949    }
   10950 
   10951    if (opcode == BITS4(0,1,0,1) || opcode == BITS4(0,1,1,1)) {
   10952       /* -------- 0,0101 SABAL{2} -------- */
   10953       /* -------- 1,0101 UABAL{2} -------- */
   10954       /* -------- 0,0111 SABDL{2} -------- */
   10955       /* -------- 1,0111 UABDL{2} -------- */
   10956       /* Widens, and size refers to the narrowed lanes. */
   10957       if (size == X11) return False;
   10958       vassert(size <= 2);
   10959       Bool   isU   = bitU == 1;
   10960       Bool   isACC = opcode == BITS4(0,1,0,1);
   10961       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
   10962       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
   10963       IRTemp abd   = math_ABD(isU, size+1, mkexpr(argL), mkexpr(argR));
   10964       IRTemp res   = newTempV128();
   10965       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(abd), getQReg128(dd))
   10966                         : mkexpr(abd));
   10967       putQReg128(dd, mkexpr(res));
   10968       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   10969       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   10970       const HChar* nm        = isACC ? (isU ? "uabal" : "sabal")
   10971                                      : (isU ? "uabdl" : "sabdl");
   10972       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
   10973           nameQReg128(dd), arrWide,
   10974           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
   10975       return True;
   10976    }
   10977 
   10978    if (opcode == BITS4(1,1,0,0)
   10979        || opcode == BITS4(1,0,0,0) || opcode == BITS4(1,0,1,0)) {
   10980       /* -------- 0,1100  SMULL{2} -------- */ // 0 (ks)
   10981       /* -------- 1,1100  UMULL{2} -------- */ // 0
   10982       /* -------- 0,1000  SMLAL{2} -------- */ // 1
   10983       /* -------- 1,1000  UMLAL{2} -------- */ // 1
   10984       /* -------- 0,1010  SMLSL{2} -------- */ // 2
   10985       /* -------- 1,1010  UMLSL{2} -------- */ // 2
   10986       /* Widens, and size refers to the narrowed lanes. */
   10987       UInt ks = 3;
   10988       switch (opcode) {
   10989          case BITS4(1,1,0,0): ks = 0; break;
   10990          case BITS4(1,0,0,0): ks = 1; break;
   10991          case BITS4(1,0,1,0): ks = 2; break;
   10992          default: vassert(0);
   10993       }
   10994       vassert(ks >= 0 && ks <= 2);
   10995       if (size == X11) return False;
   10996       vassert(size <= 2);
   10997       Bool   isU  = bitU == 1;
   10998       IRTemp vecN = newTempV128();
   10999       IRTemp vecM = newTempV128();
   11000       IRTemp vecD = newTempV128();
   11001       assign(vecN, getQReg128(nn));
   11002       assign(vecM, getQReg128(mm));
   11003       assign(vecD, getQReg128(dd));
   11004       IRTemp res = IRTemp_INVALID;
   11005       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
   11006                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
   11007       putQReg128(dd, mkexpr(res));
   11008       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   11009       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   11010       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
   11011       DIP("%c%s%s %s.%s, %s.%s, %s.%s\n", isU ? 'u' : 's', nm, is2 ? "2" : "",
   11012           nameQReg128(dd), arrWide,
   11013           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
   11014       return True;
   11015    }
   11016 
   11017    if (bitU == 0
   11018        && (opcode == BITS4(1,1,0,1)
   11019            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
   11020       /* -------- 0,1101  SQDMULL{2} -------- */ // 0 (ks)
   11021       /* -------- 0,1001  SQDMLAL{2} -------- */ // 1
   11022       /* -------- 0,1011  SQDMLSL{2} -------- */ // 2
   11023       /* Widens, and size refers to the narrowed lanes. */
   11024       UInt ks = 3;
   11025       switch (opcode) {
   11026          case BITS4(1,1,0,1): ks = 0; break;
   11027          case BITS4(1,0,0,1): ks = 1; break;
   11028          case BITS4(1,0,1,1): ks = 2; break;
   11029          default: vassert(0);
   11030       }
   11031       vassert(ks >= 0 && ks <= 2);
   11032       if (size == X00 || size == X11) return False;
   11033       vassert(size <= 2);
   11034       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
   11035       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
   11036       newTempsV128_3(&vecN, &vecM, &vecD);
   11037       assign(vecN, getQReg128(nn));
   11038       assign(vecM, getQReg128(mm));
   11039       assign(vecD, getQReg128(dd));
   11040       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
   11041                        is2, size, "mas"[ks],
   11042                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
   11043       putQReg128(dd, mkexpr(res));
   11044       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
   11045       updateQCFLAGwithDifference(sat1q, sat1n);
   11046       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
   11047          updateQCFLAGwithDifference(sat2q, sat2n);
   11048       }
   11049       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   11050       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   11051       const HChar* nm        = ks == 0 ? "sqdmull"
   11052                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
   11053       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
   11054           nameQReg128(dd), arrWide,
   11055           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
   11056       return True;
   11057    }
   11058 
   11059    if (bitU == 0 && opcode == BITS4(1,1,1,0)) {
   11060       /* -------- 0,1110  PMULL{2} -------- */
   11061       /* Widens, and size refers to the narrowed lanes. */
   11062       if (size != X00) return False;
   11063       IRTemp res
   11064          = math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8,
   11065                                      getQReg128(nn), getQReg128(mm));
   11066       putQReg128(dd, mkexpr(res));
   11067       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   11068       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   11069       DIP("%s%s %s.%s, %s.%s, %s.%s\n", "pmull", is2 ? "2" : "",
   11070           nameQReg128(dd), arrNarrow,
   11071           nameQReg128(nn), arrWide, nameQReg128(mm), arrWide);
   11072       return True;
   11073    }
   11074 
   11075    return False;
   11076 #  undef INSN
   11077 }
   11078 
   11079 
   11080 static
   11081 Bool dis_AdvSIMD_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
   11082 {
   11083    /* 31 30 29 28    23   21 20 15     10 9 4
   11084       0  Q  U  01110 size 1  m  opcode 1  n d
   11085       Decode fields: u,size,opcode
   11086    */
   11087 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   11088    if (INSN(31,31) != 0
   11089        || INSN(28,24) != BITS5(0,1,1,1,0)
   11090        || INSN(21,21) != 1
   11091        || INSN(10,10) != 1) {
   11092       return False;
   11093    }
   11094    UInt bitQ   = INSN(30,30);
   11095    UInt bitU   = INSN(29,29);
   11096    UInt size   = INSN(23,22);
   11097    UInt mm     = INSN(20,16);
   11098    UInt opcode = INSN(15,11);
   11099    UInt nn     = INSN(9,5);
   11100    UInt dd     = INSN(4,0);
   11101    vassert(size < 4);
   11102 
   11103    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,1,0,0)) {
   11104       /* -------- 0,xx,00000 SHADD std6_std6_std6 -------- */
   11105       /* -------- 1,xx,00000 UHADD std6_std6_std6 -------- */
   11106       /* -------- 0,xx,00100 SHSUB std6_std6_std6 -------- */
   11107       /* -------- 1,xx,00100 UHSUB std6_std6_std6 -------- */
   11108       if (size == X11) return False;
   11109       Bool isADD = opcode == BITS5(0,0,0,0,0);
   11110       Bool isU   = bitU == 1;
   11111       /* Widen both args out, do the math, narrow to final result. */
   11112       IRTemp argL   = newTempV128();
   11113       IRTemp argLhi = IRTemp_INVALID;
   11114       IRTemp argLlo = IRTemp_INVALID;
   11115       IRTemp argR   = newTempV128();
   11116       IRTemp argRhi = IRTemp_INVALID;
   11117       IRTemp argRlo = IRTemp_INVALID;
   11118       IRTemp resHi  = newTempV128();
   11119       IRTemp resLo  = newTempV128();
   11120       IRTemp res    = IRTemp_INVALID;
   11121       assign(argL, getQReg128(nn));
   11122       argLlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argL));
   11123       argLhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argL));
   11124       assign(argR, getQReg128(mm));
   11125       argRlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argR));
   11126       argRhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argR));
   11127       IROp opADDSUB = isADD ? mkVecADD(size+1) : mkVecSUB(size+1);
   11128       IROp opSxR = isU ? mkVecSHRN(size+1) : mkVecSARN(size+1);
   11129       assign(resHi, binop(opSxR,
   11130                           binop(opADDSUB, mkexpr(argLhi), mkexpr(argRhi)),
   11131                           mkU8(1)));
   11132       assign(resLo, binop(opSxR,
   11133                           binop(opADDSUB, mkexpr(argLlo), mkexpr(argRlo)),
   11134                           mkU8(1)));
   11135       res = math_NARROW_LANES ( resHi, resLo, size );
   11136       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11137       const HChar* nm  = isADD ? (isU ? "uhadd" : "shadd")
   11138                                : (isU ? "uhsub" : "shsub");
   11139       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11140       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   11141           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11142       return True;
   11143    }
   11144 
   11145    if (opcode == BITS5(0,0,0,1,0)) {
   11146       /* -------- 0,xx,00010 SRHADD std7_std7_std7 -------- */
   11147       /* -------- 1,xx,00010 URHADD std7_std7_std7 -------- */
   11148       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11149       Bool   isU  = bitU == 1;
   11150       IRTemp argL = newTempV128();
   11151       IRTemp argR = newTempV128();
   11152       assign(argL, getQReg128(nn));
   11153       assign(argR, getQReg128(mm));
   11154       IRTemp res = math_RHADD(size, isU, argL, argR);
   11155       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11156       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11157       DIP("%s %s.%s, %s.%s, %s.%s\n", isU ? "urhadd" : "srhadd",
   11158           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11159       return True;
   11160    }
   11161 
   11162    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
   11163       /* -------- 0,xx,00001 SQADD std7_std7_std7 -------- */
   11164       /* -------- 1,xx,00001 UQADD std7_std7_std7 -------- */
   11165       /* -------- 0,xx,00101 SQSUB std7_std7_std7 -------- */
   11166       /* -------- 1,xx,00101 UQSUB std7_std7_std7 -------- */
   11167       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11168       Bool isADD = opcode == BITS5(0,0,0,0,1);
   11169       Bool isU   = bitU == 1;
   11170       IROp qop   = Iop_INVALID;
   11171       IROp nop   = Iop_INVALID;
   11172       if (isADD) {
   11173          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
   11174          nop = mkVecADD(size);
   11175       } else {
   11176          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
   11177          nop = mkVecSUB(size);
   11178       }
   11179       IRTemp argL = newTempV128();
   11180       IRTemp argR = newTempV128();
   11181       IRTemp qres = newTempV128();
   11182       IRTemp nres = newTempV128();
   11183       assign(argL, getQReg128(nn));
   11184       assign(argR, getQReg128(mm));
   11185       assign(qres, math_MAYBE_ZERO_HI64_fromE(
   11186                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
   11187       assign(nres, math_MAYBE_ZERO_HI64_fromE(
   11188                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
   11189       putQReg128(dd, mkexpr(qres));
   11190       updateQCFLAGwithDifference(qres, nres);
   11191       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
   11192                                : (isU ? "uqsub" : "sqsub");
   11193       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11194       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   11195           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11196       return True;
   11197    }
   11198 
   11199    if (bitU == 0 && opcode == BITS5(0,0,0,1,1)) {
   11200       /* -------- 0,00,00011 AND 16b_16b_16b, 8b_8b_8b -------- */
   11201       /* -------- 0,01,00011 BIC 16b_16b_16b, 8b_8b_8b -------- */
   11202       /* -------- 0,10,00011 ORR 16b_16b_16b, 8b_8b_8b -------- */
   11203       /* -------- 0,10,00011 ORN 16b_16b_16b, 8b_8b_8b -------- */
   11204       Bool   isORx  = (size & 2) == 2;
   11205       Bool   invert = (size & 1) == 1;
   11206       IRTemp res    = newTempV128();
   11207       assign(res, binop(isORx ? Iop_OrV128 : Iop_AndV128,
   11208                         getQReg128(nn),
   11209                         invert ? unop(Iop_NotV128, getQReg128(mm))
   11210                                : getQReg128(mm)));
   11211       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11212       const HChar* names[4] = { "and", "bic", "orr", "orn" };
   11213       const HChar* ar = bitQ == 1 ? "16b" : "8b";
   11214       DIP("%s %s.%s, %s.%s, %s.%s\n", names[INSN(23,22)],
   11215           nameQReg128(dd), ar, nameQReg128(nn), ar, nameQReg128(mm), ar);
   11216       return True;
   11217    }
   11218 
   11219    if (bitU == 1 && opcode == BITS5(0,0,0,1,1)) {
   11220       /* -------- 1,00,00011 EOR 16b_16b_16b, 8b_8b_8b -------- */
   11221       /* -------- 1,01,00011 BSL 16b_16b_16b, 8b_8b_8b -------- */
   11222       /* -------- 1,10,00011 BIT 16b_16b_16b, 8b_8b_8b -------- */
   11223       /* -------- 1,10,00011 BIF 16b_16b_16b, 8b_8b_8b -------- */
   11224       IRTemp argD = newTempV128();
   11225       IRTemp argN = newTempV128();
   11226       IRTemp argM = newTempV128();
   11227       assign(argD, getQReg128(dd));
   11228       assign(argN, getQReg128(nn));
   11229       assign(argM, getQReg128(mm));
   11230       const IROp opXOR = Iop_XorV128;
   11231       const IROp opAND = Iop_AndV128;
   11232       const IROp opNOT = Iop_NotV128;
   11233       IRTemp res = newTempV128();
   11234       switch (size) {
   11235          case BITS2(0,0): /* EOR */
   11236             assign(res, binop(opXOR, mkexpr(argM), mkexpr(argN)));
   11237             break;
   11238          case BITS2(0,1): /* BSL */
   11239             assign(res, binop(opXOR, mkexpr(argM),
   11240                               binop(opAND,
   11241                                     binop(opXOR, mkexpr(argM), mkexpr(argN)),
   11242                                           mkexpr(argD))));
   11243             break;
   11244          case BITS2(1,0): /* BIT */
   11245             assign(res, binop(opXOR, mkexpr(argD),
   11246                               binop(opAND,
   11247                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
   11248                                     mkexpr(argM))));
   11249             break;
   11250          case BITS2(1,1): /* BIF */
   11251             assign(res, binop(opXOR, mkexpr(argD),
   11252                               binop(opAND,
   11253                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
   11254                                     unop(opNOT, mkexpr(argM)))));
   11255             break;
   11256          default:
   11257             vassert(0);
   11258       }
   11259       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11260       const HChar* nms[4] = { "eor", "bsl", "bit", "bif" };
   11261       const HChar* arr = bitQ == 1 ? "16b" : "8b";
   11262       DIP("%s %s.%s, %s.%s, %s.%s\n", nms[size],
   11263           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11264       return True;
   11265    }
   11266 
   11267    if (opcode == BITS5(0,0,1,1,0)) {
   11268       /* -------- 0,xx,00110 CMGT std7_std7_std7 -------- */ // >s
   11269       /* -------- 1,xx,00110 CMHI std7_std7_std7 -------- */ // >u
   11270       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11271       Bool   isGT  = bitU == 0;
   11272       IRExpr* argL = getQReg128(nn);
   11273       IRExpr* argR = getQReg128(mm);
   11274       IRTemp  res  = newTempV128();
   11275       assign(res,
   11276              isGT ? binop(mkVecCMPGTS(size), argL, argR)
   11277                   : binop(mkVecCMPGTU(size), argL, argR));
   11278       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11279       const HChar* nm  = isGT ? "cmgt" : "cmhi";
   11280       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11281       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   11282           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11283       return True;
   11284    }
   11285 
   11286    if (opcode == BITS5(0,0,1,1,1)) {
   11287       /* -------- 0,xx,00111 CMGE std7_std7_std7 -------- */ // >=s
   11288       /* -------- 1,xx,00111 CMHS std7_std7_std7 -------- */ // >=u
   11289       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11290       Bool    isGE = bitU == 0;
   11291       IRExpr* argL = getQReg128(nn);
   11292       IRExpr* argR = getQReg128(mm);
   11293       IRTemp  res  = newTempV128();
   11294       assign(res,
   11295              isGE ? unop(Iop_NotV128, binop(mkVecCMPGTS(size), argR, argL))
   11296                   : unop(Iop_NotV128, binop(mkVecCMPGTU(size), argR, argL)));
   11297       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11298       const HChar* nm  = isGE ? "cmge" : "cmhs";
   11299       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11300       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   11301           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11302       return True;
   11303    }
   11304 
   11305    if (opcode == BITS5(0,1,0,0,0) || opcode == BITS5(0,1,0,1,0)) {
   11306       /* -------- 0,xx,01000 SSHL  std7_std7_std7 -------- */
   11307       /* -------- 0,xx,01010 SRSHL std7_std7_std7 -------- */
   11308       /* -------- 1,xx,01000 USHL  std7_std7_std7 -------- */
   11309       /* -------- 1,xx,01010 URSHL std7_std7_std7 -------- */
   11310       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11311       Bool isU = bitU == 1;
   11312       Bool isR = opcode == BITS5(0,1,0,1,0);
   11313       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
   11314                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
   11315       IRTemp res = newTempV128();
   11316       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
   11317       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11318       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
   11319                              : (isU ? "ushl"  : "sshl");
   11320       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11321       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   11322           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11323       return True;
   11324    }
   11325 
   11326    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
   11327       /* -------- 0,xx,01001 SQSHL  std7_std7_std7 -------- */
   11328       /* -------- 0,xx,01011 SQRSHL std7_std7_std7 -------- */
   11329       /* -------- 1,xx,01001 UQSHL  std7_std7_std7 -------- */
   11330       /* -------- 1,xx,01011 UQRSHL std7_std7_std7 -------- */
   11331       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11332       Bool isU = bitU == 1;
   11333       Bool isR = opcode == BITS5(0,1,0,1,1);
   11334       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
   11335                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
   11336       /* This is a bit tricky.  If we're only interested in the lowest 64 bits
   11337          of the result (viz, bitQ == 0), then we must adjust the operands to
   11338          ensure that the upper part of the result, that we don't care about,
   11339          doesn't pollute the returned Q value.  To do this, zero out the upper
   11340          operand halves beforehand.  This works because it means, for the
   11341          lanes we don't care about, we are shifting zero by zero, which can
   11342          never saturate. */
   11343       IRTemp res256 = newTemp(Ity_V256);
   11344       IRTemp resSH  = newTempV128();
   11345       IRTemp resQ   = newTempV128();
   11346       IRTemp zero   = newTempV128();
   11347       assign(res256, binop(op,
   11348                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn)),
   11349                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(mm))));
   11350       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
   11351       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
   11352       assign(zero,  mkV128(0x0000));
   11353       putQReg128(dd, mkexpr(resSH));
   11354       updateQCFLAGwithDifference(resQ, zero);
   11355       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
   11356                              : (isU ? "uqshl"  : "sqshl");
   11357       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11358       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   11359           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11360       return True;
   11361    }
   11362 
   11363    if (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,0,1)) {
   11364       /* -------- 0,xx,01100 SMAX std7_std7_std7 -------- */
   11365       /* -------- 1,xx,01100 UMAX std7_std7_std7 -------- */
   11366       /* -------- 0,xx,01101 SMIN std7_std7_std7 -------- */
   11367       /* -------- 1,xx,01101 UMIN std7_std7_std7 -------- */
   11368       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11369       Bool isU   = bitU == 1;
   11370       Bool isMAX = (opcode & 1) == 0;
   11371       IROp op    = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
   11372                          : (isU ? mkVecMINU(size) : mkVecMINS(size));
   11373       IRTemp t   = newTempV128();
   11374       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
   11375       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
   11376       const HChar* nm = isMAX ? (isU ? "umax" : "smax")
   11377                               : (isU ? "umin" : "smin");
   11378       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11379       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   11380           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11381       return True;
   11382    }
   11383 
   11384    if (opcode == BITS5(0,1,1,1,0) || opcode == BITS5(0,1,1,1,1)) {
   11385       /* -------- 0,xx,01110 SABD std6_std6_std6 -------- */
   11386       /* -------- 1,xx,01110 UABD std6_std6_std6 -------- */
   11387       /* -------- 0,xx,01111 SABA std6_std6_std6 -------- */
   11388       /* -------- 1,xx,01111 UABA std6_std6_std6 -------- */
   11389       if (size == X11) return False; // 1d/2d cases not allowed
   11390       Bool isU   = bitU == 1;
   11391       Bool isACC = opcode == BITS5(0,1,1,1,1);
   11392       vassert(size <= 2);
   11393       IRTemp t1 = math_ABD(isU, size, getQReg128(nn), getQReg128(mm));
   11394       IRTemp t2 = newTempV128();
   11395       assign(t2, isACC ? binop(mkVecADD(size), mkexpr(t1), getQReg128(dd))
   11396                        : mkexpr(t1));
   11397       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
   11398       const HChar* nm  = isACC ? (isU ? "uaba" : "saba")
   11399                                : (isU ? "uabd" : "sabd");
   11400       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11401       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   11402           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11403       return True;
   11404    }
   11405 
   11406    if (opcode == BITS5(1,0,0,0,0)) {
   11407       /* -------- 0,xx,10000 ADD std7_std7_std7 -------- */
   11408       /* -------- 1,xx,10000 SUB std7_std7_std7 -------- */
   11409       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11410       Bool   isSUB = bitU == 1;
   11411       IROp   op    = isSUB ? mkVecSUB(size) : mkVecADD(size);
   11412       IRTemp t     = newTempV128();
   11413       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
   11414       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
   11415       const HChar* nm  = isSUB ? "sub" : "add";
   11416       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11417       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   11418           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11419       return True;
   11420    }
   11421 
   11422    if (opcode == BITS5(1,0,0,0,1)) {
   11423       /* -------- 0,xx,10001 CMTST std7_std7_std7 -------- */ // &, != 0
   11424       /* -------- 1,xx,10001 CMEQ  std7_std7_std7 -------- */ // ==
   11425       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11426       Bool    isEQ = bitU == 1;
   11427       IRExpr* argL = getQReg128(nn);
   11428       IRExpr* argR = getQReg128(mm);
   11429       IRTemp  res  = newTempV128();
   11430       assign(res,
   11431              isEQ ? binop(mkVecCMPEQ(size), argL, argR)
   11432                   : unop(Iop_NotV128, binop(mkVecCMPEQ(size),
   11433                                             binop(Iop_AndV128, argL, argR),
   11434                                             mkV128(0x0000))));
   11435       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11436       const HChar* nm  = isEQ ? "cmeq" : "cmtst";
   11437       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11438       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   11439           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11440       return True;
   11441    }
   11442 
   11443    if (opcode == BITS5(1,0,0,1,0)) {
   11444       /* -------- 0,xx,10010 MLA std7_std7_std7 -------- */
   11445       /* -------- 1,xx,10010 MLS std7_std7_std7 -------- */
   11446       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11447       Bool isMLS = bitU == 1;
   11448       IROp   opMUL    = mkVecMUL(size);
   11449       IROp   opADDSUB = isMLS ? mkVecSUB(size) : mkVecADD(size);
   11450       IRTemp res      = newTempV128();
   11451       if (opMUL != Iop_INVALID && opADDSUB != Iop_INVALID) {
   11452          assign(res, binop(opADDSUB,
   11453                            getQReg128(dd),
   11454                            binop(opMUL, getQReg128(nn), getQReg128(mm))));
   11455          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11456          const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11457          DIP("%s %s.%s, %s.%s, %s.%s\n", isMLS ? "mls" : "mla",
   11458              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11459          return True;
   11460       }
   11461       return False;
   11462    }
   11463 
   11464    if (opcode == BITS5(1,0,0,1,1)) {
   11465       /* -------- 0,xx,10011 MUL  std7_std7_std7 -------- */
   11466       /* -------- 1,xx,10011 PMUL 16b_16b_16b, 8b_8b_8b -------- */
   11467       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11468       Bool isPMUL = bitU == 1;
   11469       const IROp opsPMUL[4]
   11470          = { Iop_PolynomialMul8x16, Iop_INVALID, Iop_INVALID, Iop_INVALID };
   11471       IROp   opMUL = isPMUL ? opsPMUL[size] : mkVecMUL(size);
   11472       IRTemp res   = newTempV128();
   11473       if (opMUL != Iop_INVALID) {
   11474          assign(res, binop(opMUL, getQReg128(nn), getQReg128(mm)));
   11475          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11476          const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11477          DIP("%s %s.%s, %s.%s, %s.%s\n", isPMUL ? "pmul" : "mul",
   11478              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11479          return True;
   11480       }
   11481       return False;
   11482    }
   11483 
   11484    if (opcode == BITS5(1,0,1,0,0) || opcode == BITS5(1,0,1,0,1)) {
   11485       /* -------- 0,xx,10100 SMAXP std6_std6_std6 -------- */
   11486       /* -------- 1,xx,10100 UMAXP std6_std6_std6 -------- */
   11487       /* -------- 0,xx,10101 SMINP std6_std6_std6 -------- */
   11488       /* -------- 1,xx,10101 UMINP std6_std6_std6 -------- */
   11489       if (size == X11) return False;
   11490       Bool isU   = bitU == 1;
   11491       Bool isMAX = opcode == BITS5(1,0,1,0,0);
   11492       IRTemp vN  = newTempV128();
   11493       IRTemp vM  = newTempV128();
   11494       IROp op = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
   11495                       : (isU ? mkVecMINU(size) : mkVecMINS(size));
   11496       assign(vN, getQReg128(nn));
   11497       assign(vM, getQReg128(mm));
   11498       IRTemp res128 = newTempV128();
   11499       assign(res128,
   11500              binop(op,
   11501                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
   11502                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
   11503       /* In the half-width case, use CatEL32x4 to extract the half-width
   11504          result from the full-width result. */
   11505       IRExpr* res
   11506          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
   11507                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
   11508                                                         mkexpr(res128)))
   11509                      : mkexpr(res128);
   11510       putQReg128(dd, res);
   11511       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11512       const HChar* nm  = isMAX ? (isU ? "umaxp" : "smaxp")
   11513                                : (isU ? "uminp" : "sminp");
   11514       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   11515           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11516       return True;
   11517    }
   11518 
   11519    if (opcode == BITS5(1,0,1,1,0)) {
   11520       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
   11521       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
   11522       if (size == X00 || size == X11) return False;
   11523       Bool isR = bitU == 1;
   11524       IRTemp res, sat1q, sat1n, vN, vM;
   11525       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
   11526       newTempsV128_2(&vN, &vM);
   11527       assign(vN, getQReg128(nn));
   11528       assign(vM, getQReg128(mm));
   11529       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
   11530       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11531       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
   11532       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
   11533       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11534       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
   11535       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
   11536           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11537       return True;
   11538    }
   11539 
   11540    if (bitU == 0 && opcode == BITS5(1,0,1,1,1)) {
   11541       /* -------- 0,xx,10111 ADDP std7_std7_std7 -------- */
   11542       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11543       IRTemp vN = newTempV128();
   11544       IRTemp vM = newTempV128();
   11545       assign(vN, getQReg128(nn));
   11546       assign(vM, getQReg128(mm));
   11547       IRTemp res128 = newTempV128();
   11548       assign(res128,
   11549              binop(mkVecADD(size),
   11550                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
   11551                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
   11552       /* In the half-width case, use CatEL32x4 to extract the half-width
   11553          result from the full-width result. */
   11554       IRExpr* res
   11555          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
   11556                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
   11557                                                         mkexpr(res128)))
   11558                      : mkexpr(res128);
   11559       putQReg128(dd, res);
   11560       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11561       DIP("addp %s.%s, %s.%s, %s.%s\n",
   11562           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11563       return True;
   11564    }
   11565 
   11566    if (bitU == 0
   11567        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
   11568       /* -------- 0,0x,11000 FMAXNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11569       /* -------- 0,1x,11000 FMINNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11570       /* -------- 0,0x,11110 FMAX   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11571       /* -------- 0,1x,11110 FMIN   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11572       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
   11573       Bool   isD   = (size & 1) == 1;
   11574       if (bitQ == 0 && isD) return False; // implied 1d case
   11575       Bool   isMIN = (size & 2) == 2;
   11576       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
   11577       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? X11 : X10);
   11578       IRTemp res   = newTempV128();
   11579       assign(res, binop(opMXX, getQReg128(nn), getQReg128(mm)));
   11580       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11581       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   11582       DIP("%s%s %s.%s, %s.%s, %s.%s\n",
   11583           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
   11584           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11585       return True;
   11586    }
   11587 
   11588    if (bitU == 0 && opcode == BITS5(1,1,0,0,1)) {
   11589       /* -------- 0,0x,11001 FMLA 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11590       /* -------- 0,1x,11001 FMLS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11591       Bool isD   = (size & 1) == 1;
   11592       Bool isSUB = (size & 2) == 2;
   11593       if (bitQ == 0 && isD) return False; // implied 1d case
   11594       IROp opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
   11595       IROp opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
   11596       IROp opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
   11597       IRTemp rm = mk_get_IR_rounding_mode();
   11598       IRTemp t1 = newTempV128();
   11599       IRTemp t2 = newTempV128();
   11600       // FIXME: double rounding; use FMA primops instead
   11601       assign(t1, triop(opMUL,
   11602                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
   11603       assign(t2, triop(isSUB ? opSUB : opADD,
   11604                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
   11605       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
   11606       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   11607       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fmls" : "fmla",
   11608           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11609       return True;
   11610    }
   11611 
   11612    if (bitU == 0 && opcode == BITS5(1,1,0,1,0)) {
   11613       /* -------- 0,0x,11010 FADD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11614       /* -------- 0,1x,11010 FSUB 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11615       Bool isD   = (size & 1) == 1;
   11616       Bool isSUB = (size & 2) == 2;
   11617       if (bitQ == 0 && isD) return False; // implied 1d case
   11618       const IROp ops[4]
   11619          = { Iop_Add32Fx4, Iop_Add64Fx2, Iop_Sub32Fx4, Iop_Sub64Fx2 };
   11620       IROp   op = ops[size];
   11621       IRTemp rm = mk_get_IR_rounding_mode();
   11622       IRTemp t1 = newTempV128();
   11623       IRTemp t2 = newTempV128();
   11624       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
   11625       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
   11626       putQReg128(dd, mkexpr(t2));
   11627       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   11628       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fsub" : "fadd",
   11629           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11630       return True;
   11631    }
   11632 
   11633    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
   11634       /* -------- 1,1x,11010 FABD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11635       Bool isD = (size & 1) == 1;
   11636       if (bitQ == 0 && isD) return False; // implied 1d case
   11637       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
   11638       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
   11639       IRTemp rm    = mk_get_IR_rounding_mode();
   11640       IRTemp t1    = newTempV128();
   11641       IRTemp t2    = newTempV128();
   11642       // FIXME: use Abd primop instead?
   11643       assign(t1, triop(opSUB, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
   11644       assign(t2, unop(opABS, mkexpr(t1)));
   11645       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
   11646       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   11647       DIP("fabd %s.%s, %s.%s, %s.%s\n",
   11648           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11649       return True;
   11650    }
   11651 
   11652    if (size <= X01 && opcode == BITS5(1,1,0,1,1)) {
   11653       /* -------- 0,0x,11011 FMULX 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11654       /* -------- 1,0x,11011 FMUL  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11655       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
   11656       Bool isD    = (size & 1) == 1;
   11657       Bool isMULX = bitU == 0;
   11658       if (bitQ == 0 && isD) return False; // implied 1d case
   11659       IRTemp rm = mk_get_IR_rounding_mode();
   11660       IRTemp t1 = newTempV128();
   11661       assign(t1, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
   11662                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
   11663       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
   11664       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   11665       DIP("%s %s.%s, %s.%s, %s.%s\n", isMULX ? "fmulx" : "fmul",
   11666           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11667       return True;
   11668    }
   11669 
   11670    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
   11671       /* -------- 0,0x,11100 FCMEQ 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11672       /* -------- 1,0x,11100 FCMGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11673       Bool isD = (size & 1) == 1;
   11674       if (bitQ == 0 && isD) return False; // implied 1d case
   11675       Bool   isGE  = bitU == 1;
   11676       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
   11677                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
   11678       IRTemp t1    = newTempV128();
   11679       assign(t1, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
   11680                       : binop(opCMP, getQReg128(nn), getQReg128(mm)));
   11681       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
   11682       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   11683       DIP("%s %s.%s, %s.%s, %s.%s\n", isGE ? "fcmge" : "fcmeq",
   11684           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11685       return True;
   11686    }
   11687 
   11688    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
   11689       /* -------- 1,1x,11100 FCMGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11690       Bool isD = (size & 1) == 1;
   11691       if (bitQ == 0 && isD) return False; // implied 1d case
   11692       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
   11693       IRTemp t1    = newTempV128();
   11694       assign(t1, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
   11695       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
   11696       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   11697       DIP("%s %s.%s, %s.%s, %s.%s\n", "fcmgt",
   11698           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11699       return True;
   11700    }
   11701 
   11702    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
   11703       /* -------- 1,0x,11101 FACGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11704       /* -------- 1,1x,11101 FACGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11705       Bool isD  = (size & 1) == 1;
   11706       Bool isGT = (size & 2) == 2;
   11707       if (bitQ == 0 && isD) return False; // implied 1d case
   11708       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
   11709                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
   11710       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
   11711       IRTemp t1    = newTempV128();
   11712       assign(t1, binop(opCMP, unop(opABS, getQReg128(mm)),
   11713                               unop(opABS, getQReg128(nn)))); // swapd
   11714       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
   11715       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   11716       DIP("%s %s.%s, %s.%s, %s.%s\n", isGT ? "facgt" : "facge",
   11717           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11718       return True;
   11719    }
   11720 
   11721    if (bitU == 1
   11722        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
   11723       /* -------- 1,0x,11000 FMAXNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11724       /* -------- 1,1x,11000 FMINNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11725       /* -------- 1,0x,11110 FMAXP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11726       /* -------- 1,1x,11110 FMINP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11727       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
   11728       Bool isD = (size & 1) == 1;
   11729       if (bitQ == 0 && isD) return False; // implied 1d case
   11730       Bool   isMIN = (size & 2) == 2;
   11731       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
   11732       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
   11733       IRTemp srcN  = newTempV128();
   11734       IRTemp srcM  = newTempV128();
   11735       IRTemp preL  = IRTemp_INVALID;
   11736       IRTemp preR  = IRTemp_INVALID;
   11737       assign(srcN, getQReg128(nn));
   11738       assign(srcM, getQReg128(mm));
   11739       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR,
   11740                                            srcM, srcN, isD, bitQ);
   11741       putQReg128(
   11742          dd, math_MAYBE_ZERO_HI64_fromE(
   11743                 bitQ,
   11744                 binop(opMXX, mkexpr(preL), mkexpr(preR))));
   11745       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   11746       DIP("%s%sp %s.%s, %s.%s, %s.%s\n",
   11747           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
   11748           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11749       return True;
   11750    }
   11751 
   11752    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,0,1,0)) {
   11753       /* -------- 1,0x,11010 FADDP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11754       Bool isD = size == X01;
   11755       if (bitQ == 0 && isD) return False; // implied 1d case
   11756       IRTemp srcN = newTempV128();
   11757       IRTemp srcM = newTempV128();
   11758       IRTemp preL = IRTemp_INVALID;
   11759       IRTemp preR = IRTemp_INVALID;
   11760       assign(srcN, getQReg128(nn));
   11761       assign(srcM, getQReg128(mm));
   11762       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR,
   11763                                            srcM, srcN, isD, bitQ);
   11764       putQReg128(
   11765          dd, math_MAYBE_ZERO_HI64_fromE(
   11766                 bitQ,
   11767                 triop(mkVecADDF(isD ? 3 : 2),
   11768                       mkexpr(mk_get_IR_rounding_mode()),
   11769                       mkexpr(preL), mkexpr(preR))));
   11770       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   11771       DIP("%s %s.%s, %s.%s, %s.%s\n", "faddp",
   11772           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11773       return True;
   11774    }
   11775 
   11776    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,1,1,1)) {
   11777       /* -------- 1,0x,11111 FDIV 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11778       Bool isD = (size & 1) == 1;
   11779       if (bitQ == 0 && isD) return False; // implied 1d case
   11780       vassert(size <= 1);
   11781       const IROp ops[2] = { Iop_Div32Fx4, Iop_Div64Fx2 };
   11782       IROp   op = ops[size];
   11783       IRTemp rm = mk_get_IR_rounding_mode();
   11784       IRTemp t1 = newTempV128();
   11785       IRTemp t2 = newTempV128();
   11786       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
   11787       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
   11788       putQReg128(dd, mkexpr(t2));
   11789       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   11790       DIP("%s %s.%s, %s.%s, %s.%s\n", "fdiv",
   11791           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11792       return True;
   11793    }
   11794 
   11795    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
   11796       /* -------- 0,0x,11111: FRECPS  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11797       /* -------- 0,1x,11111: FRSQRTS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
   11798       Bool isSQRT = (size & 2) == 2;
   11799       Bool isD    = (size & 1) == 1;
   11800       if (bitQ == 0 && isD) return False; // implied 1d case
   11801       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
   11802                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
   11803       IRTemp res = newTempV128();
   11804       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
   11805       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11806       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   11807       DIP("%s %s.%s, %s.%s, %s.%s\n", isSQRT ? "frsqrts" : "frecps",
   11808           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
   11809       return True;
   11810    }
   11811 
   11812    return False;
   11813 #  undef INSN
   11814 }
   11815 
   11816 
   11817 static
   11818 Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
   11819 {
   11820    /* 31 30 29 28    23   21    16     11 9 4
   11821       0  Q  U  01110 size 10000 opcode 10 n d
   11822       Decode fields: U,size,opcode
   11823    */
   11824 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   11825    if (INSN(31,31) != 0
   11826        || INSN(28,24) != BITS5(0,1,1,1,0)
   11827        || INSN(21,17) != BITS5(1,0,0,0,0)
   11828        || INSN(11,10) != BITS2(1,0)) {
   11829       return False;
   11830    }
   11831    UInt bitQ   = INSN(30,30);
   11832    UInt bitU   = INSN(29,29);
   11833    UInt size   = INSN(23,22);
   11834    UInt opcode = INSN(16,12);
   11835    UInt nn     = INSN(9,5);
   11836    UInt dd     = INSN(4,0);
   11837    vassert(size < 4);
   11838 
   11839    if (bitU == 0 && size <= X10 && opcode == BITS5(0,0,0,0,0)) {
   11840       /* -------- 0,00,00000: REV64 16b_16b, 8b_8b -------- */
   11841       /* -------- 0,01,00000: REV64 8h_8h, 4h_4h -------- */
   11842       /* -------- 0,10,00000: REV64 4s_4s, 2s_2s -------- */
   11843       const IROp iops[3] = { Iop_Reverse8sIn64_x2,
   11844                              Iop_Reverse16sIn64_x2, Iop_Reverse32sIn64_x2 };
   11845       vassert(size <= 2);
   11846       IRTemp res = newTempV128();
   11847       assign(res, unop(iops[size], getQReg128(nn)));
   11848       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11849       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11850       DIP("%s %s.%s, %s.%s\n", "rev64",
   11851           nameQReg128(dd), arr, nameQReg128(nn), arr);
   11852       return True;
   11853    }
   11854 
   11855    if (bitU == 1 && size <= X01 && opcode == BITS5(0,0,0,0,0)) {
   11856       /* -------- 1,00,00000: REV32 16b_16b, 8b_8b -------- */
   11857       /* -------- 1,01,00000: REV32 8h_8h, 4h_4h -------- */
   11858       Bool   isH = size == X01;
   11859       IRTemp res = newTempV128();
   11860       IROp   iop = isH ? Iop_Reverse16sIn32_x4 : Iop_Reverse8sIn32_x4;
   11861       assign(res, unop(iop, getQReg128(nn)));
   11862       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11863       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11864       DIP("%s %s.%s, %s.%s\n", "rev32",
   11865           nameQReg128(dd), arr, nameQReg128(nn), arr);
   11866       return True;
   11867    }
   11868 
   11869    if (bitU == 0 && size == X00 && opcode == BITS5(0,0,0,0,1)) {
   11870       /* -------- 0,00,00001: REV16 16b_16b, 8b_8b -------- */
   11871       IRTemp res = newTempV128();
   11872       assign(res, unop(Iop_Reverse8sIn16_x8, getQReg128(nn)));
   11873       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11874       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11875       DIP("%s %s.%s, %s.%s\n", "rev16",
   11876           nameQReg128(dd), arr, nameQReg128(nn), arr);
   11877       return True;
   11878    }
   11879 
   11880    if (opcode == BITS5(0,0,0,1,0) || opcode == BITS5(0,0,1,1,0)) {
   11881       /* -------- 0,xx,00010: SADDLP std6_std6 -------- */
   11882       /* -------- 1,xx,00010: UADDLP std6_std6 -------- */
   11883       /* -------- 0,xx,00110: SADALP std6_std6 -------- */
   11884       /* -------- 1,xx,00110: UADALP std6_std6 -------- */
   11885       /* Widens, and size refers to the narrow size. */
   11886       if (size == X11) return False; // no 1d or 2d cases
   11887       Bool   isU   = bitU == 1;
   11888       Bool   isACC = opcode == BITS5(0,0,1,1,0);
   11889       IRTemp src   = newTempV128();
   11890       IRTemp sum   = newTempV128();
   11891       IRTemp res   = newTempV128();
   11892       assign(src, getQReg128(nn));
   11893       assign(sum,
   11894              binop(mkVecADD(size+1),
   11895                    mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
   11896                              isU, True/*fromOdd*/, size, mkexpr(src))),
   11897                    mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
   11898                              isU, False/*!fromOdd*/, size, mkexpr(src)))));
   11899       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd))
   11900                         : mkexpr(sum));
   11901       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11902       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   11903       const HChar* arrWide   = nameArr_Q_SZ(bitQ, size+1);
   11904       DIP("%s %s.%s, %s.%s\n", isACC ? (isU ? "uadalp" : "sadalp")
   11905                                      : (isU ? "uaddlp" : "saddlp"),
   11906           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
   11907       return True;
   11908    }
   11909 
   11910    if (opcode == BITS5(0,0,0,1,1)) {
   11911       /* -------- 0,xx,00011: SUQADD std7_std7 -------- */
   11912       /* -------- 1,xx,00011: USQADD std7_std7 -------- */
   11913       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11914       Bool isUSQADD = bitU == 1;
   11915       /* This is switched (in the US vs SU sense) deliberately.
   11916          SUQADD corresponds to the ExtUSsatSS variants and
   11917          USQADD corresponds to the ExtSUsatUU variants.
   11918          See libvex_ir for more details. */
   11919       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
   11920                              : mkVecQADDEXTUSSATSS(size);
   11921       IROp   nop  = mkVecADD(size);
   11922       IRTemp argL = newTempV128();
   11923       IRTemp argR = newTempV128();
   11924       IRTemp qres = newTempV128();
   11925       IRTemp nres = newTempV128();
   11926       /* Because the two arguments to the addition are implicitly
   11927          extended differently (one signedly, the other unsignedly) it is
   11928          important to present them to the primop in the correct order. */
   11929       assign(argL, getQReg128(nn));
   11930       assign(argR, getQReg128(dd));
   11931       assign(qres, math_MAYBE_ZERO_HI64_fromE(
   11932                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
   11933       assign(nres, math_MAYBE_ZERO_HI64_fromE(
   11934                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
   11935       putQReg128(dd, mkexpr(qres));
   11936       updateQCFLAGwithDifference(qres, nres);
   11937       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11938       DIP("%s %s.%s, %s.%s\n", isUSQADD ? "usqadd" : "suqadd",
   11939           nameQReg128(dd), arr, nameQReg128(nn), arr);
   11940       return True;
   11941    }
   11942 
   11943    if (opcode == BITS5(0,0,1,0,0)) {
   11944       /* -------- 0,xx,00100: CLS std6_std6 -------- */
   11945       /* -------- 1,xx,00100: CLZ std6_std6 -------- */
   11946       if (size == X11) return False; // no 1d or 2d cases
   11947       const IROp opsCLS[3] = { Iop_Cls8x16, Iop_Cls16x8, Iop_Cls32x4 };
   11948       const IROp opsCLZ[3] = { Iop_Clz8x16, Iop_Clz16x8, Iop_Clz32x4 };
   11949       Bool   isCLZ = bitU == 1;
   11950       IRTemp res   = newTempV128();
   11951       vassert(size <= 2);
   11952       assign(res, unop(isCLZ ? opsCLZ[size] : opsCLS[size], getQReg128(nn)));
   11953       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11954       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11955       DIP("%s %s.%s, %s.%s\n", isCLZ ? "clz" : "cls",
   11956           nameQReg128(dd), arr, nameQReg128(nn), arr);
   11957       return True;
   11958    }
   11959 
   11960    if (size == X00 && opcode == BITS5(0,0,1,0,1)) {
   11961       /* -------- 0,00,00101: CNT 16b_16b, 8b_8b -------- */
   11962       /* -------- 1,00,00101: NOT 16b_16b, 8b_8b -------- */
   11963       IRTemp res = newTempV128();
   11964       assign(res, unop(bitU == 0 ? Iop_Cnt8x16 : Iop_NotV128, getQReg128(nn)));
   11965       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11966       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
   11967       DIP("%s %s.%s, %s.%s\n", bitU == 0 ? "cnt" : "not",
   11968           nameQReg128(dd), arr, nameQReg128(nn), arr);
   11969       return True;
   11970    }
   11971 
   11972    if (bitU == 1 && size == X01 && opcode == BITS5(0,0,1,0,1)) {
   11973       /* -------- 1,01,00101  RBIT 16b_16b, 8b_8b -------- */
   11974       IRTemp res = newTempV128();
   11975       assign(res, unop(Iop_Reverse1sIn8_x16, getQReg128(nn)));
   11976       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   11977       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
   11978       DIP("%s %s.%s, %s.%s\n", "rbit",
   11979           nameQReg128(dd), arr, nameQReg128(nn), arr);
   11980       return True;
   11981    }
   11982 
   11983    if (opcode == BITS5(0,0,1,1,1)) {
   11984       /* -------- 0,xx,00111 SQABS std7_std7 -------- */
   11985       /* -------- 1,xx,00111 SQNEG std7_std7 -------- */
   11986       if (bitQ == 0 && size == X11) return False; // implied 1d case
   11987       Bool   isNEG  = bitU == 1;
   11988       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
   11989       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
   11990                                          getQReg128(nn), size );
   11991       IRTemp qres = newTempV128(), nres = newTempV128();
   11992       assign(qres, math_MAYBE_ZERO_HI64(bitQ, qresFW));
   11993       assign(nres, math_MAYBE_ZERO_HI64(bitQ, nresFW));
   11994       putQReg128(dd, mkexpr(qres));
   11995       updateQCFLAGwithDifference(qres, nres);
   11996       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   11997       DIP("%s %s.%s, %s.%s\n", isNEG ? "sqneg" : "sqabs",
   11998           nameQReg128(dd), arr, nameQReg128(nn), arr);
   11999       return True;
   12000    }
   12001 
   12002    if (opcode == BITS5(0,1,0,0,0)) {
   12003       /* -------- 0,xx,01000: CMGT std7_std7_#0 -------- */ // >s 0
   12004       /* -------- 1,xx,01000: CMGE std7_std7_#0 -------- */ // >=s 0
   12005       if (bitQ == 0 && size == X11) return False; // implied 1d case
   12006       Bool    isGT  = bitU == 0;
   12007       IRExpr* argL  = getQReg128(nn);
   12008       IRExpr* argR  = mkV128(0x0000);
   12009       IRTemp  res   = newTempV128();
   12010       IROp    opGTS = mkVecCMPGTS(size);
   12011       assign(res, isGT ? binop(opGTS, argL, argR)
   12012                        : unop(Iop_NotV128, binop(opGTS, argR, argL)));
   12013       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   12014       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   12015       DIP("cm%s %s.%s, %s.%s, #0\n", isGT ? "gt" : "ge",
   12016           nameQReg128(dd), arr, nameQReg128(nn), arr);
   12017       return True;
   12018    }
   12019 
   12020    if (opcode == BITS5(0,1,0,0,1)) {
   12021       /* -------- 0,xx,01001: CMEQ std7_std7_#0 -------- */ // == 0
   12022       /* -------- 1,xx,01001: CMLE std7_std7_#0 -------- */ // <=s 0
   12023       if (bitQ == 0 && size == X11) return False; // implied 1d case
   12024       Bool    isEQ = bitU == 0;
   12025       IRExpr* argL = getQReg128(nn);
   12026       IRExpr* argR = mkV128(0x0000);
   12027       IRTemp  res  = newTempV128();
   12028       assign(res, isEQ ? binop(mkVecCMPEQ(size), argL, argR)
   12029                        : unop(Iop_NotV128,
   12030                               binop(mkVecCMPGTS(size), argL, argR)));
   12031       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   12032       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   12033       DIP("cm%s %s.%s, %s.%s, #0\n", isEQ ? "eq" : "le",
   12034           nameQReg128(dd), arr, nameQReg128(nn), arr);
   12035       return True;
   12036    }
   12037 
   12038    if (bitU == 0 && opcode == BITS5(0,1,0,1,0)) {
   12039       /* -------- 0,xx,01010: CMLT std7_std7_#0 -------- */ // <s 0
   12040       if (bitQ == 0 && size == X11) return False; // implied 1d case
   12041       IRExpr* argL = getQReg128(nn);
   12042       IRExpr* argR = mkV128(0x0000);
   12043       IRTemp  res  = newTempV128();
   12044       assign(res, binop(mkVecCMPGTS(size), argR, argL));
   12045       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   12046       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   12047       DIP("cm%s %s.%s, %s.%s, #0\n", "lt",
   12048           nameQReg128(dd), arr, nameQReg128(nn), arr);
   12049       return True;
   12050    }
   12051 
   12052    if (bitU == 0 && opcode == BITS5(0,1,0,1,1)) {
   12053       /* -------- 0,xx,01011: ABS std7_std7 -------- */
   12054       if (bitQ == 0 && size == X11) return False; // implied 1d case
   12055       IRTemp res = newTempV128();
   12056       assign(res, unop(mkVecABS(size), getQReg128(nn)));
   12057       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   12058       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   12059       DIP("abs %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
   12060       return True;
   12061    }
   12062 
   12063    if (bitU == 1 && opcode == BITS5(0,1,0,1,1)) {
   12064       /* -------- 1,xx,01011: NEG std7_std7 -------- */
   12065       if (bitQ == 0 && size == X11) return False; // implied 1d case
   12066       IRTemp res = newTempV128();
   12067       assign(res, binop(mkVecSUB(size), mkV128(0x0000), getQReg128(nn)));
   12068       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   12069       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   12070       DIP("neg %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
   12071       return True;
   12072    }
   12073 
   12074    UInt ix = 0; /*INVALID*/
   12075    if (size >= X10) {
   12076       switch (opcode) {
   12077          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
   12078          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
   12079          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
   12080          default: break;
   12081       }
   12082    }
   12083    if (ix > 0) {
   12084       /* -------- 0,1x,01100 FCMGT 2d_2d,4s_4s,2s_2s _#0.0 (ix 1) -------- */
   12085       /* -------- 0,1x,01101 FCMEQ 2d_2d,4s_4s,2s_2s _#0.0 (ix 2) -------- */
   12086       /* -------- 0,1x,01110 FCMLT 2d_2d,4s_4s,2s_2s _#0.0 (ix 3) -------- */
   12087       /* -------- 1,1x,01100 FCMGE 2d_2d,4s_4s,2s_2s _#0.0 (ix 4) -------- */
   12088       /* -------- 1,1x,01101 FCMLE 2d_2d,4s_4s,2s_2s _#0.0 (ix 5) -------- */
   12089       if (bitQ == 0 && size == X11) return False; // implied 1d case
   12090       Bool   isD     = size == X11;
   12091       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
   12092       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
   12093       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
   12094       IROp   opCmp   = Iop_INVALID;
   12095       Bool   swap    = False;
   12096       const HChar* nm = "??";
   12097       switch (ix) {
   12098          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
   12099          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
   12100          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
   12101          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
   12102          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
   12103          default: vassert(0);
   12104       }
   12105       IRExpr* zero = mkV128(0x0000);
   12106       IRTemp res = newTempV128();
   12107       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
   12108                        : binop(opCmp, getQReg128(nn), zero));
   12109       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   12110       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
   12111       DIP("%s %s.%s, %s.%s, #0.0\n", nm,
   12112           nameQReg128(dd), arr, nameQReg128(nn), arr);
   12113       return True;
   12114    }
   12115 
   12116    if (size >= X10 && opcode == BITS5(0,1,1,1,1)) {
   12117       /* -------- 0,1x,01111: FABS 2d_2d, 4s_4s, 2s_2s -------- */
   12118       /* -------- 1,1x,01111: FNEG 2d_2d, 4s_4s, 2s_2s -------- */
   12119       if (bitQ == 0 && size == X11) return False; // implied 1d case
   12120       Bool   isFNEG = bitU == 1;
   12121       IROp   op     = isFNEG ? (size == X10 ? Iop_Neg32Fx4 : Iop_Neg64Fx2)
   12122                              : (size == X10 ? Iop_Abs32Fx4 : Iop_Abs64Fx2);
   12123       IRTemp res = newTempV128();
   12124       assign(res, unop(op, getQReg128(nn)));
   12125       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   12126       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
   12127       DIP("%s %s.%s, %s.%s\n", isFNEG ? "fneg" : "fabs",
   12128           nameQReg128(dd), arr, nameQReg128(nn), arr);
   12129       return True;
   12130    }
   12131 
   12132    if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
   12133       /* -------- 0,xx,10010: XTN{,2} -------- */
   12134       if (size == X11) return False;
   12135       vassert(size < 3);
   12136       Bool   is2  = bitQ == 1;
   12137       IROp   opN  = mkVecNARROWUN(size);
   12138       IRTemp resN = newTempV128();
   12139       assign(resN, unop(Iop_64UtoV128, unop(opN, getQReg128(nn))));
   12140       putLO64andZUorPutHI64(is2, dd, resN);
   12141       const HChar* nm        = "xtn";
   12142       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   12143       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   12144       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
   12145           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
   12146       return True;
   12147    }
   12148 
   12149    if (opcode == BITS5(1,0,1,0,0)
   12150        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
   12151       /* -------- 0,xx,10100: SQXTN{,2} -------- */
   12152       /* -------- 1,xx,10100: UQXTN{,2} -------- */
   12153       /* -------- 1,xx,10010: SQXTUN{,2} -------- */
   12154       if (size == X11) return False;
   12155       vassert(size < 3);
   12156       Bool  is2    = bitQ == 1;
   12157       IROp  opN    = Iop_INVALID;
   12158       Bool  zWiden = True;
   12159       const HChar* nm = "??";
   12160       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
   12161          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
   12162       }
   12163       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
   12164          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
   12165       }
   12166       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
   12167          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
   12168       }
   12169       else vassert(0);
   12170       IRTemp src  = newTempV128();
   12171       assign(src, getQReg128(nn));
   12172       IRTemp resN = newTempV128();
   12173       assign(resN, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
   12174       putLO64andZUorPutHI64(is2, dd, resN);
   12175       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
   12176                                               size, mkexpr(resN));
   12177       updateQCFLAGwithDifference(src, resW);
   12178       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   12179       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   12180       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
   12181           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
   12182       return True;
   12183    }
   12184 
   12185    if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
   12186       /* -------- 1,xx,10011 SHLL{2} #lane-width -------- */
   12187       /* Widens, and size is the narrow size. */
   12188       if (size == X11) return False;
   12189       Bool is2   = bitQ == 1;
   12190       IROp opINT = is2 ? mkVecINTERLEAVEHI(size) : mkVecINTERLEAVELO(size);
   12191       IROp opSHL = mkVecSHLN(size+1);
   12192       IRTemp src = newTempV128();
   12193       IRTemp res = newTempV128();
   12194       assign(src, getQReg128(nn));
   12195       assign(res, binop(opSHL, binop(opINT, mkexpr(src), mkexpr(src)),
   12196                                mkU8(8 << size)));
   12197       putQReg128(dd, mkexpr(res));
   12198       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   12199       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   12200       DIP("shll%s %s.%s, %s.%s, #%d\n", is2 ? "2" : "",
   12201           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow, 8 << size);
   12202       return True;
   12203    }
   12204 
   12205    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,0)) {
   12206       /* -------- 0,0x,10110: FCVTN 4h/8h_4s, 2s/4s_2d -------- */
   12207       UInt   nLanes = size == X00 ? 4 : 2;
   12208       IRType srcTy  = size == X00 ? Ity_F32 : Ity_F64;
   12209       IROp   opCvt  = size == X00 ? Iop_F32toF16 : Iop_F64toF32;
   12210       IRTemp rm     = mk_get_IR_rounding_mode();
   12211       IRTemp src[nLanes];
   12212       for (UInt i = 0; i < nLanes; i++) {
   12213          src[i] = newTemp(srcTy);
   12214          assign(src[i], getQRegLane(nn, i, srcTy));
   12215       }
   12216       for (UInt i = 0; i < nLanes; i++) {
   12217          putQRegLane(dd, nLanes * bitQ + i,
   12218                          binop(opCvt, mkexpr(rm), mkexpr(src[i])));
   12219       }
   12220       if (bitQ == 0) {
   12221          putQRegLane(dd, 1, mkU64(0));
   12222       }
   12223       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
   12224       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
   12225       DIP("fcvtn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
   12226           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
   12227       return True;
   12228    }
   12229 
   12230    if (bitU == 1 && size == X01 && opcode == BITS5(1,0,1,1,0)) {
   12231       /* -------- 1,01,10110: FCVTXN 2s/4s_2d -------- */
   12232       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
   12233          odd" but I don't know what that really means. */
   12234       IRType srcTy = Ity_F64;
   12235       IROp   opCvt = Iop_F64toF32;
   12236       IRTemp src[2];
   12237       for (UInt i = 0; i < 2; i++) {
   12238          src[i] = newTemp(srcTy);
   12239          assign(src[i], getQRegLane(nn, i, srcTy));
   12240       }
   12241       for (UInt i = 0; i < 2; i++) {
   12242          putQRegLane(dd, 2 * bitQ + i,
   12243                          binop(opCvt, mkU32(Irrm_NEAREST), mkexpr(src[i])));
   12244       }
   12245       if (bitQ == 0) {
   12246          putQRegLane(dd, 1, mkU64(0));
   12247       }
   12248       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
   12249       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
   12250       DIP("fcvtxn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
   12251           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
   12252       return True;
   12253    }
   12254 
   12255    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,1)) {
   12256       /* -------- 0,0x,10111: FCVTL 4s_4h/8h, 2d_2s/4s -------- */
   12257       UInt   nLanes = size == X00 ? 4 : 2;
   12258       IRType srcTy  = size == X00 ? Ity_F16 : Ity_F32;
   12259       IROp   opCvt  = size == X00 ? Iop_F16toF32 : Iop_F32toF64;
   12260       IRTemp src[nLanes];
   12261       for (UInt i = 0; i < nLanes; i++) {
   12262          src[i] = newTemp(srcTy);
   12263          assign(src[i], getQRegLane(nn, nLanes * bitQ + i, srcTy));
   12264       }
   12265       for (UInt i = 0; i < nLanes; i++) {
   12266          putQRegLane(dd, i, unop(opCvt, mkexpr(src[i])));
   12267       }
   12268       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
   12269       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
   12270       DIP("fcvtl%s %s.%s, %s.%s\n", bitQ ? "2" : "",
   12271           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
   12272       return True;
   12273    }
   12274 
   12275    ix = 0;
   12276    if (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,0,0,1)) {
   12277       ix = 1 + ((((bitU & 1) << 2) | ((size & 2) << 0)) | ((opcode & 1) << 0));
   12278       // = 1 + bitU[0]:size[1]:opcode[0]
   12279       vassert(ix >= 1 && ix <= 8);
   12280       if (ix == 7) ix = 0;
   12281    }
   12282    if (ix > 0) {
   12283       /* -------- 0,0x,11000 FRINTN 2d_2d, 4s_4s, 2s_2s (1) -------- */
   12284       /* -------- 0,0x,11001 FRINTM 2d_2d, 4s_4s, 2s_2s (2) -------- */
   12285       /* -------- 0,1x,11000 FRINTP 2d_2d, 4s_4s, 2s_2s (3) -------- */
   12286       /* -------- 0,1x,11001 FRINTZ 2d_2d, 4s_4s, 2s_2s (4) -------- */
   12287       /* -------- 1,0x,11000 FRINTA 2d_2d, 4s_4s, 2s_2s (5) -------- */
   12288       /* -------- 1,0x,11001 FRINTX 2d_2d, 4s_4s, 2s_2s (6) -------- */
   12289       /* -------- 1,1x,11000 (apparently unassigned)    (7) -------- */
   12290       /* -------- 1,1x,11001 FRINTI 2d_2d, 4s_4s, 2s_2s (8) -------- */
   12291       /* rm plan:
   12292          FRINTN: tieeven -- !! FIXME KLUDGED !!
   12293          FRINTM: -inf
   12294          FRINTP: +inf
   12295          FRINTZ: zero
   12296          FRINTA: tieaway -- !! FIXME KLUDGED !!
   12297          FRINTX: per FPCR + "exact = TRUE"
   12298          FRINTI: per FPCR
   12299       */
   12300       Bool isD = (size & 1) == 1;
   12301       if (bitQ == 0 && isD) return False; // implied 1d case
   12302 
   12303       IRTemp irrmRM = mk_get_IR_rounding_mode();
   12304 
   12305       UChar ch = '?';
   12306       IRTemp irrm = newTemp(Ity_I32);
   12307       switch (ix) {
   12308          case 1: ch = 'n'; assign(irrm, mkU32(Irrm_NEAREST)); break;
   12309          case 2: ch = 'm'; assign(irrm, mkU32(Irrm_NegINF)); break;
   12310          case 3: ch = 'p'; assign(irrm, mkU32(Irrm_PosINF)); break;
   12311          case 4: ch = 'z'; assign(irrm, mkU32(Irrm_ZERO)); break;
   12312          // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
   12313          case 5: ch = 'a'; assign(irrm, mkU32(Irrm_NEAREST)); break;
   12314          // I am unsure about the following, due to the "integral exact"
   12315          // description in the manual.  What does it mean? (frintx, that is)
   12316          case 6: ch = 'x'; assign(irrm, mkexpr(irrmRM)); break;
   12317          case 8: ch = 'i'; assign(irrm, mkexpr(irrmRM)); break;
   12318          default: vassert(0);
   12319       }
   12320 
   12321       IROp opRND = isD ? Iop_RoundF64toInt : Iop_RoundF32toInt;
   12322       if (isD) {
   12323          for (UInt i = 0; i < 2; i++) {
   12324             putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
   12325                                             getQRegLane(nn, i, Ity_F64)));
   12326          }
   12327       } else {
   12328          UInt n = bitQ==1 ? 4 : 2;
   12329          for (UInt i = 0; i < n; i++) {
   12330             putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
   12331                                             getQRegLane(nn, i, Ity_F32)));
   12332          }
   12333          if (bitQ == 0)
   12334             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
   12335       }
   12336       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   12337       DIP("frint%c %s.%s, %s.%s\n", ch,
   12338           nameQReg128(dd), arr, nameQReg128(nn), arr);
   12339       return True;
   12340    }
   12341 
   12342    ix = 0; /*INVALID*/
   12343    switch (opcode) {
   12344       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
   12345       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
   12346       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
   12347       default: break;
   12348    }
   12349    if (ix > 0) {
   12350       /* -------- 0,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
   12351       /* -------- 0,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
   12352       /* -------- 0,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
   12353       /* -------- 0,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
   12354       /* -------- 0,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
   12355       /* -------- 1,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
   12356       /* -------- 1,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
   12357       /* -------- 1,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
   12358       /* -------- 1,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
   12359       /* -------- 1,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
   12360       Bool isD = (size & 1) == 1;
   12361       if (bitQ == 0 && isD) return False; // implied 1d case
   12362 
   12363       IRRoundingMode irrm = 8; /*impossible*/
   12364       HChar          ch   = '?';
   12365       switch (ix) {
   12366          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
   12367          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
   12368          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
   12369          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
   12370          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
   12371          default: vassert(0);
   12372       }
   12373       IROp cvt = Iop_INVALID;
   12374       if (bitU == 1) {
   12375          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
   12376       } else {
   12377          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
   12378       }
   12379       if (isD) {
   12380          for (UInt i = 0; i < 2; i++) {
   12381             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
   12382                                             getQRegLane(nn, i, Ity_F64)));
   12383          }
   12384       } else {
   12385          UInt n = bitQ==1 ? 4 : 2;
   12386          for (UInt i = 0; i < n; i++) {
   12387             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
   12388                                             getQRegLane(nn, i, Ity_F32)));
   12389          }
   12390          if (bitQ == 0)
   12391             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
   12392       }
   12393       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   12394       DIP("fcvt%c%c %s.%s, %s.%s\n", ch, bitU == 1 ? 'u' : 's',
   12395           nameQReg128(dd), arr, nameQReg128(nn), arr);
   12396       return True;
   12397    }
   12398 
   12399    if (size == X10 && opcode == BITS5(1,1,1,0,0)) {
   12400       /* -------- 0,10,11100: URECPE  4s_4s, 2s_2s -------- */
   12401       /* -------- 1,10,11100: URSQRTE 4s_4s, 2s_2s -------- */
   12402       Bool isREC = bitU == 0;
   12403       IROp op    = isREC ? Iop_RecipEst32Ux4 : Iop_RSqrtEst32Ux4;
   12404       IRTemp res = newTempV128();
   12405       assign(res, unop(op, getQReg128(nn)));
   12406       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   12407       const HChar* nm  = isREC ? "urecpe" : "ursqrte";
   12408       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   12409       DIP("%s %s.%s, %s.%s\n", nm,
   12410           nameQReg128(dd), arr, nameQReg128(nn), arr);
   12411       return True;
   12412    }
   12413 
   12414    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
   12415       /* -------- 0,0x,11101: SCVTF -------- */
   12416       /* -------- 1,0x,11101: UCVTF -------- */
   12417       /* 31  28      22 21       15     9 4
   12418          0q0 01110 0 sz 1  00001 110110 n d  SCVTF Vd, Vn
   12419          0q1 01110 0 sz 1  00001 110110 n d  UCVTF Vd, Vn
   12420          with laneage:
   12421          case sz:Q of 00 -> 2S, zero upper, 01 -> 4S, 10 -> illegal, 11 -> 2D
   12422       */
   12423       Bool isQ   = bitQ == 1;
   12424       Bool isU   = bitU == 1;
   12425       Bool isF64 = (size & 1) == 1;
   12426       if (isQ || !isF64) {
   12427          IRType tyF = Ity_INVALID, tyI = Ity_INVALID;
   12428          UInt   nLanes = 0;
   12429          Bool   zeroHI = False;
   12430          const HChar* arrSpec = NULL;
   12431          Bool   ok  = getLaneInfo_Q_SZ(&tyI, &tyF, &nLanes, &zeroHI, &arrSpec,
   12432                                        isQ, isF64 );
   12433          IROp   iop = isU ? (isF64 ? Iop_I64UtoF64 : Iop_I32UtoF32)
   12434                           : (isF64 ? Iop_I64StoF64 : Iop_I32StoF32);
   12435          IRTemp rm  = mk_get_IR_rounding_mode();
   12436          UInt   i;
   12437          vassert(ok); /* the 'if' above should ensure this */
   12438          for (i = 0; i < nLanes; i++) {
   12439             putQRegLane(dd, i,
   12440                         binop(iop, mkexpr(rm), getQRegLane(nn, i, tyI)));
   12441          }
   12442          if (zeroHI) {
   12443             putQRegLane(dd, 1, mkU64(0));
   12444          }
   12445          DIP("%ccvtf %s.%s, %s.%s\n", isU ? 'u' : 's',
   12446              nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec);
   12447          return True;
   12448       }
   12449       /* else fall through */
   12450    }
   12451 
   12452    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
   12453       /* -------- 0,1x,11101: FRECPE  2d_2d, 4s_4s, 2s_2s -------- */
   12454       /* -------- 1,1x,11101: FRSQRTE 2d_2d, 4s_4s, 2s_2s -------- */
   12455       Bool isSQRT = bitU == 1;
   12456       Bool isD    = (size & 1) == 1;
   12457       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
   12458                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
   12459       if (bitQ == 0 && isD) return False; // implied 1d case
   12460       IRTemp resV = newTempV128();
   12461       assign(resV, unop(op, getQReg128(nn)));
   12462       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
   12463       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
   12464       DIP("%s %s.%s, %s.%s\n", isSQRT ? "frsqrte" : "frecpe",
   12465           nameQReg128(dd), arr, nameQReg128(nn), arr);
   12466       return True;
   12467    }
   12468 
   12469    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
   12470       /* -------- 1,1x,11111: FSQRT 2d_2d, 4s_4s, 2s_2s -------- */
   12471       Bool isD = (size & 1) == 1;
   12472       IROp op  = isD ? Iop_Sqrt64Fx2 : Iop_Sqrt32Fx4;
   12473       if (bitQ == 0 && isD) return False; // implied 1d case
   12474       IRTemp resV = newTempV128();
   12475       assign(resV, binop(op, mkexpr(mk_get_IR_rounding_mode()),
   12476                              getQReg128(nn)));
   12477       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
   12478       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
   12479       DIP("%s %s.%s, %s.%s\n", "fsqrt",
   12480           nameQReg128(dd), arr, nameQReg128(nn), arr);
   12481       return True;
   12482    }
   12483 
   12484    return False;
   12485 #  undef INSN
   12486 }
   12487 
   12488 
   12489 static
   12490 Bool dis_AdvSIMD_vector_x_indexed_elem(/*MB_OUT*/DisResult* dres, UInt insn)
   12491 {
   12492    /* 31    28    23   21 20 19 15     11   9 4
   12493       0 Q U 01111 size L  M  m  opcode H  0 n d
   12494       Decode fields are: u,size,opcode
   12495       M is really part of the mm register number.  Individual
   12496       cases need to inspect L and H though.
   12497    */
   12498 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   12499    if (INSN(31,31) != 0
   12500        || INSN(28,24) != BITS5(0,1,1,1,1) || INSN(10,10) !=0) {
   12501       return False;
   12502    }
   12503    UInt bitQ   = INSN(30,30);
   12504    UInt bitU   = INSN(29,29);
   12505    UInt size   = INSN(23,22);
   12506    UInt bitL   = INSN(21,21);
   12507    UInt bitM   = INSN(20,20);
   12508    UInt mmLO4  = INSN(19,16);
   12509    UInt opcode = INSN(15,12);
   12510    UInt bitH   = INSN(11,11);
   12511    UInt nn     = INSN(9,5);
   12512    UInt dd     = INSN(4,0);
   12513    vassert(size < 4);
   12514    vassert(bitH < 2 && bitM < 2 && bitL < 2);
   12515 
   12516    if (bitU == 0 && size >= X10
   12517        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
   12518       /* -------- 0,1x,0001 FMLA 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
   12519       /* -------- 0,1x,0101 FMLS 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
   12520       if (bitQ == 0 && size == X11) return False; // implied 1d case
   12521       Bool isD   = (size & 1) == 1;
   12522       Bool isSUB = opcode == BITS4(0,1,0,1);
   12523       UInt index;
   12524       if      (!isD)             index = (bitH << 1) | bitL;
   12525       else if (isD && bitL == 0) index = bitH;
   12526       else return False; // sz:L == x11 => unallocated encoding
   12527       vassert(index < (isD ? 2 : 4));
   12528       IRType ity   = isD ? Ity_F64 : Ity_F32;
   12529       IRTemp elem  = newTemp(ity);
   12530       UInt   mm    = (bitM << 4) | mmLO4;
   12531       assign(elem, getQRegLane(mm, index, ity));
   12532       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
   12533       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
   12534       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
   12535       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
   12536       IRTemp rm    = mk_get_IR_rounding_mode();
   12537       IRTemp t1    = newTempV128();
   12538       IRTemp t2    = newTempV128();
   12539       // FIXME: double rounding; use FMA primops instead
   12540       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
   12541       assign(t2, triop(isSUB ? opSUB : opADD,
   12542                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
   12543       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
   12544       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   12545       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
   12546           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm),
   12547           isD ? 'd' : 's', index);
   12548       return True;
   12549    }
   12550 
   12551    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
   12552       /* -------- 0,1x,1001 FMUL  2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
   12553       /* -------- 1,1x,1001 FMULX 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
   12554       if (bitQ == 0 && size == X11) return False; // implied 1d case
   12555       Bool isD    = (size & 1) == 1;
   12556       Bool isMULX = bitU == 1;
   12557       UInt index;
   12558       if      (!isD)             index = (bitH << 1) | bitL;
   12559       else if (isD && bitL == 0) index = bitH;
   12560       else return False; // sz:L == x11 => unallocated encoding
   12561       vassert(index < (isD ? 2 : 4));
   12562       IRType ity  = isD ? Ity_F64 : Ity_F32;
   12563       IRTemp elem = newTemp(ity);
   12564       UInt   mm   = (bitM << 4) | mmLO4;
   12565       assign(elem, getQRegLane(mm, index, ity));
   12566       IRTemp dupd = math_DUP_TO_V128(elem, ity);
   12567       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
   12568       IRTemp res  = newTempV128();
   12569       assign(res, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
   12570                         mkexpr(mk_get_IR_rounding_mode()),
   12571                         getQReg128(nn), mkexpr(dupd)));
   12572       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   12573       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
   12574       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n",
   12575           isMULX ? "fmulx" : "fmul", nameQReg128(dd), arr,
   12576           nameQReg128(nn), arr, nameQReg128(mm), isD ? 'd' : 's', index);
   12577       return True;
   12578    }
   12579 
   12580    if ((bitU == 1 && (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,1,0,0)))
   12581        || (bitU == 0 && opcode == BITS4(1,0,0,0))) {
   12582       /* -------- 1,xx,0000 MLA s/h variants only -------- */
   12583       /* -------- 1,xx,0100 MLS s/h variants only -------- */
   12584       /* -------- 0,xx,1000 MUL s/h variants only -------- */
   12585       Bool isMLA = opcode == BITS4(0,0,0,0);
   12586       Bool isMLS = opcode == BITS4(0,1,0,0);
   12587       UInt mm    = 32; // invalid
   12588       UInt ix    = 16; // invalid
   12589       switch (size) {
   12590          case X00:
   12591             return False; // b case is not allowed
   12592          case X01:
   12593             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
   12594          case X10:
   12595             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
   12596          case X11:
   12597             return False; // d case is not allowed
   12598          default:
   12599             vassert(0);
   12600       }
   12601       vassert(mm < 32 && ix < 16);
   12602       IROp   opMUL = mkVecMUL(size);
   12603       IROp   opADD = mkVecADD(size);
   12604       IROp   opSUB = mkVecSUB(size);
   12605       HChar  ch    = size == X01 ? 'h' : 's';
   12606       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
   12607       IRTemp vecD  = newTempV128();
   12608       IRTemp vecN  = newTempV128();
   12609       IRTemp res   = newTempV128();
   12610       assign(vecD, getQReg128(dd));
   12611       assign(vecN, getQReg128(nn));
   12612       IRExpr* prod = binop(opMUL, mkexpr(vecN), mkexpr(vecM));
   12613       if (isMLA || isMLS) {
   12614          assign(res, binop(isMLA ? opADD : opSUB, mkexpr(vecD), prod));
   12615       } else {
   12616          assign(res, prod);
   12617       }
   12618       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   12619       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   12620       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isMLA ? "mla"
   12621                                                 : (isMLS ? "mls" : "mul"),
   12622           nameQReg128(dd), arr,
   12623           nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
   12624       return True;
   12625    }
   12626 
   12627    if (opcode == BITS4(1,0,1,0)
   12628        || opcode == BITS4(0,0,1,0) || opcode == BITS4(0,1,1,0)) {
   12629       /* -------- 0,xx,1010 SMULL s/h variants only -------- */ // 0 (ks)
   12630       /* -------- 1,xx,1010 UMULL s/h variants only -------- */ // 0
   12631       /* -------- 0,xx,0010 SMLAL s/h variants only -------- */ // 1
   12632       /* -------- 1,xx,0010 UMLAL s/h variants only -------- */ // 1
   12633       /* -------- 0,xx,0110 SMLSL s/h variants only -------- */ // 2
   12634       /* -------- 1,xx,0110 SMLSL s/h variants only -------- */ // 2
   12635       /* Widens, and size refers to the narrowed lanes. */
   12636       UInt ks = 3;
   12637       switch (opcode) {
   12638          case BITS4(1,0,1,0): ks = 0; break;
   12639          case BITS4(0,0,1,0): ks = 1; break;
   12640          case BITS4(0,1,1,0): ks = 2; break;
   12641          default: vassert(0);
   12642       }
   12643       vassert(ks >= 0 && ks <= 2);
   12644       Bool isU = bitU == 1;
   12645       Bool is2 = bitQ == 1;
   12646       UInt mm  = 32; // invalid
   12647       UInt ix  = 16; // invalid
   12648       switch (size) {
   12649          case X00:
   12650             return False; // h_b_b[] case is not allowed
   12651          case X01:
   12652             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
   12653          case X10:
   12654             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
   12655          case X11:
   12656             return False; // q_d_d[] case is not allowed
   12657          default:
   12658             vassert(0);
   12659       }
   12660       vassert(mm < 32 && ix < 16);
   12661       IRTemp vecN  = newTempV128();
   12662       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
   12663       IRTemp vecD  = newTempV128();
   12664       assign(vecN, getQReg128(nn));
   12665       assign(vecD, getQReg128(dd));
   12666       IRTemp res = IRTemp_INVALID;
   12667       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
   12668                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
   12669       putQReg128(dd, mkexpr(res));
   12670       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
   12671       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   12672       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   12673       HChar ch               = size == X01 ? 'h' : 's';
   12674       DIP("%c%s%s %s.%s, %s.%s, %s.%c[%u]\n",
   12675           isU ? 'u' : 's', nm, is2 ? "2" : "",
   12676           nameQReg128(dd), arrWide,
   12677           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
   12678       return True;
   12679    }
   12680 
   12681    if (bitU == 0
   12682        && (opcode == BITS4(1,0,1,1)
   12683            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
   12684       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
   12685       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
   12686       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
   12687       /* Widens, and size refers to the narrowed lanes. */
   12688       UInt ks = 3;
   12689       switch (opcode) {
   12690          case BITS4(1,0,1,1): ks = 0; break;
   12691          case BITS4(0,0,1,1): ks = 1; break;
   12692          case BITS4(0,1,1,1): ks = 2; break;
   12693          default: vassert(0);
   12694       }
   12695       vassert(ks >= 0 && ks <= 2);
   12696       Bool is2 = bitQ == 1;
   12697       UInt mm  = 32; // invalid
   12698       UInt ix  = 16; // invalid
   12699       switch (size) {
   12700          case X00:
   12701             return False; // h_b_b[] case is not allowed
   12702          case X01:
   12703             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
   12704          case X10:
   12705             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
   12706          case X11:
   12707             return False; // q_d_d[] case is not allowed
   12708          default:
   12709             vassert(0);
   12710       }
   12711       vassert(mm < 32 && ix < 16);
   12712       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
   12713       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
   12714       newTempsV128_2(&vecN, &vecD);
   12715       assign(vecN, getQReg128(nn));
   12716       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
   12717       assign(vecD, getQReg128(dd));
   12718       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
   12719                        is2, size, "mas"[ks],
   12720                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
   12721       putQReg128(dd, mkexpr(res));
   12722       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
   12723       updateQCFLAGwithDifference(sat1q, sat1n);
   12724       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
   12725          updateQCFLAGwithDifference(sat2q, sat2n);
   12726       }
   12727       const HChar* nm        = ks == 0 ? "sqdmull"
   12728                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
   12729       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
   12730       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
   12731       HChar ch               = size == X01 ? 'h' : 's';
   12732       DIP("%s%s %s.%s, %s.%s, %s.%c[%u]\n",
   12733           nm, is2 ? "2" : "",
   12734           nameQReg128(dd), arrWide,
   12735           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
   12736       return True;
   12737    }
   12738 
   12739    if (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1)) {
   12740       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
   12741       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
   12742       UInt mm  = 32; // invalid
   12743       UInt ix  = 16; // invalid
   12744       switch (size) {
   12745          case X00:
   12746             return False; // b case is not allowed
   12747          case X01:
   12748             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
   12749          case X10:
   12750             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
   12751          case X11:
   12752             return False; // q case is not allowed
   12753          default:
   12754             vassert(0);
   12755       }
   12756       vassert(mm < 32 && ix < 16);
   12757       Bool isR = opcode == BITS4(1,1,0,1);
   12758       IRTemp res, sat1q, sat1n, vN, vM;
   12759       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
   12760       vN = newTempV128();
   12761       assign(vN, getQReg128(nn));
   12762       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
   12763       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
   12764       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
   12765       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
   12766       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
   12767       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
   12768       const HChar* arr = nameArr_Q_SZ(bitQ, size);
   12769       HChar ch         = size == X01 ? 'h' : 's';
   12770       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
   12771           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
   12772       return True;
   12773    }
   12774 
   12775    return False;
   12776 #  undef INSN
   12777 }
   12778 
   12779 
   12780 static
   12781 Bool dis_AdvSIMD_crypto_aes(/*MB_OUT*/DisResult* dres, UInt insn)
   12782 {
   12783 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   12784    return False;
   12785 #  undef INSN
   12786 }
   12787 
   12788 
   12789 static
   12790 Bool dis_AdvSIMD_crypto_three_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
   12791 {
   12792 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   12793    return False;
   12794 #  undef INSN
   12795 }
   12796 
   12797 
   12798 static
   12799 Bool dis_AdvSIMD_crypto_two_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
   12800 {
   12801 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   12802    return False;
   12803 #  undef INSN
   12804 }
   12805 
   12806 
   12807 static
   12808 Bool dis_AdvSIMD_fp_compare(/*MB_OUT*/DisResult* dres, UInt insn)
   12809 {
   12810    /* 31  28    23 21 20 15 13   9 4
   12811       000 11110 ty 1  m  op 1000 n opcode2
   12812       The first 3 bits are really "M 0 S", but M and S are always zero.
   12813       Decode fields are: ty,op,opcode2
   12814    */
   12815 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   12816    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
   12817        || INSN(21,21) != 1 || INSN(13,10) != BITS4(1,0,0,0)) {
   12818       return False;
   12819    }
   12820    UInt ty      = INSN(23,22);
   12821    UInt mm      = INSN(20,16);
   12822    UInt op      = INSN(15,14);
   12823    UInt nn      = INSN(9,5);
   12824    UInt opcode2 = INSN(4,0);
   12825    vassert(ty < 4);
   12826 
   12827    if (ty <= X01 && op == X00
   12828        && (opcode2 & BITS5(0,0,1,1,1)) == BITS5(0,0,0,0,0)) {
   12829       /* -------- 0x,00,00000 FCMP  d_d,   s_s -------- */
   12830       /* -------- 0x,00,01000 FCMP  d_#0, s_#0 -------- */
   12831       /* -------- 0x,00,10000 FCMPE d_d,   s_s -------- */
   12832       /* -------- 0x,00,11000 FCMPE d_#0, s_#0 -------- */
   12833       /* 31        23   20    15      9 4
   12834          000 11110 01 1     m 00 1000 n 10 000  FCMPE Dn, Dm
   12835          000 11110 01 1 00000 00 1000 n 11 000  FCMPE Dn, #0.0
   12836          000 11110 01 1     m 00 1000 n 00 000  FCMP  Dn, Dm
   12837          000 11110 01 1 00000 00 1000 n 01 000  FCMP  Dn, #0.0
   12838 
   12839          000 11110 00 1     m 00 1000 n 10 000  FCMPE Sn, Sm
   12840          000 11110 00 1 00000 00 1000 n 11 000  FCMPE Sn, #0.0
   12841          000 11110 00 1     m 00 1000 n 00 000  FCMP  Sn, Sm
   12842          000 11110 00 1 00000 00 1000 n 01 000  FCMP  Sn, #0.0
   12843 
   12844          FCMPE generates Invalid Operation exn if either arg is any kind
   12845          of NaN.  FCMP generates Invalid Operation exn if either arg is a
   12846          signalling NaN.  We ignore this detail here and produce the same
   12847          IR for both.
   12848       */
   12849       Bool   isD     = (ty & 1) == 1;
   12850       Bool   isCMPE  = (opcode2 & 16) == 16;
   12851       Bool   cmpZero = (opcode2 & 8) == 8;
   12852       IRType ity     = isD ? Ity_F64 : Ity_F32;
   12853       Bool   valid   = True;
   12854       if (cmpZero && mm != 0) valid = False;
   12855       if (valid) {
   12856          IRTemp argL  = newTemp(ity);
   12857          IRTemp argR  = newTemp(ity);
   12858          IRTemp irRes = newTemp(Ity_I32);
   12859          assign(argL, getQRegLO(nn, ity));
   12860          assign(argR,
   12861                 cmpZero
   12862                    ? (IRExpr_Const(isD ? IRConst_F64i(0) : IRConst_F32i(0)))
   12863                    : getQRegLO(mm, ity));
   12864          assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
   12865                              mkexpr(argL), mkexpr(argR)));
   12866          IRTemp nzcv = mk_convert_IRCmpF64Result_to_NZCV(irRes);
   12867          IRTemp nzcv_28x0 = newTemp(Ity_I64);
   12868          assign(nzcv_28x0, binop(Iop_Shl64, mkexpr(nzcv), mkU8(28)));
   12869          setFlags_COPY(nzcv_28x0);
   12870          DIP("fcmp%s %s, %s\n", isCMPE ? "e" : "", nameQRegLO(nn, ity),
   12871              cmpZero ? "#0.0" : nameQRegLO(mm, ity));
   12872          return True;
   12873       }
   12874       return False;
   12875    }
   12876 
   12877    return False;
   12878 #  undef INSN
   12879 }
   12880 
   12881 
   12882 static
   12883 Bool dis_AdvSIMD_fp_conditional_compare(/*MB_OUT*/DisResult* dres, UInt insn)
   12884 {
   12885    /* 31  28    23 21 20 15   11 9 4  3
   12886       000 11110 ty 1  m  cond 01 n op nzcv
   12887       The first 3 bits are really "M 0 S", but M and S are always zero.
   12888       Decode fields are: ty,op
   12889    */
   12890 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   12891    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
   12892        || INSN(21,21) != 1 || INSN(11,10) != BITS2(0,1)) {
   12893       return False;
   12894    }
   12895    UInt ty   = INSN(23,22);
   12896    UInt mm   = INSN(20,16);
   12897    UInt cond = INSN(15,12);
   12898    UInt nn   = INSN(9,5);
   12899    UInt op   = INSN(4,4);
   12900    UInt nzcv = INSN(3,0);
   12901    vassert(ty < 4 && op <= 1);
   12902 
   12903    if (ty <= BITS2(0,1)) {
   12904       /* -------- 00,0 FCCMP  s_s -------- */
   12905       /* -------- 00,1 FCCMPE s_s -------- */
   12906       /* -------- 01,0 FCCMP  d_d -------- */
   12907       /* -------- 01,1 FCCMPE d_d -------- */
   12908 
   12909       /* FCCMPE generates Invalid Operation exn if either arg is any kind
   12910          of NaN.  FCCMP generates Invalid Operation exn if either arg is a
   12911          signalling NaN.  We ignore this detail here and produce the same
   12912          IR for both.
   12913       */
   12914       Bool   isD    = (ty & 1) == 1;
   12915       Bool   isCMPE = op == 1;
   12916       IRType ity    = isD ? Ity_F64 : Ity_F32;
   12917       IRTemp argL   = newTemp(ity);
   12918       IRTemp argR   = newTemp(ity);
   12919       IRTemp irRes  = newTemp(Ity_I32);
   12920       assign(argL,  getQRegLO(nn, ity));
   12921       assign(argR,  getQRegLO(mm, ity));
   12922       assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
   12923                           mkexpr(argL), mkexpr(argR)));
   12924       IRTemp condT = newTemp(Ity_I1);
   12925       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
   12926       IRTemp nzcvT = mk_convert_IRCmpF64Result_to_NZCV(irRes);
   12927 
   12928       IRTemp nzcvT_28x0 = newTemp(Ity_I64);
   12929       assign(nzcvT_28x0, binop(Iop_Shl64, mkexpr(nzcvT), mkU8(28)));
   12930 
   12931       IRExpr* nzcvF_28x0 = mkU64(((ULong)nzcv) << 28);
   12932 
   12933       IRTemp nzcv_28x0 = newTemp(Ity_I64);
   12934       assign(nzcv_28x0, IRExpr_ITE(mkexpr(condT),
   12935                                    mkexpr(nzcvT_28x0), nzcvF_28x0));
   12936       setFlags_COPY(nzcv_28x0);
   12937       DIP("fccmp%s %s, %s, #%u, %s\n", isCMPE ? "e" : "",
   12938           nameQRegLO(nn, ity), nameQRegLO(mm, ity), nzcv, nameCC(cond));
   12939       return True;
   12940    }
   12941 
   12942    return False;
   12943 #  undef INSN
   12944 }
   12945 
   12946 
   12947 static
   12948 Bool dis_AdvSIMD_fp_conditional_select(/*MB_OUT*/DisResult* dres, UInt insn)
   12949 {
   12950    /* 31        23 21 20 15   11 9 5
   12951       000 11110 ty 1  m  cond 11 n d
   12952       The first 3 bits are really "M 0 S", but M and S are always zero.
   12953       Decode fields: ty
   12954    */
   12955 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   12956    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0) || INSN(21,21) != 1
   12957        || INSN(11,10) != BITS2(1,1)) {
   12958       return False;
   12959    }
   12960    UInt ty   = INSN(23,22);
   12961    UInt mm   = INSN(20,16);
   12962    UInt cond = INSN(15,12);
   12963    UInt nn   = INSN(9,5);
   12964    UInt dd   = INSN(4,0);
   12965    if (ty <= X01) {
   12966       /* -------- 00: FCSEL s_s -------- */
   12967       /* -------- 00: FCSEL d_d -------- */
   12968       IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
   12969       IRTemp srcT = newTemp(ity);
   12970       IRTemp srcF = newTemp(ity);
   12971       IRTemp res  = newTemp(ity);
   12972       assign(srcT, getQRegLO(nn, ity));
   12973       assign(srcF, getQRegLO(mm, ity));
   12974       assign(res, IRExpr_ITE(
   12975                      unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
   12976                      mkexpr(srcT), mkexpr(srcF)));
   12977       putQReg128(dd, mkV128(0x0000));
   12978       putQRegLO(dd, mkexpr(res));
   12979       DIP("fcsel %s, %s, %s, %s\n",
   12980           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity),
   12981           nameCC(cond));
   12982       return True;
   12983    }
   12984    return False;
   12985 #  undef INSN
   12986 }
   12987 
   12988 
   12989 static
   12990 Bool dis_AdvSIMD_fp_data_proc_1_source(/*MB_OUT*/DisResult* dres, UInt insn)
   12991 {
   12992    /* 31  28    23 21 20     14    9 4
   12993       000 11110 ty 1  opcode 10000 n d
   12994       The first 3 bits are really "M 0 S", but M and S are always zero.
   12995       Decode fields: ty,opcode
   12996    */
   12997 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   12998    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
   12999        || INSN(21,21) != 1 || INSN(14,10) != BITS5(1,0,0,0,0)) {
   13000       return False;
   13001    }
   13002    UInt ty     = INSN(23,22);
   13003    UInt opcode = INSN(20,15);
   13004    UInt nn     = INSN(9,5);
   13005    UInt dd     = INSN(4,0);
   13006 
   13007    if (ty <= X01 && opcode <= BITS6(0,0,0,0,1,1)) {
   13008       /* -------- 0x,000000: FMOV  d_d, s_s -------- */
   13009       /* -------- 0x,000001: FABS  d_d, s_s -------- */
   13010       /* -------- 0x,000010: FNEG  d_d, s_s -------- */
   13011       /* -------- 0x,000011: FSQRT d_d, s_s -------- */
   13012       IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
   13013       IRTemp src = newTemp(ity);
   13014       IRTemp res = newTemp(ity);
   13015       const HChar* nm = "??";
   13016       assign(src, getQRegLO(nn, ity));
   13017       switch (opcode) {
   13018          case BITS6(0,0,0,0,0,0):
   13019             nm = "fmov"; assign(res, mkexpr(src)); break;
   13020          case BITS6(0,0,0,0,0,1):
   13021             nm = "fabs"; assign(res, unop(mkABSF(ity), mkexpr(src))); break;
   13022          case BITS6(0,0,0,0,1,0):
   13023             nm = "fabs"; assign(res, unop(mkNEGF(ity), mkexpr(src))); break;
   13024          case BITS6(0,0,0,0,1,1):
   13025             nm = "fsqrt";
   13026             assign(res, binop(mkSQRTF(ity),
   13027                               mkexpr(mk_get_IR_rounding_mode()),
   13028                               mkexpr(src))); break;
   13029          default:
   13030             vassert(0);
   13031       }
   13032       putQReg128(dd, mkV128(0x0000));
   13033       putQRegLO(dd, mkexpr(res));
   13034       DIP("%s %s, %s\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
   13035       return True;
   13036    }
   13037 
   13038    if (   (ty == X11 && (opcode == BITS6(0,0,0,1,0,0)
   13039                          || opcode == BITS6(0,0,0,1,0,1)))
   13040        || (ty == X00 && (opcode == BITS6(0,0,0,1,1,1)
   13041                          || opcode == BITS6(0,0,0,1,0,1)))
   13042        || (ty == X01 && (opcode == BITS6(0,0,0,1,1,1)
   13043                          || opcode == BITS6(0,0,0,1,0,0)))) {
   13044       /* -------- 11,000100: FCVT s_h -------- */
   13045       /* -------- 11,000101: FCVT d_h -------- */
   13046       /* -------- 00,000111: FCVT h_s -------- */
   13047       /* -------- 00,000101: FCVT d_s -------- */
   13048       /* -------- 01,000111: FCVT h_d -------- */
   13049       /* -------- 01,000100: FCVT s_d -------- */
   13050       /* 31        23 21    16 14    9 4
   13051          000 11110 11 10001 00 10000 n d   FCVT Sd, Hn
   13052          --------- 11 ----- 01 ---------   FCVT Dd, Hn
   13053          --------- 00 ----- 11 ---------   FCVT Hd, Sn
   13054          --------- 00 ----- 01 ---------   FCVT Dd, Sn
   13055          --------- 01 ----- 11 ---------   FCVT Hd, Dn
   13056          --------- 01 ----- 00 ---------   FCVT Sd, Dn
   13057          Rounding, when dst is smaller than src, is per the FPCR.
   13058       */
   13059       UInt b2322 = ty;
   13060       UInt b1615 = opcode & BITS2(1,1);
   13061       switch ((b2322 << 2) | b1615) {
   13062          case BITS4(0,0,0,1):   // S -> D
   13063          case BITS4(1,1,0,1): { // H -> D
   13064             Bool   srcIsH = b2322 == BITS2(1,1);
   13065             IRType srcTy  = srcIsH ? Ity_F16 : Ity_F32;
   13066             IRTemp res    = newTemp(Ity_F64);
   13067             assign(res, unop(srcIsH ? Iop_F16toF64 : Iop_F32toF64,
   13068                              getQRegLO(nn, srcTy)));
   13069             putQReg128(dd, mkV128(0x0000));
   13070             putQRegLO(dd, mkexpr(res));
   13071             DIP("fcvt %s, %s\n",
   13072                 nameQRegLO(dd, Ity_F64), nameQRegLO(nn, srcTy));
   13073             return True;
   13074          }
   13075          case BITS4(0,1,0,0):   // D -> S
   13076          case BITS4(0,1,1,1): { // D -> H
   13077             Bool   dstIsH = b1615 == BITS2(1,1);
   13078             IRType dstTy  = dstIsH ? Ity_F16 : Ity_F32;
   13079             IRTemp res    = newTemp(dstTy);
   13080             assign(res, binop(dstIsH ? Iop_F64toF16 : Iop_F64toF32,
   13081                               mkexpr(mk_get_IR_rounding_mode()),
   13082                               getQRegLO(nn, Ity_F64)));
   13083             putQReg128(dd, mkV128(0x0000));
   13084             putQRegLO(dd, mkexpr(res));
   13085             DIP("fcvt %s, %s\n",
   13086                 nameQRegLO(dd, dstTy), nameQRegLO(nn, Ity_F64));
   13087             return True;
   13088          }
   13089          case BITS4(0,0,1,1):   // S -> H
   13090          case BITS4(1,1,0,0): { // H -> S
   13091             Bool   toH   = b1615 == BITS2(1,1);
   13092             IRType srcTy = toH ? Ity_F32 : Ity_F16;
   13093             IRType dstTy = toH ? Ity_F16 : Ity_F32;
   13094             IRTemp res = newTemp(dstTy);
   13095             if (toH) {
   13096                assign(res, binop(Iop_F32toF16,
   13097                                  mkexpr(mk_get_IR_rounding_mode()),
   13098                                  getQRegLO(nn, srcTy)));
   13099 
   13100             } else {
   13101                assign(res, unop(Iop_F16toF32,
   13102                                 getQRegLO(nn, srcTy)));
   13103             }
   13104             putQReg128(dd, mkV128(0x0000));
   13105             putQRegLO(dd, mkexpr(res));
   13106             DIP("fcvt %s, %s\n",
   13107                 nameQRegLO(dd, dstTy), nameQRegLO(nn, srcTy));
   13108             return True;
   13109          }
   13110          default:
   13111             break;
   13112       }
   13113       /* else unhandled */
   13114       return False;
   13115    }
   13116 
   13117    if (ty <= X01
   13118        && opcode >= BITS6(0,0,1,0,0,0) && opcode <= BITS6(0,0,1,1,1,1)
   13119        && opcode != BITS6(0,0,1,1,0,1)) {
   13120       /* -------- 0x,001000 FRINTN d_d, s_s -------- */
   13121       /* -------- 0x,001001 FRINTP d_d, s_s -------- */
   13122       /* -------- 0x,001010 FRINTM d_d, s_s -------- */
   13123       /* -------- 0x,001011 FRINTZ d_d, s_s -------- */
   13124       /* -------- 0x,001100 FRINTA d_d, s_s -------- */
   13125       /* -------- 0x,001110 FRINTX d_d, s_s -------- */
   13126       /* -------- 0x,001111 FRINTI d_d, s_s -------- */
   13127       /* 31        23 21   17  14    9 4
   13128          000 11110 0x 1001 111 10000 n d  FRINTI Fd, Fm (round per FPCR)
   13129                            rm
   13130          x==0 => S-registers, x==1 => D-registers
   13131          rm (17:15) encodings:
   13132             111 per FPCR  (FRINTI)
   13133             001 +inf      (FRINTP)
   13134             010 -inf      (FRINTM)
   13135             011 zero      (FRINTZ)
   13136             000 tieeven   (FRINTN) -- !! FIXME KLUDGED !!
   13137             100 tieaway   (FRINTA) -- !! FIXME KLUDGED !!
   13138             110 per FPCR + "exact = TRUE" (FRINTX)
   13139             101 unallocated
   13140       */
   13141       Bool    isD   = (ty & 1) == 1;
   13142       UInt    rm    = opcode & BITS6(0,0,0,1,1,1);
   13143       IRType  ity   = isD ? Ity_F64 : Ity_F32;
   13144       IRExpr* irrmE = NULL;
   13145       UChar   ch    = '?';
   13146       switch (rm) {
   13147          case BITS3(0,1,1): ch = 'z'; irrmE = mkU32(Irrm_ZERO); break;
   13148          case BITS3(0,1,0): ch = 'm'; irrmE = mkU32(Irrm_NegINF); break;
   13149          case BITS3(0,0,1): ch = 'p'; irrmE = mkU32(Irrm_PosINF); break;
   13150          // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
   13151          case BITS3(1,0,0): ch = 'a'; irrmE = mkU32(Irrm_NEAREST); break;
   13152          // I am unsure about the following, due to the "integral exact"
   13153          // description in the manual.  What does it mean? (frintx, that is)
   13154          case BITS3(1,1,0):
   13155             ch = 'x'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
   13156          case BITS3(1,1,1):
   13157             ch = 'i'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
   13158          // The following is a kludge.  There's no Irrm_ value to represent
   13159          // this ("to nearest, with ties to even")
   13160          case BITS3(0,0,0): ch = 'n'; irrmE = mkU32(Irrm_NEAREST); break;
   13161          default: break;
   13162       }
   13163       if (irrmE) {
   13164          IRTemp src = newTemp(ity);
   13165          IRTemp dst = newTemp(ity);
   13166          assign(src, getQRegLO(nn, ity));
   13167          assign(dst, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   13168                            irrmE, mkexpr(src)));
   13169          putQReg128(dd, mkV128(0x0000));
   13170          putQRegLO(dd, mkexpr(dst));
   13171          DIP("frint%c %s, %s\n",
   13172              ch, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
   13173          return True;
   13174       }
   13175       return False;
   13176    }
   13177 
   13178    return False;
   13179 #  undef INSN
   13180 }
   13181 
   13182 
   13183 static
   13184 Bool dis_AdvSIMD_fp_data_proc_2_source(/*MB_OUT*/DisResult* dres, UInt insn)
   13185 {
   13186    /* 31  28    23 21 20 15     11 9 4
   13187       000 11110 ty 1  m  opcode 10 n d
   13188       The first 3 bits are really "M 0 S", but M and S are always zero.
   13189       Decode fields: ty, opcode
   13190    */
   13191 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   13192    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
   13193        || INSN(21,21) != 1 || INSN(11,10) != BITS2(1,0)) {
   13194       return False;
   13195    }
   13196    UInt ty     = INSN(23,22);
   13197    UInt mm     = INSN(20,16);
   13198    UInt opcode = INSN(15,12);
   13199    UInt nn     = INSN(9,5);
   13200    UInt dd     = INSN(4,0);
   13201 
   13202    if (ty <= X01 && opcode <= BITS4(0,1,1,1)) {
   13203       /* ------- 0x,0000: FMUL d_d, s_s ------- */
   13204       /* ------- 0x,0001: FDIV d_d, s_s ------- */
   13205       /* ------- 0x,0010: FADD d_d, s_s ------- */
   13206       /* ------- 0x,0011: FSUB d_d, s_s ------- */
   13207       /* ------- 0x,0100: FMAX d_d, s_s ------- */
   13208       /* ------- 0x,0101: FMIN d_d, s_s ------- */
   13209       /* ------- 0x,0110: FMAXNM d_d, s_s ------- (FIXME KLUDGED) */
   13210       /* ------- 0x,0111: FMINNM d_d, s_s ------- (FIXME KLUDGED) */
   13211       IRType ity = ty == X00 ? Ity_F32 : Ity_F64;
   13212       IROp   iop = Iop_INVALID;
   13213       const HChar* nm = "???";
   13214       switch (opcode) {
   13215          case BITS4(0,0,0,0): nm = "fmul"; iop = mkMULF(ity); break;
   13216          case BITS4(0,0,0,1): nm = "fdiv"; iop = mkDIVF(ity); break;
   13217          case BITS4(0,0,1,0): nm = "fadd"; iop = mkADDF(ity); break;
   13218          case BITS4(0,0,1,1): nm = "fsub"; iop = mkSUBF(ity); break;
   13219          case BITS4(0,1,0,0): nm = "fmax"; iop = mkVecMAXF(ty+2); break;
   13220          case BITS4(0,1,0,1): nm = "fmin"; iop = mkVecMINF(ty+2); break;
   13221          case BITS4(0,1,1,0): nm = "fmaxnm"; iop = mkVecMAXF(ty+2); break; //!!
   13222          case BITS4(0,1,1,1): nm = "fminnm"; iop = mkVecMINF(ty+2); break; //!!
   13223          default: vassert(0);
   13224       }
   13225       if (opcode <= BITS4(0,0,1,1)) {
   13226          // This is really not good code.  TODO: avoid width-changing
   13227          IRTemp res = newTemp(ity);
   13228          assign(res, triop(iop, mkexpr(mk_get_IR_rounding_mode()),
   13229                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
   13230          putQReg128(dd, mkV128(0));
   13231          putQRegLO(dd, mkexpr(res));
   13232       } else {
   13233          putQReg128(dd, unop(mkVecZEROHIxxOFV128(ty+2),
   13234                              binop(iop, getQReg128(nn), getQReg128(mm))));
   13235       }
   13236       DIP("%s %s, %s, %s\n",
   13237           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
   13238       return True;
   13239    }
   13240 
   13241    if (ty <= X01 && opcode == BITS4(1,0,0,0)) {
   13242       /* ------- 0x,1000: FNMUL d_d, s_s ------- */
   13243       IRType ity  = ty == X00 ? Ity_F32 : Ity_F64;
   13244       IROp   iop  = mkMULF(ity);
   13245       IROp   iopn = mkNEGF(ity);
   13246       const HChar* nm = "fnmul";
   13247       IRExpr* resE = unop(iopn,
   13248                           triop(iop, mkexpr(mk_get_IR_rounding_mode()),
   13249                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
   13250       IRTemp  res  = newTemp(ity);
   13251       assign(res, resE);
   13252       putQReg128(dd, mkV128(0));
   13253       putQRegLO(dd, mkexpr(res));
   13254       DIP("%s %s, %s, %s\n",
   13255           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
   13256       return True;
   13257    }
   13258 
   13259    return False;
   13260 #  undef INSN
   13261 }
   13262 
   13263 
   13264 static
   13265 Bool dis_AdvSIMD_fp_data_proc_3_source(/*MB_OUT*/DisResult* dres, UInt insn)
   13266 {
   13267    /* 31  28    23 21 20 15 14 9 4
   13268       000 11111 ty o1 m  o0 a  n d
   13269       The first 3 bits are really "M 0 S", but M and S are always zero.
   13270       Decode fields: ty,o1,o0
   13271    */
   13272 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   13273    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,1)) {
   13274       return False;
   13275    }
   13276    UInt ty    = INSN(23,22);
   13277    UInt bitO1 = INSN(21,21);
   13278    UInt mm    = INSN(20,16);
   13279    UInt bitO0 = INSN(15,15);
   13280    UInt aa    = INSN(14,10);
   13281    UInt nn    = INSN(9,5);
   13282    UInt dd    = INSN(4,0);
   13283    vassert(ty < 4);
   13284 
   13285    if (ty <= X01) {
   13286       /* -------- 0x,0,0 FMADD  d_d_d_d, s_s_s_s -------- */
   13287       /* -------- 0x,0,1 FMSUB  d_d_d_d, s_s_s_s -------- */
   13288       /* -------- 0x,1,0 FNMADD d_d_d_d, s_s_s_s -------- */
   13289       /* -------- 0x,1,1 FNMSUB d_d_d_d, s_s_s_s -------- */
   13290       /* -------------------- F{N}M{ADD,SUB} -------------------- */
   13291       /* 31          22   20 15 14 9 4   ix
   13292          000 11111 0 sz 0 m  0  a  n d   0   FMADD  Fd,Fn,Fm,Fa
   13293          000 11111 0 sz 0 m  1  a  n d   1   FMSUB  Fd,Fn,Fm,Fa
   13294          000 11111 0 sz 1 m  0  a  n d   2   FNMADD Fd,Fn,Fm,Fa
   13295          000 11111 0 sz 1 m  1  a  n d   3   FNMSUB Fd,Fn,Fm,Fa
   13296          where Fx=Dx when sz=1, Fx=Sx when sz=0
   13297 
   13298                   -----SPEC------    ----IMPL----
   13299          fmadd       a +    n * m    a + n * m
   13300          fmsub       a + (-n) * m    a - n * m
   13301          fnmadd   (-a) + (-n) * m    -(a + n * m)
   13302          fnmsub   (-a) +    n * m    -(a - n * m)
   13303       */
   13304       Bool    isD   = (ty & 1) == 1;
   13305       UInt    ix    = (bitO1 << 1) | bitO0;
   13306       IRType  ity   = isD ? Ity_F64 : Ity_F32;
   13307       IROp    opADD = mkADDF(ity);
   13308       IROp    opSUB = mkSUBF(ity);
   13309       IROp    opMUL = mkMULF(ity);
   13310       IROp    opNEG = mkNEGF(ity);
   13311       IRTemp  res   = newTemp(ity);
   13312       IRExpr* eA    = getQRegLO(aa, ity);
   13313       IRExpr* eN    = getQRegLO(nn, ity);
   13314       IRExpr* eM    = getQRegLO(mm, ity);
   13315       IRExpr* rm    = mkexpr(mk_get_IR_rounding_mode());
   13316       IRExpr* eNxM  = triop(opMUL, rm, eN, eM);
   13317       switch (ix) {
   13318          case 0:  assign(res, triop(opADD, rm, eA, eNxM)); break;
   13319          case 1:  assign(res, triop(opSUB, rm, eA, eNxM)); break;
   13320          case 2:  assign(res, unop(opNEG, triop(opADD, rm, eA, eNxM))); break;
   13321          case 3:  assign(res, unop(opNEG, triop(opSUB, rm, eA, eNxM))); break;
   13322          default: vassert(0);
   13323       }
   13324       putQReg128(dd, mkV128(0x0000));
   13325       putQRegLO(dd, mkexpr(res));
   13326       const HChar* names[4] = { "fmadd", "fmsub", "fnmadd", "fnmsub" };
   13327       DIP("%s %s, %s, %s, %s\n",
   13328           names[ix], nameQRegLO(dd, ity), nameQRegLO(nn, ity),
   13329                      nameQRegLO(mm, ity), nameQRegLO(aa, ity));
   13330       return True;
   13331    }
   13332 
   13333    return False;
   13334 #  undef INSN
   13335 }
   13336 
   13337 
   13338 static
   13339 Bool dis_AdvSIMD_fp_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
   13340 {
   13341    /* 31  28    23 21 20   12  9    4
   13342       000 11110 ty 1  imm8 100 imm5 d
   13343       The first 3 bits are really "M 0 S", but M and S are always zero.
   13344    */
   13345 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   13346    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
   13347        || INSN(21,21) != 1 || INSN(12,10) != BITS3(1,0,0)) {
   13348       return False;
   13349    }
   13350    UInt ty     = INSN(23,22);
   13351    UInt imm8   = INSN(20,13);
   13352    UInt imm5   = INSN(9,5);
   13353    UInt dd     = INSN(4,0);
   13354 
   13355    /* ------- 00,00000: FMOV s_imm ------- */
   13356    /* ------- 01,00000: FMOV d_imm ------- */
   13357    if (ty <= X01 && imm5 == BITS5(0,0,0,0,0)) {
   13358       Bool  isD  = (ty & 1) == 1;
   13359       ULong imm  = VFPExpandImm(imm8, isD ? 64 : 32);
   13360       if (!isD) {
   13361          vassert(0 == (imm & 0xFFFFFFFF00000000ULL));
   13362       }
   13363       putQReg128(dd, mkV128(0));
   13364       putQRegLO(dd, isD ? mkU64(imm) : mkU32(imm & 0xFFFFFFFFULL));
   13365       DIP("fmov %s, #0x%llx\n",
   13366           nameQRegLO(dd, isD ? Ity_F64 : Ity_F32), imm);
   13367       return True;
   13368    }
   13369 
   13370    return False;
   13371 #  undef INSN
   13372 }
   13373 
   13374 
   13375 static
   13376 Bool dis_AdvSIMD_fp_to_from_fixedp_conv(/*MB_OUT*/DisResult* dres, UInt insn)
   13377 {
   13378 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   13379    /* 31 30 29 28    23   21 20    18     15    9 4
   13380       sf  0  0 11110 type 0  rmode opcode scale n d
   13381       The first 3 bits are really "sf 0 S", but S is always zero.
   13382       Decode fields: sf,type,rmode,opcode
   13383    */
   13384 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   13385    if (INSN(30,29) != BITS2(0,0)
   13386        || INSN(28,24) != BITS5(1,1,1,1,0)
   13387        || INSN(21,21) != 0) {
   13388       return False;
   13389    }
   13390    UInt bitSF = INSN(31,31);
   13391    UInt ty    = INSN(23,22); // type
   13392    UInt rm    = INSN(20,19); // rmode
   13393    UInt op    = INSN(18,16); // opcode
   13394    UInt sc    = INSN(15,10); // scale
   13395    UInt nn    = INSN(9,5);
   13396    UInt dd    = INSN(4,0);
   13397 
   13398    if (ty <= X01 && rm == X11
   13399        && (op == BITS3(0,0,0) || op == BITS3(0,0,1))) {
   13400       /* -------- (ix) sf ty rm opc -------- */
   13401       /* -------- 0    0  00 11 000: FCVTZS w_s_#fbits -------- */
   13402       /* -------- 1    0  01 11 000: FCVTZS w_d_#fbits -------- */
   13403       /* -------- 2    1  00 11 000: FCVTZS x_s_#fbits -------- */
   13404       /* -------- 3    1  01 11 000: FCVTZS x_d_#fbits -------- */
   13405 
   13406       /* -------- 4    0  00 11 001: FCVTZU w_s_#fbits -------- */
   13407       /* -------- 5    0  01 11 001: FCVTZU w_d_#fbits -------- */
   13408       /* -------- 6    1  00 11 001: FCVTZU x_s_#fbits -------- */
   13409       /* -------- 7    1  01 11 001: FCVTZU x_d_#fbits -------- */
   13410       Bool isI64 = bitSF == 1;
   13411       Bool isF64 = (ty & 1) == 1;
   13412       Bool isU   = (op & 1) == 1;
   13413       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
   13414 
   13415       Int fbits = 64 - sc;
   13416       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
   13417 
   13418       Double  scale  = two_to_the_plus(fbits);
   13419       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
   13420                              : IRExpr_Const(IRConst_F32( (Float)scale ));
   13421       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
   13422 
   13423       const IROp ops[8]
   13424         = { Iop_F32toI32S, Iop_F64toI32S, Iop_F32toI64S, Iop_F64toI64S,
   13425             Iop_F32toI32U, Iop_F64toI32U, Iop_F32toI64U, Iop_F64toI64U };
   13426       IRTemp irrm = newTemp(Ity_I32);
   13427       assign(irrm, mkU32(Irrm_ZERO));
   13428 
   13429       IRExpr* src = getQRegLO(nn, isF64 ? Ity_F64 : Ity_F32);
   13430       IRExpr* res = binop(ops[ix], mkexpr(irrm),
   13431                                    triop(opMUL, mkexpr(irrm), src, scaleE));
   13432       putIRegOrZR(isI64, dd, res);
   13433 
   13434       DIP("fcvtz%c %s, %s, #%d\n",
   13435           isU ? 'u' : 's', nameIRegOrZR(isI64, dd),
   13436           nameQRegLO(nn, isF64 ? Ity_F64 : Ity_F32), fbits);
   13437       return True;
   13438    }
   13439 
   13440    /* ------ sf,ty,rm,opc ------ */
   13441    /* ------ x,0x,00,010  SCVTF s/d, w/x, #fbits  ------ */
   13442    /* ------ x,0x,00,011  UCVTF s/d, w/x, #fbits  ------ */
   13443    /* (ix) sf  S 28    ty   rm opc 15    9 4
   13444       0    0 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Wn, #fbits
   13445       1    0 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Wn, #fbits
   13446       2    1 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Xn, #fbits
   13447       3    1 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Xn, #fbits
   13448 
   13449       4    0 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Wn, #fbits
   13450       5    0 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Wn, #fbits
   13451       6    1 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Xn, #fbits
   13452       7    1 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Xn, #fbits
   13453 
   13454       These are signed/unsigned conversion from integer registers to
   13455       FP registers, all 4 32/64-bit combinations, rounded per FPCR,
   13456       scaled per |scale|.
   13457    */
   13458    if (ty <= X01 && rm == X00
   13459        && (op == BITS3(0,1,0) || op == BITS3(0,1,1))
   13460        && (bitSF == 1 || ((sc >> 5) & 1) == 1)) {
   13461       Bool isI64 = bitSF == 1;
   13462       Bool isF64 = (ty & 1) == 1;
   13463       Bool isU   = (op & 1) == 1;
   13464       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
   13465 
   13466       Int fbits = 64 - sc;
   13467       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
   13468 
   13469       Double  scale  = two_to_the_minus(fbits);
   13470       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
   13471                              : IRExpr_Const(IRConst_F32( (Float)scale ));
   13472       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
   13473 
   13474       const IROp ops[8]
   13475         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
   13476             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
   13477       IRExpr* src = getIRegOrZR(isI64, nn);
   13478       IRExpr* res = (isF64 && !isI64)
   13479                        ? unop(ops[ix], src)
   13480                        : binop(ops[ix],
   13481                                mkexpr(mk_get_IR_rounding_mode()), src);
   13482       putQReg128(dd, mkV128(0));
   13483       putQRegLO(dd, triop(opMUL, mkU32(Irrm_NEAREST), res, scaleE));
   13484 
   13485       DIP("%ccvtf %s, %s, #%d\n",
   13486           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
   13487           nameIRegOrZR(isI64, nn), fbits);
   13488       return True;
   13489    }
   13490 
   13491    return False;
   13492 #  undef INSN
   13493 }
   13494 
   13495 
   13496 static
   13497 Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn)
   13498 {
   13499    /* 31 30 29 28    23   21 20    18     15     9 4
   13500       sf  0  0 11110 type 1  rmode opcode 000000 n d
   13501       The first 3 bits are really "sf 0 S", but S is always zero.
   13502       Decode fields: sf,type,rmode,opcode
   13503    */
   13504 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   13505    if (INSN(30,29) != BITS2(0,0)
   13506        || INSN(28,24) != BITS5(1,1,1,1,0)
   13507        || INSN(21,21) != 1
   13508        || INSN(15,10) != BITS6(0,0,0,0,0,0)) {
   13509       return False;
   13510    }
   13511    UInt bitSF = INSN(31,31);
   13512    UInt ty    = INSN(23,22); // type
   13513    UInt rm    = INSN(20,19); // rmode
   13514    UInt op    = INSN(18,16); // opcode
   13515    UInt nn    = INSN(9,5);
   13516    UInt dd    = INSN(4,0);
   13517 
   13518    // op = 000, 001
   13519    /* -------- FCVT{N,P,M,Z,A}{S,U} (scalar, integer) -------- */
   13520    /*    30       23   20 18  15     9 4
   13521       sf 00 11110 0x 1 00 000 000000 n d  FCVTNS Rd, Fn (round to
   13522       sf 00 11110 0x 1 00 001 000000 n d  FCVTNU Rd, Fn  nearest)
   13523       ---------------- 01 --------------  FCVTP-------- (round to +inf)
   13524       ---------------- 10 --------------  FCVTM-------- (round to -inf)
   13525       ---------------- 11 --------------  FCVTZ-------- (round to zero)
   13526       ---------------- 00 100 ----------  FCVTAS------- (nearest, ties away)
   13527       ---------------- 00 101 ----------  FCVTAU------- (nearest, ties away)
   13528 
   13529       Rd is Xd when sf==1, Wd when sf==0
   13530       Fn is Dn when x==1, Sn when x==0
   13531       20:19 carry the rounding mode, using the same encoding as FPCR
   13532    */
   13533    if (ty <= X01
   13534        && (   ((op == BITS3(0,0,0) || op == BITS3(0,0,1)) && True)
   13535            || ((op == BITS3(1,0,0) || op == BITS3(1,0,1)) && rm == BITS2(0,0))
   13536           )
   13537       ) {
   13538       Bool isI64 = bitSF == 1;
   13539       Bool isF64 = (ty & 1) == 1;
   13540       Bool isU   = (op & 1) == 1;
   13541       /* Decide on the IR rounding mode to use. */
   13542       IRRoundingMode irrm = 8; /*impossible*/
   13543       HChar ch = '?';
   13544       if (op == BITS3(0,0,0) || op == BITS3(0,0,1)) {
   13545          switch (rm) {
   13546             case BITS2(0,0): ch = 'n'; irrm = Irrm_NEAREST; break;
   13547             case BITS2(0,1): ch = 'p'; irrm = Irrm_PosINF; break;
   13548             case BITS2(1,0): ch = 'm'; irrm = Irrm_NegINF; break;
   13549             case BITS2(1,1): ch = 'z'; irrm = Irrm_ZERO; break;
   13550             default: vassert(0);
   13551          }
   13552       } else {
   13553          vassert(op == BITS3(1,0,0) || op == BITS3(1,0,1));
   13554          switch (rm) {
   13555             case BITS2(0,0): ch = 'a'; irrm = Irrm_NEAREST; break;
   13556             default: vassert(0);
   13557          }
   13558       }
   13559       vassert(irrm != 8);
   13560       /* Decide on the conversion primop, based on the source size,
   13561          dest size and signedness (8 possibilities).  Case coding:
   13562             F32 ->s I32   0
   13563             F32 ->u I32   1
   13564             F32 ->s I64   2
   13565             F32 ->u I64   3
   13566             F64 ->s I32   4
   13567             F64 ->u I32   5
   13568             F64 ->s I64   6
   13569             F64 ->u I64   7
   13570       */
   13571       UInt ix = (isF64 ? 4 : 0) | (isI64 ? 2 : 0) | (isU ? 1 : 0);
   13572       vassert(ix < 8);
   13573       const IROp iops[8]
   13574          = { Iop_F32toI32S, Iop_F32toI32U, Iop_F32toI64S, Iop_F32toI64U,
   13575              Iop_F64toI32S, Iop_F64toI32U, Iop_F64toI64S, Iop_F64toI64U };
   13576       IROp iop = iops[ix];
   13577       // A bit of ATCery: bounce all cases we haven't seen an example of.
   13578       if (/* F32toI32S */
   13579              (iop == Iop_F32toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Sn */
   13580           || (iop == Iop_F32toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Sn */
   13581           || (iop == Iop_F32toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Sn */
   13582           || (iop == Iop_F32toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,S */
   13583           /* F32toI32U */
   13584           || (iop == Iop_F32toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Sn */
   13585           || (iop == Iop_F32toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Sn */
   13586           || (iop == Iop_F32toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Sn */
   13587           || (iop == Iop_F32toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,S */
   13588           /* F32toI64S */
   13589           || (iop == Iop_F32toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Sn */
   13590           || (iop == Iop_F32toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Sn */
   13591           || (iop == Iop_F32toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Sn */
   13592           || (iop == Iop_F32toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,S */
   13593           /* F32toI64U */
   13594           || (iop == Iop_F32toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Sn */
   13595           || (iop == Iop_F32toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Sn */
   13596           || (iop == Iop_F32toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Sn */
   13597           || (iop == Iop_F32toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,S */
   13598           /* F64toI32S */
   13599           || (iop == Iop_F64toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Dn */
   13600           || (iop == Iop_F64toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Dn */
   13601           || (iop == Iop_F64toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Dn */
   13602           || (iop == Iop_F64toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,D */
   13603           /* F64toI32U */
   13604           || (iop == Iop_F64toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Dn */
   13605           || (iop == Iop_F64toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Dn */
   13606           || (iop == Iop_F64toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Dn */
   13607           || (iop == Iop_F64toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,D */
   13608           /* F64toI64S */
   13609           || (iop == Iop_F64toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Dn */
   13610           || (iop == Iop_F64toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Dn */
   13611           || (iop == Iop_F64toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Dn */
   13612           || (iop == Iop_F64toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,D */
   13613           /* F64toI64U */
   13614           || (iop == Iop_F64toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Dn */
   13615           || (iop == Iop_F64toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Dn */
   13616           || (iop == Iop_F64toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Dn */
   13617           || (iop == Iop_F64toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,D */
   13618          ) {
   13619         /* validated */
   13620       } else {
   13621         return False;
   13622       }
   13623       IRType srcTy  = isF64 ? Ity_F64 : Ity_F32;
   13624       IRType dstTy  = isI64 ? Ity_I64 : Ity_I32;
   13625       IRTemp src    = newTemp(srcTy);
   13626       IRTemp dst    = newTemp(dstTy);
   13627       assign(src, getQRegLO(nn, srcTy));
   13628       assign(dst, binop(iop, mkU32(irrm), mkexpr(src)));
   13629       putIRegOrZR(isI64, dd, mkexpr(dst));
   13630       DIP("fcvt%c%c %s, %s\n", ch, isU ? 'u' : 's',
   13631           nameIRegOrZR(isI64, dd), nameQRegLO(nn, srcTy));
   13632       return True;
   13633    }
   13634 
   13635    // op = 010, 011
   13636    /* -------------- {S,U}CVTF (scalar, integer) -------------- */
   13637    /* (ix) sf  S 28    ty   rm op  15     9 4
   13638       0    0 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Wn
   13639       1    0 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Wn
   13640       2    1 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Xn
   13641       3    1 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Xn
   13642 
   13643       4    0 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Wn
   13644       5    0 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Wn
   13645       6    1 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Xn
   13646       7    1 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Xn
   13647 
   13648       These are signed/unsigned conversion from integer registers to
   13649       FP registers, all 4 32/64-bit combinations, rounded per FPCR.
   13650    */
   13651    if (ty <= X01 && rm == X00 && (op == BITS3(0,1,0) || op == BITS3(0,1,1))) {
   13652       Bool isI64 = bitSF == 1;
   13653       Bool isF64 = (ty & 1) == 1;
   13654       Bool isU   = (op & 1) == 1;
   13655       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
   13656       const IROp ops[8]
   13657         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
   13658             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
   13659       IRExpr* src = getIRegOrZR(isI64, nn);
   13660       IRExpr* res = (isF64 && !isI64)
   13661                        ? unop(ops[ix], src)
   13662                        : binop(ops[ix],
   13663                                mkexpr(mk_get_IR_rounding_mode()), src);
   13664       putQReg128(dd, mkV128(0));
   13665       putQRegLO(dd, res);
   13666       DIP("%ccvtf %s, %s\n",
   13667           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
   13668           nameIRegOrZR(isI64, nn));
   13669       return True;
   13670    }
   13671 
   13672    // op = 110, 111
   13673    /* -------- FMOV (general) -------- */
   13674    /* case sf  S       ty   rm op  15     9 4
   13675        (1) 0 0 0 11110 00 1 00 111 000000 n d     FMOV Sd,      Wn
   13676        (2) 1 0 0 11110 01 1 00 111 000000 n d     FMOV Dd,      Xn
   13677        (3) 1 0 0 11110 10 1 01 111 000000 n d     FMOV Vd.D[1], Xn
   13678 
   13679        (4) 0 0 0 11110 00 1 00 110 000000 n d     FMOV Wd, Sn
   13680        (5) 1 0 0 11110 01 1 00 110 000000 n d     FMOV Xd, Dn
   13681        (6) 1 0 0 11110 10 1 01 110 000000 n d     FMOV Xd, Vn.D[1]
   13682    */
   13683    if (1) {
   13684       UInt ix = 0; // case
   13685       if (bitSF == 0) {
   13686          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,1))
   13687             ix = 1;
   13688          else
   13689          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,0))
   13690             ix = 4;
   13691       } else {
   13692          vassert(bitSF == 1);
   13693          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,1))
   13694             ix = 2;
   13695          else
   13696          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,0))
   13697             ix = 5;
   13698          else
   13699          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,1))
   13700             ix = 3;
   13701          else
   13702          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,0))
   13703             ix = 6;
   13704       }
   13705       if (ix > 0) {
   13706          switch (ix) {
   13707             case 1:
   13708                putQReg128(dd, mkV128(0));
   13709                putQRegLO(dd, getIReg32orZR(nn));
   13710                DIP("fmov s%u, w%u\n", dd, nn);
   13711                break;
   13712             case 2:
   13713                putQReg128(dd, mkV128(0));
   13714                putQRegLO(dd, getIReg64orZR(nn));
   13715                DIP("fmov d%u, x%u\n", dd, nn);
   13716                break;
   13717             case 3:
   13718                putQRegHI64(dd, getIReg64orZR(nn));
   13719                DIP("fmov v%u.d[1], x%u\n", dd, nn);
   13720                break;
   13721             case 4:
   13722                putIReg32orZR(dd, getQRegLO(nn, Ity_I32));
   13723                DIP("fmov w%u, s%u\n", dd, nn);
   13724                break;
   13725             case 5:
   13726                putIReg64orZR(dd, getQRegLO(nn, Ity_I64));
   13727                DIP("fmov x%u, d%u\n", dd, nn);
   13728                break;
   13729             case 6:
   13730                putIReg64orZR(dd, getQRegHI64(nn));
   13731                DIP("fmov x%u, v%u.d[1]\n", dd, nn);
   13732                break;
   13733             default:
   13734                vassert(0);
   13735          }
   13736          return True;
   13737       }
   13738       /* undecodable; fall through */
   13739    }
   13740 
   13741    return False;
   13742 #  undef INSN
   13743 }
   13744 
   13745 
   13746 static
   13747 Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
   13748 {
   13749    Bool ok;
   13750    ok = dis_AdvSIMD_EXT(dres, insn);
   13751    if (UNLIKELY(ok)) return True;
   13752    ok = dis_AdvSIMD_TBL_TBX(dres, insn);
   13753    if (UNLIKELY(ok)) return True;
   13754    ok = dis_AdvSIMD_ZIP_UZP_TRN(dres, insn);
   13755    if (UNLIKELY(ok)) return True;
   13756    ok = dis_AdvSIMD_across_lanes(dres, insn);
   13757    if (UNLIKELY(ok)) return True;
   13758    ok = dis_AdvSIMD_copy(dres, insn);
   13759    if (UNLIKELY(ok)) return True;
   13760    ok = dis_AdvSIMD_modified_immediate(dres, insn);
   13761    if (UNLIKELY(ok)) return True;
   13762    ok = dis_AdvSIMD_scalar_copy(dres, insn);
   13763    if (UNLIKELY(ok)) return True;
   13764    ok = dis_AdvSIMD_scalar_pairwise(dres, insn);
   13765    if (UNLIKELY(ok)) return True;
   13766    ok = dis_AdvSIMD_scalar_shift_by_imm(dres, insn);
   13767    if (UNLIKELY(ok)) return True;
   13768    ok = dis_AdvSIMD_scalar_three_different(dres, insn);
   13769    if (UNLIKELY(ok)) return True;
   13770    ok = dis_AdvSIMD_scalar_three_same(dres, insn);
   13771    if (UNLIKELY(ok)) return True;
   13772    ok = dis_AdvSIMD_scalar_two_reg_misc(dres, insn);
   13773    if (UNLIKELY(ok)) return True;
   13774    ok = dis_AdvSIMD_scalar_x_indexed_element(dres, insn);
   13775    if (UNLIKELY(ok)) return True;
   13776    ok = dis_AdvSIMD_shift_by_immediate(dres, insn);
   13777    if (UNLIKELY(ok)) return True;
   13778    ok = dis_AdvSIMD_three_different(dres, insn);
   13779    if (UNLIKELY(ok)) return True;
   13780    ok = dis_AdvSIMD_three_same(dres, insn);
   13781    if (UNLIKELY(ok)) return True;
   13782    ok = dis_AdvSIMD_two_reg_misc(dres, insn);
   13783    if (UNLIKELY(ok)) return True;
   13784    ok = dis_AdvSIMD_vector_x_indexed_elem(dres, insn);
   13785    if (UNLIKELY(ok)) return True;
   13786    ok = dis_AdvSIMD_crypto_aes(dres, insn);
   13787    if (UNLIKELY(ok)) return True;
   13788    ok = dis_AdvSIMD_crypto_three_reg_sha(dres, insn);
   13789    if (UNLIKELY(ok)) return True;
   13790    ok = dis_AdvSIMD_crypto_two_reg_sha(dres, insn);
   13791    if (UNLIKELY(ok)) return True;
   13792    ok = dis_AdvSIMD_fp_compare(dres, insn);
   13793    if (UNLIKELY(ok)) return True;
   13794    ok = dis_AdvSIMD_fp_conditional_compare(dres, insn);
   13795    if (UNLIKELY(ok)) return True;
   13796    ok = dis_AdvSIMD_fp_conditional_select(dres, insn);
   13797    if (UNLIKELY(ok)) return True;
   13798    ok = dis_AdvSIMD_fp_data_proc_1_source(dres, insn);
   13799    if (UNLIKELY(ok)) return True;
   13800    ok = dis_AdvSIMD_fp_data_proc_2_source(dres, insn);
   13801    if (UNLIKELY(ok)) return True;
   13802    ok = dis_AdvSIMD_fp_data_proc_3_source(dres, insn);
   13803    if (UNLIKELY(ok)) return True;
   13804    ok = dis_AdvSIMD_fp_immediate(dres, insn);
   13805    if (UNLIKELY(ok)) return True;
   13806    ok = dis_AdvSIMD_fp_to_from_fixedp_conv(dres, insn);
   13807    if (UNLIKELY(ok)) return True;
   13808    ok = dis_AdvSIMD_fp_to_from_int_conv(dres, insn);
   13809    if (UNLIKELY(ok)) return True;
   13810    return False;
   13811 }
   13812 
   13813 
   13814 /*------------------------------------------------------------*/
   13815 /*--- Disassemble a single ARM64 instruction               ---*/
   13816 /*------------------------------------------------------------*/
   13817 
   13818 /* Disassemble a single ARM64 instruction into IR.  The instruction
   13819    has is located at |guest_instr| and has guest IP of
   13820    |guest_PC_curr_instr|, which will have been set before the call
   13821    here.  Returns True iff the instruction was decoded, in which case
   13822    *dres will be set accordingly, or False, in which case *dres should
   13823    be ignored by the caller. */
   13824 
   13825 static
   13826 Bool disInstr_ARM64_WRK (
   13827         /*MB_OUT*/DisResult* dres,
   13828         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   13829         Bool         resteerCisOk,
   13830         void*        callback_opaque,
   13831         const UChar* guest_instr,
   13832         const VexArchInfo* archinfo,
   13833         const VexAbiInfo*  abiinfo
   13834      )
   13835 {
   13836    // A macro to fish bits out of 'insn'.
   13837 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
   13838 
   13839 //ZZ    DisResult dres;
   13840 //ZZ    UInt      insn;
   13841 //ZZ    //Bool      allow_VFP = False;
   13842 //ZZ    //UInt      hwcaps = archinfo->hwcaps;
   13843 //ZZ    IRTemp    condT; /* :: Ity_I32 */
   13844 //ZZ    UInt      summary;
   13845 //ZZ    HChar     dis_buf[128];  // big enough to hold LDMIA etc text
   13846 //ZZ
   13847 //ZZ    /* What insn variants are we supporting today? */
   13848 //ZZ    //allow_VFP  = (0 != (hwcaps & VEX_HWCAPS_ARM_VFP));
   13849 //ZZ    // etc etc
   13850 
   13851    /* Set result defaults. */
   13852    dres->whatNext    = Dis_Continue;
   13853    dres->len         = 4;
   13854    dres->continueAt  = 0;
   13855    dres->jk_StopHere = Ijk_INVALID;
   13856 
   13857    /* At least this is simple on ARM64: insns are all 4 bytes long, and
   13858       4-aligned.  So just fish the whole thing out of memory right now
   13859       and have done. */
   13860    UInt insn = getUIntLittleEndianly( guest_instr );
   13861 
   13862    if (0) vex_printf("insn: 0x%x\n", insn);
   13863 
   13864    DIP("\t(arm64) 0x%llx:  ", (ULong)guest_PC_curr_instr);
   13865 
   13866    vassert(0 == (guest_PC_curr_instr & 3ULL));
   13867 
   13868    /* ----------------------------------------------------------- */
   13869 
   13870    /* Spot "Special" instructions (see comment at top of file). */
   13871    {
   13872       const UChar* code = guest_instr;
   13873       /* Spot the 16-byte preamble:
   13874             93CC0D8C   ror x12, x12, #3
   13875             93CC358C   ror x12, x12, #13
   13876             93CCCD8C   ror x12, x12, #51
   13877             93CCF58C   ror x12, x12, #61
   13878       */
   13879       UInt word1 = 0x93CC0D8C;
   13880       UInt word2 = 0x93CC358C;
   13881       UInt word3 = 0x93CCCD8C;
   13882       UInt word4 = 0x93CCF58C;
   13883       if (getUIntLittleEndianly(code+ 0) == word1 &&
   13884           getUIntLittleEndianly(code+ 4) == word2 &&
   13885           getUIntLittleEndianly(code+ 8) == word3 &&
   13886           getUIntLittleEndianly(code+12) == word4) {
   13887          /* Got a "Special" instruction preamble.  Which one is it? */
   13888          if (getUIntLittleEndianly(code+16) == 0xAA0A014A
   13889                                                /* orr x10,x10,x10 */) {
   13890             /* X3 = client_request ( X4 ) */
   13891             DIP("x3 = client_request ( x4 )\n");
   13892             putPC(mkU64( guest_PC_curr_instr + 20 ));
   13893             dres->jk_StopHere = Ijk_ClientReq;
   13894             dres->whatNext    = Dis_StopHere;
   13895             return True;
   13896          }
   13897          else
   13898          if (getUIntLittleEndianly(code+16) == 0xAA0B016B
   13899                                                /* orr x11,x11,x11 */) {
   13900             /* X3 = guest_NRADDR */
   13901             DIP("x3 = guest_NRADDR\n");
   13902             dres->len = 20;
   13903             putIReg64orZR(3, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
   13904             return True;
   13905          }
   13906          else
   13907          if (getUIntLittleEndianly(code+16) == 0xAA0C018C
   13908                                                /* orr x12,x12,x12 */) {
   13909             /*  branch-and-link-to-noredir X8 */
   13910             DIP("branch-and-link-to-noredir x8\n");
   13911             putIReg64orZR(30, mkU64(guest_PC_curr_instr + 20));
   13912             putPC(getIReg64orZR(8));
   13913             dres->jk_StopHere = Ijk_NoRedir;
   13914             dres->whatNext    = Dis_StopHere;
   13915             return True;
   13916          }
   13917          else
   13918          if (getUIntLittleEndianly(code+16) == 0xAA090129
   13919                                                /* orr x9,x9,x9 */) {
   13920             /* IR injection */
   13921             DIP("IR injection\n");
   13922             vex_inject_ir(irsb, Iend_LE);
   13923             // Invalidate the current insn. The reason is that the IRop we're
   13924             // injecting here can change. In which case the translation has to
   13925             // be redone. For ease of handling, we simply invalidate all the
   13926             // time.
   13927             stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_PC_curr_instr)));
   13928             stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(20)));
   13929             putPC(mkU64( guest_PC_curr_instr + 20 ));
   13930             dres->whatNext    = Dis_StopHere;
   13931             dres->jk_StopHere = Ijk_InvalICache;
   13932             return True;
   13933          }
   13934          /* We don't know what it is. */
   13935          return False;
   13936          /*NOTREACHED*/
   13937       }
   13938    }
   13939 
   13940    /* ----------------------------------------------------------- */
   13941 
   13942    /* Main ARM64 instruction decoder starts here. */
   13943 
   13944    Bool ok = False;
   13945 
   13946    /* insn[28:25] determines the top-level grouping, so let's start
   13947       off with that.
   13948 
   13949       For all of these dis_ARM64_ functions, we pass *dres with the
   13950       normal default results "insn OK, 4 bytes long, keep decoding" so
   13951       they don't need to change it.  However, decodes of control-flow
   13952       insns may cause *dres to change.
   13953    */
   13954    switch (INSN(28,25)) {
   13955       case BITS4(1,0,0,0): case BITS4(1,0,0,1):
   13956          // Data processing - immediate
   13957          ok = dis_ARM64_data_processing_immediate(dres, insn);
   13958          break;
   13959       case BITS4(1,0,1,0): case BITS4(1,0,1,1):
   13960          // Branch, exception generation and system instructions
   13961          ok = dis_ARM64_branch_etc(dres, insn, archinfo);
   13962          break;
   13963       case BITS4(0,1,0,0): case BITS4(0,1,1,0):
   13964       case BITS4(1,1,0,0): case BITS4(1,1,1,0):
   13965          // Loads and stores
   13966          ok = dis_ARM64_load_store(dres, insn);
   13967          break;
   13968       case BITS4(0,1,0,1): case BITS4(1,1,0,1):
   13969          // Data processing - register
   13970          ok = dis_ARM64_data_processing_register(dres, insn);
   13971          break;
   13972       case BITS4(0,1,1,1): case BITS4(1,1,1,1):
   13973          // Data processing - SIMD and floating point
   13974          ok = dis_ARM64_simd_and_fp(dres, insn);
   13975          break;
   13976       case BITS4(0,0,0,0): case BITS4(0,0,0,1):
   13977       case BITS4(0,0,1,0): case BITS4(0,0,1,1):
   13978          // UNALLOCATED
   13979          break;
   13980       default:
   13981          vassert(0); /* Can't happen */
   13982    }
   13983 
   13984    /* If the next-level down decoders failed, make sure |dres| didn't
   13985       get changed. */
   13986    if (!ok) {
   13987       vassert(dres->whatNext    == Dis_Continue);
   13988       vassert(dres->len         == 4);
   13989       vassert(dres->continueAt  == 0);
   13990       vassert(dres->jk_StopHere == Ijk_INVALID);
   13991    }
   13992 
   13993    return ok;
   13994 
   13995 #  undef INSN
   13996 }
   13997 
   13998 
   13999 /*------------------------------------------------------------*/
   14000 /*--- Top-level fn                                         ---*/
   14001 /*------------------------------------------------------------*/
   14002 
   14003 /* Disassemble a single instruction into IR.  The instruction
   14004    is located in host memory at &guest_code[delta]. */
   14005 
   14006 DisResult disInstr_ARM64 ( IRSB*        irsb_IN,
   14007                            Bool         (*resteerOkFn) ( void*, Addr ),
   14008                            Bool         resteerCisOk,
   14009                            void*        callback_opaque,
   14010                            const UChar* guest_code_IN,
   14011                            Long         delta_IN,
   14012                            Addr         guest_IP,
   14013                            VexArch      guest_arch,
   14014                            const VexArchInfo* archinfo,
   14015                            const VexAbiInfo*  abiinfo,
   14016                            VexEndness   host_endness_IN,
   14017                            Bool         sigill_diag_IN )
   14018 {
   14019    DisResult dres;
   14020    vex_bzero(&dres, sizeof(dres));
   14021 
   14022    /* Set globals (see top of this file) */
   14023    vassert(guest_arch == VexArchARM64);
   14024 
   14025    irsb                = irsb_IN;
   14026    host_endness        = host_endness_IN;
   14027    guest_PC_curr_instr = (Addr64)guest_IP;
   14028 
   14029    /* Sanity checks */
   14030    /* (x::UInt - 2) <= 15   ===   x >= 2 && x <= 17 (I hope) */
   14031    vassert((archinfo->arm64_dMinLine_lg2_szB - 2) <= 15);
   14032    vassert((archinfo->arm64_iMinLine_lg2_szB - 2) <= 15);
   14033 
   14034    /* Try to decode */
   14035    Bool ok = disInstr_ARM64_WRK( &dres,
   14036                                  resteerOkFn, resteerCisOk, callback_opaque,
   14037                                  &guest_code_IN[delta_IN],
   14038                                  archinfo, abiinfo );
   14039    if (ok) {
   14040       /* All decode successes end up here. */
   14041       vassert(dres.len == 4 || dres.len == 20);
   14042       switch (dres.whatNext) {
   14043          case Dis_Continue:
   14044             putPC( mkU64(dres.len + guest_PC_curr_instr) );
   14045             break;
   14046          case Dis_ResteerU:
   14047          case Dis_ResteerC:
   14048             putPC(mkU64(dres.continueAt));
   14049             break;
   14050          case Dis_StopHere:
   14051             break;
   14052          default:
   14053             vassert(0);
   14054       }
   14055       DIP("\n");
   14056    } else {
   14057       /* All decode failures end up here. */
   14058       if (sigill_diag_IN) {
   14059          Int   i, j;
   14060          UChar buf[64];
   14061          UInt  insn
   14062                   = getUIntLittleEndianly( &guest_code_IN[delta_IN] );
   14063          vex_bzero(buf, sizeof(buf));
   14064          for (i = j = 0; i < 32; i++) {
   14065             if (i > 0) {
   14066               if ((i & 7) == 0) buf[j++] = ' ';
   14067               else if ((i & 3) == 0) buf[j++] = '\'';
   14068             }
   14069             buf[j++] = (insn & (1<<(31-i))) ? '1' : '0';
   14070          }
   14071          vex_printf("disInstr(arm64): unhandled instruction 0x%08x\n", insn);
   14072          vex_printf("disInstr(arm64): %s\n", buf);
   14073       }
   14074 
   14075       /* Tell the dispatcher that this insn cannot be decoded, and so
   14076          has not been executed, and (is currently) the next to be
   14077          executed.  PC should be up-to-date since it is made so at the
   14078          start of each insn, but nevertheless be paranoid and update
   14079          it again right now. */
   14080       putPC( mkU64(guest_PC_curr_instr) );
   14081       dres.len         = 0;
   14082       dres.whatNext    = Dis_StopHere;
   14083       dres.jk_StopHere = Ijk_NoDecode;
   14084       dres.continueAt  = 0;
   14085    }
   14086    return dres;
   14087 }
   14088 
   14089 
   14090 /*--------------------------------------------------------------------*/
   14091 /*--- end                                       guest_arm64_toIR.c ---*/
   14092 /*--------------------------------------------------------------------*/
   14093