Home | History | Annotate | Download | only in generators
      1 # Copyright 2016 The Gemmlowp Authors. All rights reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
      4 # you may not use this file except in compliance with the License.
      5 # You may obtain a copy of the License at
      6 #
      7 #    http://www.apache.org/licenses/LICENSE-2.0
      8 #
      9 # Unless required by applicable law or agreed to in writing, software
     10 # distributed under the License is distributed on an "AS IS" BASIS,
     11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
     14 """64bit ARM/NEON assembly emitter.
     15 
     16 Used by code generators to produce ARM assembly with NEON simd code.
     17 Provides tools for easier register management: named register variable
     18 allocation/deallocation, and offers a more procedural/structured approach
     19 to generating assembly.
     20 
     21 """
     22 
     23 _WIDE_TYPES = {
     24     8: 16,
     25     16: 32,
     26     32: 64,
     27     '8': '16',
     28     '16': '32',
     29     '32': '64',
     30     'i8': 'i16',
     31     'i16': 'i32',
     32     'i32': 'i64',
     33     'u8': 'u16',
     34     'u16': 'u32',
     35     'u32': 'u64',
     36     's8': 's16',
     37     's16': 's32',
     38     's32': 's64'
     39 }
     40 
     41 _NARROW_TYPES = {
     42     64: 32,
     43     32: 16,
     44     16: 8,
     45     '64': '32',
     46     '32': '16',
     47     '16': '8',
     48     'i64': 'i32',
     49     'i32': 'i16',
     50     'i16': 'i8',
     51     'u64': 'u32',
     52     'u32': 'u16',
     53     'u16': 'u8',
     54     's64': 's32',
     55     's32': 's16',
     56     's16': 's8'
     57 }
     58 
     59 _TYPE_BITS = {
     60     8: 8,
     61     16: 16,
     62     32: 32,
     63     64: 64,
     64     '8': 8,
     65     '16': 16,
     66     '32': 32,
     67     '64': 64,
     68     'i8': 8,
     69     'i16': 16,
     70     'i32': 32,
     71     'i64': 64,
     72     'u8': 8,
     73     'u16': 16,
     74     'u32': 32,
     75     'u64': 64,
     76     's8': 8,
     77     's16': 16,
     78     's32': 32,
     79     's64': 64,
     80     'f32': 32,
     81     'f64': 64,
     82     'b': 8,
     83     'h': 16,
     84     's': 32,
     85     'd': 64
     86 }
     87 
     88 
     89 class Error(Exception):
     90   """Module level error."""
     91 
     92 
     93 class RegisterAllocationError(Error):
     94   """Cannot alocate registers."""
     95 
     96 
     97 class LaneError(Error):
     98   """Wrong lane number."""
     99 
    100 
    101 class RegisterSubtypeError(Error):
    102   """The register needs to be lane-typed."""
    103 
    104 
    105 class ArgumentError(Error):
    106   """Wrong argument."""
    107 
    108 
    109 def _AppendType(type_name, register):
    110   """Calculates sizes and attaches the type information to the register."""
    111   if register.register_type is not 'v':
    112     raise ArgumentError('Only vector registers can have type appended.')
    113 
    114   if type_name in set([8, '8', 'i8', 's8', 'u8']):
    115     subtype = 'b'
    116     subtype_bits = 8
    117   elif type_name in set([16, '16', 'i16', 's16', 'u16']):
    118     subtype = 'h'
    119     subtype_bits = 16
    120   elif type_name in set([32, '32', 'i32', 's32', 'u32', 'f32']):
    121     subtype = 's'
    122     subtype_bits = 32
    123   elif type_name in set([64, '64', 'i64', 's64', 'u64', 'f64']):
    124     subtype = 'd'
    125     subtype_bits = 64
    126   else:
    127     raise ArgumentError('Unknown type: %s' % type_name)
    128 
    129   new_register = register.Copy()
    130   new_register.register_subtype = subtype
    131   new_register.register_subtype_count = register.register_bits / subtype_bits
    132   return new_register
    133 
    134 
    135 def _UnsignedType(type_name):
    136   return type_name in set(['u8', 'u16', 'u32', 'u64'])
    137 
    138 
    139 def _FloatType(type_name):
    140   return type_name in set(['f32', 'f64'])
    141 
    142 
    143 def _WideType(type_name):
    144   if type_name in _WIDE_TYPES.keys():
    145     return _WIDE_TYPES[type_name]
    146   else:
    147     raise ArgumentError('No wide type for: %s' % type_name)
    148 
    149 
    150 def _NarrowType(type_name):
    151   if type_name in _NARROW_TYPES.keys():
    152     return _NARROW_TYPES[type_name]
    153   else:
    154     raise ArgumentError('No narrow type for: %s' % type_name)
    155 
    156 
    157 def _LoadStoreSize(register):
    158   if register.lane is None:
    159     return register.register_bits
    160   else:
    161     return register.lane_bits
    162 
    163 
    164 def _MakeCompatibleDown(reg_1, reg_2, reg_3):
    165   bits = min([reg_1.register_bits, reg_2.register_bits, reg_3.register_bits])
    166   return (_Cast(bits, reg_1), _Cast(bits, reg_2), _Cast(bits, reg_3))
    167 
    168 
    169 def _MakeCompatibleUp(reg_1, reg_2, reg_3):
    170   bits = max([reg_1.register_bits, reg_2.register_bits, reg_3.register_bits])
    171   return (_Cast(bits, reg_1), _Cast(bits, reg_2), _Cast(bits, reg_3))
    172 
    173 
    174 def _Cast(bits, reg):
    175   if reg.register_bits is bits:
    176     return reg
    177   else:
    178     new_reg = reg.Copy()
    179     new_reg.register_bits = bits
    180     return new_reg
    181 
    182 
    183 def _TypeBits(type_name):
    184   if type_name in _TYPE_BITS.keys():
    185     return _TYPE_BITS[type_name]
    186   else:
    187     raise ArgumentError('Unknown type: %s' % type_name)
    188 
    189 
    190 def _RegisterList(list_type, registers):
    191   lanes = list(set([register.lane for register in registers]))
    192   if len(lanes) > 1:
    193     raise ArgumentError('Cannot mix lanes on a register list.')
    194   typed_registers = [_AppendType(list_type, register) for register in registers]
    195 
    196   if lanes[0] is None:
    197     return '{%s}' % ', '.join(map(str, typed_registers))
    198   elif lanes[0] is -1:
    199     raise ArgumentError('Cannot construct a list with all lane indexing.')
    200   else:
    201     typed_registers_nolane = [register.Copy() for register in typed_registers]
    202     for register in typed_registers_nolane:
    203       register.lane = None
    204       register.register_subtype_count = None
    205     return '{%s}[%d]' % (', '.join(map(str, typed_registers_nolane)), lanes[0])
    206 
    207 
    208 class _GeneralRegister(object):
    209   """Arm v8 general register: (x|w)n."""
    210 
    211   def __init__(self,
    212                register_bits,
    213                number,
    214                dereference=False,
    215                dereference_increment=False):
    216     self.register_type = 'r'
    217     self.register_bits = register_bits
    218     self.number = number
    219     self.dereference = dereference
    220     self.dereference_increment = dereference_increment
    221 
    222   def Copy(self):
    223     return _GeneralRegister(self.register_bits, self.number, self.dereference,
    224                             self.dereference_increment)
    225 
    226   def __repr__(self):
    227     if self.register_bits is 64:
    228       text = 'x%d' % self.number
    229     elif self.register_bits <= 32:
    230       text = 'w%d' % self.number
    231     else:
    232       raise RegisterSubtypeError('Wrong bits (%d) for general register: %d' %
    233                                  (self.register_bits, self.number))
    234     if self.dereference:
    235       return '[%s]' % text
    236     else:
    237       return text
    238 
    239 
    240 class _MappedParameter(object):
    241   """Object representing a C variable mapped to a register."""
    242 
    243   def __init__(self,
    244                name,
    245                register_bits=64,
    246                dereference=False,
    247                dereference_increment=False):
    248     self.name = name
    249     self.register_bits = register_bits
    250     self.dereference = dereference
    251     self.dereference_increment = dereference_increment
    252 
    253   def Copy(self):
    254     return _MappedParameter(self.name, self.register_bits, self.dereference,
    255                             self.dereference_increment)
    256 
    257   def __repr__(self):
    258     if self.register_bits is None:
    259       text = '%%[%s]' % self.name
    260     elif self.register_bits is 64:
    261       text = '%%x[%s]' % self.name
    262     elif self.register_bits <= 32:
    263       text = '%%w[%s]' % self.name
    264     else:
    265       raise RegisterSubtypeError('Wrong bits (%d) for mapped parameter: %s' %
    266                                  (self.register_bits, self.name))
    267     if self.dereference:
    268       return '[%s]' % text
    269     else:
    270       return text
    271 
    272 
    273 class _VectorRegister(object):
    274   """Arm v8 vector register Vn.TT."""
    275 
    276   def __init__(self,
    277                register_bits,
    278                number,
    279                register_subtype=None,
    280                register_subtype_count=None,
    281                lane=None,
    282                lane_bits=None):
    283     self.register_type = 'v'
    284     self.register_bits = register_bits
    285     self.number = number
    286     self.register_subtype = register_subtype
    287     self.register_subtype_count = register_subtype_count
    288     self.lane = lane
    289     self.lane_bits = lane_bits
    290 
    291   def Copy(self):
    292     return _VectorRegister(self.register_bits, self.number,
    293                            self.register_subtype, self.register_subtype_count,
    294                            self.lane, self.lane_bits)
    295 
    296   def __repr__(self):
    297     if self.register_subtype is None:
    298       raise RegisterSubtypeError('Register: %s%d has no lane types defined.' %
    299                                  (self.register_type, self.number))
    300     if (self.register_subtype_count is None or (self.lane is not None and
    301                                                 self.lane is not -1)):
    302       typed_name = '%s%d.%s' % (self.register_type, self.number,
    303                                 self.register_subtype)
    304     else:
    305       typed_name = '%s%d.%d%s' % (self.register_type, self.number,
    306                                   self.register_subtype_count,
    307                                   self.register_subtype)
    308 
    309     if self.lane is None or self.lane is -1:
    310       return typed_name
    311     elif self.lane >= 0 and self.lane < self.register_subtype_count:
    312       return '%s[%d]' % (typed_name, self.lane)
    313     else:
    314       raise LaneError('Wrong lane: %d for: %s' % (self.lane, typed_name))
    315 
    316 
    317 class _ImmediateConstant(object):
    318 
    319   def __init__(self, value):
    320     self.register_type = 'i'
    321     self.value = value
    322 
    323   def Copy(self):
    324     return _ImmediateConstant(self.value)
    325 
    326   def __repr__(self):
    327     return '#%d' % self.value
    328 
    329 
    330 class _NeonRegisters64Bit(object):
    331   """Utility that keeps track of used 32bit ARM/NEON registers."""
    332 
    333   def __init__(self):
    334     self.vector = set()
    335     self.vector_ever = set()
    336     self.general = set()
    337     self.general_ever = set()
    338     self.parameters = dict()
    339     self.output_parameters = dict()
    340 
    341   def MapParameter(self, parameter, parameter_value=None):
    342     if not parameter_value:
    343       parameter_value = parameter
    344     self.parameters[parameter] = (parameter_value, 'r')
    345     return _MappedParameter(parameter)
    346 
    347   def MapMemoryParameter(self, parameter, parameter_value=None):
    348     if not parameter_value:
    349       parameter_value = parameter
    350     self.parameters[parameter] = (parameter_value, 'm')
    351     return _MappedParameter(parameter)
    352 
    353   def MapOutputParameter(self, parameter, parameter_value=None):
    354     if not parameter_value:
    355       parameter_value = parameter
    356     self.output_parameters[parameter] = (parameter_value, '+r')
    357     return _MappedParameter(parameter)
    358 
    359   def _VectorRegisterNum(self, min_val=0):
    360     for i in range(min_val, 32):
    361       if i not in self.vector:
    362         self.vector.add(i)
    363         self.vector_ever.add(i)
    364         return i
    365     raise RegisterAllocationError('Not enough vector registers.')
    366 
    367   def DoubleRegister(self, min_val=0):
    368     return _VectorRegister(64, self._VectorRegisterNum(min_val))
    369 
    370   def QuadRegister(self, min_val=0):
    371     return _VectorRegister(128, self._VectorRegisterNum(min_val))
    372 
    373   def GeneralRegister(self):
    374     for i in range(0, 30):
    375       if i not in self.general:
    376         self.general.add(i)
    377         self.general_ever.add(i)
    378         return _GeneralRegister(64, i)
    379     raise RegisterAllocationError('Not enough general registers.')
    380 
    381   def MappedParameters(self):
    382     return [x for x in self.parameters.items()]
    383 
    384   def MappedOutputParameters(self):
    385     return [x for x in self.output_parameters.items()]
    386 
    387   def Clobbers(self):
    388     return (
    389         ['x%d' % i
    390          for i in self.general_ever] + ['v%d' % i for i in self.vector_ever])
    391 
    392   def FreeRegister(self, register):
    393     if isinstance(register, _MappedParameter):
    394       return
    395 
    396     if register.register_type == 'v':
    397       assert register.number in self.vector
    398       self.vector.remove(register.number)
    399     elif register.register_type == 'r':
    400       assert register.number in self.general
    401       self.general.remove(register.number)
    402     else:
    403       raise RegisterAllocationError('Register not allocated: %s%d' %
    404                                     (register.register_type, register.number))
    405 
    406   def FreeRegisters(self, registers):
    407     for register in registers:
    408       self.FreeRegister(register)
    409 
    410 
    411 class NeonEmitter64(object):
    412   """Emits ARM/NEON 64bit assembly opcodes."""
    413 
    414   def __init__(self, debug=False):
    415     self.ops = {}
    416     self.indent = ''
    417     self.debug = debug
    418 
    419   def PushIndent(self, delta_indent='  '):
    420     self.indent += delta_indent
    421 
    422   def PopIndent(self, delta=2):
    423     self.indent = self.indent[:-delta]
    424 
    425   def EmitIndented(self, what):
    426     print self.indent + what
    427 
    428   def PushOp(self, op):
    429     if op in self.ops.keys():
    430       self.ops[op] += 1
    431     else:
    432       self.ops[op] = 1
    433 
    434   def ClearCounters(self):
    435     self.ops.clear()
    436 
    437   def EmitNewline(self):
    438     print ''
    439 
    440   def EmitPreprocessor1(self, op, param):
    441     print '#%s %s' % (op, param)
    442 
    443   def EmitPreprocessor(self, op):
    444     print '#%s' % op
    445 
    446   def EmitInclude(self, include):
    447     self.EmitPreprocessor1('include', include)
    448 
    449   def EmitCall1(self, function, param):
    450     self.EmitIndented('%s(%s);' % (function, param))
    451 
    452   def EmitAssert(self, assert_expression):
    453     if self.debug:
    454       self.EmitCall1('assert', assert_expression)
    455 
    456   def EmitHeaderBegin(self, header_name, includes):
    457     self.EmitPreprocessor1('ifndef', (header_name + '_H_').upper())
    458     self.EmitPreprocessor1('define', (header_name + '_H_').upper())
    459     self.EmitNewline()
    460     if includes:
    461       for include in includes:
    462         self.EmitInclude(include)
    463       self.EmitNewline()
    464 
    465   def EmitHeaderEnd(self):
    466     self.EmitPreprocessor('endif')
    467 
    468   def EmitCode(self, code):
    469     self.EmitIndented('%s;' % code)
    470 
    471   def EmitFunctionBeginA(self, function_name, params, return_type):
    472     self.EmitIndented('%s %s(%s) {' %
    473                       (return_type, function_name,
    474                        ', '.join(['%s %s' % (t, n) for (t, n) in params])))
    475     self.PushIndent()
    476 
    477   def EmitFunctionEnd(self):
    478     self.PopIndent()
    479     self.EmitIndented('}')
    480 
    481   def EmitAsmBegin(self):
    482     self.EmitIndented('asm volatile(')
    483     self.PushIndent()
    484 
    485   def EmitAsmMapping(self, elements):
    486     if elements:
    487       self.EmitIndented(': ' + ', '.join(
    488           ['[%s] "%s"(%s)' % (k, v[1], v[0]) for (k, v) in elements]))
    489     else:
    490       self.EmitIndented(':')
    491 
    492   def EmitClobbers(self, elements):
    493     if elements:
    494       self.EmitIndented(': ' + ', '.join(['"%s"' % c for c in elements]))
    495     else:
    496       self.EmitIndented(':')
    497 
    498   def EmitAsmEnd(self, registers):
    499     self.EmitAsmMapping(registers.MappedOutputParameters())
    500     self.EmitAsmMapping(registers.MappedParameters())
    501     self.EmitClobbers(registers.Clobbers() + ['cc', 'memory'])
    502     self.PopIndent()
    503     self.EmitIndented(');')
    504 
    505   def EmitComment(self, comment):
    506     self.EmitIndented('// ' + comment)
    507 
    508   def EmitNumericalLabel(self, label):
    509     self.EmitIndented('"%d:"' % label)
    510 
    511   def EmitOp1(self, op, param1):
    512     self.PushOp(op)
    513     self.EmitIndented('"%s %s\\n"' % (op, param1))
    514 
    515   def EmitOp2(self, op, param1, param2):
    516     self.PushOp(op)
    517     self.EmitIndented('"%s %s, %s\\n"' % (op, param1, param2))
    518 
    519   def EmitOp3(self, op, param1, param2, param3):
    520     self.PushOp(op)
    521     self.EmitIndented('"%s %s, %s, %s\\n"' % (op, param1, param2, param3))
    522 
    523   def EmitAdd(self, destination, source, param):
    524     self.EmitOp3('add', destination, source, param)
    525 
    526   def EmitSubs(self, destination, source, param):
    527     self.EmitOp3('subs', destination, source, param)
    528 
    529   def EmitSub(self, destination, source, param):
    530     self.EmitOp3('sub', destination, source, param)
    531 
    532   def EmitMul(self, destination, source, param):
    533     self.EmitOp3('mul', destination, source, param)
    534 
    535   def EmitMov(self, param1, param2):
    536     self.EmitOp2('mov', param1, param2)
    537 
    538   def EmitVMovl(self, mov_type, destination, source):
    539     wide_type = _WideType(mov_type)
    540     destination = _AppendType(wide_type, destination)
    541     source = _AppendType(mov_type, _Cast(source.register_bits / 2, source))
    542     if _UnsignedType(mov_type):
    543       self.EmitOp2('uxtl', destination, source)
    544     else:
    545       self.EmitOp2('sxtl', destination, source)
    546 
    547   def EmitVMovl2(self, mov_type, destination_1, destination_2, source):
    548     wide_type = _WideType(mov_type)
    549     if (destination_1.register_bits != source.register_bits or
    550         destination_2.register_bits != source.register_bits):
    551       raise ArgumentError('Register sizes do not match.')
    552     if _UnsignedType(mov_type):
    553       self.EmitOp2('uxtl2',
    554                    _AppendType(wide_type, destination_2),
    555                    _AppendType(mov_type, source))
    556       self.EmitOp2('uxtl',
    557                    _AppendType(wide_type, destination_1),
    558                    _AppendType(mov_type,
    559                                _Cast(source.register_bits / 2, source)))
    560     else:
    561       self.EmitOp2('sxtl2',
    562                    _AppendType(wide_type, destination_2),
    563                    _AppendType(mov_type, source))
    564       self.EmitOp2('sxtl',
    565                    _AppendType(wide_type, destination_1),
    566                    _AppendType(mov_type,
    567                                _Cast(source.register_bits / 2, source)))
    568 
    569   def EmitVMax(self, max_type, destination, source_1, source_2):
    570     if _UnsignedType(max_type):
    571       self.EmitOp3('umax',
    572                    _AppendType(max_type, destination),
    573                    _AppendType(max_type, source_1),
    574                    _AppendType(max_type, source_2))
    575     else:
    576       self.EmitOp3('smax',
    577                    _AppendType(max_type, destination),
    578                    _AppendType(max_type, source_1),
    579                    _AppendType(max_type, source_2))
    580 
    581   def EmitVMin(self, min_type, destination, source_1, source_2):
    582     if _UnsignedType(min_type):
    583       self.EmitOp3('umin',
    584                    _AppendType(min_type, destination),
    585                    _AppendType(min_type, source_1),
    586                    _AppendType(min_type, source_2))
    587     else:
    588       self.EmitOp3('smin',
    589                    _AppendType(min_type, destination),
    590                    _AppendType(min_type, source_1),
    591                    _AppendType(min_type, source_2))
    592 
    593   def EmitBeqBack(self, label):
    594     self.EmitOp1('beq', '%db' % label)
    595 
    596   def EmitBeqFront(self, label):
    597     self.EmitOp1('beq', '%df' % label)
    598 
    599   def EmitBgtBack(self, label):
    600     self.EmitOp1('bgt', '%db' % label)
    601 
    602   def EmitBgtFront(self, label):
    603     self.EmitOp1('bgt', '%df' % label)
    604 
    605   def EmitBleBack(self, label):
    606     self.EmitOp1('ble', '%db' % label)
    607 
    608   def EmitBleFront(self, label):
    609     self.EmitOp1('ble', '%df' % label)
    610 
    611   def EmitBneBack(self, label):
    612     self.EmitOp1('bne', '%db' % label)
    613 
    614   def EmitBneFront(self, label):
    615     self.EmitOp1('bne', '%df' % label)
    616 
    617   def EmitVAdd(self, add_type, destination, source_1, source_2):
    618     destination, source_1, source_2 = _MakeCompatibleDown(destination, source_1,
    619                                                           source_2)
    620     if _FloatType(add_type):
    621       self.EmitOp3('fadd',
    622                    _AppendType(add_type, destination),
    623                    _AppendType(add_type, source_1),
    624                    _AppendType(add_type, source_2))
    625     else:
    626       self.EmitOp3('add',
    627                    _AppendType(add_type, destination),
    628                    _AppendType(add_type, source_1),
    629                    _AppendType(add_type, source_2))
    630 
    631   def EmitVAddw(self, add_type, destination, source_1, source_2):
    632     wide_type = _WideType(add_type)
    633     destination = _AppendType(wide_type, destination)
    634     source_1 = _AppendType(wide_type, source_1)
    635     source_2 = _AppendType(add_type, source_2)
    636     if _UnsignedType(add_type):
    637       self.EmitOp3('uaddw', destination, source_1, source_2)
    638     else:
    639       self.EmitOp3('saddw', destination, source_1, source_2)
    640 
    641   def EmitVSub(self, sub_type, destination, source_1, source_2):
    642     destination, source_1, source_2 = _MakeCompatibleDown(destination, source_1,
    643                                                           source_2)
    644     if _FloatType(sub_type):
    645       self.EmitOp3('fsub',
    646                    _AppendType(sub_type, destination),
    647                    _AppendType(sub_type, source_1),
    648                    _AppendType(sub_type, source_2))
    649     else:
    650       self.EmitOp3('sub',
    651                    _AppendType(sub_type, destination),
    652                    _AppendType(sub_type, source_1),
    653                    _AppendType(sub_type, source_2))
    654 
    655   def EmitVCvt(self, cvt_to, cvt_from, destination, source):
    656     if cvt_to == 'f32' and cvt_from == 's32':
    657       self.EmitOp2('scvtf',
    658                    _AppendType('f32', destination), _AppendType('s32', source))
    659     elif cvt_to == 'f32' and cvt_from == 'u32':
    660       self.EmitOp2('ucvtf',
    661                    _AppendType('f32', destination), _AppendType('u32', source))
    662     elif cvt_to == 's32' and cvt_from == 'f32':
    663       self.EmitOp2('fcvtzs',
    664                    _AppendType('s32', destination), _AppendType('f32', source))
    665     else:
    666       raise ArgumentError('Convert not supported, to: %s from: %s' % (cvt_to,
    667                                                                       cvt_from))
    668 
    669   def EmitVDup(self, dup_type, destination, source):
    670     if (isinstance(source, _GeneralRegister) or
    671         isinstance(source, _MappedParameter)):
    672       self.EmitOp2('dup',
    673                    _AppendType(dup_type, destination),
    674                    _Cast(_TypeBits(dup_type), source))
    675     else:
    676       self.EmitOp2('dup',
    677                    _AppendType(dup_type, destination),
    678                    _AppendType(dup_type, source))
    679 
    680   def EmitVMov(self, mov_type, destination, source):
    681     if isinstance(source, _ImmediateConstant):
    682       self.EmitOp2('movi', _AppendType(mov_type, destination), source)
    683     elif (isinstance(source, _GeneralRegister) or
    684           isinstance(source, _MappedParameter)):
    685       self.EmitOp2('mov',
    686                    _AppendType(mov_type, destination),
    687                    _Cast(_TypeBits(mov_type), source))
    688     else:
    689       self.EmitOp2('mov', _AppendType(8, destination), _AppendType(8, source))
    690 
    691   def EmitVQmovn(self, mov_type, destination, source):
    692     narrow_type = _NarrowType(mov_type)
    693     if destination.register_bits * 2 == source.register_bits:
    694       self.EmitOp2('sqxtn',
    695                    _AppendType(narrow_type, destination),
    696                    _AppendType(mov_type, source))
    697     elif destination.register_bits == source.register_bits:
    698       self.EmitOp2('sqxtn',
    699                    _AppendType(narrow_type,
    700                                _Cast(destination.register_bits / 2,
    701                                      destination)),
    702                    _AppendType(mov_type, source))
    703 
    704   def EmitVQmovn2(self, mov_type, destination, source_1, source_2):
    705     narrow_type = _NarrowType(mov_type)
    706     if (destination.register_bits != source_1.register_bits or
    707         destination.register_bits != source_2.register_bits):
    708       raise ArgumentError('Register sizes do not match.')
    709     self.EmitOp2('sqxtn',
    710                  _AppendType(narrow_type,
    711                              _Cast(destination.register_bits / 2, destination)),
    712                  _AppendType(mov_type, source_1))
    713     self.EmitOp2('sqxtn2',
    714                  _AppendType(narrow_type, destination),
    715                  _AppendType(mov_type, source_2))
    716 
    717   def EmitVQmovun(self, mov_type, destination, source):
    718     narrow_type = _NarrowType(mov_type)
    719     if destination.register_bits * 2 == source.register_bits:
    720       self.EmitOp2('sqxtun',
    721                    _AppendType(narrow_type, destination),
    722                    _AppendType(mov_type, source))
    723     elif destination.register_bits == source.register_bits:
    724       self.EmitOp2('sqxtun',
    725                    _AppendType(narrow_type,
    726                                _Cast(destination.register_bits / 2,
    727                                      destination)),
    728                    _AppendType(mov_type, source))
    729 
    730   def EmitVQmovun2(self, mov_type, destination, source_1, source_2):
    731     narrow_type = _NarrowType(mov_type)
    732     if (destination.register_bits != source_1.register_bits or
    733         destination.register_bits != source_2.register_bits):
    734       raise ArgumentError('Register sizes do not match.')
    735     self.EmitOp2('sqxtun',
    736                  _AppendType(narrow_type,
    737                              _Cast(destination.register_bits / 2, destination)),
    738                  _AppendType(mov_type, source_1))
    739     self.EmitOp2('sqxtun2',
    740                  _AppendType(narrow_type, destination),
    741                  _AppendType(mov_type, source_2))
    742 
    743   def EmitVMul(self, mul_type, destination, source_1, source_2):
    744     destination, source_1, source_2 = _MakeCompatibleDown(destination, source_1,
    745                                                           source_2)
    746     if _FloatType(mul_type):
    747       self.EmitOp3('fmul',
    748                    _AppendType(mul_type, destination),
    749                    _AppendType(mul_type, source_1),
    750                    _AppendType(mul_type, source_2))
    751     else:
    752       self.EmitOp3('mul',
    753                    _AppendType(mul_type, destination),
    754                    _AppendType(mul_type, source_1),
    755                    _AppendType(mul_type, source_2))
    756 
    757   def EmitVMulScalar(self, mul_type, destination, source_1, source_2):
    758     self.EmitOp3('mul',
    759                  _AppendType(mul_type, destination),
    760                  _AppendType(mul_type, source_1),
    761                  _AppendType(mul_type, source_2))
    762 
    763   def EmitVMull(self, mul_type, destination, source_1, source_2):
    764     wide_type = _WideType(mul_type)
    765     if _UnsignedType(mul_type):
    766       self.EmitOp3('umull',
    767                    _AppendType(wide_type, destination),
    768                    _AppendType(mul_type, source_1),
    769                    _AppendType(mul_type, source_2))
    770     else:
    771       self.EmitOp3('smull',
    772                    _AppendType(wide_type, destination),
    773                    _AppendType(mul_type, source_1),
    774                    _AppendType(mul_type, source_2))
    775 
    776   def EmitVPadd(self, add_type, destination, source_1, source_2):
    777     self.EmitOp3('addp',
    778                  _AppendType(add_type, destination),
    779                  _AppendType(add_type, source_1),
    780                  _AppendType(add_type, source_2))
    781 
    782   def EmitVPaddl(self, add_type, destination, source):
    783     wide_type = _WideType(add_type)
    784     if _UnsignedType(add_type):
    785       self.EmitOp2('uaddlp',
    786                    _AppendType(wide_type, destination),
    787                    _AppendType(add_type, source))
    788     else:
    789       self.EmitOp2('saddlp',
    790                    _AppendType(wide_type, destination),
    791                    _AppendType(add_type, source))
    792 
    793   def EmitVPadal(self, add_type, destination, source):
    794     wide_type = _WideType(add_type)
    795     if _UnsignedType(add_type):
    796       self.EmitOp2('uadalp',
    797                    _AppendType(wide_type, destination),
    798                    _AppendType(add_type, source))
    799     else:
    800       self.EmitOp2('sadalp',
    801                    _AppendType(wide_type, destination),
    802                    _AppendType(add_type, source))
    803 
    804   def EmitLdr(self, register, value):
    805     self.EmitOp2('ldr', _Cast(32, register), _Cast(None, value))
    806 
    807   def EmitVLoad(self, load_no, load_type, destination, source):
    808     self.EmitVLoadA(load_no, load_type, [destination], source)
    809 
    810   def EmitVLoadA(self, load_no, load_type, destinations, source):
    811     if source.dereference_increment:
    812       increment = sum(
    813           [_LoadStoreSize(destination) for destination in destinations]) / 8
    814       self.EmitVLoadAPostIncrement(load_no, load_type, destinations, source,
    815                                    self.ImmediateConstant(increment))
    816     else:
    817       self.EmitVLoadAPostIncrement(load_no, load_type, destinations, source,
    818                                    None)
    819 
    820   def EmitVLoadAPostIncrement(self, load_no, load_type, destinations, source,
    821                               increment):
    822     """Generate assembly to load memory to registers and increment source."""
    823     if len(destinations) == 1 and destinations[0].lane is -1:
    824       destination = '{%s}' % _AppendType(load_type, destinations[0])
    825       if increment:
    826         self.EmitOp3('ld%dr' % load_no, destination, source, increment)
    827       else:
    828         self.EmitOp2('ld%dr' % load_no, destination, source)
    829       return
    830 
    831     destination_list = _RegisterList(load_type, destinations)
    832     if increment:
    833       self.EmitOp3('ld%d' % load_no, destination_list, source, increment)
    834     else:
    835       self.EmitOp2('ld%d' % load_no, destination_list, source)
    836 
    837   def EmitVLoadAE(self,
    838                   load_type,
    839                   elem_count,
    840                   destinations,
    841                   source,
    842                   alignment=None):
    843     """Generate assembly to load an array of elements of given size."""
    844     bits_to_load = load_type * elem_count
    845     min_bits = min([destination.register_bits for destination in destinations])
    846     max_bits = max([destination.register_bits for destination in destinations])
    847 
    848     if min_bits is not max_bits:
    849       raise ArgumentError('Cannot mix double and quad loads.')
    850 
    851     if len(destinations) * min_bits < bits_to_load:
    852       raise ArgumentError('To few destinations: %d to load %d bits.' %
    853                           (len(destinations), bits_to_load))
    854 
    855     leftover_loaded = 0
    856     while bits_to_load > 0:
    857       if bits_to_load >= 4 * min_bits:
    858         self.EmitVLoadA(1, 32, destinations[:4],
    859                         self.DereferenceIncrement(source, alignment))
    860         bits_to_load -= 4 * min_bits
    861         destinations = destinations[4:]
    862       elif bits_to_load >= 3 * min_bits:
    863         self.EmitVLoadA(1, 32, destinations[:3],
    864                         self.DereferenceIncrement(source, alignment))
    865         bits_to_load -= 3 * min_bits
    866         destinations = destinations[3:]
    867       elif bits_to_load >= 2 * min_bits:
    868         self.EmitVLoadA(1, 32, destinations[:2],
    869                         self.DereferenceIncrement(source, alignment))
    870         bits_to_load -= 2 * min_bits
    871         destinations = destinations[2:]
    872       elif bits_to_load >= min_bits:
    873         self.EmitVLoad(1, 32, destinations[0],
    874                        self.DereferenceIncrement(source, alignment))
    875         bits_to_load -= min_bits
    876         destinations = destinations[1:]
    877       elif bits_to_load >= 64:
    878         self.EmitVLoad(1, 32,
    879                        _Cast(64, destinations[0]),
    880                        self.DereferenceIncrement(source))
    881         bits_to_load -= 64
    882         leftover_loaded += 64
    883       elif bits_to_load >= 32:
    884         self.EmitVLoad(1, 32,
    885                        self.Lane(32, destinations[0], leftover_loaded / 32),
    886                        self.DereferenceIncrement(source))
    887         bits_to_load -= 32
    888         leftover_loaded += 32
    889       elif bits_to_load >= 16:
    890         self.EmitVLoad(1, 16,
    891                        self.Lane(16, destinations[0], leftover_loaded / 16),
    892                        self.DereferenceIncrement(source))
    893         bits_to_load -= 16
    894         leftover_loaded += 16
    895       elif bits_to_load is 8:
    896         self.EmitVLoad(1, 8,
    897                        self.Lane(8, destinations[0], leftover_loaded / 8),
    898                        self.DereferenceIncrement(source))
    899         bits_to_load -= 8
    900         leftover_loaded += 8
    901       else:
    902         raise ArgumentError('Wrong leftover: %d' % bits_to_load)
    903 
    904   def EmitVLoadE(self, load_type, count, destination, source, alignment=None):
    905     self.EmitVLoadAE(load_type, count, [destination], source, alignment)
    906 
    907   def EmitVLoadAllLanes(self, load_no, load_type, destination, source):
    908     new_destination = destination.Copy()
    909     new_destination.lane = -1
    910     new_destination.lane_bits = load_type
    911     self.EmitVLoad(load_no, load_type, new_destination, source)
    912 
    913   def EmitVLoadOffset(self, load_no, load_type, destination, source, offset):
    914     self.EmitVLoadOffsetA(load_no, load_type, [destination], source, offset)
    915 
    916   def EmitVLoadOffsetA(self, load_no, load_type, destinations, source, offset):
    917     assert len(destinations) <= 4
    918     self.EmitOp3('ld%d' % load_no,
    919                  _RegisterList(load_type, destinations), source, offset)
    920 
    921   def EmitPld(self, load_address_register):
    922     self.EmitOp2('prfm', 'pldl1keep', '[%s]' % load_address_register)
    923 
    924   def EmitPldOffset(self, load_address_register, offset):
    925     self.EmitOp2('prfm', 'pldl1keep',
    926                  '[%s, %s]' % (load_address_register, offset))
    927 
    928   def EmitVShl(self, shift_type, destination, source, shift):
    929     self.EmitOp3('sshl',
    930                  _AppendType(shift_type, destination),
    931                  _AppendType(shift_type, source), _AppendType('i32', shift))
    932 
    933   def EmitVStore(self, store_no, store_type, source, destination):
    934     self.EmitVStoreA(store_no, store_type, [source], destination)
    935 
    936   def EmitVStoreA(self, store_no, store_type, sources, destination):
    937     if destination.dereference_increment:
    938       increment = sum([_LoadStoreSize(source) for source in sources]) / 8
    939       self.EmitVStoreAPostIncrement(store_no, store_type, sources, destination,
    940                                     self.ImmediateConstant(increment))
    941     else:
    942       self.EmitVStoreAPostIncrement(store_no, store_type, sources, destination,
    943                                     None)
    944 
    945   def EmitVStoreAPostIncrement(self, store_no, store_type, sources, destination,
    946                                increment):
    947     source_list = _RegisterList(store_type, sources)
    948     if increment:
    949       self.EmitOp3('st%d' % store_no, source_list, destination, increment)
    950     else:
    951       self.EmitOp2('st%d' % store_no, source_list, destination)
    952 
    953   def EmitVStoreAE(self,
    954                    store_type,
    955                    elem_count,
    956                    sources,
    957                    destination,
    958                    alignment=None):
    959     """Generate assembly to store an array of elements of given size."""
    960     bits_to_store = store_type * elem_count
    961     min_bits = min([source.register_bits for source in sources])
    962     max_bits = max([source.register_bits for source in sources])
    963 
    964     if min_bits is not max_bits:
    965       raise ArgumentError('Cannot mix double and quad stores.')
    966 
    967     if len(sources) * min_bits < bits_to_store:
    968       raise ArgumentError('To few destinations: %d to store %d bits.' %
    969                           (len(sources), bits_to_store))
    970 
    971     leftover_stored = 0
    972     while bits_to_store > 0:
    973       if bits_to_store >= 4 * min_bits:
    974         self.EmitVStoreA(1, 32, sources[:4],
    975                          self.DereferenceIncrement(destination, alignment))
    976         bits_to_store -= 4 * min_bits
    977         sources = sources[4:]
    978       elif bits_to_store >= 3 * min_bits:
    979         self.EmitVStoreA(1, 32, sources[:3],
    980                          self.DereferenceIncrement(destination, alignment))
    981         bits_to_store -= 3 * min_bits
    982         sources = sources[3:]
    983       elif bits_to_store >= 2 * min_bits:
    984         self.EmitVStoreA(1, 32, sources[:2],
    985                          self.DereferenceIncrement(destination, alignment))
    986         bits_to_store -= 2 * min_bits
    987         sources = sources[2:]
    988       elif bits_to_store >= min_bits:
    989         self.EmitVStore(1, 32, sources[0],
    990                         self.DereferenceIncrement(destination, alignment))
    991         bits_to_store -= min_bits
    992         sources = sources[1:]
    993       elif bits_to_store >= 64:
    994         self.EmitVStore(1, 32,
    995                         _Cast(64, sources[0]),
    996                         self.DereferenceIncrement(destination, alignment))
    997         bits_to_store -= 64
    998         leftover_stored += 64
    999       elif bits_to_store >= 32:
   1000         self.EmitVStore(1, 32,
   1001                         self.Lane(32, sources[0], leftover_stored / 32),
   1002                         self.DereferenceIncrement(destination))
   1003         bits_to_store -= 32
   1004         leftover_stored += 32
   1005       elif bits_to_store >= 16:
   1006         self.EmitVStore(1, 16,
   1007                         self.Lane(16, sources[0], leftover_stored / 16),
   1008                         self.DereferenceIncrement(destination))
   1009         bits_to_store -= 16
   1010         leftover_stored += 16
   1011       elif bits_to_store >= 8:
   1012         self.EmitVStore(1, 8,
   1013                         self.Lane(8, sources[0], leftover_stored / 8),
   1014                         self.DereferenceIncrement(destination))
   1015         bits_to_store -= 8
   1016         leftover_stored += 8
   1017       else:
   1018         raise ArgumentError('Wrong leftover: %d' % bits_to_store)
   1019 
   1020   def EmitVStoreE(self, store_type, count, source, destination, alignment=None):
   1021     self.EmitVStoreAE(store_type, count, [source], destination, alignment)
   1022 
   1023   def EmitVStoreOffset(self, store_no, store_type, source, destination, offset):
   1024     self.EmitVStoreOffsetA(store_no, store_type, [source], destination, offset)
   1025 
   1026   def EmitVStoreOffsetA(self, store_no, store_type, sources, destination,
   1027                         offset):
   1028     self.EmitOp3('st%d' % store_no,
   1029                  _RegisterList(store_type, sources), destination, offset)
   1030 
   1031   def EmitVStoreOffsetE(self, store_type, count, source, destination, offset):
   1032     if store_type is not 32:
   1033       raise ArgumentError('Unsupported store_type: %d' % store_type)
   1034 
   1035     if count == 1:
   1036       self.EmitVStoreOffset(1, 32,
   1037                             self.Lane(32, source, 0),
   1038                             self.Dereference(destination, None), offset)
   1039     elif count == 2:
   1040       self.EmitVStoreOffset(1, 32,
   1041                             _Cast(64, source),
   1042                             self.Dereference(destination, None), offset)
   1043     elif count == 3:
   1044       self.EmitVStore(1, 32,
   1045                       _Cast(64, source),
   1046                       self.DereferenceIncrement(destination, None))
   1047       self.EmitVStoreOffset(1, 32,
   1048                             self.Lane(32, source, 2),
   1049                             self.Dereference(destination, None), offset)
   1050       self.EmitSub(destination, destination, self.ImmediateConstant(8))
   1051     elif count == 4:
   1052       self.EmitVStoreOffset(1, 32, source,
   1053                             self.Dereference(destination, None), offset)
   1054     else:
   1055       raise ArgumentError('To many elements: %d' % count)
   1056 
   1057   def EmitVSumReduce(self, reduce_type, elem_count, reduce_count, destinations,
   1058                      sources):
   1059     """Generate assembly to perform n-fold horizontal sum reduction."""
   1060     if reduce_type is not 'u32':
   1061       raise ArgumentError('Unsupported reduce: %s' % reduce_type)
   1062 
   1063     if (elem_count + 3) / 4 > len(destinations):
   1064       raise ArgumentError('To few destinations: %d (%d needed)' %
   1065                           (len(destinations), (elem_count + 3) / 4))
   1066 
   1067     if elem_count * reduce_count > len(sources) * 4:
   1068       raise ArgumentError('To few sources: %d' % len(sources))
   1069 
   1070     if reduce_count <= 1:
   1071       raise ArgumentError('Unsupported reduce_count: %d' % reduce_count)
   1072 
   1073     sources = [_Cast(128, source) for source in sources]
   1074     destinations = [_Cast(128, destination) for destination in destinations]
   1075 
   1076     while reduce_count > 1:
   1077       if len(sources) % 2 == 1:
   1078         sources.append(sources[-1])
   1079 
   1080       if reduce_count == 2:
   1081         for i in range(len(destinations)):
   1082           self.EmitVPadd(reduce_type, destinations[i], sources[2 * i],
   1083                          sources[2 * i + 1])
   1084         return
   1085       else:
   1086         sources_2 = []
   1087         for i in range(len(sources) / 2):
   1088           self.EmitVPadd(reduce_type, sources[2 * i], sources[2 * i],
   1089                          sources[2 * i + 1])
   1090           sources_2.append(sources[2 * i])
   1091         reduce_count /= 2
   1092         sources = sources_2
   1093 
   1094   def EmitVUzp1(self, uzp_type, destination, source_1, source_2):
   1095     self.EmitOp3('uzp1',
   1096                  _AppendType(uzp_type, destination),
   1097                  _AppendType(uzp_type, source_1),
   1098                  _AppendType(uzp_type, source_2))
   1099 
   1100   def EmitVUzp2(self, uzp_type, destination, source_1, source_2):
   1101     self.EmitOp3('uzp2',
   1102                  _AppendType(uzp_type, destination),
   1103                  _AppendType(uzp_type, source_1),
   1104                  _AppendType(uzp_type, source_2))
   1105 
   1106   def EmitVUzp(self, uzp_type, destination_1, destination_2, source_1,
   1107                source_2):
   1108     self.EmitVUzp1(uzp_type, destination_1, source_1, source_2)
   1109     self.EmitVUzp2(uzp_type, destination_2, source_1, source_2)
   1110 
   1111   def EmitVTrn1(self, trn_type, destination, source_1, source_2):
   1112     self.EmitOp3('trn1',
   1113                  _AppendType(trn_type, destination),
   1114                  _AppendType(trn_type, source_1),
   1115                  _AppendType(trn_type, source_2))
   1116 
   1117   def EmitVTrn2(self, trn_type, destination, source_1, source_2):
   1118     self.EmitOp3('trn2',
   1119                  _AppendType(trn_type, destination),
   1120                  _AppendType(trn_type, source_1),
   1121                  _AppendType(trn_type, source_2))
   1122 
   1123   def EmitVTrn(self, trn_type, destination_1, destination_2, source_1,
   1124                source_2):
   1125     self.EmitVTrn1(trn_type, destination_1, source_1, source_2)
   1126     self.EmitVTrn2(trn_type, destination_2, source_1, source_2)
   1127 
   1128   def EmitColBlockStride(self, cols, stride, new_stride):
   1129     assert cols in [1, 2, 3, 4, 5, 6, 7, 8]
   1130     if cols in [5, 6, 7]:
   1131       self.EmitSub(new_stride, stride, self.ImmediateConstant(4))
   1132 
   1133   def EmitLoadColBlock(self, registers, load_type, cols, elements, block,
   1134                        input_address, stride):
   1135     assert cols is len(block)
   1136     assert load_type is 8
   1137 
   1138     input_deref = self.Dereference(input_address, None)
   1139     input_deref_increment = self.DereferenceIncrement(input_address, None)
   1140 
   1141     if cols is 1:
   1142       for i in range(elements):
   1143         self.EmitVLoadOffset(1, 8,
   1144                              self.Lane(8, block[0], i), input_deref, stride)
   1145       self.EmitPld(input_address)
   1146       return block
   1147     elif cols is 2:
   1148       temp = [registers.DoubleRegister() for unused_i in range(2)]
   1149       for i in range(elements):
   1150         self.EmitVLoadOffset(1, 16,
   1151                              self.Lane(16, block[i / 4], i % 4), input_deref,
   1152                              stride)
   1153       self.EmitPld(input_address)
   1154       self.EmitVUzp(8, temp[0], temp[1], block[0], block[1])
   1155       registers.FreeRegisters(block)
   1156       return temp
   1157     elif cols is 3:
   1158       for i in range(elements):
   1159         self.EmitVLoadOffsetA(3, 8, [self.Lane(8, row, i) for row in block],
   1160                               input_deref, stride)
   1161       self.EmitPld(input_address)
   1162       return block
   1163     elif cols is 4:
   1164       temp = [registers.DoubleRegister() for unused_i in range(4)]
   1165       for i in range(elements):
   1166         self.EmitVLoadOffset(1, 32,
   1167                              self.Lane(32, block[i % 4], i / 4), input_deref,
   1168                              stride)
   1169       self.EmitPld(input_address)
   1170       self.EmitVTrn(16, temp[0], temp[2], block[0], block[2])
   1171       self.EmitVTrn(16, temp[1], temp[3], block[1], block[3])
   1172       self.EmitVTrn(8, block[0], block[1], temp[0], temp[1])
   1173       self.EmitVTrn(8, block[2], block[3], temp[2], temp[3])
   1174       registers.FreeRegisters(temp)
   1175       return block
   1176     elif cols is 5:
   1177       temp = [registers.DoubleRegister() for unused_i in range(4)]
   1178       for i in range(elements):
   1179         self.EmitVLoad(1, 32,
   1180                        self.Lane(32, block[i % 4], i / 4),
   1181                        input_deref_increment)
   1182         self.EmitVLoadOffset(1, 8,
   1183                              self.Lane(8, block[4], i), input_deref, stride)
   1184       self.EmitPld(input_address)
   1185       self.EmitVTrn(16, temp[0], temp[2], block[0], block[2])
   1186       self.EmitVTrn(16, temp[1], temp[3], block[1], block[3])
   1187       self.EmitVTrn(8, block[0], block[1], temp[0], temp[1])
   1188       self.EmitVTrn(8, block[2], block[3], temp[2], temp[3])
   1189       registers.FreeRegisters(temp)
   1190       return block
   1191     elif cols is 6:
   1192       temp = [registers.DoubleRegister() for unused_i in range(6)]
   1193       for i in range(elements):
   1194         self.EmitVLoad(1, 32,
   1195                        self.Lane(32, block[i % 4], i / 4),
   1196                        input_deref_increment)
   1197         self.EmitVLoadOffset(1, 16,
   1198                              self.Lane(16, block[4 + i / 4], i % 4),
   1199                              input_deref, stride)
   1200       self.EmitPld(input_address)
   1201       self.EmitVTrn(16, temp[0], temp[2], block[0], block[2])
   1202       self.EmitVTrn(16, temp[1], temp[3], block[1], block[3])
   1203       self.EmitVUzp(8, temp[4], temp[5], block[4], block[5])
   1204       self.EmitVTrn(8, block[0], block[1], temp[0], temp[1])
   1205       self.EmitVTrn(8, block[2], block[3], temp[2], temp[3])
   1206       registers.FreeRegisters(
   1207           [block[4], block[5], temp[0], temp[1], temp[2], temp[3]])
   1208       return [block[0], block[1], block[2], block[3], temp[4], temp[5]]
   1209     elif cols is 7:
   1210       temp = [registers.DoubleRegister() for unused_i in range(4)]
   1211       for i in range(elements):
   1212         self.EmitVLoad(1, 32,
   1213                        self.Lane(32, block[i % 4], i / 4),
   1214                        input_deref_increment)
   1215         self.EmitVLoadOffsetA(3, 8,
   1216                               [self.Lane(8, row, i) for row in block[4:]],
   1217                               input_deref, stride)
   1218       self.EmitPld(input_address)
   1219       self.EmitVTrn1(16, temp[0], block[0], block[2])
   1220       self.EmitVTrn2(16, temp[2], block[0], block[2])
   1221       self.EmitVTrn1(16, temp[1], block[1], block[3])
   1222       self.EmitVTrn2(16, temp[3], block[1], block[3])
   1223       self.EmitVTrn1(8, block[0], temp[0], temp[1])
   1224       self.EmitVTrn2(8, block[1], temp[0], temp[1])
   1225       self.EmitVTrn1(8, block[2], temp[2], temp[3])
   1226       self.EmitVTrn2(8, block[3], temp[2], temp[3])
   1227       registers.FreeRegisters(temp)
   1228       return block
   1229     elif cols is 8:
   1230       temp = [registers.DoubleRegister() for unused_i in range(8)]
   1231       for i in range(elements):
   1232         self.EmitVLoadOffset(1, 32, block[i], input_deref, stride)
   1233       self.EmitPld(input_address)
   1234       self.EmitVTrn(8, temp[0], temp[1], block[0], block[1])
   1235       self.EmitVTrn(8, temp[2], temp[3], block[2], block[3])
   1236       self.EmitVTrn(8, temp[4], temp[5], block[4], block[5])
   1237       self.EmitVTrn(8, temp[6], temp[7], block[6], block[7])
   1238       self.EmitVTrn(16, block[0], block[2], temp[0], temp[2])
   1239       self.EmitVTrn(16, block[1], block[3], temp[1], temp[3])
   1240       self.EmitVTrn(16, block[4], block[6], temp[4], temp[6])
   1241       self.EmitVTrn(16, block[5], block[7], temp[5], temp[7])
   1242       self.EmitVTrn(32, temp[0], temp[4], block[0], block[4])
   1243       self.EmitVTrn(32, temp[1], temp[5], block[1], block[5])
   1244       self.EmitVTrn(32, temp[2], temp[6], block[2], block[6])
   1245       self.EmitVTrn(32, temp[3], temp[7], block[3], block[7])
   1246       registers.FreeRegisters(block)
   1247       return temp
   1248     else:
   1249       assert False
   1250 
   1251   def Dereference(self, value, unused_alignment=None):
   1252     new_value = value.Copy()
   1253     new_value.dereference = True
   1254     return new_value
   1255 
   1256   def DereferenceIncrement(self, value, alignment=None):
   1257     new_value = self.Dereference(value, alignment).Copy()
   1258     new_value.dereference_increment = True
   1259     return new_value
   1260 
   1261   def ImmediateConstant(self, value):
   1262     return _ImmediateConstant(value)
   1263 
   1264   def AllLanes(self, value):
   1265     return '%s[]' % value
   1266 
   1267   def Lane(self, bits, value, lane):
   1268     new_value = value.Copy()
   1269     if bits * (lane + 1) > new_value.register_bits:
   1270       raise ArgumentError('Lane to big: (%d + 1) x %d > %d' %
   1271                           (lane, bits, new_value.register_bits))
   1272     new_value.lane = lane
   1273     new_value.lane_bits = bits
   1274     return new_value
   1275 
   1276   def CreateRegisters(self):
   1277     return _NeonRegisters64Bit()
   1278