Home | History | Annotate | Download | only in generators
      1 # Copyright 2016 The Gemmlowp Authors. All rights reserved.
      2 #
      3 # Licensed under the Apache License, Version 2.0 (the "License");
      4 # you may not use this file except in compliance with the License.
      5 # You may obtain a copy of the License at
      6 #
      7 #    http://www.apache.org/licenses/LICENSE-2.0
      8 #
      9 # Unless required by applicable law or agreed to in writing, software
     10 # distributed under the License is distributed on an "AS IS" BASIS,
     11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 # See the License for the specific language governing permissions and
     13 # limitations under the License.
     14 """32bit ARM/NEON assembly emitter.
     15 
     16 Used by code generators to produce ARM assembly with NEON simd code.
     17 Provides tools for easier register management: named register variable
     18 allocation/deallocation, and offers a more procedural/structured approach
     19 to generating assembly.
     20 
     21 TODO: right now neon emitter prints out assembly instructions immediately,
     22 it might be beneficial to keep the whole structure and emit the assembly after
     23 applying some optimizations like: instruction reordering or register reuse.
     24 
     25 TODO: NeonRegister object assigns explicit registers at allocation time.
     26 Similarily to emiting code, register mapping and reuse can be performed and
     27 optimized lazily.
     28 """
     29 
     30 
     31 class Error(Exception):
     32   """Module level error."""
     33 
     34 
     35 class RegisterAllocationError(Error):
     36   """Cannot alocate registers."""
     37 
     38 
     39 class LaneError(Error):
     40   """Wrong lane number."""
     41 
     42 
     43 class ArgumentError(Error):
     44   """Wrong argument."""
     45 
     46 
     47 def _Low(register):
     48   assert register[0] == 'q'
     49   num = int(register[1:])
     50   return 'd%d' % (num * 2)
     51 
     52 
     53 def _High(register):
     54   assert register[0] == 'q'
     55   num = int(register[1:])
     56   return 'd%d' % (num * 2 + 1)
     57 
     58 
     59 def _ExpandQuads(registers):
     60   doubles = []
     61   for register in registers:
     62     if register[0] == 'q':
     63       doubles.append(_Low(register))
     64       doubles.append(_High(register))
     65     else:
     66       doubles.append(register)
     67   return doubles
     68 
     69 
     70 def _MakeCompatible(op1, op2, op3):
     71   if op1[0] == 'd' or op2[0] == 'd' or op3[0] == 'd':
     72     if op1[0] == 'q':
     73       op1 = _Low(op1)
     74     if op2[0] == 'q':
     75       op2 = _Low(op2)
     76     if op3[0] == 'q':
     77       op3 = _Low(op3)
     78   return (op1, op2, op3)
     79 
     80 
     81 class _NeonRegisters32Bit(object):
     82   """Utility that keeps track of used 32bit ARM/NEON registers."""
     83 
     84   def __init__(self):
     85     self.double = set()
     86     self.double_ever = set()
     87     self.general = set()
     88     self.general_ever = set()
     89     self.parameters = dict()
     90     self.output_parameters = dict()
     91 
     92   def MapParameter(self, parameter, parameter_value=None):
     93     if not parameter_value:
     94       parameter_value = parameter
     95     self.parameters[parameter] = (parameter_value, 'r')
     96     return '%%[%s]' % parameter
     97 
     98   def MapMemoryParameter(self, parameter, parameter_value=None):
     99     if not parameter_value:
    100       parameter_value = parameter
    101     self.parameters[parameter] = (parameter_value, 'm')
    102     return '%%[%s]' % parameter
    103 
    104   def MapOutputParameter(self, parameter, parameter_value=None):
    105     if not parameter_value:
    106       parameter_value = parameter
    107     self.output_parameters[parameter] = (parameter_value, '+r')
    108     return '%%[%s]' % parameter
    109 
    110   def DoubleRegister(self, min_val=0):
    111     for i in range(min_val, 32):
    112       if i not in self.double:
    113         self.double.add(i)
    114         self.double_ever.add(i)
    115         return 'd%d' % i
    116     raise RegisterAllocationError('Not enough double registers.')
    117 
    118   def QuadRegister(self, min_val=0):
    119     for i in range(min_val, 16):
    120       if ((i * 2) not in self.double) and ((i * 2 + 1) not in self.double):
    121         self.double.add(i * 2)
    122         self.double.add(i * 2 + 1)
    123         self.double_ever.add(i * 2)
    124         self.double_ever.add(i * 2 + 1)
    125         return 'q%d' % i
    126     raise RegisterAllocationError('Not enough quad registers.')
    127 
    128   def GeneralRegister(self):
    129     for i in range(0, 16):
    130       if i not in self.general:
    131         self.general.add(i)
    132         self.general_ever.add(i)
    133         return 'r%d' % i
    134     raise RegisterAllocationError('Not enough general registers.')
    135 
    136   def MappedParameters(self):
    137     return [(k, v) for (k, v) in self.parameters.items()]
    138 
    139   def MappedOutputParameters(self):
    140     return [(k, v) for (k, v) in self.output_parameters.items()]
    141 
    142   def Clobbers(self):
    143     return (['r%d' % i for i in self.general_ever] +
    144             ['d%d' % i for i in self.DoubleClobbers()])
    145 
    146   def DoubleClobbers(self):
    147     return sorted(self.double_ever)
    148 
    149   def FreeRegister(self, register):
    150     assert len(register) > 1
    151     if register[0] not in ['r', 'd', 'q']:
    152       return
    153 
    154     num = int(register[1:])
    155 
    156     if register[0] == 'r':
    157       assert num in self.general
    158       self.general.remove(num)
    159     elif register[0] == 'd':
    160       assert num in self.double
    161       self.double.remove(num)
    162     elif register[0] == 'q':
    163       assert num * 2 in self.double
    164       assert num * 2 + 1 in self.double
    165       self.double.remove(num * 2)
    166       self.double.remove(num * 2 + 1)
    167     else:
    168       raise RegisterDeallocationError('Register not allocated: %s' % register)
    169 
    170   def FreeRegisters(self, registers):
    171     for register in registers:
    172       self.FreeRegister(register)
    173 
    174 
    175 class NeonEmitter(object):
    176   """Emits ARM/NEON assembly opcodes."""
    177 
    178   def __init__(self, debug=False):
    179     self.ops = {}
    180     self.indent = ''
    181     self.debug = debug
    182 
    183   def PushIndent(self, delta='  '):
    184     self.indent += delta
    185 
    186   def PopIndent(self, delta=2):
    187     self.indent = self.indent[:-delta]
    188 
    189   def EmitIndented(self, what):
    190     print self.indent + what
    191 
    192   def PushOp(self, op):
    193     if op in self.ops.keys():
    194       self.ops[op] += 1
    195     else:
    196       self.ops[op] = 1
    197 
    198   def ClearCounters(self):
    199     self.ops.clear()
    200 
    201   def EmitNewline(self):
    202     print ''
    203 
    204   def EmitPreprocessor1(self, op, param):
    205     print '#%s %s' % (op, param)
    206 
    207   def EmitPreprocessor(self, op):
    208     print '#%s' % op
    209 
    210   def EmitInclude(self, include):
    211     self.EmitPreprocessor1('include', include)
    212 
    213   def EmitCall1(self, function, param):
    214     self.EmitIndented('%s(%s);' % (function, param))
    215 
    216   def EmitAssert(self, assert_expression):
    217     if self.debug:
    218       self.EmitCall1('assert', assert_expression)
    219 
    220   def EmitHeaderBegin(self, header_name, includes):
    221     self.EmitPreprocessor1('ifndef', (header_name + '_H_').upper())
    222     self.EmitPreprocessor1('define', (header_name + '_H_').upper())
    223     self.EmitNewline()
    224     if includes:
    225       for include in includes:
    226         self.EmitInclude(include)
    227       self.EmitNewline()
    228 
    229   def EmitHeaderEnd(self):
    230     self.EmitPreprocessor('endif')
    231 
    232   def EmitCode(self, code):
    233     self.EmitIndented('%s;' % code)
    234 
    235   def EmitFunctionBeginA(self, function_name, params, return_type):
    236     self.EmitIndented('%s %s(%s) {' %
    237                       (return_type, function_name,
    238                        ', '.join(['%s %s' % (t, n) for (t, n) in params])))
    239     self.PushIndent()
    240 
    241   def EmitFunctionEnd(self):
    242     self.PopIndent()
    243     self.EmitIndented('}')
    244 
    245   def EmitAsmBegin(self):
    246     self.EmitIndented('asm volatile(')
    247     self.PushIndent()
    248 
    249   def EmitAsmMapping(self, elements):
    250     if elements:
    251       self.EmitIndented(': ' + ', '.join(
    252           ['[%s] "%s"(%s)' % (d, v[1], v[0]) for (d, v) in elements]))
    253     else:
    254       self.EmitIndented(':')
    255 
    256   def EmitClobbers(self, elements):
    257     if elements:
    258       self.EmitIndented(': ' + ', '.join(['"%s"' % c for c in elements]))
    259     else:
    260       self.EmitIndented(':')
    261 
    262   def EmitAsmEnd(self, registers):
    263     self.EmitAsmMapping(registers.MappedOutputParameters())
    264     self.EmitAsmMapping(registers.MappedParameters())
    265     self.EmitClobbers(registers.Clobbers() + ['cc', 'memory'])
    266     self.PopIndent()
    267     self.EmitIndented(');')
    268 
    269   def EmitComment(self, comment):
    270     self.EmitIndented('// ' + comment)
    271 
    272   def EmitNumericalLabel(self, label):
    273     self.EmitIndented('"%d:"' % label)
    274 
    275   def EmitOp1(self, op, param1):
    276     self.PushOp(op)
    277     self.EmitIndented('"%s %s\\n"' % (op, param1))
    278 
    279   def EmitOp2(self, op, param1, param2):
    280     self.PushOp(op)
    281     self.EmitIndented('"%s %s, %s\\n"' % (op, param1, param2))
    282 
    283   def EmitOp3(self, op, param1, param2, param3):
    284     self.PushOp(op)
    285     self.EmitIndented('"%s %s, %s, %s\\n"' % (op, param1, param2, param3))
    286 
    287   def EmitAdd(self, destination, source, param):
    288     self.EmitOp3('add', destination, source, param)
    289 
    290   def EmitSubs(self, destination, source, param):
    291     self.EmitOp3('subs', destination, source, param)
    292 
    293   def EmitSub(self, destination, source, param):
    294     self.EmitOp3('sub', destination, source, param)
    295 
    296   def EmitMul(self, destination, source, param):
    297     self.EmitOp3('mul', destination, source, param)
    298 
    299   def EmitMov(self, param1, param2):
    300     self.EmitOp2('mov', param1, param2)
    301 
    302   def EmitBeqBack(self, label):
    303     self.EmitOp1('beq', '%db' % label)
    304 
    305   def EmitBeqFront(self, label):
    306     self.EmitOp1('beq', '%df' % label)
    307 
    308   def EmitBgtBack(self, label):
    309     self.EmitOp1('bgt', '%db' % label)
    310 
    311   def EmitBgtFront(self, label):
    312     self.EmitOp1('bgt', '%df' % label)
    313 
    314   def EmitBleBack(self, label):
    315     self.EmitOp1('ble', '%db' % label)
    316 
    317   def EmitBleFront(self, label):
    318     self.EmitOp1('ble', '%df' % label)
    319 
    320   def EmitBneBack(self, label):
    321     self.EmitOp1('bne', '%db' % label)
    322 
    323   def EmitBneFront(self, label):
    324     self.EmitOp1('bne', '%df' % label)
    325 
    326   def EmitVAdd(self, add_type, destination, source_1, source_2):
    327     destination, source_1, source_2 = _MakeCompatible(destination, source_1,
    328                                                       source_2)
    329     self.EmitOp3('vadd.%s' % add_type, destination, source_1, source_2)
    330 
    331   def EmitVAddw(self, add_type, destination, source_1, source_2):
    332     self.EmitOp3('vaddw.%s' % add_type, destination, source_1, source_2)
    333 
    334   def EmitVSub(self, sub_type, destination, source_1, source_2):
    335     destination, source_1, source_2 = _MakeCompatible(destination, source_1,
    336                                                       source_2)
    337     self.EmitOp3('vsub.%s' % sub_type, destination, source_1, source_2)
    338 
    339   def EmitVCvt(self, cvt_to, cvt_from, destination, source):
    340     self.EmitOp2('vcvt.%s.%s' % (cvt_to, cvt_from), destination, source)
    341 
    342   def EmitVDup(self, dup_type, destination, source):
    343     self.EmitOp2('vdup.%s' % dup_type, destination, source)
    344 
    345   def EmitVMax(self, size, destination, source_1, source_2):
    346     self.EmitOp3('vmax.%s' % size, destination, source_1, source_2)
    347 
    348   def EmitVMin(self, size, destination, source_1, source_2):
    349     self.EmitOp3('vmin.%s' % size, destination, source_1, source_2)
    350 
    351   def EmitVMov(self, mov_type, destination, source):
    352     self.EmitOp2('vmov.%s' % mov_type, destination, source)
    353 
    354   def EmitVMovl(self, mov_type, destination, source):
    355     if source[0] == 'q':
    356       source = _Low(source)
    357     self.EmitOp2('vmovl.%s' % mov_type, destination, source)
    358 
    359   def EmitVMovl2(self, mov_type, destination_1, destination_2, source):
    360     self.EmitVMovl(mov_type, destination_2, _High(source))
    361     self.EmitVMovl(mov_type, destination_1, _Low(source))
    362 
    363   def EmitVQmovn(self, mov_type, destination, source):
    364     if destination[0] == 'q':
    365       destination = _Low(destination)
    366     self.EmitOp2('vqmovn.%s' % mov_type, destination, source)
    367 
    368   def EmitVQmovn2(self, mov_type, destination, source_1, source_2):
    369     self.EmitVQmovn(mov_type, _Low(destination), source_1)
    370     self.EmitVQmovn(mov_type, _High(destination), source_2)
    371 
    372   def EmitVQmovun(self, mov_type, destination, source):
    373     if destination[0] == 'q':
    374       destination = _Low(destination)
    375     self.EmitOp2('vqmovun.%s' % mov_type, destination, source)
    376 
    377   def EmitVQmovun2(self, mov_type, destination, source_1, source_2):
    378     self.EmitVQmovun(mov_type, _Low(destination), source_1)
    379     self.EmitVQmovun(mov_type, _High(destination), source_2)
    380 
    381   def EmitVMul(self, mul_type, destination, source_1, source_2):
    382     destination, source_1, source_2 = _MakeCompatible(destination, source_1,
    383                                                       source_2)
    384     self.EmitOp3('vmul.%s' % mul_type, destination, source_1, source_2)
    385 
    386   def EmitVMulScalar(self, mul_type, destination, source_1, source_2):
    387     self.EmitOp3('vmul.%s' % mul_type, destination, source_1, source_2)
    388 
    389   def EmitVMull(self, mul_type, destination, source_1, source_2):
    390     self.EmitOp3('vmull.%s' % mul_type, destination, source_1, source_2)
    391 
    392   def EmitVPadd(self, add_type, destination, source_1, source_2):
    393     self.EmitOp3('vpadd.%s' % add_type, destination, source_1, source_2)
    394 
    395   def EmitVPaddl(self, add_type, destination, source):
    396     self.EmitOp2('vpaddl.%s' % add_type, destination, source)
    397 
    398   def EmitVPadal(self, add_type, destination, source):
    399     self.EmitOp2('vpadal.%s' % add_type, destination, source)
    400 
    401   def EmitLdr(self, register, value):
    402     self.EmitOp2('ldr', register, value)
    403 
    404   def EmitVLoad(self, load_no, load_type, destination, source):
    405     self.EmitVLoadA(load_no, load_type, [destination], source)
    406 
    407   def EmitVLoadA(self, load_no, load_type, destinations, source):
    408     self.EmitOp2('vld%d.%d' % (load_no, load_type),
    409                  '{%s}' % ', '.join(_ExpandQuads(destinations)), source)
    410 
    411   def EmitVLoadAE(self,
    412                   load_type,
    413                   elem_count,
    414                   destinations,
    415                   source,
    416                   alignment=None):
    417     bits_to_load = load_type * elem_count
    418     destinations = _ExpandQuads(destinations)
    419     if len(destinations) * 64 < bits_to_load:
    420       raise ArgumentError('To few destinations: %d to load %d bits.' %
    421                           (len(destinations), bits_to_load))
    422 
    423     while bits_to_load > 0:
    424       if bits_to_load >= 256:
    425         self.EmitVLoadA(1, 32, destinations[:4],
    426                         self.DereferenceIncrement(source, alignment))
    427         bits_to_load -= 256
    428         destinations = destinations[4:]
    429       elif bits_to_load >= 192:
    430         self.EmitVLoadA(1, 32, destinations[:3],
    431                         self.DereferenceIncrement(source, alignment))
    432         bits_to_load -= 192
    433         destinations = destinations[3:]
    434       elif bits_to_load >= 128:
    435         self.EmitVLoadA(1, 32, destinations[:2],
    436                         self.DereferenceIncrement(source, alignment))
    437         bits_to_load -= 128
    438         destinations = destinations[2:]
    439       elif bits_to_load >= 64:
    440         self.EmitVLoad(1, 32, destinations[0],
    441                        self.DereferenceIncrement(source, alignment))
    442         bits_to_load -= 64
    443         destinations = destinations[1:]
    444       else:
    445         destination = destinations[0]
    446         if bits_to_load == 56:
    447           self.EmitVLoad(1, 32,
    448                          self.Lane(32, destination, 0),
    449                          self.DereferenceIncrement(source))
    450           self.EmitVLoad(1, 16,
    451                          self.Lane(16, destination, 2),
    452                          self.DereferenceIncrement(source))
    453           self.EmitVLoad(1, 8,
    454                          self.Lane(8, destination, 6),
    455                          self.DereferenceIncrement(source))
    456         elif bits_to_load == 48:
    457           self.EmitVLoad(1, 32,
    458                          self.Lane(32, destination, 0),
    459                          self.DereferenceIncrement(source))
    460           self.EmitVLoad(1, 16,
    461                          self.Lane(16, destination, 2),
    462                          self.DereferenceIncrement(source))
    463         elif bits_to_load == 40:
    464           self.EmitVLoad(1, 32,
    465                          self.Lane(32, destination, 0),
    466                          self.DereferenceIncrement(source))
    467           self.EmitVLoad(1, 8,
    468                          self.Lane(8, destination, 4),
    469                          self.DereferenceIncrement(source))
    470         elif bits_to_load == 32:
    471           self.EmitVLoad(1, 32,
    472                          self.Lane(32, destination, 0),
    473                          self.DereferenceIncrement(source))
    474         elif bits_to_load == 24:
    475           self.EmitVLoad(1, 16,
    476                          self.Lane(16, destination, 0),
    477                          self.DereferenceIncrement(source))
    478           self.EmitVLoad(1, 8,
    479                          self.Lane(8, destination, 2),
    480                          self.DereferenceIncrement(source))
    481         elif bits_to_load == 16:
    482           self.EmitVLoad(1, 16,
    483                          self.Lane(16, destination, 0),
    484                          self.DereferenceIncrement(source))
    485         elif bits_to_load == 8:
    486           self.EmitVLoad(1, 8,
    487                          self.Lane(8, destination, 0),
    488                          self.DereferenceIncrement(source))
    489         else:
    490           raise ArgumentError('Wrong leftover: %d' % bits_to_load)
    491         return
    492 
    493   def EmitVLoadE(self, load_type, count, destination, source, alignment=None):
    494     self.EmitVLoadAE(load_type, count, [destination], source, alignment)
    495 
    496   def EmitVLoadAllLanes(self, load_no, load_type, destination, source):
    497     destinations = []
    498     if destination[0] == 'q':
    499       destinations.append(self.AllLanes(_Low(destination)))
    500       destinations.append(self.AllLanes(_High(destination)))
    501     else:
    502       destinations.append(self.AllLanes(destination))
    503     self.EmitVLoadA(load_no, load_type, destinations, source)
    504 
    505   def EmitVLoadOffset(self, load_no, load_type, destination, source, offset):
    506     self.EmitVLoadOffsetA(load_no, load_type, [destination], source, offset)
    507 
    508   def EmitVLoadOffsetA(self, load_no, load_type, destinations, source, offset):
    509     assert len(destinations) <= 4
    510     self.EmitOp3('vld%d.%d' % (load_no, load_type),
    511                  '{%s}' % ', '.join(_ExpandQuads(destinations)), source, offset)
    512 
    513   def EmitPld(self, load_address_register):
    514     self.EmitOp1('pld', '[%s]' % load_address_register)
    515 
    516   def EmitPldw(self, store_address_register):
    517     self.EmitOp1('pldw', '[%s]' % store_address_register)
    518 
    519   def EmitPldOffset(self, load_address_register, offset):
    520     self.EmitOp1('pld', '[%s, %s]' % (load_address_register, offset))
    521 
    522   def EmitPldwOffset(self, store_address_register, offset):
    523     self.EmitOp1('pldw', '[%s, %s]' % (store_address_register, offset))
    524 
    525   def EmitVShl(self, shift_type, destination, source, shift):
    526     self.EmitOp3('vshl.%s' % shift_type, destination, source, shift)
    527 
    528   def EmitVStore(self, store_no, store_type, source, destination):
    529     self.EmitVStoreA(store_no, store_type, [source], destination)
    530 
    531   def EmitVStoreA(self, store_no, store_type, sources, destination):
    532     self.EmitOp2('vst%d.%d' % (store_no, store_type),
    533                  '{%s}' % ', '.join(_ExpandQuads(sources)), destination)
    534 
    535   def EmitVStoreAE(self,
    536                    store_type,
    537                    elem_count,
    538                    sources,
    539                    destination,
    540                    alignment=None):
    541     bits_to_store = store_type * elem_count
    542     sources = _ExpandQuads(sources)
    543     if len(sources) * 64 < bits_to_store:
    544       raise ArgumentError('To few sources: %d to store %d bits.' %
    545                           (len(sources), bits_to_store))
    546 
    547     while bits_to_store > 0:
    548       if bits_to_store >= 256:
    549         self.EmitVStoreA(1, 32, sources[:4],
    550                          self.DereferenceIncrement(destination, alignment))
    551         bits_to_store -= 256
    552         sources = sources[4:]
    553       elif bits_to_store >= 192:
    554         self.EmitVStoreA(1, 32, sources[:3],
    555                          self.DereferenceIncrement(destination, alignment))
    556         bits_to_store -= 192
    557         sources = sources[3:]
    558       elif bits_to_store >= 128:
    559         self.EmitVStoreA(1, 32, sources[:2],
    560                          self.DereferenceIncrement(destination, alignment))
    561         bits_to_store -= 128
    562         sources = sources[2:]
    563       elif bits_to_store >= 64:
    564         self.EmitVStore(1, 32, sources[0],
    565                         self.DereferenceIncrement(destination, alignment))
    566         bits_to_store -= 64
    567         sources = sources[1:]
    568       else:
    569         source = sources[0]
    570         if bits_to_store == 56:
    571           self.EmitVStore(1, 32,
    572                           self.Lane(32, source, 0),
    573                           self.DereferenceIncrement(destination))
    574           self.EmitVStore(1, 16,
    575                           self.Lane(16, source, 2),
    576                           self.DereferenceIncrement(destination))
    577           self.EmitVStore(1, 8,
    578                           self.Lane(8, source, 6),
    579                           self.DereferenceIncrement(destination))
    580         elif bits_to_store == 48:
    581           self.EmitVStore(1, 32,
    582                           self.Lane(32, source, 0),
    583                           self.DereferenceIncrement(destination))
    584           self.EmitVStore(1, 16,
    585                           self.Lane(16, source, 2),
    586                           self.DereferenceIncrement(destination))
    587         elif bits_to_store == 40:
    588           self.EmitVStore(1, 32,
    589                           self.Lane(32, source, 0),
    590                           self.DereferenceIncrement(destination))
    591           self.EmitVStore(1, 8,
    592                           self.Lane(8, source, 4),
    593                           self.DereferenceIncrement(destination))
    594         elif bits_to_store == 32:
    595           self.EmitVStore(1, 32,
    596                           self.Lane(32, source, 0),
    597                           self.DereferenceIncrement(destination))
    598         elif bits_to_store == 24:
    599           self.EmitVStore(1, 16,
    600                           self.Lane(16, source, 0),
    601                           self.DereferenceIncrement(destination))
    602           self.EmitVStore(1, 8,
    603                           self.Lane(8, source, 2),
    604                           self.DereferenceIncrement(destination))
    605         elif bits_to_store == 16:
    606           self.EmitVStore(1, 16,
    607                           self.Lane(16, source, 0),
    608                           self.DereferenceIncrement(destination))
    609         elif bits_to_store == 8:
    610           self.EmitVStore(1, 8,
    611                           self.Lane(8, source, 0),
    612                           self.DereferenceIncrement(destination))
    613         else:
    614           raise ArgumentError('Wrong leftover: %d' % bits_to_store)
    615         return
    616 
    617   def EmitVStoreE(self, store_type, count, source, destination, alignment=None):
    618     self.EmitVStoreAE(store_type, count, [source], destination, alignment)
    619 
    620   def EmitVStoreOffset(self, store_no, store_type, source, destination, offset):
    621     self.EmitVStoreOffsetA(store_no, store_type, [source], destination, offset)
    622 
    623   def EmitVStoreOffsetA(self, store_no, store_type, sources, destination,
    624                         offset):
    625     self.EmitOp3('vst%d.%d' % (store_no, store_type),
    626                  '{%s}' % ', '.join(_ExpandQuads(sources)), destination, offset)
    627 
    628   def EmitVStoreOffsetE(self, store_type, count, source, destination, offset):
    629     """Emit assembly to store a number elements from the source registers."""
    630     if store_type is not 32:
    631       raise ArgumentError('Unsupported store_type: %d' % store_type)
    632 
    633     sources = []
    634     if source[0] == 'q':
    635       sources.append(_Low(source))
    636       sources.append(_High(source))
    637       if count * store_type > 128:
    638         raise ArgumentError('To many %dbit elements in a q register: %d' %
    639                             (store_type, count))
    640     else:
    641       sources.append(source)
    642       if count * store_type > 64:
    643         raise ArgumentError('To many %dbit elements in a d register: %d' %
    644                             (store_type, count))
    645 
    646     if count == 1:
    647       self.EmitVStoreOffset(1, store_type,
    648                             self.Lane(store_type, sources[0], 0),
    649                             self.Dereference(destination, None), offset)
    650     elif count == 2:
    651       self.EmitVStoreOffset(1, store_type, sources[0],
    652                             self.Dereference(destination, None), offset)
    653     elif count == 3:
    654       self.EmitVStore(1, store_type, sources[0],
    655                       self.DereferenceIncrement(destination, None))
    656       self.EmitVStoreOffset(1, store_type,
    657                             self.Lane(store_type, sources[1], 0),
    658                             self.Dereference(destination, None), offset)
    659       self.EmitSub(destination, destination, self.ImmediateConstant(8))
    660     elif count == 4:
    661       self.EmitVStoreOffsetA(1, store_type, sources,
    662                              self.Dereference(destination, None), offset)
    663     else:
    664       raise ArgumentError('To many elements: %d' % count)
    665 
    666   def EmitVSumReduce(self, reduce_type, elem_count, reduce_count, destinations,
    667                      sources):
    668     """Emit assembly for n-fold horizontal sum reduction."""
    669     if reduce_type is not 'u32':
    670       raise ArgumentError('Unsupported reduce: %s' % reduce_type)
    671 
    672     sources = _ExpandQuads(sources)
    673 
    674     destinations = _ExpandQuads(destinations)
    675 
    676     if len(destinations) * 2 < elem_count:
    677       raise ArgumentError('Not enough space in destination: %d vs %d' %
    678                           (len(destinations) * 2, elem_count))
    679 
    680     if len(sources) * 2 != elem_count * reduce_count:
    681       raise ArgumentError('Wrong number of sources: %d vs %d' %
    682                           (len(sources) * 2, elem_count * reduce_count))
    683 
    684     if reduce_count <= 1:
    685       raise ArgumentError('Unsupported reduce_count: %d' % reduce_count)
    686 
    687     while reduce_count > 1:
    688       if len(sources) % 2 == 1:
    689         sources.append(sources[-1])
    690 
    691       if reduce_count == 2:
    692         for i in range(len(sources) / 2):
    693           self.EmitVPadd(reduce_type, destinations[i], sources[2 * i],
    694                          sources[2 * i + 1])
    695         return
    696       else:
    697         sources_2 = []
    698         for i in range(len(sources) / 2):
    699           self.EmitVPadd(reduce_type, sources[2 * i], sources[2 * i],
    700                          sources[2 * i + 1])
    701           sources_2.append(sources[2 * i])
    702         reduce_count /= 2
    703         sources = sources_2
    704 
    705   def EmitVUzp(self, uzp_type, operand_1, operand_2):
    706     self.EmitOp2('vuzp.%d' % uzp_type, operand_1, operand_2)
    707 
    708   def EmitVTrn(self, trn_type, operand_1, operand_2):
    709     self.EmitOp2('vtrn.%d' % trn_type, operand_1, operand_2)
    710 
    711   def EmitColBlockStride(self, cols, stride, new_stride):
    712     assert cols in [1, 2, 3, 4, 5, 6, 7, 8]
    713     if cols in [5, 6, 7]:
    714       self.EmitSub(new_stride, stride, self.ImmediateConstant(4))
    715 
    716   def EmitLoadColBlock(self, unused_registers, load_type, cols, elements, block,
    717                        input_address, stride):
    718     """Load a block of column major data."""
    719     assert cols is len(block)
    720     assert load_type is 8
    721 
    722     input_deref = self.Dereference(input_address, None)
    723     input_deref_increment = self.DereferenceIncrement(input_address, None)
    724 
    725     if cols is 1:
    726       for i in range(elements):
    727         self.EmitVLoadOffset(1, 8,
    728                              self.Lane(8, block[0], i), input_deref, stride)
    729       self.EmitPld(input_address)
    730     elif cols is 2:
    731       for i in range(elements):
    732         self.EmitVLoadOffset(1, 16,
    733                              self.Lane(16, block[i / 4], i % 4), input_deref,
    734                              stride)
    735       self.EmitPld(input_address)
    736       self.EmitVUzp(8, block[0], block[1])
    737     elif cols is 3:
    738       for i in range(elements):
    739         self.EmitVLoadOffsetA(3, 8, [self.Lane(8, row, i) for row in block],
    740                               input_deref, stride)
    741     elif cols is 4:
    742       for i in range(elements):
    743         self.EmitVLoadOffset(1, 32,
    744                              self.Lane(32, block[i % 4], i / 4), input_deref,
    745                              stride)
    746       self.EmitPld(input_address)
    747       self.EmitVTrn(16, block[0], block[2])
    748       self.EmitVTrn(16, block[1], block[3])
    749       self.EmitVTrn(8, block[0], block[1])
    750       self.EmitVTrn(8, block[2], block[3])
    751     elif cols is 5:
    752       for i in range(elements):
    753         self.EmitVLoad(1, 32,
    754                        self.Lane(32, block[i % 4], i / 4),
    755                        input_deref_increment)
    756         self.EmitVLoadOffset(1, 8,
    757                              self.Lane(8, block[4], i), input_deref, stride)
    758       self.EmitPld(input_address)
    759       self.EmitVTrn(16, block[0], block[2])
    760       self.EmitVTrn(16, block[1], block[3])
    761       self.EmitVTrn(8, block[0], block[1])
    762       self.EmitVTrn(8, block[2], block[3])
    763     elif cols is 6:
    764       for i in range(elements):
    765         self.EmitVLoad(1, 32,
    766                        self.Lane(32, block[i % 4], i / 4),
    767                        input_deref_increment)
    768         self.EmitVLoadOffset(1, 16,
    769                              self.Lane(16, block[4 + i / 4], i % 4),
    770                              input_deref, stride)
    771       self.EmitPld(input_address)
    772       self.EmitVTrn(16, block[0], block[2])
    773       self.EmitVTrn(16, block[1], block[3])
    774       self.EmitVUzp(8, block[4], block[5])
    775       self.EmitVTrn(8, block[0], block[1])
    776       self.EmitVTrn(8, block[2], block[3])
    777     elif cols is 7:
    778       for i in range(elements):
    779         self.EmitVLoad(1, 32,
    780                        self.Lane(32, block[i % 4], i / 4),
    781                        input_deref_increment)
    782         self.EmitVLoadOffsetA(3, 8,
    783                               [self.Lane(8, row, i) for row in block[4:]],
    784                               input_deref, stride)
    785       self.EmitPld(input_address)
    786       self.EmitVTrn(16, block[0], block[2])
    787       self.EmitVTrn(16, block[1], block[3])
    788       self.EmitVTrn(8, block[0], block[1])
    789       self.EmitVTrn(8, block[2], block[3])
    790     elif cols is 8:
    791       for i in range(elements):
    792         self.EmitVLoadOffset(1, 32, block[i], input_deref, stride)
    793       self.EmitPld(input_address)
    794       self.EmitVTrn(8, block[0], block[1])
    795       self.EmitVTrn(8, block[2], block[3])
    796       self.EmitVTrn(8, block[4], block[5])
    797       self.EmitVTrn(8, block[6], block[7])
    798       self.EmitVTrn(16, block[0], block[2])
    799       self.EmitVTrn(16, block[1], block[3])
    800       self.EmitVTrn(16, block[4], block[6])
    801       self.EmitVTrn(16, block[5], block[7])
    802       self.EmitVTrn(32, block[0], block[4])
    803       self.EmitVTrn(32, block[1], block[5])
    804       self.EmitVTrn(32, block[2], block[6])
    805       self.EmitVTrn(32, block[3], block[7])
    806     else:
    807       assert False
    808     return block
    809 
    810   def Dereference(self, value, alignment=None):
    811     if alignment:
    812       return '[%s:%d]' % (value, alignment)
    813     else:
    814       return '[%s]' % value
    815 
    816   def DereferenceIncrement(self, value, alignment=None):
    817     return '%s!' % self.Dereference(value, alignment)
    818 
    819   def ImmediateConstant(self, value):
    820     return '#%d' % value
    821 
    822   def AllLanes(self, value):
    823     return '%s[]' % value
    824 
    825   def Lane(self, bits, value, lane):
    826     """Get the proper n-bit lane from the given register."""
    827     registers = []
    828     if value[0] == 'q':
    829       registers.append(_Low(value))
    830       registers.append(_High(value))
    831     else:
    832       registers.append(value)
    833 
    834     elems_per_register = 64 / bits
    835     register = lane / elems_per_register
    836     lane %= elems_per_register
    837 
    838     return '%s[%d]' % (registers[register], lane)
    839 
    840   def CreateRegisters(self):
    841     return _NeonRegisters32Bit()
    842