Lines Matching refs:emitter
27 def GenerateZipLanes(emitter, registers, zip_lanes, input_address, stride):
31 emitter: ARM/NEON emitter.
50 emitter.EmitAdd(address_register, last_address_register, stride)
64 def GenerateClearAggregators(emitter, lanes):
66 emitter.EmitVMov('i16', lane.aggregator, emitter.ImmediateConstant(0))
69 def GenerateLoadAggregateStore(emitter, lanes, output_address, alignment):
71 emitter.EmitNewline()
72 emitter.EmitComment('Load Aggregate Store.')
75 emitter.EmitVLoad(
77 emitter.DereferenceIncrement(lane.input_address, alignment))
81 emitter.EmitVAddw('u8', lane.aggregator, lane.aggregator, lane.load)
84 emitter.EmitVStoreA('1.8', store_registers,
85 emitter.DereferenceIncrement(output_address, 64))
88 def GenerateLeftoverLoadAggregateStore(emitter, leftovers, lanes,
91 emitter.EmitNewline()
92 emitter.EmitComment('Leftover Load Aggregate Store.')
96 emitter.EmitVMov('i8', lane.load, emitter.ImmediateConstant(0))
101 emitter.EmitVLoad('1.8', emitter.Lane(lane.load, 0),
102 emitter.Dereference(lane.input_address, None))
106 emitter.EmitVLoad('1.16', emitter.Lane(lane.load, 0),
107 emitter.Dereference(lane.input_address, None))
111 emitter.EmitVLoad('1.16', emitter.Lane(lane.load, 0),
112 emitter.DereferenceIncrement(lane.input_address, None))
115 emitter.EmitVLoad('1.8', emitter.Lane(lane.load, 2),
116 emitter.Dereference(lane.input_address, None))
120 emitter.EmitVLoad('1.32', emitter.Lane(lane.load, 0),
121 emitter.Dereference(lane.input_address, None))
125 emitter.EmitVLoad('1.32', emitter.Lane(lane.load, 0),
126 emitter.DereferenceIncrement(lane.input_address, None))
129 emitter.EmitVLoad('1.8', emitter.Lane(lane.load, 4),
130 emitter.Dereference(lane.input_address, None))
134 emitter.EmitVLoad('1.32', emitter.Lane(lane.load, 0),
135 emitter.DereferenceIncrement(lane.input_address, None))
138 emitter.EmitVLoad('1.16', emitter.Lane(lane.load, 2),
139 emitter.Dereference(lane.input_address, None))
143 emitter.EmitVLoad('1.32', emitter.Lane(lane.load, 0),
144 emitter.DereferenceIncrement(lane.input_address, None))
147 emitter.EmitVLoad('1.16', emitter.Lane(lane.load, 2),
148 emitter.DereferenceIncrement(lane.input_address, None))
151 emitter.EmitVLoad('1.8', emitter.Lane(lane.load, 6),
152 emitter.Dereference(lane.input_address, None))
159 emitter.EmitVAddw('u8', lane.aggregator, lane.aggregator, lane.load)
163 emitter.EmitVStoreA('1.8', store_registers,
164 emitter.DereferenceIncrement(output_address, 64))
167 def GenerateAggregatorReduction(emitter, registers, lanes, output_address,
170 emitter.EmitNewline()
171 emitter.EmitComment('Aggregator Reduction.')
174 emitter.EmitVMov('32', emitter.Lane(multiplier, 0), multiplicative_offset)
176 emitter.EmitVDup('32', offset, additive_offset)
180 emitter.EmitVPaddl('u16', lane.aggregator, lane.aggregator)
185 emitter.EmitVPadd('u32', lane_temp, registers.Low(lane.aggregator),
193 emitter.EmitVPadd('u32', low, lane_temps[0], lane_temps[0])
195 emitter.EmitVPadd('u32', low, lane_temps[0], lane_temps[1])
197 emitter.EmitVPadd('u32', low, lane_temps[0], lane_temps[1])
198 emitter.EmitVPadd('u32', high, lane_temps[2], lane_temps[2])
200 emitter.EmitVPadd('u32', low, lane_temps[0], lane_temps[1])
201 emitter.EmitVPadd('u32', high, lane_temps[2], lane_temps[3])
206 emitter.EmitVMul('i32', temp, temp, emitter.Lane(multiplier, 0))
207 emitter.EmitVAdd('i32', temp, temp, offset)
210 emitter.EmitVStore('1.32', emitter.Lane(low, 0),
211 emitter.Dereference(output_address, None))
213 emitter.EmitVStore('1.32', low, emitter.Dereference(output_address, 64))
215 emitter.EmitVStore('1.32', low,
216 emitter.DereferenceIncrement(output_address, 64))
217 emitter.EmitVStore('1.32', emitter.Lane(high, 0),
218 emitter.Dereference(output_address, None))
220 emitter.EmitVStoreA('1.32', [low, high],
221 emitter.DereferenceIncrement(output_address, 64))
224 def GenerateZipNx8(emitter, zip_lanes, leftovers, aligned):
233 emitter.EmitFunctionBeginA(
238 emitter.EmitAssert('count %% 8 == %d' % leftovers)
239 emitter.EmitAssert('count <= 2048')
240 emitter.EmitAssert('count >= 8')
241 emitter.EmitAssert('reinterpret_cast<std::uintptr_t>(destination) % 8 == 0')
243 emitter.EmitAssert('reinterpret_cast<std::uintptr_t>(source) % 8 == 0')
245 emitter.EmitAssert('stride % 8 == 0')
246 emitter.EmitAsmBegin()
253 lanes = GenerateZipLanes(emitter, registers, zip_lanes,
258 emitter.EmitSub(count, count, emitter.ImmediateConstant(leftovers))
260 GenerateClearAggregators(emitter, lanes)
262 emitter.EmitNewline()
263 emitter.EmitNumericalLabel(1)
264 emitter.EmitSubs(count, count, emitter.ImmediateConstant(8))
266 GenerateLoadAggregateStore(emitter, lanes, output_address, 64 if aligned else
269 emitter.EmitNewline()
270 emitter.EmitBneBack(1)
273 GenerateLeftoverLoadAggregateStore(emitter, leftovers, lanes,
276 GenerateAggregatorReduction(emitter, registers, lanes, output_address,
280 emitter.EmitAsmEnd(registers.MappedParameters(), [],
282 emitter.EmitFunctionEnd()
285 def GenerateFunctions(emitter):
289 GenerateZipNx8(emitter, lanes, leftovers, aligned)
290 emitter.EmitNewline()