"""Mul primitive used by the GEMM function. The Mul primitive takes 1-3 zipped rows and 1-3 zipped columns and performs matrix multiplication on those resulting in a small 1x1 to 3x3 block of results. """ import neon_emitter class Error(Exception): """Module level error.""" class ConfigurationError(Error): """Unsupported configuration.""" class MulLanes(object): def __init__(self, input_address): self.input_address = input_address self.lanes = [] def AddLane(self, lane): self.lanes.append(lane) def FreeRegisters(self, registers): for i in range(0, len(self.lanes)): registers.FreeRegister(self.lanes[i]) self.lanes[i] = None def GenerateMulLanes(registers, lane_count, address): lanes = MulLanes(address) for unused_i in range(0, lane_count): lanes.AddLane(registers.DoubleRegister()) return lanes def Generate3MulLanes(quad_register, registers, address): lanes = MulLanes(address) lanes.AddLane(registers.Low(quad_register)) lanes.AddLane(registers.High(quad_register)) lanes.AddLane(registers.DoubleRegister()) return lanes def GenerateAndClearAggregators(emitter, registers, aggregator_count): """Prepare aggregators and emit aggregator clear code.""" emitter.EmitComment('Clear aggregators.') aggregators = [] for i in range(0, aggregator_count): aggregator = registers.QuadRegister() aggregators.append(aggregator) if i < 3: emitter.EmitVMov('i32', aggregator, emitter.ImmediateConstant(0)) else: emitter.EmitVMov('i32', aggregator, aggregators[i - 3]) emitter.EmitNewline() return aggregators def GenerateNxMLoadMultiplyAggregate(emitter, registers, left_lanes, right_lanes, aggregators, count): """Emit inner loop for N rows x M cols multiplication.""" emitter.EmitComment('General NxM lanes loop.') emitter.EmitNumericalLabel(1) emitter.EmitNewline() emitter.EmitComment('Subtract counter.') emitter.EmitSubs(count, count, emitter.ImmediateConstant(8)) emitter.EmitNewline() emitter.EmitVLoadA('1.8', left_lanes.lanes, emitter.DereferenceIncrement(left_lanes.input_address, 64)) emitter.EmitVLoadA( '1.8', right_lanes.lanes, emitter.DereferenceIncrement(right_lanes.input_address, 64)) emitter.EmitPldOffset(left_lanes.input_address, emitter.ImmediateConstant(64)) emitter.EmitPldOffset(right_lanes.input_address, emitter.ImmediateConstant(64)) rows = len(left_lanes.lanes) cols = len(right_lanes.lanes) multiply_results = [] for i in range(0, rows * cols): multiply_results.append(registers.QuadRegister()) for row in range(0, rows): for col in range(0, cols): index = row * cols + col emitter.EmitVMull('u8', multiply_results[index], right_lanes.lanes[col], left_lanes.lanes[row]) for i in range(0, rows * cols): emitter.EmitVPadal('u16', aggregators[i], multiply_results[i]) emitter.EmitNewline() emitter.EmitComment('Loop break.') emitter.EmitBneBack(1) emitter.EmitNewline() for register in multiply_results: registers.FreeRegister(register) def Generate3x3LoadMultiplyAggregate(emitter, registers, left_lanes, right_lanes, aggregators, count, backup_register): """Emit inner loop for 3 rows x 3 cols multiplication (register trick).""" emitter.EmitComment('3x3 lanes loop.') emitter.EmitNumericalLabel(1) emitter.EmitNewline() emitter.EmitComment('Subtract counter.') emitter.EmitSubs(count, count, emitter.ImmediateConstant(8)) emitter.EmitNewline() emitter.EmitVLoadA('1.8', left_lanes.lanes, emitter.DereferenceIncrement(left_lanes.input_address, 64)) emitter.EmitVLoadA( '1.8', right_lanes.lanes, emitter.DereferenceIncrement(right_lanes.input_address, 64)) emitter.EmitPldOffset(left_lanes.input_address, emitter.ImmediateConstant(64)) emitter.EmitPldOffset(right_lanes.input_address, emitter.ImmediateConstant(64)) temp = [] for unused_i in range(0, 4): temp.append(registers.QuadRegister()) emitter.EmitVMull('u8', temp[0], left_lanes.lanes[0], right_lanes.lanes[0]) emitter.EmitVMull('u8', temp[1], left_lanes.lanes[0], right_lanes.lanes[1]) emitter.EmitVMull('u8', temp[2], left_lanes.lanes[0], right_lanes.lanes[2]) emitter.EmitVMull('u8', temp[3], left_lanes.lanes[1], right_lanes.lanes[0]) emitter.EmitVPadal('u16', aggregators[0], temp[0]) emitter.EmitVPadal('u16', aggregators[1], temp[1]) emitter.EmitVPadal('u16', aggregators[2], temp[2]) emitter.EmitVPadal('u16', aggregators[3], temp[3]) emitter.EmitVMull('u8', temp[0], left_lanes.lanes[1], right_lanes.lanes[1]) emitter.EmitVMull('u8', temp[1], left_lanes.lanes[1], right_lanes.lanes[2]) emitter.EmitVMull('u8', temp[2], left_lanes.lanes[2], right_lanes.lanes[0]) emitter.EmitVMull('u8', temp[3], left_lanes.lanes[2], right_lanes.lanes[1]) emitter.EmitVMull('u8', backup_register, left_lanes.lanes[2], right_lanes.lanes[2]) emitter.EmitVPadal('u16', aggregators[4], temp[0]) emitter.EmitVPadal('u16', aggregators[5], temp[1]) emitter.EmitVPadal('u16', aggregators[6], temp[2]) emitter.EmitVPadal('u16', aggregators[7], temp[3]) emitter.EmitVPadal('u16', aggregators[8], backup_register) emitter.EmitNewline() emitter.EmitComment('Loop break.') emitter.EmitBneBack(1) emitter.EmitNewline() for register in temp: registers.FreeRegister(register) def ReadParams(emitter, registers, input_address, elements, min_reg): if elements == 1 or elements == 2: register = registers.DoubleRegister(min_reg * 2) emitter.EmitVLoad('1.32', register, emitter.Dereference(input_address, 64)) return register elif elements == 3 or elements == 4: register = registers.QuadRegister(min_reg) emitter.EmitVLoad('1.32', register, emitter.Dereference(input_address, 64)) return register else: raise ConfigurationError('Unsupported elements no: %d' % elements) def Duplicate(emitter, registers, rows, cols, min_register, values): """Populate a grid of registers duplicating provided values.""" duplicated = [] if cols == 1 or cols == 2: for unused_i in range(0, rows): duplicated.append(registers.DoubleRegister(min_register)) elif cols == 3 or cols == 4: for unused_i in range(0, rows): duplicated.append(registers.QuadRegister(min_register)) else: raise ConfigurationError('Unsupported duplicate amount: %d' % cols) if rows == 1: emitter.EmitVDup('32', duplicated[0], emitter.Lane(values, 0)) elif rows == 2: emitter.EmitVDup('32', duplicated[0], emitter.Lane(values, 0)) emitter.EmitVDup('32', duplicated[1], emitter.Lane(values, 1)) elif rows == 3: emitter.EmitVDup('32', duplicated[0], emitter.Lane( registers.Low(values), 0)) emitter.EmitVDup('32', duplicated[1], emitter.Lane( registers.Low(values), 1)) emitter.EmitVDup('32', duplicated[2], emitter.Lane( registers.High(values), 0)) elif rows == 4: emitter.EmitVDup('32', duplicated[0], emitter.Lane( registers.Low(values), 0)) emitter.EmitVDup('32', duplicated[1], emitter.Lane( registers.Low(values), 1)) emitter.EmitVDup('32', duplicated[2], emitter.Lane( registers.High(values), 0)) emitter.EmitVDup('32', duplicated[3], emitter.Lane( registers.High(values), 1)) return duplicated def DuplicateGeneralRegister(emitter, registers, cols, general_register, min_register): if cols == 1 or cols == 2: duplicated = registers.DoubleRegister(min_register) elif cols == 3 or cols == 4: duplicated = registers.QuadRegister(min_register) else: raise ConfigurationError('Unsupported duplicate amount: %d' % cols) emitter.EmitVDup('32', duplicated, general_register) return duplicated def ReduceAggregator(emitter, registers, aggregators, row, cols): if cols == 1: register = registers.Low(aggregators[row]) emitter.EmitVPadd('u32', register, register, register) return register elif cols == 2: register = registers.Low(aggregators[row * 2]) emitter.EmitVPadd('u32', register, register, registers.Low(aggregators[row * 2 + 1])) return register elif cols == 3: register = aggregators[row * 3] emitter.EmitVPadd('u32', registers.Low(register), registers.Low(register), registers.Low(aggregators[row * 3 + 1])) emitter.EmitVPadd('u32', registers.High(register), registers.Low(aggregators[row * 3 + 2]), registers.Low(aggregators[row * 3 + 2])) return register elif cols == 4: register = aggregators[row * 3] emitter.EmitVPadd('u32', registers.Low(register), registers.Low(register), registers.Low(aggregators[row * 3 + 1])) emitter.EmitVPadd('u32', registers.High(register), registers.Low(aggregators[row * 3 + 2]), registers.Low(aggregators[row * 3 + 3])) return register else: raise ConfigurationError('Unsupported columns no: %d' % cols) def StoreAggregator(emitter, registers, aggregator, cols, result_address, result_stride): if cols == 1: emitter.EmitVStoreOffset('1.32', emitter.Lane(aggregator, 0), emitter.Dereference(result_address, None), result_stride) elif cols == 2: emitter.EmitVStoreOffset('1.32', aggregator, emitter.Dereference(result_address, None), result_stride) elif cols == 3: emitter.EmitVStore('1.32', registers.Low(aggregator), emitter.DereferenceIncrement(result_address, None)) emitter.EmitVStoreOffset('1.32', emitter.Lane( registers.High(aggregator), 0), emitter.Dereference(result_address, None), result_stride) emitter.EmitNewline() elif cols == 4: emitter.EmitVStoreOffsetA( '1.32', [registers.Low(aggregator), registers.High(aggregator)], emitter.Dereference(result_address, None), result_stride) else: raise ConfigurationError('Unsupported columns no: %d' % cols) def GenerateAggregatorReduceStore(emitter, registers, aggregators, result_type, lhs_add, rhs_add, left_lanes, right_lanes, results, results_stride): """Emit code that reduces 4 lane aggregators to 1 value, and stores them.""" rows = len(left_lanes.lanes) cols = len(right_lanes.lanes) if lhs_add: left_offset = ReadParams(emitter, registers, left_lanes.input_address, rows, 4) left_offsets = Duplicate(emitter, registers, rows, cols, 4, left_offset) else: left_offsets = None if rhs_add: right_offset = ReadParams(emitter, registers, right_lanes.input_address, cols, 4) else: right_offset = None if result_type is 'float': result_scale = DuplicateGeneralRegister( emitter, registers, cols, registers.MapParameter('result_scale'), 4) else: result_scale = None if cols == 3: emitter.EmitNewline() emitter.EmitComment('Change stride because storing in two ops.') emitter.EmitSub(results_stride, results_stride, emitter.ImmediateConstant(8)) emitter.EmitNewline() emitter.EmitComment('Horizontal reduce aggregators.') for aggregator in aggregators: emitter.EmitVPadd('u32', registers.Low(aggregator), registers.Low(aggregator), registers.High(aggregator)) emitter.EmitNewline() emitter.EmitComment('Reduce rows.') row_temps = [] for i in range(0, rows): row_temps.append(ReduceAggregator(emitter, registers, aggregators, i, cols)) if lhs_add: emitter.EmitNewline() emitter.EmitComment('Add lhs offsets to aggregated rows.') for (row_temp, left_offset) in zip(row_temps, left_offsets): emitter.EmitVAdd('s32', row_temp, row_temp, left_offset) if rhs_add: emitter.EmitNewline() emitter.EmitComment('Add rhs offset to aggregated rows.') for row_temp in row_temps: emitter.EmitVAdd('s32', row_temp, row_temp, right_offset) if result_type is 'float': emitter.EmitNewline() emitter.EmitComment('Convert to float. Multiply by result scale.') for row_temp in row_temps: emitter.EmitVCvt('f32', 's32', row_temp, row_temp) for row_temp in row_temps: emitter.EmitVMul('f32', row_temp, row_temp, result_scale) emitter.EmitNewline() emitter.EmitComment('Store reduced rows.') for row_temp in row_temps: StoreAggregator(emitter, registers, row_temp, cols, results, results_stride) def BuildName(result_type, lhs_add, rhs_add, left, right): name = 'mul_%dx8_%dx8_%s' % (left, right, result_type) if lhs_add: name += '_lhsadd' if rhs_add: name += '_rhsadd' return name def CppResultType(result_type): if result_type is 'int32': return 'std::int32_t*' elif result_type is 'float': return 'float*' else: raise ConfigurationError('Unsupported result type: %s' % result_type) def GetParameters(result_type): params = [['const std::uint8_t*', 'lhs'], ['const std::uint8_t*', 'rhs'], ['std::int32_t', 'count'], [CppResultType(result_type), 'result'], ['std::int32_t', 'result_stride']] if result_type is 'float': params.append(['float', 'result_scale']) return params def GenerateMulNx8Mx8(emitter, result_type, lhs_add, rhs_add, left_lanes_count, right_lanes_count): """Emit the multiply code for given rows and cols counts.""" if left_lanes_count < 1 or left_lanes_count > 4: raise ConfigurationError('Left_lanes should be: 1, 2, 3 or 4.') if right_lanes_count < 1 or right_lanes_count > 4: raise ConfigurationError('Right_lanes should be: 1, 2, 3 or 4.') emitter.EmitFunctionBeginA( BuildName(result_type, lhs_add, rhs_add, left_lanes_count, right_lanes_count), GetParameters(result_type), 'inline void') emitter.EmitAssert('count % 8 == 0') emitter.EmitAssert('count >= 8') emitter.EmitAsmBegin() registers = neon_emitter.NeonRegisters() count = registers.MapParameter('count') size = left_lanes_count * right_lanes_count lhs = registers.MapParameter('lhs') rhs = registers.MapParameter('rhs') emitter.EmitPld(lhs) emitter.EmitPld(rhs) aggregators = GenerateAndClearAggregators(emitter, registers, size) if size < 9: left_lanes = GenerateMulLanes(registers, left_lanes_count, lhs) right_lanes = GenerateMulLanes(registers, right_lanes_count, rhs) GenerateNxMLoadMultiplyAggregate(emitter, registers, left_lanes, right_lanes, aggregators, count) else: # left == 3 and right == 3 backup_register = registers.QuadRegister() left_lanes = Generate3MulLanes(backup_register, registers, lhs) right_lanes = GenerateMulLanes(registers, right_lanes_count, rhs) Generate3x3LoadMultiplyAggregate(emitter, registers, left_lanes, right_lanes, aggregators, count, backup_register) left_lanes.FreeRegisters(registers) right_lanes.FreeRegisters(registers) GenerateAggregatorReduceStore(emitter, registers, aggregators, result_type, lhs_add, rhs_add, left_lanes, right_lanes, registers.MapParameter('result'), registers.MapParameter('result_stride')) emitter.EmitAsmEnd(registers.MappedParameters(), [], registers.Clobbers() + ['cc', 'memory']) emitter.EmitFunctionEnd() def GenerateFunctions(emitter, result_type, lhs_add, rhs_add): for left_lanes in range(1, 4): for right_lanes in range(1, 4): GenerateMulNx8Mx8(emitter, result_type, lhs_add, rhs_add, left_lanes, right_lanes) emitter.EmitNewline() GenerateMulNx8Mx8(emitter, result_type, lhs_add, rhs_add, 1, 4) emitter.EmitNewline() if __name__ == '__main__': GenerateFunctions(neon_emitter.NeonEmitter(), 'int32', True, True)