# Copyright 2016 The Gemmlowp Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""."""
import common
def _DuplicateGeneralRegister(size, emitter, registers, value, min_register):
register = registers.QuadRegister(min_register)
emitter.EmitVDup(size, register, value)
return register
def _DuplicateGeneralMemoryRegister(size, emitter, registers, value,
min_register):
register = registers.QuadRegister(min_register)
general = registers.GeneralRegister()
emitter.EmitLdr(general, value)
emitter.EmitVDup(size, register, general)
registers.FreeRegister(general)
return register
class MinMaxTransformation(object):
"""."""
def Check(self, in_type, out_type, kernel_size, leftovers):
assert in_type is 'uint8_t'
assert out_type is 'uint8_t'
assert kernel_size is 16
assert leftovers < 16
def Prepare(self, emitter, registers, unused_kernel_size):
emitter.EmitNewline()
emitter.EmitComment('MinMax::Prepare')
self.min = _DuplicateGeneralRegister(8, emitter, registers,
registers.MapParameter('min',
'params.min'),
4)
self.max = _DuplicateGeneralRegister(8, emitter, registers,
registers.MapParameter('max',
'params.max'),
4)
def Transform(self, emitter, registers, input_address, elements,
output_address):
"""Generate the MinMax transform inner loop code."""
emitter.EmitNewline()
emitter.EmitComment('MinMax::Transform')
register_count = (elements + 15) / 16
load = [registers.QuadRegister() for unused_i in range(register_count)]
emitter.EmitVLoadAE(8, elements, load, input_address, None)
emitter.EmitPldOffset(input_address, emitter.ImmediateConstant(16))
for register in load:
emitter.EmitVMax('u8', register, register, self.min)
for register in load:
emitter.EmitVMin('u8', register, register, self.max)
emitter.EmitNewline()
emitter.EmitVStoreAE(8, elements, load, output_address, None)
emitter.EmitPld(output_address)
registers.FreeRegisters(load)
class DequantizeTransformation(object):
"""."""
def Check(self, in_type, out_type, kernel_size, leftovers):
assert in_type is 'uint8_t'
assert out_type is 'float'
assert kernel_size is 16
assert leftovers < 16
def Prepare(self, emitter, registers, unused_kernel_size):
"""Duplicate quantization offsets to vector registers."""
emitter.EmitNewline()
emitter.EmitComment('Dequantize::Prepare')
self.range_min = _DuplicateGeneralRegister(
32, emitter, registers,
registers.MapParameter('range_min', 'params.range_min'), 4)
self.range_offset = _DuplicateGeneralRegister(
32, emitter, registers,
registers.MapParameter('range_offset', 'params.range_offset'), 4)
self.range_scale = _DuplicateGeneralRegister(
32, emitter, registers,
registers.MapParameter('range_scale', 'params.range_scale'), 4)
def Transform(self, emitter, registers, input_address, elements,
output_address):
"""Emit the dequantization inner loop."""
emitter.EmitNewline()
emitter.EmitComment('Dequantize::Transform')
register_count = (elements + 3) / 4
load = [registers.QuadRegister() for unused_i in range(register_count)]
emitter.EmitVLoadAE(8, elements, load, input_address, None)
emitter.EmitPldOffset(input_address, emitter.ImmediateConstant(32))
if len(load) is 1:
emitter.EmitVMovl('u8', load[0], load[0])
emitter.EmitVMovl('s16', load[0], load[0])
elif len(load) is 2:
emitter.EmitVMovl('u8', load[0], load[0])
emitter.EmitVMovl2('s16', load[0], load[1], load[0])
elif len(load) is 3:
emitter.EmitVMovl2('u8', load[0], load[1], load[0])
emitter.EmitVMovl('s16', load[2], load[1])
emitter.EmitVMovl2('s16', load[0], load[1], load[0])
elif len(load) is 4:
emitter.EmitVMovl2('u8', load[0], load[1], load[0])
emitter.EmitVMovl2('s16', load[2], load[3], load[1])
emitter.EmitVMovl2('s16', load[0], load[1], load[0])
else:
assert False
for register in load:
emitter.EmitVCvt('f32', 's32', register, register)
for register in load:
emitter.EmitVSub('f32', register, register, self.range_offset)
for register in load:
emitter.EmitVMul('f32', register, register, self.range_scale)
for register in load:
emitter.EmitVAdd('f32', register, register, self.range_min)
emitter.EmitNewline()
emitter.EmitVStoreAE(32, elements, load, output_address, None)
emitter.EmitPld(output_address)
registers.FreeRegisters(load)
class QuantizeTransformation(object):
"""."""
def Check(self, in_type, out_type, kernel_size, leftovers):
assert in_type is 'float'
assert out_type is 'uint8_t'
assert kernel_size is 16
assert leftovers < 16
def Prepare(self, emitter, registers, unused_kernel_size):
"""Duplicate quantization offsets to vector registers."""
emitter.EmitNewline()
emitter.EmitComment('Quantize::Prepare')
self.range_min = _DuplicateGeneralRegister(
32, emitter, registers,
registers.MapParameter('range_min', 'params.range_min'), 4)
self.range_offset = _DuplicateGeneralRegister(
32, emitter, registers,
registers.MapParameter('range_offset', 'params.range_offset'), 4)
self.range_scale = _DuplicateGeneralRegister(
32, emitter, registers,
registers.MapParameter('range_scale', 'params.range_scale'), 4)
def Transform(self, emitter, registers, input_address, elements,
output_address):
"""Emit quantization inner loop code."""
emitter.EmitNewline()
emitter.EmitComment('Quantize::Transform')
register_count = (elements + 3) / 4
load = [registers.QuadRegister() for unused_i in range(register_count)]
emitter.EmitVLoadAE(32, elements, load, input_address, None)
emitter.EmitPldOffset(input_address, emitter.ImmediateConstant(64))
for register in load:
emitter.EmitVSub('f32', register, register, self.range_min)
for register in load:
emitter.EmitVMul('f32', register, register, self.range_scale)
for register in load:
emitter.EmitVAdd('f32', register, register, self.range_offset)
for register in load:
emitter.EmitVCvt('s32', 'f32', register, register)
if len(load) is 1:
emitter.EmitVQmovn('s32', load[0], load[0])
emitter.EmitVQmovun('s16', load[0], load[0])
elif len(load) is 2:
emitter.EmitVQmovn2('s32', load[0], load[0], load[1])
emitter.EmitVQmovun('s16', load[0], load[0])
elif len(load) is 3:
emitter.EmitVQmovn2('s32', load[0], load[0], load[1])
emitter.EmitVQmovn('s32', load[2], load[2])
emitter.EmitVQmovun2('s16', load[0], load[0], load[2])
elif len(load) is 4:
emitter.EmitVQmovn2('s32', load[0], load[0], load[1])
emitter.EmitVQmovn2('s32', load[2], load[2], load[3])
emitter.EmitVQmovun2('s16', load[0], load[0], load[2])
else:
assert False
emitter.EmitNewline()
emitter.EmitVStoreAE(8, elements, load, output_address, None)
emitter.EmitPld(output_address)
registers.FreeRegisters(load)
class RequantizeTransformation(object):
"""."""
def Check(self, in_type, out_type, kernel_size, leftovers):
assert in_type is 'int32_t'
assert out_type is 'uint8_t'
assert kernel_size is 16
assert leftovers < 16
def Prepare(self, emitter, registers, unused_kernel_size):
"""Duplicate quantization parameters to vector registers."""
emitter.EmitNewline()
emitter.EmitComment('Requantize::Prepare')
self.range_min_delta = _DuplicateGeneralRegister(
32, emitter, registers,
registers.MapParameter('input_range_min', 'params.input_range_min'), 4)
self.output_range_min = _DuplicateGeneralRegister(
32, emitter, registers,
registers.MapParameter('output_range_min', 'params.output_range_min'),
4)
self.input_range_offset = _DuplicateGeneralRegister(
32, emitter, registers,
registers.MapParameter('input_range_offset',
'params.input_range_offset'), 4)
self.input_range_scale = _DuplicateGeneralRegister(
32, emitter, registers,
registers.MapParameter('input_range_scale', 'params.input_range_scale'),
4)
self.one_over_output_range_scale = _DuplicateGeneralRegister(
32, emitter, registers,
registers.MapParameter('one_over_output_range_scale',
'params.one_over_output_range_scale'), 4)
emitter.EmitVSub('f32', self.range_min_delta, self.range_min_delta,
self.output_range_min)
def Transform(self, emitter, registers, input_address, elements,
output_address):
"""Emit requantization inner loop code."""
emitter.EmitNewline()
emitter.EmitComment('Requantize::Transform')
register_count = (elements + 3) / 4
load = [registers.QuadRegister() for unused_i in range(register_count)]
emitter.EmitVLoadAE(32, elements, load, input_address, None)
emitter.EmitPldOffset(input_address, emitter.ImmediateConstant(64))
for register in load:
emitter.EmitVCvt('f32', 's32', register, register)
for register in load:
emitter.EmitVSub('f32', register, register, self.input_range_offset)
for register in load:
emitter.EmitVMul('f32', register, register, self.input_range_scale)
for register in load:
emitter.EmitVAdd('f32', register, register, self.range_min_delta)
for register in load:
emitter.EmitVMul('f32', register, register,
self.one_over_output_range_scale)
for register in load:
emitter.EmitVCvt('s32', 'f32', register, register)
if len(load) is 1:
emitter.EmitVQmovn('s32', load[0], load[0])
emitter.EmitVQmovun('s16', load[0], load[0])
elif len(load) is 2:
emitter.EmitVQmovn2('s32', load[0], load[0], load[1])
emitter.EmitVQmovun('s16', load[0], load[0])
elif len(load) is 3:
emitter.EmitVQmovn2('s32', load[0], load[0], load[1])
emitter.EmitVQmovn('s32', load[2], load[2])
emitter.EmitVQmovun2('s16', load[0], load[0], load[2])
elif len(load) is 4:
emitter.EmitVQmovn2('s32', load[0], load[0], load[1])
emitter.EmitVQmovn2('s32', load[2], load[2], load[3])
emitter.EmitVQmovun2('s16', load[0], load[0], load[2])
else:
assert False
emitter.EmitNewline()
emitter.EmitVStoreAE(8, elements, load, output_address, None)
emitter.EmitPld(output_address)
registers.FreeRegisters(load)
class BaseTransform(common.Transform1DKernelGenerator):
"""."""
def __init__(self, cc_emitter, kernel_name, asm_emitter, transformation):
common.Transform1DKernelGenerator.__init__(self, cc_emitter, kernel_name)
self.asm_emitter = asm_emitter
self.transformation = transformation
def EmitTransform(self, in_type, out_type, kernel_size, leftovers):
"""."""
self.transformation.Check(in_type, out_type, kernel_size, leftovers)
registers = self.asm_emitter.CreateRegisters()
self.emitter.EmitDeclare('int', 'params_count_copy', 'params.count')
self.asm_emitter.PushIndent(self.emitter.indent)
self.asm_emitter.EmitAsmBegin()
count = registers.MapOutputParameter('count', 'params_count_copy')
input_address = registers.MapOutputParameter('input')
output_address = registers.MapOutputParameter('output')
self.transformation.Prepare(self.asm_emitter, registers, kernel_size)
if leftovers:
self.asm_emitter.EmitNewline()
self.asm_emitter.EmitComment('Reduce count by leftovers.')
self.asm_emitter.EmitSubs(count, count,
self.asm_emitter.ImmediateConstant(leftovers))
self.asm_emitter.EmitBeqFront(2)
self.asm_emitter.EmitNewline()
self.asm_emitter.EmitNumericalLabel(1)
self.asm_emitter.EmitSubs(count, count,
self.asm_emitter.ImmediateConstant(kernel_size))
self.transformation.Transform(self.asm_emitter, registers, input_address,
kernel_size, output_address)
self.asm_emitter.EmitNewline()
self.asm_emitter.EmitBneBack(1)
if leftovers:
self.asm_emitter.EmitNumericalLabel(2)
self.asm_emitter.EmitNewline()
self.asm_emitter.EmitComment('Handle leftovers.')
self.transformation.Transform(self.asm_emitter, registers, input_address,
leftovers, output_address)
self.asm_emitter.EmitAsmEnd(registers)
self.asm_emitter.PopIndent(len(self.emitter.indent))
class Requantize(BaseTransform):
"""."""
def __init__(self, cc_emitter, asm_emitter):
BaseTransform.__init__(self, cc_emitter, 'Requantize', asm_emitter,
RequantizeTransformation())
class Quantize(BaseTransform):
"""."""
def __init__(self, cc_emitter, asm_emitter):
BaseTransform.__init__(self, cc_emitter, 'Quantize', asm_emitter,
QuantizeTransformation())
class Dequantize(BaseTransform):
"""."""
def __init__(self, cc_emitter, asm_emitter):
BaseTransform.__init__(self, cc_emitter, 'Dequantize', asm_emitter,
DequantizeTransformation())
class MinMax(BaseTransform):
"""."""
def __init__(self, numerical_type, cc_emitter, asm_emitter):
BaseTransform.__init__(self, cc_emitter, 'MinMax<%s>' % numerical_type,
asm_emitter, MinMaxTransformation())
class BiasAdd(common.Transform1DKernelGenerator):
"""."""
def __init__(self, bias_type, cc_emitter, asm_emitter):
common.Transform1DKernelGenerator.__init__(self, cc_emitter,
'BiasAdd<%s>' % bias_type)
self.asm_emitter = asm_emitter
def EmitTransform(self, in_type, out_type, kernel_size, leftovers):
"""."""
assert in_type is 'uint8_t'
assert out_type is 'int32_t'
assert kernel_size is 16
assert leftovers < 16
registers = self.asm_emitter.CreateRegisters()
self.emitter.EmitDeclare('int', 'params_rows_copy', 'params.rows')
self.asm_emitter.PushIndent(self.emitter.indent)
self.asm_emitter.EmitAsmBegin()
self._Prepare(self.asm_emitter, registers)
rows = registers.MapParameter('rows', 'params_rows_copy')
self.asm_emitter.EmitNumericalLabel(1)
self._ProcessRow(self.asm_emitter, registers, kernel_size, leftovers)
self.asm_emitter.EmitSubs(rows, rows, self.asm_emitter.ImmediateConstant(1))
self.asm_emitter.EmitBneBack(1)
self.asm_emitter.EmitAsmEnd(registers)
self.asm_emitter.PopIndent(len(self.emitter.indent))
def _Prepare(self, emitter, registers):
self.input_range_min = _DuplicateGeneralMemoryRegister(
32, emitter, registers,
registers.MapMemoryParameter('input_range_min',
'params.input_range_min'), 8)
self.input_range_scale = _DuplicateGeneralMemoryRegister(
32, emitter, registers,
registers.MapMemoryParameter('input_range_scale',
'params.input_range_scale'), 8)
self.bias_range_min = _DuplicateGeneralMemoryRegister(
32, emitter, registers,
registers.MapMemoryParameter('bias_range_min', 'params.bias_range_min'),
8)
self.bias_range_scale = _DuplicateGeneralMemoryRegister(
32, emitter, registers,
registers.MapMemoryParameter('bias_range_scale',
'params.bias_range_scale'), 8)
self.output_range_min = _DuplicateGeneralMemoryRegister(
32, emitter, registers,
registers.MapMemoryParameter('output_range_min',
'params.output_range_min'), 8)
self.one_over_output_range_scale = _DuplicateGeneralMemoryRegister(
32, emitter, registers,
registers.MapMemoryParameter('one_over_output_range_scale',
'params.one_over_output_range_scale'), 8)
self.output_range_offset = _DuplicateGeneralMemoryRegister(
32, emitter, registers,
registers.MapMemoryParameter('output_range_offset',
'params.output_range_offset'), 8)
def _ProcessRow(self, emitter, registers, kernel_size, leftovers):
const_count = registers.MapParameter('count', 'params.count')
const_bias = registers.MapParameter('bias', 'params.bias')
count = registers.GeneralRegister()
bias = registers.GeneralRegister()
input_address = registers.MapOutputParameter('input')
output_address = registers.MapOutputParameter('output')
emitter.EmitMov(count, const_count)
emitter.EmitMov(bias, const_bias)
if leftovers:
emitter.EmitSubs(count, count, emitter.ImmediateConstant(leftovers))
emitter.EmitBeqFront(3)
emitter.EmitNumericalLabel(2)
emitter.EmitSubs(count, count, emitter.ImmediateConstant(kernel_size))
self._BiasAdd(emitter, registers, kernel_size, input_address, bias,
output_address)
emitter.EmitBneBack(2)
if leftovers:
emitter.EmitNumericalLabel(3)
self._BiasAdd(emitter, registers, leftovers, input_address, bias,
output_address)
def _BiasAdd(self, emitter, registers, elements, input_address, bias,
output_address):
emitter.EmitNewline()
emitter.EmitComment('BiasAdd::Transform')
register_count = (elements + 3) / 4
load_input = [
registers.QuadRegister() for unused_i in range(register_count)
]
load_bias = [registers.QuadRegister() for unused_i in range(register_count)]
emitter.EmitVLoadAE(8, elements, load_input, input_address, None)
emitter.EmitVLoadAE(8, elements, load_bias, bias, None)
emitter.EmitPldOffset(input_address, emitter.ImmediateConstant(32))
if len(load_input) is 1:
emitter.EmitVMovl('u8', load_input[0], load_input[0])
emitter.EmitVMovl('u8', load_bias[0], load_bias[0])
emitter.EmitVMovl('s16', load_input[0], load_input[0])
emitter.EmitVMovl('s16', load_bias[0], load_bias[0])
elif len(load_input) is 2:
emitter.EmitVMovl('u8', load_input[0], load_input[0])
emitter.EmitVMovl('u8', load_bias[0], load_bias[0])
emitter.EmitVMovl2('s16', load_input[0], load_input[1], load_input[0])
emitter.EmitVMovl2('s16', load_bias[0], load_bias[1], load_bias[0])
elif len(load_input) is 3:
emitter.EmitVMovl2('u8', load_input[0], load_input[1], load_input[0])
emitter.EmitVMovl2('u8', load_bias[0], load_bias[1], load_bias[0])
emitter.EmitVMovl('s16', load_input[2], load_input[1])
emitter.EmitVMovl('s16', load_bias[2], load_bias[1])
emitter.EmitVMovl2('s16', load_input[0], load_input[1], load_input[0])
emitter.EmitVMovl2('s16', load_bias[0], load_bias[1], load_bias[0])
elif len(load_input) is 4:
emitter.EmitVMovl2('u8', load_input[0], load_input[1], load_input[0])
emitter.EmitVMovl2('u8', load_bias[0], load_bias[1], load_bias[0])
emitter.EmitVMovl2('s16', load_input[2], load_input[3], load_input[1])
emitter.EmitVMovl2('s16', load_bias[2], load_bias[3], load_bias[1])
emitter.EmitVMovl2('s16', load_input[0], load_input[1], load_input[0])
emitter.EmitVMovl2('s16', load_bias[0], load_bias[1], load_bias[0])
else:
assert False
for register in load_input + load_bias:
emitter.EmitVCvt('f32', 's32', register, register)
for register in load_input:
emitter.EmitVMul('f32', register, register, self.input_range_scale)
for register in load_bias:
emitter.EmitVMul('f32', register, register, self.bias_range_scale)
for register in load_input:
emitter.EmitVAdd('f32', register, register, self.input_range_min)
for register in load_bias:
emitter.EmitVAdd('f32', register, register, self.bias_range_min)
for (register_1, register_2) in zip(load_input, load_bias):
emitter.EmitVAdd('f32', register_1, register_1, register_2)
for register in load_input:
emitter.EmitVSub('f32', register, register, self.output_range_min)
for register in load_input:
emitter.EmitVMul('f32', register, register,
self.one_over_output_range_scale)
for register in load_input:
emitter.EmitVAdd('f32', register, register, self.output_range_offset)
for register in load_input:
emitter.EmitVCvt('s32', 'f32', register, register)
emitter.EmitNewline()
emitter.EmitVStoreAE(32, elements, load_input, output_address, None)
emitter.EmitPld(output_address)
registers.FreeRegisters(load_input + load_bias)
def GenerateKernels(cc_emitter, asm_emitter, shapes):
"""Generate the quantization/dequantization/requantization kernels."""
requantize = Requantize(cc_emitter, asm_emitter)
quantize = Quantize(cc_emitter, asm_emitter)
dequantize = Dequantize(cc_emitter, asm_emitter)
minmax = MinMax('uint8_t', cc_emitter, asm_emitter)
biasadd = BiasAdd('uint8_t', cc_emitter, asm_emitter)
for shape in shapes:
requantize.SpecializeTransform1DKernel('int32_t', 'uint8_t', shape[0],
shape[1])
for shape in shapes:
quantize.SpecializeTransform1DKernel('float', 'uint8_t', shape[0], shape[1])
for shape in shapes:
dequantize.SpecializeTransform1DKernel('uint8_t', 'float', shape[0],
shape[1])
for shape in shapes:
minmax.SpecializeTransform1DKernel('uint8_t', 'uint8_t', shape[0], shape[1])
for shape in shapes:
biasadd.SpecializeTransform1DKernel('uint8_t', 'int32_t', shape[0],
shape[1])