// Copyright 2016, VIXL authors // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // * Neither the name of ARM Limited nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include <cfloat> #include <cmath> #include <cstdio> #include <cstdlib> #include <cstring> #include "test-runner.h" #include "test-utils-aarch64.h" #include "aarch64/cpu-aarch64.h" #include "aarch64/disasm-aarch64.h" #include "aarch64/macro-assembler-aarch64.h" #include "aarch64/simulator-aarch64.h" namespace vixl { namespace aarch64 { #define __ masm-> #define TEST(name) TEST_(TRACE_##name) #define REF(name) "test/test-trace-reference/" name static void GenerateTestSequenceBase(MacroAssembler* masm) { ExactAssemblyScope guard(masm, masm->GetBuffer()->GetRemainingBytes(), ExactAssemblyScope::kMaximumSize); __ adc(w3, w4, w5); __ adc(x6, x7, x8); __ adcs(w9, w10, w11); __ adcs(x12, x13, x14); __ add(w15, w16, w17); __ add(x18, x19, x20); __ adds(w21, w22, w23); __ adds(x24, x25, x26); __ and_(w27, w28, w29); __ and_(x2, x3, x4); __ ands(w5, w6, w7); __ ands(x8, x9, x10); __ asr(w11, w12, 0); __ asr(x13, x14, 1); __ asrv(w15, w16, w17); __ asrv(x18, x19, x20); __ bfm(w21, w22, 5, 6); __ bfm(x23, x24, 7, 8); __ bic(w25, w26, w27); __ bic(x28, x29, x2); __ bics(w3, w4, w5); __ bics(x6, x7, x8); __ ccmn(w9, w10, NoFlag, al); __ ccmn(w9, w10, NoFlag, eq); __ ccmn(w9, w10, NoFlag, ne); __ ccmn(x11, x12, CFlag, al); __ ccmn(x11, x12, CFlag, cc); __ ccmn(x11, x12, CFlag, cs); __ ccmp(w13, w14, VFlag, al); __ ccmp(w13, w14, VFlag, hi); __ ccmp(w13, w14, VFlag, ls); __ ccmp(x15, x16, CVFlag, al); __ ccmp(x15, x16, CVFlag, eq); __ ccmp(x15, x16, CVFlag, ne); __ cinc(w17, w18, cc); __ cinc(w17, w18, cs); __ cinc(x19, x20, hi); __ cinc(x19, x20, ls); __ cinv(w21, w22, eq); __ cinv(w21, w22, ne); __ cinv(x23, x24, cc); __ cinv(x23, x24, cs); __ clrex(); __ cls(w25, w26); __ cls(x27, x28); __ clz(w29, w2); __ clz(x3, x4); __ cmn(w5, w6); __ cmn(x7, x8); __ cmp(w9, w10); __ cmp(x11, x12); __ cneg(w13, w14, hi); __ cneg(w13, w14, ls); __ cneg(x15, x16, eq); __ cneg(x15, x16, ne); __ crc32b(w17, w18, w19); __ crc32cb(w20, w21, w22); __ crc32ch(w23, w24, w25); __ crc32cw(w26, w27, w28); __ crc32h(w4, w5, w6); __ crc32w(w7, w8, w9); __ csel(w13, w14, w15, cc); __ csel(w13, w14, w15, cs); __ csel(x16, x17, x18, hi); __ csel(x16, x17, x18, ls); __ cset(w19, eq); __ cset(w19, ne); __ cset(x20, cc); __ cset(x20, cs); __ csetm(w21, hi); __ csetm(w21, ls); __ csetm(x22, eq); __ csetm(x22, ne); __ csinc(w23, w24, w25, cc); __ csinc(w23, w24, w25, cs); __ csinc(x26, x27, x28, hi); __ csinc(x26, x27, x28, ls); __ csinv(w29, w2, w3, eq); __ csinv(w29, w2, w3, ne); __ csinv(x4, x5, x6, cc); __ csinv(x4, x5, x6, cs); __ csneg(w7, w8, w9, hi); __ csneg(w7, w8, w9, ls); __ csneg(x10, x11, x12, eq); __ csneg(x10, x11, x12, ne); __ dc(CVAC, x0); __ dmb(InnerShareable, BarrierAll); __ dsb(InnerShareable, BarrierAll); __ eon(w13, w14, w15); __ eon(x16, x17, x18); __ eor(w19, w20, w21); __ eor(x22, x23, x24); __ extr(w25, w26, w27, 9); __ extr(x28, x29, x2, 10); __ hint(NOP); __ ic(IVAU, x0); __ isb(); __ ldar(w3, MemOperand(x0)); __ ldar(x4, MemOperand(x0)); __ ldarb(w5, MemOperand(x0)); __ ldarb(x6, MemOperand(x0)); __ ldarh(w7, MemOperand(x0)); __ ldarh(x8, MemOperand(x0)); __ ldaxp(w9, w10, MemOperand(x0)); __ ldaxp(x11, x12, MemOperand(x0)); __ ldaxr(w13, MemOperand(x0)); __ ldaxr(x14, MemOperand(x0)); __ ldaxrb(w15, MemOperand(x0)); __ ldaxrb(x16, MemOperand(x0)); __ ldaxrh(w17, MemOperand(x0)); __ ldaxrh(x18, MemOperand(x0)); __ ldnp(w19, w20, MemOperand(x0)); __ ldnp(x21, x22, MemOperand(x0)); __ ldp(w23, w24, MemOperand(x0)); __ ldp(w23, w24, MemOperand(x1, 8, PostIndex)); __ ldp(w23, w24, MemOperand(x1, 8, PreIndex)); __ ldp(x25, x26, MemOperand(x0)); __ ldp(x25, x26, MemOperand(x1, 16, PostIndex)); __ ldp(x25, x26, MemOperand(x1, 16, PreIndex)); __ ldpsw(x27, x28, MemOperand(x0)); __ ldpsw(x27, x28, MemOperand(x1, 8, PostIndex)); __ ldpsw(x27, x28, MemOperand(x1, 8, PreIndex)); __ ldr(w29, MemOperand(x0)); __ ldr(w29, MemOperand(x1, 4, PostIndex)); __ ldr(w29, MemOperand(x1, 4, PreIndex)); __ ldr(x2, MemOperand(x0)); __ ldr(x2, MemOperand(x1, 8, PostIndex)); __ ldr(x2, MemOperand(x1, 8, PreIndex)); __ ldrb(w3, MemOperand(x0)); __ ldrb(w3, MemOperand(x1, 1, PostIndex)); __ ldrb(w3, MemOperand(x1, 1, PreIndex)); __ ldrb(x4, MemOperand(x0)); __ ldrb(x4, MemOperand(x1, 1, PostIndex)); __ ldrb(x4, MemOperand(x1, 1, PreIndex)); __ ldrh(w5, MemOperand(x0)); __ ldrh(w5, MemOperand(x1, 2, PostIndex)); __ ldrh(w5, MemOperand(x1, 2, PreIndex)); __ ldrh(x6, MemOperand(x0)); __ ldrh(x6, MemOperand(x1, 2, PostIndex)); __ ldrh(x6, MemOperand(x1, 2, PreIndex)); __ ldrsb(w7, MemOperand(x0)); __ ldrsb(w7, MemOperand(x1, 1, PostIndex)); __ ldrsb(w7, MemOperand(x1, 1, PreIndex)); __ ldrsb(x8, MemOperand(x0)); __ ldrsb(x8, MemOperand(x1, 1, PostIndex)); __ ldrsb(x8, MemOperand(x1, 1, PreIndex)); __ ldrsh(w9, MemOperand(x0)); __ ldrsh(w9, MemOperand(x1, 2, PostIndex)); __ ldrsh(w9, MemOperand(x1, 2, PreIndex)); __ ldrsh(x10, MemOperand(x0)); __ ldrsh(x10, MemOperand(x1, 2, PostIndex)); __ ldrsh(x10, MemOperand(x1, 2, PreIndex)); __ ldrsw(x11, MemOperand(x0)); __ ldrsw(x11, MemOperand(x1, 4, PostIndex)); __ ldrsw(x11, MemOperand(x1, 4, PreIndex)); __ ldur(w12, MemOperand(x0, 7)); __ ldur(x13, MemOperand(x0, 15)); __ ldurb(w14, MemOperand(x0, 1)); __ ldurb(x15, MemOperand(x0, 1)); __ ldurh(w16, MemOperand(x0, 3)); __ ldurh(x17, MemOperand(x0, 3)); __ ldursb(w18, MemOperand(x0, 1)); __ ldursb(x19, MemOperand(x0, 1)); __ ldursh(w20, MemOperand(x0, 3)); __ ldursh(x21, MemOperand(x0, 3)); __ ldursw(x22, MemOperand(x0, 7)); __ ldxp(w23, w24, MemOperand(x0)); __ ldxp(x25, x26, MemOperand(x0)); __ ldxr(w27, MemOperand(x0)); __ ldxr(x28, MemOperand(x0)); __ ldxrb(w29, MemOperand(x0)); __ ldxrb(x2, MemOperand(x0)); __ ldxrh(w3, MemOperand(x0)); __ ldxrh(x4, MemOperand(x0)); __ lsl(w5, w6, 2); __ lsl(x7, x8, 3); __ lslv(w9, w10, w11); __ lslv(x12, x13, x14); __ lsr(w15, w16, 4); __ lsr(x17, x18, 5); __ lsrv(w19, w20, w21); __ lsrv(x22, x23, x24); __ madd(w25, w26, w27, w28); __ madd(x29, x2, x3, x4); __ mneg(w5, w6, w7); __ mneg(x8, x9, x10); __ mov(w11, w12); __ mov(x13, x14); __ movk(w15, 130); __ movk(x16, 131); __ movn(w17, 132); __ movn(x18, 133); __ movz(w19, 134); __ movz(x20, 135); __ msub(w22, w23, w24, w25); __ msub(x26, x27, x28, x29); __ mul(w2, w3, w4); __ mul(x5, x6, x7); __ mvn(w8, w9); __ mvn(x10, x11); __ neg(w12, w13); __ neg(x14, x15); __ negs(w16, w17); __ negs(x18, x19); __ ngc(w20, w21); __ ngc(x22, x23); __ ngcs(w24, w25); __ ngcs(x26, x27); __ nop(); __ orn(w28, w29, w2); __ orn(x3, x4, x5); __ orr(w6, w7, w8); __ orr(x9, x10, x11); __ prfm(PLDL1KEEP, MemOperand(x0, 4)); __ prfum(PLDL1KEEP, MemOperand(x0, 1)); __ rbit(w12, w13); __ rbit(x14, x15); __ rev(w16, w17); __ rev(x18, x19); __ rev16(w20, w21); __ rev16(x22, x23); __ rev32(x24, x25); __ rorv(w26, w27, w28); __ rorv(x29, x2, x3); __ sbc(w4, w5, w6); __ sbc(x7, x8, x9); __ sbcs(w10, w11, w12); __ sbcs(x13, x14, x15); __ sbfiz(w16, w17, 2, 3); __ sbfiz(x18, x19, 4, 5); __ sbfx(w22, w23, 6, 7); __ sbfx(x24, x25, 8, 9); __ sdiv(w26, w27, w28); __ sdiv(x29, x2, x3); __ smulh(x12, x13, x14); __ stlr(w18, MemOperand(x0)); __ stlr(x19, MemOperand(x0)); __ stlrb(w20, MemOperand(x0)); __ stlrb(x21, MemOperand(x0)); __ stlrh(w22, MemOperand(x0)); __ stlrh(x23, MemOperand(x0)); __ stlxp(w24, w25, w26, MemOperand(x0)); __ stlxp(x27, x28, x29, MemOperand(x0)); __ stlxr(w2, w3, MemOperand(x0)); __ stlxr(x4, x5, MemOperand(x0)); __ stlxrb(w6, w7, MemOperand(x0)); __ stlxrb(x8, x9, MemOperand(x0)); __ stlxrh(w10, w11, MemOperand(x0)); __ stlxrh(x12, x13, MemOperand(x0)); __ stnp(w14, w15, MemOperand(x0)); __ stnp(x16, x17, MemOperand(x0)); __ stp(w18, w19, MemOperand(x0)); __ stp(w18, w19, MemOperand(x1, 8, PostIndex)); __ stp(w18, w19, MemOperand(x1, 8, PreIndex)); __ stp(x20, x21, MemOperand(x0)); __ stp(x20, x21, MemOperand(x1, 16, PostIndex)); __ stp(x20, x21, MemOperand(x1, 16, PreIndex)); __ str(w22, MemOperand(x0)); __ str(w22, MemOperand(x1, 4, PostIndex)); __ str(w22, MemOperand(x1, 4, PreIndex)); __ str(x23, MemOperand(x0)); __ str(x23, MemOperand(x1, 8, PostIndex)); __ str(x23, MemOperand(x1, 8, PreIndex)); __ strb(w24, MemOperand(x0)); __ strb(w24, MemOperand(x1, 1, PostIndex)); __ strb(w24, MemOperand(x1, 1, PreIndex)); __ strb(x25, MemOperand(x0)); __ strb(x25, MemOperand(x1, 1, PostIndex)); __ strb(x25, MemOperand(x1, 1, PreIndex)); __ strh(w26, MemOperand(x0)); __ strh(w26, MemOperand(x1, 2, PostIndex)); __ strh(w26, MemOperand(x1, 2, PreIndex)); __ strh(x27, MemOperand(x0)); __ strh(x27, MemOperand(x1, 2, PostIndex)); __ strh(x27, MemOperand(x1, 2, PreIndex)); __ stur(w28, MemOperand(x0, 7)); __ stur(x29, MemOperand(x0, 15)); __ sturb(w2, MemOperand(x0, 1)); __ sturb(x3, MemOperand(x0, 1)); __ sturh(w4, MemOperand(x0, 3)); __ sturh(x5, MemOperand(x0, 3)); __ stxp(w6, w7, w8, MemOperand(x0)); __ stxp(x9, x10, x11, MemOperand(x0)); __ stxr(w12, w13, MemOperand(x0)); __ stxr(x14, x15, MemOperand(x0)); __ stxrb(w16, w17, MemOperand(x0)); __ stxrb(x18, x19, MemOperand(x0)); __ stxrh(w20, w21, MemOperand(x0)); __ stxrh(x22, x23, MemOperand(x0)); __ sub(w24, w25, w26); __ sub(x27, x28, x29); __ subs(w2, w3, w4); __ subs(x5, x6, x7); __ sxtb(w8, w9); __ sxtb(x10, x11); __ sxth(w12, w13); __ sxth(x14, x15); __ sxtw(w16, w17); __ sxtw(x18, x19); __ tst(w20, w21); __ tst(x22, x23); __ ubfiz(w24, w25, 10, 11); __ ubfiz(x26, x27, 12, 13); __ ubfm(w28, w29, 14, 15); __ ubfm(x2, x3, 1, 2); __ ubfx(w4, w5, 3, 4); __ ubfx(x6, x7, 5, 6); __ udiv(w8, w9, w10); __ udiv(x11, x12, x13); __ umulh(x22, x23, x24); __ uxtb(w28, w29); __ uxtb(x2, x3); __ uxth(w4, w5); __ uxth(x6, x7); __ uxtw(w8, w9); __ uxtw(x10, x11); // Branch tests. { Label end; // Branch to the next instruction. __ b(&end); __ bind(&end); } { Label loop, end; __ subs(x3, x3, x3); __ bind(&loop); // Not-taken branch (the first time). // Taken branch (the second time). __ b(&end, ne); __ cmp(x3, 1); // Backwards branch. __ b(&loop); __ bind(&end); } } static void GenerateTestSequenceFP(MacroAssembler* masm) { ExactAssemblyScope guard(masm, masm->GetBuffer()->GetRemainingBytes(), ExactAssemblyScope::kMaximumSize); // Scalar floating point instructions. __ fabd(d13, d2, d19); __ fabd(s8, s10, s30); __ fabs(d1, d1); __ fabs(s25, s7); __ facge(d1, d23, d16); __ facge(s4, s17, s1); __ facgt(d2, d21, d24); __ facgt(s12, s26, s12); __ fadd(d13, d11, d22); __ fadd(s27, s19, s8); __ fccmp(d6, d10, NoFlag, hs); __ fccmp(s29, s20, NZVFlag, ne); __ fccmpe(d10, d2, NZCFlag, al); __ fccmpe(s3, s3, NZVFlag, pl); __ fcmeq(d19, d8, d10); __ fcmeq(d0, d18, 0.0); __ fcmeq(s1, s4, s30); __ fcmeq(s22, s29, 0.0); __ fcmge(d27, d18, d1); __ fcmge(d31, d28, 0.0); __ fcmge(s31, s19, s9); __ fcmge(s1, s25, 0.0); __ fcmgt(d18, d1, d15); __ fcmgt(d3, d31, 0.0); __ fcmgt(s11, s25, s2); __ fcmgt(s17, s16, 0.0); __ fcmle(d24, d17, 0.0); __ fcmle(s11, s8, 0.0); __ fcmlt(d5, d31, 0.0); __ fcmlt(s18, s23, 0.0); __ fcmp(d10, d24); __ fcmp(d13, 0.0); __ fcmp(s18, s6); __ fcmp(s16, 0.0); __ fcmpe(d9, d17); __ fcmpe(d29, 0.0); __ fcmpe(s16, s17); __ fcmpe(s22, 0.0); __ fcsel(d10, d14, d19, gt); __ fcsel(s22, s18, s2, ge); __ fcvt(d4, h24); __ fcvt(d11, s2); __ fcvt(h8, d9); __ fcvt(h12, s1); __ fcvt(s12, d31); __ fcvt(s27, h25); __ fcvtas(d28, d16); __ fcvtas(s3, s5); __ fcvtas(w18, d31); __ fcvtas(w29, s24); __ fcvtas(x9, d1); __ fcvtas(x30, s2); __ fcvtau(d14, d0); __ fcvtau(s31, s14); __ fcvtau(w16, d2); __ fcvtau(w18, s0); __ fcvtau(x26, d7); __ fcvtau(x25, s19); __ fcvtms(d30, d25); __ fcvtms(s12, s15); __ fcvtms(w9, d7); __ fcvtms(w19, s6); __ fcvtms(x6, d6); __ fcvtms(x22, s7); __ fcvtmu(d27, d0); __ fcvtmu(s8, s22); __ fcvtmu(w29, d19); __ fcvtmu(w26, s0); __ fcvtmu(x13, d5); __ fcvtmu(x5, s18); __ fcvtns(d30, d15); __ fcvtns(s10, s11); __ fcvtns(w21, d15); __ fcvtns(w18, s10); __ fcvtns(x8, d17); __ fcvtns(x17, s12); __ fcvtnu(d0, d21); __ fcvtnu(s6, s25); __ fcvtnu(w29, d11); __ fcvtnu(w25, s31); __ fcvtnu(x30, d11); __ fcvtnu(x27, s18); __ fcvtps(d11, d22); __ fcvtps(s29, s20); __ fcvtps(w15, d25); __ fcvtps(w16, s7); __ fcvtps(x13, d20); __ fcvtps(x3, s23); __ fcvtpu(d24, d1); __ fcvtpu(s14, s24); __ fcvtpu(w26, d29); __ fcvtpu(wzr, s26); __ fcvtpu(x27, d6); __ fcvtpu(x29, s14); __ fcvtxn(s12, d12); __ fcvtzs(d15, d0); __ fcvtzs(d13, d4, 42); __ fcvtzs(s8, s11); __ fcvtzs(s31, s6, 25); __ fcvtzs(w6, d9); __ fcvtzs(w25, d10, 20); __ fcvtzs(w9, s1); __ fcvtzs(w17, s29, 30); __ fcvtzs(x19, d2); __ fcvtzs(x22, d14, 1); __ fcvtzs(x14, s20); __ fcvtzs(x3, s30, 33); __ fcvtzu(d28, d15); __ fcvtzu(d0, d4, 3); __ fcvtzu(s2, s5); __ fcvtzu(s4, s0, 30); __ fcvtzu(w11, d4); __ fcvtzu(w7, d24, 32); __ fcvtzu(w18, s24); __ fcvtzu(w14, s27, 4); __ fcvtzu(x22, d11); __ fcvtzu(x8, d27, 52); __ fcvtzu(x7, s20); __ fcvtzu(x22, s7, 44); __ fdiv(d6, d14, d15); __ fdiv(s26, s5, s25); __ fmadd(d18, d26, d12, d30); __ fmadd(s13, s9, s28, s4); __ fmax(d12, d5, d5); __ fmax(s12, s28, s6); __ fmaxnm(d28, d4, d2); __ fmaxnm(s6, s10, s8); __ fmin(d20, d20, d18); __ fmin(s7, s13, s16); __ fminnm(d19, d14, d30); __ fminnm(s0, s1, s1); __ fmov(d13, d6); __ fmov(d2, x17); __ fmov(d8, -2.5000); __ fmov(s5, s3); __ fmov(s25, w20); __ fmov(s21, 2.8750f); __ fmov(w18, s24); __ fmov(x18, d2); __ fmsub(d20, d30, d3, d19); __ fmsub(s5, s19, s4, s12); __ fmul(d30, d27, d23); __ fmul(s25, s17, s15); __ fmulx(d4, d17, d1); __ fmulx(s14, s25, s4); __ fneg(d15, d0); __ fneg(s14, s15); __ fnmadd(d0, d16, d22, d31); __ fnmadd(s0, s18, s26, s18); __ fnmsub(d19, d12, d15, d21); __ fnmsub(s29, s0, s11, s26); __ fnmul(d31, d19, d1); __ fnmul(s18, s3, s17); __ frecpe(d7, d21); __ frecpe(s29, s17); __ frecps(d11, d26, d17); __ frecps(s18, s27, s1); __ frecpx(d15, d18); __ frecpx(s5, s10); __ frinta(d16, d30); __ frinta(s1, s22); __ frinti(d19, d29); __ frinti(s14, s21); __ frintm(d20, d30); __ frintm(s1, s16); __ frintn(d30, d1); __ frintn(s24, s10); __ frintp(d4, d20); __ frintp(s13, s3); __ frintx(d13, d20); __ frintx(s17, s7); __ frintz(d0, d8); __ frintz(s15, s29); __ frsqrte(d21, d10); __ frsqrte(s17, s25); __ frsqrts(d4, d29, d17); __ frsqrts(s14, s3, s24); __ fsqrt(d14, d17); __ fsqrt(s4, s14); __ fsub(d13, d19, d7); __ fsub(s3, s21, s27); __ scvtf(d31, d16); __ scvtf(d26, d31, 24); __ scvtf(d6, w16); __ scvtf(d5, w20, 6); __ scvtf(d16, x8); __ scvtf(d15, x8, 10); __ scvtf(s7, s4); __ scvtf(s8, s15, 14); __ scvtf(s29, w10); __ scvtf(s15, w21, 11); __ scvtf(s27, x26); __ scvtf(s26, x12, 38); __ ucvtf(d0, d9); __ ucvtf(d5, d22, 47); __ ucvtf(d30, w27); __ ucvtf(d3, w19, 1); __ ucvtf(d28, x21); __ ucvtf(d27, x30, 35); __ ucvtf(s11, s5); __ ucvtf(s0, s23, 14); __ ucvtf(s20, w19); __ ucvtf(s21, w22, 18); __ ucvtf(s6, x13); __ ucvtf(s7, x2, 21); } static void GenerateTestSequenceNEON(MacroAssembler* masm) { ExactAssemblyScope guard(masm, masm->GetBuffer()->GetRemainingBytes(), ExactAssemblyScope::kMaximumSize); // NEON integer instructions. __ abs(d19, d0); __ abs(v16.V16B(), v11.V16B()); __ abs(v0.V2D(), v31.V2D()); __ abs(v27.V2S(), v25.V2S()); __ abs(v21.V4H(), v27.V4H()); __ abs(v16.V4S(), v1.V4S()); __ abs(v31.V8B(), v5.V8B()); __ abs(v29.V8H(), v13.V8H()); __ add(d10, d5, d17); __ add(v31.V16B(), v15.V16B(), v23.V16B()); __ add(v10.V2D(), v31.V2D(), v14.V2D()); __ add(v15.V2S(), v14.V2S(), v19.V2S()); __ add(v27.V4H(), v23.V4H(), v17.V4H()); __ add(v25.V4S(), v28.V4S(), v29.V4S()); __ add(v13.V8B(), v7.V8B(), v18.V8B()); __ add(v4.V8H(), v2.V8H(), v1.V8H()); __ addhn(v10.V2S(), v14.V2D(), v15.V2D()); __ addhn(v10.V4H(), v30.V4S(), v26.V4S()); __ addhn(v31.V8B(), v12.V8H(), v22.V8H()); __ addhn2(v16.V16B(), v21.V8H(), v20.V8H()); __ addhn2(v0.V4S(), v2.V2D(), v17.V2D()); __ addhn2(v31.V8H(), v7.V4S(), v17.V4S()); __ addp(d14, v19.V2D()); __ addp(v3.V16B(), v8.V16B(), v28.V16B()); __ addp(v8.V2D(), v5.V2D(), v17.V2D()); __ addp(v22.V2S(), v30.V2S(), v26.V2S()); __ addp(v29.V4H(), v24.V4H(), v14.V4H()); __ addp(v30.V4S(), v26.V4S(), v24.V4S()); __ addp(v12.V8B(), v26.V8B(), v7.V8B()); __ addp(v17.V8H(), v8.V8H(), v12.V8H()); __ addv(b27, v23.V16B()); __ addv(b12, v20.V8B()); __ addv(h27, v30.V4H()); __ addv(h19, v14.V8H()); __ addv(s14, v27.V4S()); __ and_(v10.V16B(), v8.V16B(), v27.V16B()); __ and_(v5.V8B(), v1.V8B(), v16.V8B()); __ bic(v26.V16B(), v3.V16B(), v24.V16B()); __ bic(v7.V2S(), 0xe4, 16); __ bic(v28.V4H(), 0x23, 8); __ bic(v29.V4S(), 0xac); __ bic(v12.V8B(), v31.V8B(), v21.V8B()); __ bic(v18.V8H(), 0x98); __ bif(v12.V16B(), v26.V16B(), v8.V16B()); __ bif(v2.V8B(), v23.V8B(), v27.V8B()); __ bit(v8.V16B(), v3.V16B(), v13.V16B()); __ bit(v5.V8B(), v5.V8B(), v23.V8B()); __ bsl(v9.V16B(), v31.V16B(), v23.V16B()); __ bsl(v14.V8B(), v7.V8B(), v3.V8B()); __ cls(v29.V16B(), v5.V16B()); __ cls(v21.V2S(), v0.V2S()); __ cls(v1.V4H(), v12.V4H()); __ cls(v27.V4S(), v10.V4S()); __ cls(v19.V8B(), v4.V8B()); __ cls(v15.V8H(), v14.V8H()); __ clz(v1.V16B(), v4.V16B()); __ clz(v27.V2S(), v17.V2S()); __ clz(v9.V4H(), v9.V4H()); __ clz(v31.V4S(), v15.V4S()); __ clz(v14.V8B(), v19.V8B()); __ clz(v6.V8H(), v11.V8H()); __ cmeq(d18, d5, d29); __ cmeq(d14, d31, 0); __ cmeq(v19.V16B(), v3.V16B(), v22.V16B()); __ cmeq(v15.V16B(), v9.V16B(), 0); __ cmeq(v12.V2D(), v16.V2D(), v10.V2D()); __ cmeq(v8.V2D(), v22.V2D(), 0); __ cmeq(v2.V2S(), v3.V2S(), v9.V2S()); __ cmeq(v16.V2S(), v25.V2S(), 0); __ cmeq(v6.V4H(), v23.V4H(), v20.V4H()); __ cmeq(v16.V4H(), v13.V4H(), 0); __ cmeq(v21.V4S(), v17.V4S(), v2.V4S()); __ cmeq(v6.V4S(), v25.V4S(), 0); __ cmeq(v16.V8B(), v13.V8B(), v2.V8B()); __ cmeq(v21.V8B(), v16.V8B(), 0); __ cmeq(v20.V8H(), v7.V8H(), v25.V8H()); __ cmeq(v26.V8H(), v8.V8H(), 0); __ cmge(d16, d13, d31); __ cmge(d25, d24, 0); __ cmge(v17.V16B(), v19.V16B(), v17.V16B()); __ cmge(v22.V16B(), v30.V16B(), 0); __ cmge(v28.V2D(), v20.V2D(), v26.V2D()); __ cmge(v6.V2D(), v23.V2D(), 0); __ cmge(v25.V2S(), v22.V2S(), v3.V2S()); __ cmge(v21.V2S(), v11.V2S(), 0); __ cmge(v16.V4H(), v3.V4H(), v12.V4H()); __ cmge(v23.V4H(), v9.V4H(), 0); __ cmge(v7.V4S(), v2.V4S(), v11.V4S()); __ cmge(v0.V4S(), v22.V4S(), 0); __ cmge(v10.V8B(), v30.V8B(), v9.V8B()); __ cmge(v21.V8B(), v8.V8B(), 0); __ cmge(v2.V8H(), v7.V8H(), v26.V8H()); __ cmge(v19.V8H(), v10.V8H(), 0); __ cmgt(d6, d13, d1); __ cmgt(d30, d24, 0); __ cmgt(v20.V16B(), v25.V16B(), v27.V16B()); __ cmgt(v0.V16B(), v25.V16B(), 0); __ cmgt(v22.V2D(), v25.V2D(), v1.V2D()); __ cmgt(v16.V2D(), v16.V2D(), 0); __ cmgt(v5.V2S(), v9.V2S(), v15.V2S()); __ cmgt(v12.V2S(), v18.V2S(), 0); __ cmgt(v28.V4H(), v18.V4H(), v11.V4H()); __ cmgt(v22.V4H(), v3.V4H(), 0); __ cmgt(v5.V4S(), v11.V4S(), v27.V4S()); __ cmgt(v13.V4S(), v20.V4S(), 0); __ cmgt(v27.V8B(), v31.V8B(), v7.V8B()); __ cmgt(v5.V8B(), v0.V8B(), 0); __ cmgt(v22.V8H(), v28.V8H(), v13.V8H()); __ cmgt(v6.V8H(), v2.V8H(), 0); __ cmhi(d21, d8, d22); __ cmhi(v18.V16B(), v19.V16B(), v19.V16B()); __ cmhi(v7.V2D(), v0.V2D(), v21.V2D()); __ cmhi(v15.V2S(), v19.V2S(), v0.V2S()); __ cmhi(v31.V4H(), v7.V4H(), v12.V4H()); __ cmhi(v9.V4S(), v16.V4S(), v22.V4S()); __ cmhi(v7.V8B(), v24.V8B(), v28.V8B()); __ cmhi(v11.V8H(), v10.V8H(), v25.V8H()); __ cmhs(d1, d12, d17); __ cmhs(v21.V16B(), v25.V16B(), v30.V16B()); __ cmhs(v8.V2D(), v2.V2D(), v26.V2D()); __ cmhs(v1.V2S(), v22.V2S(), v29.V2S()); __ cmhs(v26.V4H(), v30.V4H(), v30.V4H()); __ cmhs(v19.V4S(), v20.V4S(), v16.V4S()); __ cmhs(v1.V8B(), v3.V8B(), v26.V8B()); __ cmhs(v20.V8H(), v28.V8H(), v8.V8H()); __ cmle(d30, d24, 0); __ cmle(v0.V16B(), v3.V16B(), 0); __ cmle(v2.V2D(), v30.V2D(), 0); __ cmle(v7.V2S(), v10.V2S(), 0); __ cmle(v9.V4H(), v31.V4H(), 0); __ cmle(v9.V4S(), v18.V4S(), 0); __ cmle(v21.V8B(), v31.V8B(), 0); __ cmle(v29.V8H(), v21.V8H(), 0); __ cmlt(d25, d23, 0); __ cmlt(v7.V16B(), v21.V16B(), 0); __ cmlt(v7.V2D(), v30.V2D(), 0); __ cmlt(v25.V2S(), v28.V2S(), 0); __ cmlt(v0.V4H(), v11.V4H(), 0); __ cmlt(v24.V4S(), v5.V4S(), 0); __ cmlt(v26.V8B(), v11.V8B(), 0); __ cmlt(v1.V8H(), v21.V8H(), 0); __ cmtst(d28, d23, d30); __ cmtst(v26.V16B(), v6.V16B(), v31.V16B()); __ cmtst(v1.V2D(), v21.V2D(), v4.V2D()); __ cmtst(v27.V2S(), v26.V2S(), v20.V2S()); __ cmtst(v26.V4H(), v0.V4H(), v18.V4H()); __ cmtst(v25.V4S(), v16.V4S(), v4.V4S()); __ cmtst(v11.V8B(), v10.V8B(), v9.V8B()); __ cmtst(v0.V8H(), v2.V8H(), v1.V8H()); __ cnt(v25.V16B(), v15.V16B()); __ cnt(v28.V8B(), v6.V8B()); __ dup(v6.V16B(), v7.B(), 7); __ dup(v9.V16B(), w20); __ dup(v12.V2D(), v13.D(), 1); __ dup(v9.V2D(), xzr); __ dup(v4.V2S(), v26.S(), 2); __ dup(v3.V2S(), w12); __ dup(v22.V4H(), v5.H(), 7); __ dup(v16.V4H(), w25); __ dup(v20.V4S(), v10.S(), 2); __ dup(v10.V4S(), w7); __ dup(v30.V8B(), v30.B(), 2); __ dup(v31.V8B(), w15); __ dup(v28.V8H(), v17.H(), 4); __ dup(v2.V8H(), w3); __ eor(v29.V16B(), v25.V16B(), v3.V16B()); __ eor(v3.V8B(), v16.V8B(), v28.V8B()); __ ext(v1.V16B(), v26.V16B(), v6.V16B(), 1); __ ext(v2.V8B(), v30.V8B(), v1.V8B(), 1); __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x0)); __ ld1(v23.V16B(), v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x1, x2, PostIndex)); __ ld1(v5.V16B(), v6.V16B(), v7.V16B(), v8.V16B(), MemOperand(x1, 64, PostIndex)); __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), MemOperand(x0)); __ ld1(v13.V16B(), v14.V16B(), v15.V16B(), MemOperand(x1, x2, PostIndex)); __ ld1(v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x1, 48, PostIndex)); __ ld1(v17.V16B(), v18.V16B(), MemOperand(x0)); __ ld1(v20.V16B(), v21.V16B(), MemOperand(x1, x2, PostIndex)); __ ld1(v28.V16B(), v29.V16B(), MemOperand(x1, 32, PostIndex)); __ ld1(v29.V16B(), MemOperand(x0)); __ ld1(v21.V16B(), MemOperand(x1, x2, PostIndex)); __ ld1(v4.V16B(), MemOperand(x1, 16, PostIndex)); __ ld1(v4.V1D(), v5.V1D(), v6.V1D(), v7.V1D(), MemOperand(x0)); __ ld1(v17.V1D(), v18.V1D(), v19.V1D(), v20.V1D(), MemOperand(x1, x2, PostIndex)); __ ld1(v28.V1D(), v29.V1D(), v30.V1D(), v31.V1D(), MemOperand(x1, 32, PostIndex)); __ ld1(v20.V1D(), v21.V1D(), v22.V1D(), MemOperand(x0)); __ ld1(v19.V1D(), v20.V1D(), v21.V1D(), MemOperand(x1, x2, PostIndex)); __ ld1(v12.V1D(), v13.V1D(), v14.V1D(), MemOperand(x1, 24, PostIndex)); __ ld1(v29.V1D(), v30.V1D(), MemOperand(x0)); __ ld1(v31.V1D(), v0.V1D(), MemOperand(x1, x2, PostIndex)); __ ld1(v3.V1D(), v4.V1D(), MemOperand(x1, 16, PostIndex)); __ ld1(v28.V1D(), MemOperand(x0)); __ ld1(v11.V1D(), MemOperand(x1, x2, PostIndex)); __ ld1(v29.V1D(), MemOperand(x1, 8, PostIndex)); __ ld1(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), MemOperand(x0)); __ ld1(v8.V2D(), v9.V2D(), v10.V2D(), v11.V2D(), MemOperand(x1, x2, PostIndex)); __ ld1(v14.V2D(), v15.V2D(), v16.V2D(), v17.V2D(), MemOperand(x1, 64, PostIndex)); __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), MemOperand(x0)); __ ld1(v5.V2D(), v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex)); __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), MemOperand(x1, 48, PostIndex)); __ ld1(v18.V2D(), v19.V2D(), MemOperand(x0)); __ ld1(v21.V2D(), v22.V2D(), MemOperand(x1, x2, PostIndex)); __ ld1(v17.V2D(), v18.V2D(), MemOperand(x1, 32, PostIndex)); __ ld1(v5.V2D(), MemOperand(x0)); __ ld1(v6.V2D(), MemOperand(x1, x2, PostIndex)); __ ld1(v15.V2D(), MemOperand(x1, 16, PostIndex)); __ ld1(v30.V2S(), v31.V2S(), v0.V2S(), v1.V2S(), MemOperand(x0)); __ ld1(v24.V2S(), v25.V2S(), v26.V2S(), v27.V2S(), MemOperand(x1, x2, PostIndex)); __ ld1(v27.V2S(), v28.V2S(), v29.V2S(), v30.V2S(), MemOperand(x1, 32, PostIndex)); __ ld1(v11.V2S(), v12.V2S(), v13.V2S(), MemOperand(x0)); __ ld1(v8.V2S(), v9.V2S(), v10.V2S(), MemOperand(x1, x2, PostIndex)); __ ld1(v31.V2S(), v0.V2S(), v1.V2S(), MemOperand(x1, 24, PostIndex)); __ ld1(v0.V2S(), v1.V2S(), MemOperand(x0)); __ ld1(v13.V2S(), v14.V2S(), MemOperand(x1, x2, PostIndex)); __ ld1(v3.V2S(), v4.V2S(), MemOperand(x1, 16, PostIndex)); __ ld1(v26.V2S(), MemOperand(x0)); __ ld1(v0.V2S(), MemOperand(x1, x2, PostIndex)); __ ld1(v11.V2S(), MemOperand(x1, 8, PostIndex)); __ ld1(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), MemOperand(x0)); __ ld1(v24.V4H(), v25.V4H(), v26.V4H(), v27.V4H(), MemOperand(x1, x2, PostIndex)); __ ld1(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), MemOperand(x1, 32, PostIndex)); __ ld1(v30.V4H(), v31.V4H(), v0.V4H(), MemOperand(x0)); __ ld1(v25.V4H(), v26.V4H(), v27.V4H(), MemOperand(x1, x2, PostIndex)); __ ld1(v3.V4H(), v4.V4H(), v5.V4H(), MemOperand(x1, 24, PostIndex)); __ ld1(v3.V4H(), v4.V4H(), MemOperand(x0)); __ ld1(v3.V4H(), v4.V4H(), MemOperand(x1, x2, PostIndex)); __ ld1(v23.V4H(), v24.V4H(), MemOperand(x1, 16, PostIndex)); __ ld1(v26.V4H(), MemOperand(x0)); __ ld1(v1.V4H(), MemOperand(x1, x2, PostIndex)); __ ld1(v14.V4H(), MemOperand(x1, 8, PostIndex)); __ ld1(v26.V4S(), v27.V4S(), v28.V4S(), v29.V4S(), MemOperand(x0)); __ ld1(v28.V4S(), v29.V4S(), v30.V4S(), v31.V4S(), MemOperand(x1, x2, PostIndex)); __ ld1(v4.V4S(), v5.V4S(), v6.V4S(), v7.V4S(), MemOperand(x1, 64, PostIndex)); __ ld1(v2.V4S(), v3.V4S(), v4.V4S(), MemOperand(x0)); __ ld1(v22.V4S(), v23.V4S(), v24.V4S(), MemOperand(x1, x2, PostIndex)); __ ld1(v15.V4S(), v16.V4S(), v17.V4S(), MemOperand(x1, 48, PostIndex)); __ ld1(v20.V4S(), v21.V4S(), MemOperand(x0)); __ ld1(v30.V4S(), v31.V4S(), MemOperand(x1, x2, PostIndex)); __ ld1(v11.V4S(), v12.V4S(), MemOperand(x1, 32, PostIndex)); __ ld1(v15.V4S(), MemOperand(x0)); __ ld1(v12.V4S(), MemOperand(x1, x2, PostIndex)); __ ld1(v0.V4S(), MemOperand(x1, 16, PostIndex)); __ ld1(v17.V8B(), v18.V8B(), v19.V8B(), v20.V8B(), MemOperand(x0)); __ ld1(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), MemOperand(x1, x2, PostIndex)); __ ld1(v9.V8B(), v10.V8B(), v11.V8B(), v12.V8B(), MemOperand(x1, 32, PostIndex)); __ ld1(v4.V8B(), v5.V8B(), v6.V8B(), MemOperand(x0)); __ ld1(v2.V8B(), v3.V8B(), v4.V8B(), MemOperand(x1, x2, PostIndex)); __ ld1(v12.V8B(), v13.V8B(), v14.V8B(), MemOperand(x1, 24, PostIndex)); __ ld1(v10.V8B(), v11.V8B(), MemOperand(x0)); __ ld1(v11.V8B(), v12.V8B(), MemOperand(x1, x2, PostIndex)); __ ld1(v27.V8B(), v28.V8B(), MemOperand(x1, 16, PostIndex)); __ ld1(v31.V8B(), MemOperand(x0)); __ ld1(v10.V8B(), MemOperand(x1, x2, PostIndex)); __ ld1(v28.V8B(), MemOperand(x1, 8, PostIndex)); __ ld1(v5.V8H(), v6.V8H(), v7.V8H(), v8.V8H(), MemOperand(x0)); __ ld1(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex)); __ ld1(v10.V8H(), v11.V8H(), v12.V8H(), v13.V8H(), MemOperand(x1, 64, PostIndex)); __ ld1(v26.V8H(), v27.V8H(), v28.V8H(), MemOperand(x0)); __ ld1(v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex)); __ ld1(v17.V8H(), v18.V8H(), v19.V8H(), MemOperand(x1, 48, PostIndex)); __ ld1(v4.V8H(), v5.V8H(), MemOperand(x0)); __ ld1(v21.V8H(), v22.V8H(), MemOperand(x1, x2, PostIndex)); __ ld1(v4.V8H(), v5.V8H(), MemOperand(x1, 32, PostIndex)); __ ld1(v9.V8H(), MemOperand(x0)); __ ld1(v27.V8H(), MemOperand(x1, x2, PostIndex)); __ ld1(v26.V8H(), MemOperand(x1, 16, PostIndex)); __ ld1(v19.B(), 1, MemOperand(x0)); __ ld1(v12.B(), 3, MemOperand(x1, x2, PostIndex)); __ ld1(v27.B(), 12, MemOperand(x1, 1, PostIndex)); __ ld1(v10.D(), 1, MemOperand(x0)); __ ld1(v26.D(), 1, MemOperand(x1, x2, PostIndex)); __ ld1(v7.D(), 1, MemOperand(x1, 8, PostIndex)); __ ld1(v19.H(), 5, MemOperand(x0)); __ ld1(v10.H(), 1, MemOperand(x1, x2, PostIndex)); __ ld1(v5.H(), 4, MemOperand(x1, 2, PostIndex)); __ ld1(v21.S(), 2, MemOperand(x0)); __ ld1(v13.S(), 2, MemOperand(x1, x2, PostIndex)); __ ld1(v1.S(), 2, MemOperand(x1, 4, PostIndex)); __ ld1r(v2.V16B(), MemOperand(x0)); __ ld1r(v2.V16B(), MemOperand(x1, x2, PostIndex)); __ ld1r(v22.V16B(), MemOperand(x1, 1, PostIndex)); __ ld1r(v25.V1D(), MemOperand(x0)); __ ld1r(v9.V1D(), MemOperand(x1, x2, PostIndex)); __ ld1r(v23.V1D(), MemOperand(x1, 8, PostIndex)); __ ld1r(v19.V2D(), MemOperand(x0)); __ ld1r(v21.V2D(), MemOperand(x1, x2, PostIndex)); __ ld1r(v30.V2D(), MemOperand(x1, 8, PostIndex)); __ ld1r(v24.V2S(), MemOperand(x0)); __ ld1r(v26.V2S(), MemOperand(x1, x2, PostIndex)); __ ld1r(v28.V2S(), MemOperand(x1, 4, PostIndex)); __ ld1r(v19.V4H(), MemOperand(x0)); __ ld1r(v1.V4H(), MemOperand(x1, x2, PostIndex)); __ ld1r(v21.V4H(), MemOperand(x1, 2, PostIndex)); __ ld1r(v15.V4S(), MemOperand(x0)); __ ld1r(v21.V4S(), MemOperand(x1, x2, PostIndex)); __ ld1r(v23.V4S(), MemOperand(x1, 4, PostIndex)); __ ld1r(v26.V8B(), MemOperand(x0)); __ ld1r(v14.V8B(), MemOperand(x1, x2, PostIndex)); __ ld1r(v19.V8B(), MemOperand(x1, 1, PostIndex)); __ ld1r(v13.V8H(), MemOperand(x0)); __ ld1r(v30.V8H(), MemOperand(x1, x2, PostIndex)); __ ld1r(v27.V8H(), MemOperand(x1, 2, PostIndex)); __ ld2(v21.V16B(), v22.V16B(), MemOperand(x0)); __ ld2(v21.V16B(), v22.V16B(), MemOperand(x1, x2, PostIndex)); __ ld2(v12.V16B(), v13.V16B(), MemOperand(x1, 32, PostIndex)); __ ld2(v14.V2D(), v15.V2D(), MemOperand(x0)); __ ld2(v0.V2D(), v1.V2D(), MemOperand(x1, x2, PostIndex)); __ ld2(v12.V2D(), v13.V2D(), MemOperand(x1, 32, PostIndex)); __ ld2(v27.V2S(), v28.V2S(), MemOperand(x0)); __ ld2(v2.V2S(), v3.V2S(), MemOperand(x1, x2, PostIndex)); __ ld2(v12.V2S(), v13.V2S(), MemOperand(x1, 16, PostIndex)); __ ld2(v9.V4H(), v10.V4H(), MemOperand(x0)); __ ld2(v23.V4H(), v24.V4H(), MemOperand(x1, x2, PostIndex)); __ ld2(v1.V4H(), v2.V4H(), MemOperand(x1, 16, PostIndex)); __ ld2(v20.V4S(), v21.V4S(), MemOperand(x0)); __ ld2(v10.V4S(), v11.V4S(), MemOperand(x1, x2, PostIndex)); __ ld2(v24.V4S(), v25.V4S(), MemOperand(x1, 32, PostIndex)); __ ld2(v17.V8B(), v18.V8B(), MemOperand(x0)); __ ld2(v13.V8B(), v14.V8B(), MemOperand(x1, x2, PostIndex)); __ ld2(v7.V8B(), v8.V8B(), MemOperand(x1, 16, PostIndex)); __ ld2(v30.V8H(), v31.V8H(), MemOperand(x0)); __ ld2(v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex)); __ ld2(v13.V8H(), v14.V8H(), MemOperand(x1, 32, PostIndex)); __ ld2(v5.B(), v6.B(), 12, MemOperand(x0)); __ ld2(v16.B(), v17.B(), 7, MemOperand(x1, x2, PostIndex)); __ ld2(v29.B(), v30.B(), 2, MemOperand(x1, 2, PostIndex)); __ ld2(v11.D(), v12.D(), 1, MemOperand(x0)); __ ld2(v26.D(), v27.D(), 0, MemOperand(x1, x2, PostIndex)); __ ld2(v25.D(), v26.D(), 0, MemOperand(x1, 16, PostIndex)); __ ld2(v18.H(), v19.H(), 7, MemOperand(x0)); __ ld2(v17.H(), v18.H(), 5, MemOperand(x1, x2, PostIndex)); __ ld2(v30.H(), v31.H(), 2, MemOperand(x1, 4, PostIndex)); __ ld2(v29.S(), v30.S(), 3, MemOperand(x0)); __ ld2(v28.S(), v29.S(), 0, MemOperand(x1, x2, PostIndex)); __ ld2(v6.S(), v7.S(), 1, MemOperand(x1, 8, PostIndex)); __ ld2r(v26.V16B(), v27.V16B(), MemOperand(x0)); __ ld2r(v21.V16B(), v22.V16B(), MemOperand(x1, x2, PostIndex)); __ ld2r(v5.V16B(), v6.V16B(), MemOperand(x1, 2, PostIndex)); __ ld2r(v26.V1D(), v27.V1D(), MemOperand(x0)); __ ld2r(v14.V1D(), v15.V1D(), MemOperand(x1, x2, PostIndex)); __ ld2r(v23.V1D(), v24.V1D(), MemOperand(x1, 16, PostIndex)); __ ld2r(v11.V2D(), v12.V2D(), MemOperand(x0)); __ ld2r(v29.V2D(), v30.V2D(), MemOperand(x1, x2, PostIndex)); __ ld2r(v15.V2D(), v16.V2D(), MemOperand(x1, 16, PostIndex)); __ ld2r(v26.V2S(), v27.V2S(), MemOperand(x0)); __ ld2r(v22.V2S(), v23.V2S(), MemOperand(x1, x2, PostIndex)); __ ld2r(v2.V2S(), v3.V2S(), MemOperand(x1, 8, PostIndex)); __ ld2r(v2.V4H(), v3.V4H(), MemOperand(x0)); __ ld2r(v9.V4H(), v10.V4H(), MemOperand(x1, x2, PostIndex)); __ ld2r(v6.V4H(), v7.V4H(), MemOperand(x1, 4, PostIndex)); __ ld2r(v7.V4S(), v8.V4S(), MemOperand(x0)); __ ld2r(v19.V4S(), v20.V4S(), MemOperand(x1, x2, PostIndex)); __ ld2r(v21.V4S(), v22.V4S(), MemOperand(x1, 8, PostIndex)); __ ld2r(v26.V8B(), v27.V8B(), MemOperand(x0)); __ ld2r(v20.V8B(), v21.V8B(), MemOperand(x1, x2, PostIndex)); __ ld2r(v11.V8B(), v12.V8B(), MemOperand(x1, 2, PostIndex)); __ ld2r(v12.V8H(), v13.V8H(), MemOperand(x0)); __ ld2r(v6.V8H(), v7.V8H(), MemOperand(x1, x2, PostIndex)); __ ld2r(v25.V8H(), v26.V8H(), MemOperand(x1, 4, PostIndex)); __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), MemOperand(x0)); __ ld3(v28.V16B(), v29.V16B(), v30.V16B(), MemOperand(x1, x2, PostIndex)); __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), MemOperand(x1, 48, PostIndex)); __ ld3(v21.V2D(), v22.V2D(), v23.V2D(), MemOperand(x0)); __ ld3(v18.V2D(), v19.V2D(), v20.V2D(), MemOperand(x1, x2, PostIndex)); __ ld3(v27.V2D(), v28.V2D(), v29.V2D(), MemOperand(x1, 48, PostIndex)); __ ld3(v7.V2S(), v8.V2S(), v9.V2S(), MemOperand(x0)); __ ld3(v20.V2S(), v21.V2S(), v22.V2S(), MemOperand(x1, x2, PostIndex)); __ ld3(v26.V2S(), v27.V2S(), v28.V2S(), MemOperand(x1, 24, PostIndex)); __ ld3(v27.V4H(), v28.V4H(), v29.V4H(), MemOperand(x0)); __ ld3(v28.V4H(), v29.V4H(), v30.V4H(), MemOperand(x1, x2, PostIndex)); __ ld3(v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x1, 24, PostIndex)); __ ld3(v2.V4S(), v3.V4S(), v4.V4S(), MemOperand(x0)); __ ld3(v24.V4S(), v25.V4S(), v26.V4S(), MemOperand(x1, x2, PostIndex)); __ ld3(v11.V4S(), v12.V4S(), v13.V4S(), MemOperand(x1, 48, PostIndex)); __ ld3(v29.V8B(), v30.V8B(), v31.V8B(), MemOperand(x0)); __ ld3(v1.V8B(), v2.V8B(), v3.V8B(), MemOperand(x1, x2, PostIndex)); __ ld3(v12.V8B(), v13.V8B(), v14.V8B(), MemOperand(x1, 24, PostIndex)); __ ld3(v22.V8H(), v23.V8H(), v24.V8H(), MemOperand(x0)); __ ld3(v13.V8H(), v14.V8H(), v15.V8H(), MemOperand(x1, x2, PostIndex)); __ ld3(v28.V8H(), v29.V8H(), v30.V8H(), MemOperand(x1, 48, PostIndex)); __ ld3(v21.B(), v22.B(), v23.B(), 11, MemOperand(x0)); __ ld3(v5.B(), v6.B(), v7.B(), 9, MemOperand(x1, x2, PostIndex)); __ ld3(v23.B(), v24.B(), v25.B(), 0, MemOperand(x1, 3, PostIndex)); __ ld3(v16.D(), v17.D(), v18.D(), 0, MemOperand(x0)); __ ld3(v30.D(), v31.D(), v0.D(), 0, MemOperand(x1, x2, PostIndex)); __ ld3(v28.D(), v29.D(), v30.D(), 1, MemOperand(x1, 24, PostIndex)); __ ld3(v13.H(), v14.H(), v15.H(), 2, MemOperand(x0)); __ ld3(v22.H(), v23.H(), v24.H(), 7, MemOperand(x1, x2, PostIndex)); __ ld3(v14.H(), v15.H(), v16.H(), 3, MemOperand(x1, 6, PostIndex)); __ ld3(v22.S(), v23.S(), v24.S(), 3, MemOperand(x0)); __ ld3(v30.S(), v31.S(), v0.S(), 2, MemOperand(x1, x2, PostIndex)); __ ld3(v12.S(), v13.S(), v14.S(), 1, MemOperand(x1, 12, PostIndex)); __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x0)); __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x1, x2, PostIndex)); __ ld3r(v3.V16B(), v4.V16B(), v5.V16B(), MemOperand(x1, 3, PostIndex)); __ ld3r(v4.V1D(), v5.V1D(), v6.V1D(), MemOperand(x0)); __ ld3r(v7.V1D(), v8.V1D(), v9.V1D(), MemOperand(x1, x2, PostIndex)); __ ld3r(v17.V1D(), v18.V1D(), v19.V1D(), MemOperand(x1, 24, PostIndex)); __ ld3r(v16.V2D(), v17.V2D(), v18.V2D(), MemOperand(x0)); __ ld3r(v20.V2D(), v21.V2D(), v22.V2D(), MemOperand(x1, x2, PostIndex)); __ ld3r(v14.V2D(), v15.V2D(), v16.V2D(), MemOperand(x1, 24, PostIndex)); __ ld3r(v10.V2S(), v11.V2S(), v12.V2S(), MemOperand(x0)); __ ld3r(v0.V2S(), v1.V2S(), v2.V2S(), MemOperand(x1, x2, PostIndex)); __ ld3r(v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x1, 12, PostIndex)); __ ld3r(v22.V4H(), v23.V4H(), v24.V4H(), MemOperand(x0)); __ ld3r(v6.V4H(), v7.V4H(), v8.V4H(), MemOperand(x1, x2, PostIndex)); __ ld3r(v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x1, 6, PostIndex)); __ ld3r(v26.V4S(), v27.V4S(), v28.V4S(), MemOperand(x0)); __ ld3r(v0.V4S(), v1.V4S(), v2.V4S(), MemOperand(x1, x2, PostIndex)); __ ld3r(v30.V4S(), v31.V4S(), v0.V4S(), MemOperand(x1, 12, PostIndex)); __ ld3r(v2.V8B(), v3.V8B(), v4.V8B(), MemOperand(x0)); __ ld3r(v10.V8B(), v11.V8B(), v12.V8B(), MemOperand(x1, x2, PostIndex)); __ ld3r(v28.V8B(), v29.V8B(), v30.V8B(), MemOperand(x1, 3, PostIndex)); __ ld3r(v6.V8H(), v7.V8H(), v8.V8H(), MemOperand(x0)); __ ld3r(v29.V8H(), v30.V8H(), v31.V8H(), MemOperand(x1, x2, PostIndex)); __ ld3r(v7.V8H(), v8.V8H(), v9.V8H(), MemOperand(x1, 6, PostIndex)); __ ld4(v3.V16B(), v4.V16B(), v5.V16B(), v6.V16B(), MemOperand(x0)); __ ld4(v2.V16B(), v3.V16B(), v4.V16B(), v5.V16B(), MemOperand(x1, x2, PostIndex)); __ ld4(v5.V16B(), v6.V16B(), v7.V16B(), v8.V16B(), MemOperand(x1, 64, PostIndex)); __ ld4(v18.V2D(), v19.V2D(), v20.V2D(), v21.V2D(), MemOperand(x0)); __ ld4(v4.V2D(), v5.V2D(), v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex)); __ ld4(v29.V2D(), v30.V2D(), v31.V2D(), v0.V2D(), MemOperand(x1, 64, PostIndex)); __ ld4(v27.V2S(), v28.V2S(), v29.V2S(), v30.V2S(), MemOperand(x0)); __ ld4(v24.V2S(), v25.V2S(), v26.V2S(), v27.V2S(), MemOperand(x1, x2, PostIndex)); __ ld4(v4.V2S(), v5.V2S(), v6.V2S(), v7.V2S(), MemOperand(x1, 32, PostIndex)); __ ld4(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), MemOperand(x0)); __ ld4(v23.V4H(), v24.V4H(), v25.V4H(), v26.V4H(), MemOperand(x1, x2, PostIndex)); __ ld4(v2.V4H(), v3.V4H(), v4.V4H(), v5.V4H(), MemOperand(x1, 32, PostIndex)); __ ld4(v7.V4S(), v8.V4S(), v9.V4S(), v10.V4S(), MemOperand(x0)); __ ld4(v28.V4S(), v29.V4S(), v30.V4S(), v31.V4S(), MemOperand(x1, x2, PostIndex)); __ ld4(v29.V4S(), v30.V4S(), v31.V4S(), v0.V4S(), MemOperand(x1, 64, PostIndex)); __ ld4(v15.V8B(), v16.V8B(), v17.V8B(), v18.V8B(), MemOperand(x0)); __ ld4(v27.V8B(), v28.V8B(), v29.V8B(), v30.V8B(), MemOperand(x1, x2, PostIndex)); __ ld4(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), MemOperand(x1, 32, PostIndex)); __ ld4(v25.V8H(), v26.V8H(), v27.V8H(), v28.V8H(), MemOperand(x0)); __ ld4(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex)); __ ld4(v20.V8H(), v21.V8H(), v22.V8H(), v23.V8H(), MemOperand(x1, 64, PostIndex)); __ ld4(v20.B(), v21.B(), v22.B(), v23.B(), 3, MemOperand(x0)); __ ld4(v12.B(), v13.B(), v14.B(), v15.B(), 3, MemOperand(x1, x2, PostIndex)); __ ld4(v27.B(), v28.B(), v29.B(), v30.B(), 6, MemOperand(x1, 4, PostIndex)); __ ld4(v28.D(), v29.D(), v30.D(), v31.D(), 1, MemOperand(x0)); __ ld4(v15.D(), v16.D(), v17.D(), v18.D(), 1, MemOperand(x1, x2, PostIndex)); __ ld4(v16.D(), v17.D(), v18.D(), v19.D(), 1, MemOperand(x1, 32, PostIndex)); __ ld4(v2.H(), v3.H(), v4.H(), v5.H(), 6, MemOperand(x0)); __ ld4(v5.H(), v6.H(), v7.H(), v8.H(), 3, MemOperand(x1, x2, PostIndex)); __ ld4(v7.H(), v8.H(), v9.H(), v10.H(), 6, MemOperand(x1, 8, PostIndex)); __ ld4(v6.S(), v7.S(), v8.S(), v9.S(), 1, MemOperand(x0)); __ ld4(v25.S(), v26.S(), v27.S(), v28.S(), 2, MemOperand(x1, x2, PostIndex)); __ ld4(v8.S(), v9.S(), v10.S(), v11.S(), 3, MemOperand(x1, 16, PostIndex)); __ ld4r(v14.V16B(), v15.V16B(), v16.V16B(), v17.V16B(), MemOperand(x0)); __ ld4r(v13.V16B(), v14.V16B(), v15.V16B(), v16.V16B(), MemOperand(x1, x2, PostIndex)); __ ld4r(v9.V16B(), v10.V16B(), v11.V16B(), v12.V16B(), MemOperand(x1, 4, PostIndex)); __ ld4r(v8.V1D(), v9.V1D(), v10.V1D(), v11.V1D(), MemOperand(x0)); __ ld4r(v4.V1D(), v5.V1D(), v6.V1D(), v7.V1D(), MemOperand(x1, x2, PostIndex)); __ ld4r(v26.V1D(), v27.V1D(), v28.V1D(), v29.V1D(), MemOperand(x1, 32, PostIndex)); __ ld4r(v19.V2D(), v20.V2D(), v21.V2D(), v22.V2D(), MemOperand(x0)); __ ld4r(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), MemOperand(x1, x2, PostIndex)); __ ld4r(v15.V2D(), v16.V2D(), v17.V2D(), v18.V2D(), MemOperand(x1, 32, PostIndex)); __ ld4r(v31.V2S(), v0.V2S(), v1.V2S(), v2.V2S(), MemOperand(x0)); __ ld4r(v28.V2S(), v29.V2S(), v30.V2S(), v31.V2S(), MemOperand(x1, x2, PostIndex)); __ ld4r(v11.V2S(), v12.V2S(), v13.V2S(), v14.V2S(), MemOperand(x1, 16, PostIndex)); __ ld4r(v19.V4H(), v20.V4H(), v21.V4H(), v22.V4H(), MemOperand(x0)); __ ld4r(v22.V4H(), v23.V4H(), v24.V4H(), v25.V4H(), MemOperand(x1, x2, PostIndex)); __ ld4r(v20.V4H(), v21.V4H(), v22.V4H(), v23.V4H(), MemOperand(x1, 8, PostIndex)); __ ld4r(v16.V4S(), v17.V4S(), v18.V4S(), v19.V4S(), MemOperand(x0)); __ ld4r(v25.V4S(), v26.V4S(), v27.V4S(), v28.V4S(), MemOperand(x1, x2, PostIndex)); __ ld4r(v23.V4S(), v24.V4S(), v25.V4S(), v26.V4S(), MemOperand(x1, 16, PostIndex)); __ ld4r(v22.V8B(), v23.V8B(), v24.V8B(), v25.V8B(), MemOperand(x0)); __ ld4r(v27.V8B(), v28.V8B(), v29.V8B(), v30.V8B(), MemOperand(x1, x2, PostIndex)); __ ld4r(v29.V8B(), v30.V8B(), v31.V8B(), v0.V8B(), MemOperand(x1, 4, PostIndex)); __ ld4r(v28.V8H(), v29.V8H(), v30.V8H(), v31.V8H(), MemOperand(x0)); __ ld4r(v25.V8H(), v26.V8H(), v27.V8H(), v28.V8H(), MemOperand(x1, x2, PostIndex)); __ ld4r(v22.V8H(), v23.V8H(), v24.V8H(), v25.V8H(), MemOperand(x1, 8, PostIndex)); __ mla(v29.V16B(), v7.V16B(), v26.V16B()); __ mla(v6.V2S(), v4.V2S(), v14.V2S()); __ mla(v9.V2S(), v11.V2S(), v0.S(), 2); __ mla(v5.V4H(), v17.V4H(), v25.V4H()); __ mla(v24.V4H(), v7.V4H(), v11.H(), 3); __ mla(v12.V4S(), v3.V4S(), v4.V4S()); __ mla(v10.V4S(), v7.V4S(), v7.S(), 3); __ mla(v3.V8B(), v16.V8B(), v9.V8B()); __ mla(v19.V8H(), v22.V8H(), v18.V8H()); __ mla(v6.V8H(), v2.V8H(), v0.H(), 0); __ mls(v23.V16B(), v10.V16B(), v11.V16B()); __ mls(v14.V2S(), v31.V2S(), v22.V2S()); __ mls(v28.V2S(), v13.V2S(), v1.S(), 3); __ mls(v2.V4H(), v19.V4H(), v13.V4H()); __ mls(v18.V4H(), v15.V4H(), v12.H(), 6); __ mls(v6.V4S(), v11.V4S(), v16.V4S()); __ mls(v23.V4S(), v16.V4S(), v10.S(), 2); __ mls(v26.V8B(), v13.V8B(), v23.V8B()); __ mls(v10.V8H(), v10.V8H(), v12.V8H()); __ mls(v14.V8H(), v0.V8H(), v14.H(), 7); __ mov(b22, v1.B(), 3); __ mov(d7, v13.D(), 1); __ mov(h26, v21.H(), 2); __ mov(s26, v19.S(), 0); __ mov(v26.V16B(), v11.V16B()); __ mov(v20.V8B(), v0.V8B()); __ mov(v19.B(), 13, v6.B(), 4); __ mov(v4.B(), 13, w19); __ mov(v11.D(), 1, v8.D(), 0); __ mov(v3.D(), 0, x30); __ mov(v29.H(), 4, v11.H(), 7); __ mov(v2.H(), 6, w6); __ mov(v22.S(), 0, v5.S(), 2); __ mov(v24.S(), 3, w8); __ mov(w18, v1.S(), 3); __ mov(x28, v21.D(), 0); __ movi(d24, 0xffff0000ffffff); __ movi(v29.V16B(), 0x80); __ movi(v12.V2D(), 0xffff00ff00ffff00); __ movi(v12.V2S(), 0xec, LSL, 24); __ movi(v10.V2S(), 0x4c, MSL, 16); __ movi(v26.V4H(), 0xc0, LSL); __ movi(v24.V4S(), 0x98, LSL, 16); __ movi(v1.V4S(), 0xde, MSL, 16); __ movi(v21.V8B(), 0x4d); __ movi(v29.V8H(), 0x69, LSL); __ mul(v1.V16B(), v15.V16B(), v17.V16B()); __ mul(v21.V2S(), v19.V2S(), v29.V2S()); __ mul(v19.V2S(), v5.V2S(), v3.S(), 0); __ mul(v29.V4H(), v11.V4H(), v2.V4H()); __ mul(v2.V4H(), v7.V4H(), v0.H(), 0); __ mul(v25.V4S(), v26.V4S(), v16.V4S()); __ mul(v26.V4S(), v6.V4S(), v15.S(), 2); __ mul(v11.V8B(), v15.V8B(), v31.V8B()); __ mul(v20.V8H(), v31.V8H(), v15.V8H()); __ mul(v29.V8H(), v5.V8H(), v9.H(), 4); __ mvn(v13.V16B(), v21.V16B()); __ mvn(v28.V8B(), v19.V8B()); __ mvni(v25.V2S(), 0xb8, LSL, 8); __ mvni(v17.V2S(), 0x6c, MSL, 16); __ mvni(v29.V4H(), 0x48, LSL); __ mvni(v20.V4S(), 0x7a, LSL, 16); __ mvni(v0.V4S(), 0x1e, MSL, 8); __ mvni(v31.V8H(), 0x3e, LSL); __ neg(d25, d11); __ neg(v4.V16B(), v9.V16B()); __ neg(v11.V2D(), v25.V2D()); __ neg(v7.V2S(), v18.V2S()); __ neg(v7.V4H(), v15.V4H()); __ neg(v17.V4S(), v18.V4S()); __ neg(v20.V8B(), v17.V8B()); __ neg(v0.V8H(), v11.V8H()); __ orn(v13.V16B(), v11.V16B(), v31.V16B()); __ orn(v22.V8B(), v16.V8B(), v22.V8B()); __ orr(v17.V16B(), v17.V16B(), v23.V16B()); __ orr(v8.V2S(), 0xe3); __ orr(v11.V4H(), 0x97, 8); __ orr(v7.V4S(), 0xab); __ orr(v8.V8B(), v4.V8B(), v3.V8B()); __ orr(v31.V8H(), 0xb0, 8); __ pmul(v11.V16B(), v18.V16B(), v23.V16B()); __ pmul(v8.V8B(), v24.V8B(), v5.V8B()); __ pmull(v24.V8H(), v18.V8B(), v22.V8B()); __ pmull2(v13.V8H(), v3.V16B(), v21.V16B()); __ raddhn(v22.V2S(), v10.V2D(), v21.V2D()); __ raddhn(v5.V4H(), v13.V4S(), v13.V4S()); __ raddhn(v10.V8B(), v17.V8H(), v26.V8H()); __ raddhn2(v9.V16B(), v29.V8H(), v13.V8H()); __ raddhn2(v27.V4S(), v23.V2D(), v26.V2D()); __ raddhn2(v0.V8H(), v29.V4S(), v7.V4S()); __ rbit(v22.V16B(), v15.V16B()); __ rbit(v30.V8B(), v3.V8B()); __ rev16(v31.V16B(), v27.V16B()); __ rev16(v12.V8B(), v26.V8B()); __ rev32(v5.V16B(), v4.V16B()); __ rev32(v16.V4H(), v26.V4H()); __ rev32(v20.V8B(), v3.V8B()); __ rev32(v20.V8H(), v28.V8H()); __ rev64(v9.V16B(), v19.V16B()); __ rev64(v5.V2S(), v16.V2S()); __ rev64(v7.V4H(), v31.V4H()); __ rev64(v15.V4S(), v26.V4S()); __ rev64(v25.V8B(), v9.V8B()); __ rev64(v11.V8H(), v5.V8H()); __ rshrn(v18.V2S(), v13.V2D(), 1); __ rshrn(v25.V4H(), v30.V4S(), 2); __ rshrn(v13.V8B(), v9.V8H(), 8); __ rshrn2(v3.V16B(), v6.V8H(), 8); __ rshrn2(v0.V4S(), v29.V2D(), 25); __ rshrn2(v27.V8H(), v26.V4S(), 15); __ rsubhn(v15.V2S(), v25.V2D(), v4.V2D()); __ rsubhn(v23.V4H(), v9.V4S(), v3.V4S()); __ rsubhn(v6.V8B(), v30.V8H(), v24.V8H()); __ rsubhn2(v4.V16B(), v24.V8H(), v20.V8H()); __ rsubhn2(v1.V4S(), v23.V2D(), v22.V2D()); __ rsubhn2(v19.V8H(), v2.V4S(), v20.V4S()); __ saba(v28.V16B(), v9.V16B(), v25.V16B()); __ saba(v9.V2S(), v28.V2S(), v20.V2S()); __ saba(v17.V4H(), v22.V4H(), v22.V4H()); __ saba(v29.V4S(), v5.V4S(), v27.V4S()); __ saba(v20.V8B(), v21.V8B(), v18.V8B()); __ saba(v27.V8H(), v17.V8H(), v30.V8H()); __ sabal(v20.V2D(), v13.V2S(), v7.V2S()); __ sabal(v4.V4S(), v12.V4H(), v4.V4H()); __ sabal(v23.V8H(), v24.V8B(), v20.V8B()); __ sabal2(v26.V2D(), v21.V4S(), v18.V4S()); __ sabal2(v27.V4S(), v28.V8H(), v8.V8H()); __ sabal2(v12.V8H(), v16.V16B(), v21.V16B()); __ sabd(v0.V16B(), v15.V16B(), v13.V16B()); __ sabd(v15.V2S(), v7.V2S(), v30.V2S()); __ sabd(v17.V4H(), v17.V4H(), v12.V4H()); __ sabd(v7.V4S(), v4.V4S(), v22.V4S()); __ sabd(v23.V8B(), v3.V8B(), v26.V8B()); __ sabd(v20.V8H(), v28.V8H(), v5.V8H()); __ sabdl(v27.V2D(), v22.V2S(), v20.V2S()); __ sabdl(v31.V4S(), v20.V4H(), v23.V4H()); __ sabdl(v0.V8H(), v20.V8B(), v27.V8B()); __ sabdl2(v31.V2D(), v11.V4S(), v3.V4S()); __ sabdl2(v26.V4S(), v11.V8H(), v27.V8H()); __ sabdl2(v6.V8H(), v8.V16B(), v18.V16B()); __ sadalp(v8.V1D(), v26.V2S()); __ sadalp(v12.V2D(), v26.V4S()); __ sadalp(v12.V2S(), v26.V4H()); __ sadalp(v4.V4H(), v1.V8B()); __ sadalp(v15.V4S(), v17.V8H()); __ sadalp(v21.V8H(), v25.V16B()); __ saddl(v5.V2D(), v10.V2S(), v14.V2S()); __ saddl(v18.V4S(), v3.V4H(), v15.V4H()); __ saddl(v15.V8H(), v2.V8B(), v23.V8B()); __ saddl2(v16.V2D(), v16.V4S(), v27.V4S()); __ saddl2(v6.V4S(), v24.V8H(), v0.V8H()); __ saddl2(v7.V8H(), v20.V16B(), v28.V16B()); __ saddlp(v10.V1D(), v25.V2S()); __ saddlp(v15.V2D(), v16.V4S()); __ saddlp(v18.V2S(), v10.V4H()); __ saddlp(v29.V4H(), v26.V8B()); __ saddlp(v10.V4S(), v1.V8H()); __ saddlp(v0.V8H(), v21.V16B()); __ saddlv(d12, v7.V4S()); __ saddlv(h14, v28.V16B()); __ saddlv(h30, v30.V8B()); __ saddlv(s27, v3.V4H()); __ saddlv(s16, v16.V8H()); __ saddw(v24.V2D(), v11.V2D(), v18.V2S()); __ saddw(v13.V4S(), v12.V4S(), v6.V4H()); __ saddw(v19.V8H(), v19.V8H(), v7.V8B()); __ saddw2(v27.V2D(), v9.V2D(), v26.V4S()); __ saddw2(v19.V4S(), v23.V4S(), v21.V8H()); __ saddw2(v15.V8H(), v25.V8H(), v30.V16B()); __ shadd(v7.V16B(), v4.V16B(), v9.V16B()); __ shadd(v29.V2S(), v25.V2S(), v24.V2S()); __ shadd(v31.V4H(), v10.V4H(), v13.V4H()); __ shadd(v21.V4S(), v16.V4S(), v8.V4S()); __ shadd(v14.V8B(), v29.V8B(), v22.V8B()); __ shadd(v19.V8H(), v24.V8H(), v20.V8H()); __ shl(d22, d25, 23); __ shl(v5.V16B(), v17.V16B(), 7); __ shl(v2.V2D(), v4.V2D(), 21); __ shl(v4.V2S(), v3.V2S(), 26); __ shl(v3.V4H(), v28.V4H(), 8); __ shl(v4.V4S(), v31.V4S(), 24); __ shl(v18.V8B(), v16.V8B(), 2); __ shl(v0.V8H(), v11.V8H(), 3); __ shll(v5.V2D(), v24.V2S(), 32); __ shll(v26.V4S(), v20.V4H(), 16); __ shll(v5.V8H(), v9.V8B(), 8); __ shll2(v21.V2D(), v28.V4S(), 32); __ shll2(v22.V4S(), v1.V8H(), 16); __ shll2(v30.V8H(), v25.V16B(), 8); __ shrn(v5.V2S(), v1.V2D(), 28); __ shrn(v29.V4H(), v18.V4S(), 7); __ shrn(v17.V8B(), v29.V8H(), 2); __ shrn2(v5.V16B(), v30.V8H(), 3); __ shrn2(v24.V4S(), v1.V2D(), 1); __ shrn2(v5.V8H(), v14.V4S(), 16); __ shsub(v30.V16B(), v22.V16B(), v23.V16B()); __ shsub(v22.V2S(), v27.V2S(), v25.V2S()); __ shsub(v13.V4H(), v22.V4H(), v1.V4H()); __ shsub(v10.V4S(), v8.V4S(), v23.V4S()); __ shsub(v6.V8B(), v9.V8B(), v31.V8B()); __ shsub(v8.V8H(), v31.V8H(), v8.V8H()); __ sli(d19, d29, 20); __ sli(v9.V16B(), v24.V16B(), 0); __ sli(v22.V2D(), v9.V2D(), 10); __ sli(v11.V2S(), v27.V2S(), 20); __ sli(v16.V4H(), v15.V4H(), 5); __ sli(v8.V4S(), v8.V4S(), 25); __ sli(v10.V8B(), v30.V8B(), 0); __ sli(v7.V8H(), v28.V8H(), 6); __ smax(v18.V16B(), v8.V16B(), v1.V16B()); __ smax(v30.V2S(), v5.V2S(), v1.V2S()); __ smax(v17.V4H(), v25.V4H(), v19.V4H()); __ smax(v1.V4S(), v24.V4S(), v31.V4S()); __ smax(v17.V8B(), v24.V8B(), v24.V8B()); __ smax(v11.V8H(), v26.V8H(), v10.V8H()); __ smaxp(v12.V16B(), v14.V16B(), v7.V16B()); __ smaxp(v31.V2S(), v24.V2S(), v6.V2S()); __ smaxp(v10.V4H(), v29.V4H(), v10.V4H()); __ smaxp(v18.V4S(), v11.V4S(), v7.V4S()); __ smaxp(v21.V8B(), v0.V8B(), v18.V8B()); __ smaxp(v26.V8H(), v8.V8H(), v15.V8H()); __ smaxv(b4, v5.V16B()); __ smaxv(b23, v0.V8B()); __ smaxv(h6, v0.V4H()); __ smaxv(h24, v8.V8H()); __ smaxv(s3, v16.V4S()); __ smin(v24.V16B(), v8.V16B(), v18.V16B()); __ smin(v29.V2S(), v8.V2S(), v23.V2S()); __ smin(v6.V4H(), v11.V4H(), v21.V4H()); __ smin(v24.V4S(), v23.V4S(), v15.V4S()); __ smin(v8.V8B(), v16.V8B(), v4.V8B()); __ smin(v12.V8H(), v1.V8H(), v10.V8H()); __ sminp(v13.V16B(), v18.V16B(), v28.V16B()); __ sminp(v22.V2S(), v28.V2S(), v16.V2S()); __ sminp(v15.V4H(), v12.V4H(), v5.V4H()); __ sminp(v15.V4S(), v17.V4S(), v8.V4S()); __ sminp(v21.V8B(), v2.V8B(), v6.V8B()); __ sminp(v21.V8H(), v12.V8H(), v6.V8H()); __ sminv(b8, v6.V16B()); __ sminv(b6, v18.V8B()); __ sminv(h20, v1.V4H()); __ sminv(h7, v17.V8H()); __ sminv(s21, v4.V4S()); __ smlal(v24.V2D(), v14.V2S(), v21.V2S()); __ smlal(v31.V2D(), v3.V2S(), v14.S(), 2); __ smlal(v7.V4S(), v20.V4H(), v21.V4H()); __ smlal(v19.V4S(), v16.V4H(), v9.H(), 3); __ smlal(v29.V8H(), v14.V8B(), v1.V8B()); __ smlal2(v30.V2D(), v26.V4S(), v16.V4S()); __ smlal2(v31.V2D(), v30.V4S(), v1.S(), 0); __ smlal2(v17.V4S(), v6.V8H(), v3.V8H()); __ smlal2(v11.V4S(), v31.V8H(), v5.H(), 7); __ smlal2(v30.V8H(), v16.V16B(), v29.V16B()); __ smlsl(v1.V2D(), v20.V2S(), v17.V2S()); __ smlsl(v29.V2D(), v12.V2S(), v5.S(), 3); __ smlsl(v0.V4S(), v26.V4H(), v1.V4H()); __ smlsl(v3.V4S(), v5.V4H(), v6.H(), 5); __ smlsl(v4.V8H(), v0.V8B(), v26.V8B()); __ smlsl2(v14.V2D(), v14.V4S(), v5.V4S()); __ smlsl2(v15.V2D(), v5.V4S(), v0.S(), 1); __ smlsl2(v29.V4S(), v17.V8H(), v31.V8H()); __ smlsl2(v6.V4S(), v15.V8H(), v9.H(), 6); __ smlsl2(v30.V8H(), v15.V16B(), v15.V16B()); __ smov(w21, v6.B(), 3); __ smov(w13, v26.H(), 7); __ smov(x24, v16.B(), 7); __ smov(x7, v4.H(), 3); __ smov(x29, v7.S(), 1); __ smull(v4.V2D(), v29.V2S(), v17.V2S()); __ smull(v30.V2D(), v21.V2S(), v6.S(), 2); __ smull(v23.V4S(), v5.V4H(), v23.V4H()); __ smull(v8.V4S(), v9.V4H(), v2.H(), 1); __ smull(v31.V8H(), v17.V8B(), v1.V8B()); __ smull2(v3.V2D(), v3.V4S(), v23.V4S()); __ smull2(v15.V2D(), v29.V4S(), v6.S(), 1); __ smull2(v19.V4S(), v20.V8H(), v30.V8H()); __ smull2(v6.V4S(), v10.V8H(), v7.H(), 4); __ smull2(v25.V8H(), v8.V16B(), v27.V16B()); __ sqabs(b3, b15); __ sqabs(d14, d9); __ sqabs(h31, h28); __ sqabs(s8, s0); __ sqabs(v14.V16B(), v7.V16B()); __ sqabs(v23.V2D(), v19.V2D()); __ sqabs(v10.V2S(), v24.V2S()); __ sqabs(v31.V4H(), v19.V4H()); __ sqabs(v23.V4S(), v0.V4S()); __ sqabs(v29.V8B(), v23.V8B()); __ sqabs(v17.V8H(), v21.V8H()); __ sqadd(b9, b23, b13); __ sqadd(d2, d25, d26); __ sqadd(h7, h29, h25); __ sqadd(s11, s7, s24); __ sqadd(v20.V16B(), v16.V16B(), v29.V16B()); __ sqadd(v23.V2D(), v30.V2D(), v28.V2D()); __ sqadd(v8.V2S(), v19.V2S(), v2.V2S()); __ sqadd(v20.V4H(), v12.V4H(), v31.V4H()); __ sqadd(v14.V4S(), v15.V4S(), v17.V4S()); __ sqadd(v2.V8B(), v29.V8B(), v13.V8B()); __ sqadd(v7.V8H(), v19.V8H(), v14.V8H()); __ sqdmlal(d15, s5, s30); __ sqdmlal(d24, s10, v2.S(), 3); __ sqdmlal(s9, h19, h8); __ sqdmlal(s14, h1, v12.H(), 3); __ sqdmlal(v30.V2D(), v5.V2S(), v31.V2S()); __ sqdmlal(v25.V2D(), v14.V2S(), v10.S(), 1); __ sqdmlal(v19.V4S(), v17.V4H(), v16.V4H()); __ sqdmlal(v8.V4S(), v5.V4H(), v8.H(), 1); __ sqdmlal2(v1.V2D(), v23.V4S(), v3.V4S()); __ sqdmlal2(v19.V2D(), v0.V4S(), v9.S(), 0); __ sqdmlal2(v26.V4S(), v22.V8H(), v11.V8H()); __ sqdmlal2(v6.V4S(), v28.V8H(), v13.H(), 4); __ sqdmlsl(d10, s29, s20); __ sqdmlsl(d10, s9, v10.S(), 1); __ sqdmlsl(s30, h9, h24); __ sqdmlsl(s13, h24, v6.H(), 1); __ sqdmlsl(v27.V2D(), v10.V2S(), v20.V2S()); __ sqdmlsl(v23.V2D(), v23.V2S(), v3.S(), 3); __ sqdmlsl(v7.V4S(), v17.V4H(), v29.V4H()); __ sqdmlsl(v22.V4S(), v21.V4H(), v3.H(), 4); __ sqdmlsl2(v12.V2D(), v7.V4S(), v22.V4S()); __ sqdmlsl2(v20.V2D(), v25.V4S(), v8.S(), 0); __ sqdmlsl2(v25.V4S(), v26.V8H(), v18.V8H()); __ sqdmlsl2(v25.V4S(), v19.V8H(), v5.H(), 0); __ sqdmulh(h17, h27, h12); __ sqdmulh(h16, h5, v11.H(), 0); __ sqdmulh(s1, s19, s16); __ sqdmulh(s1, s16, v2.S(), 0); __ sqdmulh(v28.V2S(), v1.V2S(), v8.V2S()); __ sqdmulh(v28.V2S(), v8.V2S(), v3.S(), 0); __ sqdmulh(v11.V4H(), v25.V4H(), v5.V4H()); __ sqdmulh(v30.V4H(), v14.V4H(), v8.H(), 5); __ sqdmulh(v25.V4S(), v21.V4S(), v13.V4S()); __ sqdmulh(v23.V4S(), v2.V4S(), v10.S(), 3); __ sqdmulh(v26.V8H(), v5.V8H(), v23.V8H()); __ sqdmulh(v4.V8H(), v22.V8H(), v4.H(), 3); __ sqdmull(d25, s2, s26); __ sqdmull(d30, s14, v5.S(), 1); __ sqdmull(s29, h18, h11); __ sqdmull(s11, h13, v7.H(), 6); __ sqdmull(v23.V2D(), v9.V2S(), v8.V2S()); __ sqdmull(v18.V2D(), v29.V2S(), v4.S(), 1); __ sqdmull(v17.V4S(), v24.V4H(), v7.V4H()); __ sqdmull(v8.V4S(), v15.V4H(), v5.H(), 1); __ sqdmull2(v28.V2D(), v14.V4S(), v2.V4S()); __ sqdmull2(v1.V2D(), v24.V4S(), v13.S(), 2); __ sqdmull2(v11.V4S(), v17.V8H(), v31.V8H()); __ sqdmull2(v1.V4S(), v20.V8H(), v11.H(), 3); __ sqneg(b2, b0); __ sqneg(d24, d2); __ sqneg(h29, h3); __ sqneg(s4, s9); __ sqneg(v14.V16B(), v29.V16B()); __ sqneg(v30.V2D(), v12.V2D()); __ sqneg(v28.V2S(), v26.V2S()); __ sqneg(v4.V4H(), v4.V4H()); __ sqneg(v9.V4S(), v8.V4S()); __ sqneg(v20.V8B(), v20.V8B()); __ sqneg(v27.V8H(), v10.V8H()); __ sqrdmulh(h7, h24, h0); __ sqrdmulh(h14, h3, v4.H(), 6); __ sqrdmulh(s27, s19, s24); __ sqrdmulh(s31, s21, v4.S(), 0); __ sqrdmulh(v18.V2S(), v25.V2S(), v1.V2S()); __ sqrdmulh(v22.V2S(), v5.V2S(), v13.S(), 0); __ sqrdmulh(v22.V4H(), v24.V4H(), v9.V4H()); __ sqrdmulh(v13.V4H(), v2.V4H(), v12.H(), 6); __ sqrdmulh(v9.V4S(), v27.V4S(), v2.V4S()); __ sqrdmulh(v3.V4S(), v23.V4S(), v7.S(), 1); __ sqrdmulh(v2.V8H(), v0.V8H(), v7.V8H()); __ sqrdmulh(v16.V8H(), v9.V8H(), v8.H(), 2); __ sqrshl(b8, b21, b13); __ sqrshl(d29, d7, d20); __ sqrshl(h28, h14, h10); __ sqrshl(s26, s18, s2); __ sqrshl(v18.V16B(), v31.V16B(), v26.V16B()); __ sqrshl(v28.V2D(), v4.V2D(), v0.V2D()); __ sqrshl(v3.V2S(), v6.V2S(), v0.V2S()); __ sqrshl(v1.V4H(), v18.V4H(), v22.V4H()); __ sqrshl(v16.V4S(), v25.V4S(), v7.V4S()); __ sqrshl(v0.V8B(), v21.V8B(), v5.V8B()); __ sqrshl(v30.V8H(), v19.V8H(), v8.V8H()); __ sqrshrn(b6, h21, 4); __ sqrshrn(h14, s17, 11); __ sqrshrn(s25, d27, 10); __ sqrshrn(v6.V2S(), v13.V2D(), 18); __ sqrshrn(v5.V4H(), v9.V4S(), 15); __ sqrshrn(v19.V8B(), v12.V8H(), 1); __ sqrshrn2(v19.V16B(), v21.V8H(), 7); __ sqrshrn2(v29.V4S(), v24.V2D(), 13); __ sqrshrn2(v12.V8H(), v2.V4S(), 10); __ sqrshrun(b16, h9, 5); __ sqrshrun(h3, s24, 15); __ sqrshrun(s16, d18, 8); __ sqrshrun(v28.V2S(), v23.V2D(), 8); __ sqrshrun(v31.V4H(), v25.V4S(), 10); __ sqrshrun(v19.V8B(), v23.V8H(), 2); __ sqrshrun2(v24.V16B(), v0.V8H(), 8); __ sqrshrun2(v22.V4S(), v1.V2D(), 23); __ sqrshrun2(v28.V8H(), v21.V4S(), 13); __ sqshl(b6, b21, b8); __ sqshl(b11, b26, 2); __ sqshl(d29, d0, d4); __ sqshl(d21, d7, 35); __ sqshl(h20, h25, h17); __ sqshl(h20, h0, 8); __ sqshl(s29, s13, s4); __ sqshl(s10, s11, 20); __ sqshl(v8.V16B(), v18.V16B(), v28.V16B()); __ sqshl(v29.V16B(), v29.V16B(), 2); __ sqshl(v8.V2D(), v31.V2D(), v16.V2D()); __ sqshl(v7.V2D(), v14.V2D(), 37); __ sqshl(v0.V2S(), v26.V2S(), v7.V2S()); __ sqshl(v5.V2S(), v11.V2S(), 19); __ sqshl(v11.V4H(), v30.V4H(), v0.V4H()); __ sqshl(v1.V4H(), v18.V4H(), 7); __ sqshl(v22.V4S(), v3.V4S(), v30.V4S()); __ sqshl(v16.V4S(), v15.V4S(), 28); __ sqshl(v6.V8B(), v28.V8B(), v25.V8B()); __ sqshl(v0.V8B(), v15.V8B(), 0); __ sqshl(v6.V8H(), v16.V8H(), v30.V8H()); __ sqshl(v3.V8H(), v20.V8H(), 14); __ sqshlu(b13, b14, 6); __ sqshlu(d0, d16, 44); __ sqshlu(h5, h29, 15); __ sqshlu(s29, s8, 13); __ sqshlu(v27.V16B(), v20.V16B(), 2); __ sqshlu(v24.V2D(), v12.V2D(), 11); __ sqshlu(v12.V2S(), v19.V2S(), 22); __ sqshlu(v8.V4H(), v12.V4H(), 11); __ sqshlu(v18.V4S(), v3.V4S(), 8); __ sqshlu(v3.V8B(), v10.V8B(), 1); __ sqshlu(v30.V8H(), v24.V8H(), 4); __ sqshrn(b1, h28, 1); __ sqshrn(h31, s7, 10); __ sqshrn(s4, d10, 24); __ sqshrn(v10.V2S(), v1.V2D(), 29); __ sqshrn(v3.V4H(), v13.V4S(), 14); __ sqshrn(v27.V8B(), v6.V8H(), 7); __ sqshrn2(v14.V16B(), v23.V8H(), 1); __ sqshrn2(v25.V4S(), v22.V2D(), 27); __ sqshrn2(v31.V8H(), v12.V4S(), 10); __ sqshrun(b9, h0, 1); __ sqshrun(h11, s6, 7); __ sqshrun(s13, d12, 13); __ sqshrun(v10.V2S(), v30.V2D(), 1); __ sqshrun(v31.V4H(), v3.V4S(), 11); __ sqshrun(v28.V8B(), v30.V8H(), 8); __ sqshrun2(v16.V16B(), v27.V8H(), 3); __ sqshrun2(v27.V4S(), v14.V2D(), 18); __ sqshrun2(v23.V8H(), v14.V4S(), 1); __ sqsub(b19, b29, b11); __ sqsub(d21, d31, d6); __ sqsub(h18, h10, h19); __ sqsub(s6, s5, s0); __ sqsub(v21.V16B(), v22.V16B(), v0.V16B()); __ sqsub(v22.V2D(), v10.V2D(), v17.V2D()); __ sqsub(v8.V2S(), v21.V2S(), v2.V2S()); __ sqsub(v18.V4H(), v25.V4H(), v27.V4H()); __ sqsub(v13.V4S(), v3.V4S(), v6.V4S()); __ sqsub(v28.V8B(), v29.V8B(), v16.V8B()); __ sqsub(v17.V8H(), v6.V8H(), v10.V8H()); __ sqxtn(b27, h26); __ sqxtn(h17, s11); __ sqxtn(s22, d31); __ sqxtn(v26.V2S(), v5.V2D()); __ sqxtn(v13.V4H(), v7.V4S()); __ sqxtn(v19.V8B(), v19.V8H()); __ sqxtn2(v19.V16B(), v3.V8H()); __ sqxtn2(v23.V4S(), v1.V2D()); __ sqxtn2(v13.V8H(), v3.V4S()); __ sqxtun(b26, h9); __ sqxtun(h19, s12); __ sqxtun(s3, d6); __ sqxtun(v29.V2S(), v26.V2D()); __ sqxtun(v26.V4H(), v10.V4S()); __ sqxtun(v7.V8B(), v29.V8H()); __ sqxtun2(v21.V16B(), v14.V8H()); __ sqxtun2(v24.V4S(), v15.V2D()); __ sqxtun2(v30.V8H(), v1.V4S()); __ srhadd(v21.V16B(), v17.V16B(), v15.V16B()); __ srhadd(v28.V2S(), v21.V2S(), v29.V2S()); __ srhadd(v9.V4H(), v1.V4H(), v30.V4H()); __ srhadd(v24.V4S(), v0.V4S(), v2.V4S()); __ srhadd(v6.V8B(), v17.V8B(), v15.V8B()); __ srhadd(v5.V8H(), v7.V8H(), v21.V8H()); __ sri(d14, d14, 49); __ sri(v23.V16B(), v8.V16B(), 4); __ sri(v20.V2D(), v13.V2D(), 20); __ sri(v16.V2S(), v2.V2S(), 24); __ sri(v5.V4H(), v23.V4H(), 11); __ sri(v27.V4S(), v15.V4S(), 23); __ sri(v19.V8B(), v29.V8B(), 4); __ sri(v7.V8H(), v29.V8H(), 3); __ srshl(d2, d9, d26); __ srshl(v29.V16B(), v17.V16B(), v11.V16B()); __ srshl(v8.V2D(), v15.V2D(), v4.V2D()); __ srshl(v25.V2S(), v17.V2S(), v8.V2S()); __ srshl(v19.V4H(), v7.V4H(), v7.V4H()); __ srshl(v13.V4S(), v2.V4S(), v17.V4S()); __ srshl(v22.V8B(), v6.V8B(), v21.V8B()); __ srshl(v10.V8H(), v17.V8H(), v4.V8H()); __ srshr(d21, d18, 45); __ srshr(v3.V16B(), v11.V16B(), 7); __ srshr(v21.V2D(), v26.V2D(), 53); __ srshr(v11.V2S(), v5.V2S(), 28); __ srshr(v7.V4H(), v18.V4H(), 12); __ srshr(v7.V4S(), v3.V4S(), 30); __ srshr(v14.V8B(), v2.V8B(), 6); __ srshr(v21.V8H(), v20.V8H(), 3); __ srsra(d21, d30, 63); __ srsra(v27.V16B(), v30.V16B(), 6); __ srsra(v20.V2D(), v12.V2D(), 27); __ srsra(v0.V2S(), v17.V2S(), 5); __ srsra(v14.V4H(), v16.V4H(), 15); __ srsra(v18.V4S(), v3.V4S(), 20); __ srsra(v21.V8B(), v1.V8B(), 1); __ srsra(v31.V8H(), v25.V8H(), 2); __ sshl(d1, d13, d9); __ sshl(v17.V16B(), v31.V16B(), v15.V16B()); __ sshl(v13.V2D(), v16.V2D(), v0.V2D()); __ sshl(v0.V2S(), v7.V2S(), v22.V2S()); __ sshl(v23.V4H(), v19.V4H(), v4.V4H()); __ sshl(v5.V4S(), v5.V4S(), v11.V4S()); __ sshl(v23.V8B(), v27.V8B(), v7.V8B()); __ sshl(v29.V8H(), v10.V8H(), v5.V8H()); __ sshll(v0.V2D(), v2.V2S(), 23); __ sshll(v11.V4S(), v8.V4H(), 8); __ sshll(v4.V8H(), v29.V8B(), 1); __ sshll2(v10.V2D(), v4.V4S(), 14); __ sshll2(v26.V4S(), v31.V8H(), 6); __ sshll2(v3.V8H(), v26.V16B(), 4); __ sshr(d19, d21, 20); __ sshr(v15.V16B(), v23.V16B(), 5); __ sshr(v17.V2D(), v14.V2D(), 38); __ sshr(v3.V2S(), v29.V2S(), 23); __ sshr(v23.V4H(), v27.V4H(), 4); __ sshr(v28.V4S(), v3.V4S(), 4); __ sshr(v14.V8B(), v2.V8B(), 6); __ sshr(v3.V8H(), v8.V8H(), 6); __ ssra(d12, d28, 44); __ ssra(v29.V16B(), v31.V16B(), 4); __ ssra(v3.V2D(), v0.V2D(), 24); __ ssra(v14.V2S(), v28.V2S(), 6); __ ssra(v18.V4H(), v8.V4H(), 7); __ ssra(v31.V4S(), v14.V4S(), 24); __ ssra(v28.V8B(), v26.V8B(), 5); __ ssra(v9.V8H(), v9.V8H(), 14); __ ssubl(v13.V2D(), v14.V2S(), v3.V2S()); __ ssubl(v5.V4S(), v16.V4H(), v8.V4H()); __ ssubl(v0.V8H(), v28.V8B(), v6.V8B()); __ ssubl2(v5.V2D(), v13.V4S(), v25.V4S()); __ ssubl2(v3.V4S(), v15.V8H(), v17.V8H()); __ ssubl2(v15.V8H(), v15.V16B(), v14.V16B()); __ ssubw(v25.V2D(), v23.V2D(), v26.V2S()); __ ssubw(v21.V4S(), v18.V4S(), v24.V4H()); __ ssubw(v30.V8H(), v22.V8H(), v3.V8B()); __ ssubw2(v16.V2D(), v24.V2D(), v28.V4S()); __ ssubw2(v31.V4S(), v11.V4S(), v15.V8H()); __ ssubw2(v4.V8H(), v8.V8H(), v16.V16B()); __ st1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x0)); __ st1(v10.V16B(), v11.V16B(), v12.V16B(), v13.V16B(), MemOperand(x1, x2, PostIndex)); __ st1(v27.V16B(), v28.V16B(), v29.V16B(), v30.V16B(), MemOperand(x1, 64, PostIndex)); __ st1(v16.V16B(), v17.V16B(), v18.V16B(), MemOperand(x0)); __ st1(v21.V16B(), v22.V16B(), v23.V16B(), MemOperand(x1, x2, PostIndex)); __ st1(v9.V16B(), v10.V16B(), v11.V16B(), MemOperand(x1, 48, PostIndex)); __ st1(v7.V16B(), v8.V16B(), MemOperand(x0)); __ st1(v26.V16B(), v27.V16B(), MemOperand(x1, x2, PostIndex)); __ st1(v22.V16B(), v23.V16B(), MemOperand(x1, 32, PostIndex)); __ st1(v23.V16B(), MemOperand(x0)); __ st1(v28.V16B(), MemOperand(x1, x2, PostIndex)); __ st1(v2.V16B(), MemOperand(x1, 16, PostIndex)); __ st1(v29.V1D(), v30.V1D(), v31.V1D(), v0.V1D(), MemOperand(x0)); __ st1(v12.V1D(), v13.V1D(), v14.V1D(), v15.V1D(), MemOperand(x1, x2, PostIndex)); __ st1(v30.V1D(), v31.V1D(), v0.V1D(), v1.V1D(), MemOperand(x1, 32, PostIndex)); __ st1(v16.V1D(), v17.V1D(), v18.V1D(), MemOperand(x0)); __ st1(v3.V1D(), v4.V1D(), v5.V1D(), MemOperand(x1, x2, PostIndex)); __ st1(v14.V1D(), v15.V1D(), v16.V1D(), MemOperand(x1, 24, PostIndex)); __ st1(v18.V1D(), v19.V1D(), MemOperand(x0)); __ st1(v5.V1D(), v6.V1D(), MemOperand(x1, x2, PostIndex)); __ st1(v2.V1D(), v3.V1D(), MemOperand(x1, 16, PostIndex)); __ st1(v4.V1D(), MemOperand(x0)); __ st1(v27.V1D(), MemOperand(x1, x2, PostIndex)); __ st1(v23.V1D(), MemOperand(x1, 8, PostIndex)); __ st1(v2.V2D(), v3.V2D(), v4.V2D(), v5.V2D(), MemOperand(x0)); __ st1(v22.V2D(), v23.V2D(), v24.V2D(), v25.V2D(), MemOperand(x1, x2, PostIndex)); __ st1(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), MemOperand(x1, 64, PostIndex)); __ st1(v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0)); __ st1(v16.V2D(), v17.V2D(), v18.V2D(), MemOperand(x1, x2, PostIndex)); __ st1(v22.V2D(), v23.V2D(), v24.V2D(), MemOperand(x1, 48, PostIndex)); __ st1(v21.V2D(), v22.V2D(), MemOperand(x0)); __ st1(v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex)); __ st1(v27.V2D(), v28.V2D(), MemOperand(x1, 32, PostIndex)); __ st1(v21.V2D(), MemOperand(x0)); __ st1(v29.V2D(), MemOperand(x1, x2, PostIndex)); __ st1(v20.V2D(), MemOperand(x1, 16, PostIndex)); __ st1(v22.V2S(), v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x0)); __ st1(v8.V2S(), v9.V2S(), v10.V2S(), v11.V2S(), MemOperand(x1, x2, PostIndex)); __ st1(v15.V2S(), v16.V2S(), v17.V2S(), v18.V2S(), MemOperand(x1, 32, PostIndex)); __ st1(v2.V2S(), v3.V2S(), v4.V2S(), MemOperand(x0)); __ st1(v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x1, x2, PostIndex)); __ st1(v7.V2S(), v8.V2S(), v9.V2S(), MemOperand(x1, 24, PostIndex)); __ st1(v28.V2S(), v29.V2S(), MemOperand(x0)); __ st1(v29.V2S(), v30.V2S(), MemOperand(x1, x2, PostIndex)); __ st1(v23.V2S(), v24.V2S(), MemOperand(x1, 16, PostIndex)); __ st1(v6.V2S(), MemOperand(x0)); __ st1(v11.V2S(), MemOperand(x1, x2, PostIndex)); __ st1(v17.V2S(), MemOperand(x1, 8, PostIndex)); __ st1(v6.V4H(), v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x0)); __ st1(v9.V4H(), v10.V4H(), v11.V4H(), v12.V4H(), MemOperand(x1, x2, PostIndex)); __ st1(v25.V4H(), v26.V4H(), v27.V4H(), v28.V4H(), MemOperand(x1, 32, PostIndex)); __ st1(v11.V4H(), v12.V4H(), v13.V4H(), MemOperand(x0)); __ st1(v10.V4H(), v11.V4H(), v12.V4H(), MemOperand(x1, x2, PostIndex)); __ st1(v12.V4H(), v13.V4H(), v14.V4H(), MemOperand(x1, 24, PostIndex)); __ st1(v13.V4H(), v14.V4H(), MemOperand(x0)); __ st1(v15.V4H(), v16.V4H(), MemOperand(x1, x2, PostIndex)); __ st1(v21.V4H(), v22.V4H(), MemOperand(x1, 16, PostIndex)); __ st1(v16.V4H(), MemOperand(x0)); __ st1(v8.V4H(), MemOperand(x1, x2, PostIndex)); __ st1(v30.V4H(), MemOperand(x1, 8, PostIndex)); __ st1(v3.V4S(), v4.V4S(), v5.V4S(), v6.V4S(), MemOperand(x0)); __ st1(v25.V4S(), v26.V4S(), v27.V4S(), v28.V4S(), MemOperand(x1, x2, PostIndex)); __ st1(v5.V4S(), v6.V4S(), v7.V4S(), v8.V4S(), MemOperand(x1, 64, PostIndex)); __ st1(v31.V4S(), v0.V4S(), v1.V4S(), MemOperand(x0)); __ st1(v30.V4S(), v31.V4S(), v0.V4S(), MemOperand(x1, x2, PostIndex)); __ st1(v6.V4S(), v7.V4S(), v8.V4S(), MemOperand(x1, 48, PostIndex)); __ st1(v17.V4S(), v18.V4S(), MemOperand(x0)); __ st1(v31.V4S(), v0.V4S(), MemOperand(x1, x2, PostIndex)); __ st1(v1.V4S(), v2.V4S(), MemOperand(x1, 32, PostIndex)); __ st1(v26.V4S(), MemOperand(x0)); __ st1(v15.V4S(), MemOperand(x1, x2, PostIndex)); __ st1(v13.V4S(), MemOperand(x1, 16, PostIndex)); __ st1(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0)); __ st1(v10.V8B(), v11.V8B(), v12.V8B(), v13.V8B(), MemOperand(x1, x2, PostIndex)); __ st1(v15.V8B(), v16.V8B(), v17.V8B(), v18.V8B(), MemOperand(x1, 32, PostIndex)); __ st1(v19.V8B(), v20.V8B(), v21.V8B(), MemOperand(x0)); __ st1(v31.V8B(), v0.V8B(), v1.V8B(), MemOperand(x1, x2, PostIndex)); __ st1(v9.V8B(), v10.V8B(), v11.V8B(), MemOperand(x1, 24, PostIndex)); __ st1(v12.V8B(), v13.V8B(), MemOperand(x0)); __ st1(v2.V8B(), v3.V8B(), MemOperand(x1, x2, PostIndex)); __ st1(v0.V8B(), v1.V8B(), MemOperand(x1, 16, PostIndex)); __ st1(v16.V8B(), MemOperand(x0)); __ st1(v25.V8B(), MemOperand(x1, x2, PostIndex)); __ st1(v31.V8B(), MemOperand(x1, 8, PostIndex)); __ st1(v4.V8H(), v5.V8H(), v6.V8H(), v7.V8H(), MemOperand(x0)); __ st1(v3.V8H(), v4.V8H(), v5.V8H(), v6.V8H(), MemOperand(x1, x2, PostIndex)); __ st1(v26.V8H(), v27.V8H(), v28.V8H(), v29.V8H(), MemOperand(x1, 64, PostIndex)); __ st1(v10.V8H(), v11.V8H(), v12.V8H(), MemOperand(x0)); __ st1(v21.V8H(), v22.V8H(), v23.V8H(), MemOperand(x1, x2, PostIndex)); __ st1(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, 48, PostIndex)); __ st1(v26.V8H(), v27.V8H(), MemOperand(x0)); __ st1(v24.V8H(), v25.V8H(), MemOperand(x1, x2, PostIndex)); __ st1(v17.V8H(), v18.V8H(), MemOperand(x1, 32, PostIndex)); __ st1(v29.V8H(), MemOperand(x0)); __ st1(v19.V8H(), MemOperand(x1, x2, PostIndex)); __ st1(v23.V8H(), MemOperand(x1, 16, PostIndex)); __ st1(v19.B(), 15, MemOperand(x0)); __ st1(v25.B(), 9, MemOperand(x1, x2, PostIndex)); __ st1(v4.B(), 8, MemOperand(x1, 1, PostIndex)); __ st1(v13.D(), 0, MemOperand(x0)); __ st1(v30.D(), 0, MemOperand(x1, x2, PostIndex)); __ st1(v3.D(), 0, MemOperand(x1, 8, PostIndex)); __ st1(v22.H(), 0, MemOperand(x0)); __ st1(v31.H(), 7, MemOperand(x1, x2, PostIndex)); __ st1(v23.H(), 3, MemOperand(x1, 2, PostIndex)); __ st1(v0.S(), 0, MemOperand(x0)); __ st1(v11.S(), 3, MemOperand(x1, x2, PostIndex)); __ st1(v24.S(), 3, MemOperand(x1, 4, PostIndex)); __ st2(v7.V16B(), v8.V16B(), MemOperand(x0)); __ st2(v5.V16B(), v6.V16B(), MemOperand(x1, x2, PostIndex)); __ st2(v18.V16B(), v19.V16B(), MemOperand(x1, 32, PostIndex)); __ st2(v14.V2D(), v15.V2D(), MemOperand(x0)); __ st2(v7.V2D(), v8.V2D(), MemOperand(x1, x2, PostIndex)); __ st2(v24.V2D(), v25.V2D(), MemOperand(x1, 32, PostIndex)); __ st2(v22.V2S(), v23.V2S(), MemOperand(x0)); __ st2(v4.V2S(), v5.V2S(), MemOperand(x1, x2, PostIndex)); __ st2(v2.V2S(), v3.V2S(), MemOperand(x1, 16, PostIndex)); __ st2(v23.V4H(), v24.V4H(), MemOperand(x0)); __ st2(v8.V4H(), v9.V4H(), MemOperand(x1, x2, PostIndex)); __ st2(v7.V4H(), v8.V4H(), MemOperand(x1, 16, PostIndex)); __ st2(v17.V4S(), v18.V4S(), MemOperand(x0)); __ st2(v6.V4S(), v7.V4S(), MemOperand(x1, x2, PostIndex)); __ st2(v26.V4S(), v27.V4S(), MemOperand(x1, 32, PostIndex)); __ st2(v31.V8B(), v0.V8B(), MemOperand(x0)); __ st2(v0.V8B(), v1.V8B(), MemOperand(x1, x2, PostIndex)); __ st2(v21.V8B(), v22.V8B(), MemOperand(x1, 16, PostIndex)); __ st2(v7.V8H(), v8.V8H(), MemOperand(x0)); __ st2(v22.V8H(), v23.V8H(), MemOperand(x1, x2, PostIndex)); __ st2(v4.V8H(), v5.V8H(), MemOperand(x1, 32, PostIndex)); __ st2(v8.B(), v9.B(), 15, MemOperand(x0)); __ st2(v8.B(), v9.B(), 15, MemOperand(x1, x2, PostIndex)); __ st2(v7.B(), v8.B(), 4, MemOperand(x1, 2, PostIndex)); __ st2(v25.D(), v26.D(), 0, MemOperand(x0)); __ st2(v17.D(), v18.D(), 1, MemOperand(x1, x2, PostIndex)); __ st2(v3.D(), v4.D(), 1, MemOperand(x1, 16, PostIndex)); __ st2(v4.H(), v5.H(), 3, MemOperand(x0)); __ st2(v0.H(), v1.H(), 5, MemOperand(x1, x2, PostIndex)); __ st2(v22.H(), v23.H(), 2, MemOperand(x1, 4, PostIndex)); __ st2(v14.S(), v15.S(), 3, MemOperand(x0)); __ st2(v23.S(), v24.S(), 3, MemOperand(x1, x2, PostIndex)); __ st2(v0.S(), v1.S(), 2, MemOperand(x1, 8, PostIndex)); __ st3(v26.V16B(), v27.V16B(), v28.V16B(), MemOperand(x0)); __ st3(v21.V16B(), v22.V16B(), v23.V16B(), MemOperand(x1, x2, PostIndex)); __ st3(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x1, 48, PostIndex)); __ st3(v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0)); __ st3(v23.V2D(), v24.V2D(), v25.V2D(), MemOperand(x1, x2, PostIndex)); __ st3(v10.V2D(), v11.V2D(), v12.V2D(), MemOperand(x1, 48, PostIndex)); __ st3(v9.V2S(), v10.V2S(), v11.V2S(), MemOperand(x0)); __ st3(v13.V2S(), v14.V2S(), v15.V2S(), MemOperand(x1, x2, PostIndex)); __ st3(v22.V2S(), v23.V2S(), v24.V2S(), MemOperand(x1, 24, PostIndex)); __ st3(v31.V4H(), v0.V4H(), v1.V4H(), MemOperand(x0)); __ st3(v8.V4H(), v9.V4H(), v10.V4H(), MemOperand(x1, x2, PostIndex)); __ st3(v19.V4H(), v20.V4H(), v21.V4H(), MemOperand(x1, 24, PostIndex)); __ st3(v18.V4S(), v19.V4S(), v20.V4S(), MemOperand(x0)); __ st3(v25.V4S(), v26.V4S(), v27.V4S(), MemOperand(x1, x2, PostIndex)); __ st3(v16.V4S(), v17.V4S(), v18.V4S(), MemOperand(x1, 48, PostIndex)); __ st3(v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0)); __ st3(v29.V8B(), v30.V8B(), v31.V8B(), MemOperand(x1, x2, PostIndex)); __ st3(v30.V8B(), v31.V8B(), v0.V8B(), MemOperand(x1, 24, PostIndex)); __ st3(v8.V8H(), v9.V8H(), v10.V8H(), MemOperand(x0)); __ st3(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, x2, PostIndex)); __ st3(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, 48, PostIndex)); __ st3(v31.B(), v0.B(), v1.B(), 10, MemOperand(x0)); __ st3(v4.B(), v5.B(), v6.B(), 5, MemOperand(x1, x2, PostIndex)); __ st3(v5.B(), v6.B(), v7.B(), 1, MemOperand(x1, 3, PostIndex)); __ st3(v5.D(), v6.D(), v7.D(), 0, MemOperand(x0)); __ st3(v6.D(), v7.D(), v8.D(), 0, MemOperand(x1, x2, PostIndex)); __ st3(v0.D(), v1.D(), v2.D(), 0, MemOperand(x1, 24, PostIndex)); __ st3(v31.H(), v0.H(), v1.H(), 2, MemOperand(x0)); __ st3(v14.H(), v15.H(), v16.H(), 5, MemOperand(x1, x2, PostIndex)); __ st3(v21.H(), v22.H(), v23.H(), 6, MemOperand(x1, 6, PostIndex)); __ st3(v21.S(), v22.S(), v23.S(), 0, MemOperand(x0)); __ st3(v11.S(), v12.S(), v13.S(), 1, MemOperand(x1, x2, PostIndex)); __ st3(v15.S(), v16.S(), v17.S(), 0, MemOperand(x1, 12, PostIndex)); __ st4(v22.V16B(), v23.V16B(), v24.V16B(), v25.V16B(), MemOperand(x0)); __ st4(v24.V16B(), v25.V16B(), v26.V16B(), v27.V16B(), MemOperand(x1, x2, PostIndex)); __ st4(v15.V16B(), v16.V16B(), v17.V16B(), v18.V16B(), MemOperand(x1, 64, PostIndex)); __ st4(v16.V2D(), v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0)); __ st4(v17.V2D(), v18.V2D(), v19.V2D(), v20.V2D(), MemOperand(x1, x2, PostIndex)); __ st4(v9.V2D(), v10.V2D(), v11.V2D(), v12.V2D(), MemOperand(x1, 64, PostIndex)); __ st4(v23.V2S(), v24.V2S(), v25.V2S(), v26.V2S(), MemOperand(x0)); __ st4(v15.V2S(), v16.V2S(), v17.V2S(), v18.V2S(), MemOperand(x1, x2, PostIndex)); __ st4(v24.V2S(), v25.V2S(), v26.V2S(), v27.V2S(), MemOperand(x1, 32, PostIndex)); __ st4(v14.V4H(), v15.V4H(), v16.V4H(), v17.V4H(), MemOperand(x0)); __ st4(v18.V4H(), v19.V4H(), v20.V4H(), v21.V4H(), MemOperand(x1, x2, PostIndex)); __ st4(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), MemOperand(x1, 32, PostIndex)); __ st4(v13.V4S(), v14.V4S(), v15.V4S(), v16.V4S(), MemOperand(x0)); __ st4(v6.V4S(), v7.V4S(), v8.V4S(), v9.V4S(), MemOperand(x1, x2, PostIndex)); __ st4(v15.V4S(), v16.V4S(), v17.V4S(), v18.V4S(), MemOperand(x1, 64, PostIndex)); __ st4(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0)); __ st4(v25.V8B(), v26.V8B(), v27.V8B(), v28.V8B(), MemOperand(x1, x2, PostIndex)); __ st4(v19.V8B(), v20.V8B(), v21.V8B(), v22.V8B(), MemOperand(x1, 32, PostIndex)); __ st4(v19.V8H(), v20.V8H(), v21.V8H(), v22.V8H(), MemOperand(x0)); __ st4(v15.V8H(), v16.V8H(), v17.V8H(), v18.V8H(), MemOperand(x1, x2, PostIndex)); __ st4(v31.V8H(), v0.V8H(), v1.V8H(), v2.V8H(), MemOperand(x1, 64, PostIndex)); __ st4(v0.B(), v1.B(), v2.B(), v3.B(), 13, MemOperand(x0)); __ st4(v4.B(), v5.B(), v6.B(), v7.B(), 10, MemOperand(x1, x2, PostIndex)); __ st4(v9.B(), v10.B(), v11.B(), v12.B(), 9, MemOperand(x1, 4, PostIndex)); __ st4(v2.D(), v3.D(), v4.D(), v5.D(), 1, MemOperand(x0)); __ st4(v7.D(), v8.D(), v9.D(), v10.D(), 0, MemOperand(x1, x2, PostIndex)); __ st4(v31.D(), v0.D(), v1.D(), v2.D(), 1, MemOperand(x1, 32, PostIndex)); __ st4(v2.H(), v3.H(), v4.H(), v5.H(), 1, MemOperand(x0)); __ st4(v27.H(), v28.H(), v29.H(), v30.H(), 3, MemOperand(x1, x2, PostIndex)); __ st4(v24.H(), v25.H(), v26.H(), v27.H(), 4, MemOperand(x1, 8, PostIndex)); __ st4(v18.S(), v19.S(), v20.S(), v21.S(), 2, MemOperand(x0)); __ st4(v6.S(), v7.S(), v8.S(), v9.S(), 2, MemOperand(x1, x2, PostIndex)); __ st4(v25.S(), v26.S(), v27.S(), v28.S(), 1, MemOperand(x1, 16, PostIndex)); __ sub(d12, d17, d2); __ sub(v20.V16B(), v24.V16B(), v8.V16B()); __ sub(v8.V2D(), v29.V2D(), v5.V2D()); __ sub(v2.V2S(), v28.V2S(), v24.V2S()); __ sub(v24.V4H(), v10.V4H(), v4.V4H()); __ sub(v28.V4S(), v4.V4S(), v17.V4S()); __ sub(v16.V8B(), v27.V8B(), v2.V8B()); __ sub(v20.V8H(), v10.V8H(), v13.V8H()); __ subhn(v5.V2S(), v14.V2D(), v13.V2D()); __ subhn(v10.V4H(), v5.V4S(), v8.V4S()); __ subhn(v6.V8B(), v10.V8H(), v22.V8H()); __ subhn2(v11.V16B(), v6.V8H(), v9.V8H()); __ subhn2(v25.V4S(), v18.V2D(), v24.V2D()); __ subhn2(v20.V8H(), v21.V4S(), v1.V4S()); __ suqadd(b25, b11); __ suqadd(d13, d1); __ suqadd(h0, h9); __ suqadd(s22, s8); __ suqadd(v24.V16B(), v27.V16B()); __ suqadd(v26.V2D(), v14.V2D()); __ suqadd(v7.V2S(), v10.V2S()); __ suqadd(v25.V4H(), v12.V4H()); __ suqadd(v4.V4S(), v3.V4S()); __ suqadd(v14.V8B(), v18.V8B()); __ suqadd(v31.V8H(), v8.V8H()); __ sxtl(v16.V2D(), v20.V2S()); __ sxtl(v27.V4S(), v28.V4H()); __ sxtl(v0.V8H(), v22.V8B()); __ sxtl2(v6.V2D(), v7.V4S()); __ sxtl2(v9.V4S(), v27.V8H()); __ sxtl2(v16.V8H(), v16.V16B()); __ tbl(v25.V16B(), v17.V16B(), v18.V16B(), v19.V16B(), v20.V16B(), v22.V16B()); __ tbl(v28.V16B(), v13.V16B(), v14.V16B(), v15.V16B(), v4.V16B()); __ tbl(v3.V16B(), v0.V16B(), v1.V16B(), v2.V16B()); __ tbl(v20.V16B(), v15.V16B(), v4.V16B()); __ tbl(v7.V8B(), v23.V16B(), v24.V16B(), v25.V16B(), v26.V16B(), v20.V8B()); __ tbl(v8.V8B(), v1.V16B(), v2.V16B(), v3.V16B(), v31.V8B()); __ tbl(v8.V8B(), v25.V16B(), v26.V16B(), v16.V8B()); __ tbl(v11.V8B(), v19.V16B(), v30.V8B()); __ tbx(v25.V16B(), v25.V16B(), v26.V16B(), v27.V16B(), v28.V16B(), v5.V16B()); __ tbx(v21.V16B(), v29.V16B(), v30.V16B(), v31.V16B(), v24.V16B()); __ tbx(v6.V16B(), v16.V16B(), v17.V16B(), v1.V16B()); __ tbx(v13.V16B(), v3.V16B(), v20.V16B()); __ tbx(v24.V8B(), v29.V16B(), v30.V16B(), v31.V16B(), v0.V16B(), v9.V8B()); __ tbx(v17.V8B(), v9.V16B(), v10.V16B(), v11.V16B(), v26.V8B()); __ tbx(v5.V8B(), v3.V16B(), v4.V16B(), v21.V8B()); __ tbx(v16.V8B(), v11.V16B(), v29.V8B()); __ trn1(v19.V16B(), v24.V16B(), v12.V16B()); __ trn1(v2.V2D(), v7.V2D(), v10.V2D()); __ trn1(v22.V2S(), v0.V2S(), v21.V2S()); __ trn1(v12.V4H(), v15.V4H(), v20.V4H()); __ trn1(v30.V4S(), v17.V4S(), v9.V4S()); __ trn1(v12.V8B(), v19.V8B(), v29.V8B()); __ trn1(v23.V8H(), v8.V8H(), v9.V8H()); __ trn2(v28.V16B(), v30.V16B(), v25.V16B()); __ trn2(v7.V2D(), v27.V2D(), v7.V2D()); __ trn2(v30.V2S(), v16.V2S(), v19.V2S()); __ trn2(v24.V4H(), v6.V4H(), v25.V4H()); __ trn2(v2.V4S(), v19.V4S(), v11.V4S()); __ trn2(v25.V8B(), v27.V8B(), v18.V8B()); __ trn2(v12.V8H(), v4.V8H(), v15.V8H()); __ uaba(v31.V16B(), v12.V16B(), v28.V16B()); __ uaba(v18.V2S(), v5.V2S(), v14.V2S()); __ uaba(v9.V4H(), v20.V4H(), v21.V4H()); __ uaba(v6.V4S(), v20.V4S(), v2.V4S()); __ uaba(v16.V8B(), v12.V8B(), v5.V8B()); __ uaba(v15.V8H(), v26.V8H(), v30.V8H()); __ uabal(v10.V2D(), v18.V2S(), v15.V2S()); __ uabal(v30.V4S(), v19.V4H(), v7.V4H()); __ uabal(v4.V8H(), v27.V8B(), v0.V8B()); __ uabal2(v19.V2D(), v12.V4S(), v2.V4S()); __ uabal2(v26.V4S(), v5.V8H(), v12.V8H()); __ uabal2(v19.V8H(), v20.V16B(), v28.V16B()); __ uabd(v18.V16B(), v4.V16B(), v21.V16B()); __ uabd(v30.V2S(), v21.V2S(), v16.V2S()); __ uabd(v8.V4H(), v28.V4H(), v25.V4H()); __ uabd(v28.V4S(), v12.V4S(), v21.V4S()); __ uabd(v19.V8B(), v16.V8B(), v28.V8B()); __ uabd(v9.V8H(), v12.V8H(), v29.V8H()); __ uabdl(v26.V2D(), v0.V2S(), v8.V2S()); __ uabdl(v29.V4S(), v31.V4H(), v25.V4H()); __ uabdl(v27.V8H(), v29.V8B(), v14.V8B()); __ uabdl2(v20.V2D(), v20.V4S(), v8.V4S()); __ uabdl2(v22.V4S(), v15.V8H(), v18.V8H()); __ uabdl2(v9.V8H(), v18.V16B(), v23.V16B()); __ uadalp(v9.V1D(), v15.V2S()); __ uadalp(v14.V2D(), v12.V4S()); __ uadalp(v28.V2S(), v12.V4H()); __ uadalp(v0.V4H(), v17.V8B()); __ uadalp(v1.V4S(), v29.V8H()); __ uadalp(v15.V8H(), v22.V16B()); __ uaddl(v1.V2D(), v20.V2S(), v27.V2S()); __ uaddl(v31.V4S(), v25.V4H(), v5.V4H()); __ uaddl(v12.V8H(), v3.V8B(), v3.V8B()); __ uaddl2(v5.V2D(), v23.V4S(), v6.V4S()); __ uaddl2(v1.V4S(), v5.V8H(), v25.V8H()); __ uaddl2(v22.V8H(), v30.V16B(), v28.V16B()); __ uaddlp(v7.V1D(), v9.V2S()); __ uaddlp(v26.V2D(), v4.V4S()); __ uaddlp(v28.V2S(), v1.V4H()); __ uaddlp(v20.V4H(), v31.V8B()); __ uaddlp(v16.V4S(), v17.V8H()); __ uaddlp(v6.V8H(), v2.V16B()); __ uaddlv(d28, v22.V4S()); __ uaddlv(h0, v19.V16B()); __ uaddlv(h30, v30.V8B()); __ uaddlv(s24, v18.V4H()); __ uaddlv(s10, v0.V8H()); __ uaddw(v9.V2D(), v17.V2D(), v14.V2S()); __ uaddw(v9.V4S(), v25.V4S(), v3.V4H()); __ uaddw(v18.V8H(), v1.V8H(), v0.V8B()); __ uaddw2(v18.V2D(), v5.V2D(), v6.V4S()); __ uaddw2(v17.V4S(), v15.V4S(), v11.V8H()); __ uaddw2(v29.V8H(), v11.V8H(), v7.V16B()); __ uhadd(v13.V16B(), v9.V16B(), v3.V16B()); __ uhadd(v17.V2S(), v25.V2S(), v24.V2S()); __ uhadd(v25.V4H(), v23.V4H(), v13.V4H()); __ uhadd(v0.V4S(), v20.V4S(), v16.V4S()); __ uhadd(v5.V8B(), v5.V8B(), v25.V8B()); __ uhadd(v3.V8H(), v29.V8H(), v18.V8H()); __ uhsub(v1.V16B(), v22.V16B(), v13.V16B()); __ uhsub(v14.V2S(), v30.V2S(), v30.V2S()); __ uhsub(v29.V4H(), v14.V4H(), v17.V4H()); __ uhsub(v26.V4S(), v5.V4S(), v18.V4S()); __ uhsub(v3.V8B(), v7.V8B(), v12.V8B()); __ uhsub(v25.V8H(), v21.V8H(), v5.V8H()); __ umax(v28.V16B(), v12.V16B(), v6.V16B()); __ umax(v20.V2S(), v19.V2S(), v26.V2S()); __ umax(v0.V4H(), v31.V4H(), v18.V4H()); __ umax(v6.V4S(), v21.V4S(), v28.V4S()); __ umax(v0.V8B(), v2.V8B(), v20.V8B()); __ umax(v4.V8H(), v11.V8H(), v22.V8H()); __ umaxp(v1.V16B(), v6.V16B(), v29.V16B()); __ umaxp(v19.V2S(), v17.V2S(), v27.V2S()); __ umaxp(v21.V4H(), v16.V4H(), v7.V4H()); __ umaxp(v9.V4S(), v20.V4S(), v29.V4S()); __ umaxp(v13.V8B(), v1.V8B(), v16.V8B()); __ umaxp(v19.V8H(), v23.V8H(), v26.V8H()); __ umaxv(b17, v30.V16B()); __ umaxv(b23, v12.V8B()); __ umaxv(h31, v15.V4H()); __ umaxv(h15, v25.V8H()); __ umaxv(s18, v21.V4S()); __ umin(v22.V16B(), v0.V16B(), v18.V16B()); __ umin(v1.V2S(), v21.V2S(), v16.V2S()); __ umin(v17.V4H(), v4.V4H(), v25.V4H()); __ umin(v24.V4S(), v26.V4S(), v13.V4S()); __ umin(v20.V8B(), v1.V8B(), v5.V8B()); __ umin(v26.V8H(), v25.V8H(), v23.V8H()); __ uminp(v5.V16B(), v1.V16B(), v23.V16B()); __ uminp(v7.V2S(), v26.V2S(), v30.V2S()); __ uminp(v9.V4H(), v5.V4H(), v25.V4H()); __ uminp(v23.V4S(), v10.V4S(), v1.V4S()); __ uminp(v4.V8B(), v29.V8B(), v14.V8B()); __ uminp(v21.V8H(), v0.V8H(), v14.V8H()); __ uminv(b0, v17.V16B()); __ uminv(b0, v31.V8B()); __ uminv(h24, v0.V4H()); __ uminv(h29, v14.V8H()); __ uminv(s30, v3.V4S()); __ umlal(v11.V2D(), v11.V2S(), v24.V2S()); __ umlal(v30.V2D(), v16.V2S(), v11.S(), 3); __ umlal(v0.V4S(), v9.V4H(), v26.V4H()); __ umlal(v20.V4S(), v24.V4H(), v12.H(), 4); __ umlal(v16.V8H(), v21.V8B(), v6.V8B()); __ umlal2(v17.V2D(), v19.V4S(), v23.V4S()); __ umlal2(v5.V2D(), v30.V4S(), v8.S(), 0); __ umlal2(v16.V4S(), v8.V8H(), v15.V8H()); __ umlal2(v15.V4S(), v26.V8H(), v1.H(), 5); __ umlal2(v30.V8H(), v1.V16B(), v17.V16B()); __ umlsl(v18.V2D(), v19.V2S(), v28.V2S()); __ umlsl(v7.V2D(), v7.V2S(), v8.S(), 0); __ umlsl(v24.V4S(), v8.V4H(), v4.V4H()); __ umlsl(v18.V4S(), v22.V4H(), v12.H(), 4); __ umlsl(v28.V8H(), v14.V8B(), v20.V8B()); __ umlsl2(v11.V2D(), v0.V4S(), v9.V4S()); __ umlsl2(v26.V2D(), v16.V4S(), v9.S(), 2); __ umlsl2(v3.V4S(), v11.V8H(), v9.V8H()); __ umlsl2(v10.V4S(), v25.V8H(), v9.H(), 4); __ umlsl2(v24.V8H(), v16.V16B(), v28.V16B()); __ umov(x30, v25.D(), 1); __ umull(v12.V2D(), v10.V2S(), v29.V2S()); __ umull(v22.V2D(), v30.V2S(), v5.S(), 3); __ umull(v7.V4S(), v0.V4H(), v25.V4H()); __ umull(v11.V4S(), v13.V4H(), v3.H(), 2); __ umull(v25.V8H(), v16.V8B(), v10.V8B()); __ umull2(v17.V2D(), v3.V4S(), v26.V4S()); __ umull2(v26.V2D(), v11.V4S(), v2.S(), 3); __ umull2(v12.V4S(), v17.V8H(), v23.V8H()); __ umull2(v4.V4S(), v31.V8H(), v1.H(), 2); __ umull2(v5.V8H(), v12.V16B(), v17.V16B()); __ uqadd(b30, b4, b28); __ uqadd(d27, d20, d16); __ uqadd(h7, h14, h28); __ uqadd(s28, s17, s4); __ uqadd(v19.V16B(), v22.V16B(), v21.V16B()); __ uqadd(v16.V2D(), v4.V2D(), v11.V2D()); __ uqadd(v20.V2S(), v14.V2S(), v4.V2S()); __ uqadd(v5.V4H(), v0.V4H(), v16.V4H()); __ uqadd(v21.V4S(), v31.V4S(), v9.V4S()); __ uqadd(v23.V8B(), v24.V8B(), v3.V8B()); __ uqadd(v17.V8H(), v27.V8H(), v11.V8H()); __ uqrshl(b10, b22, b10); __ uqrshl(d29, d5, d11); __ uqrshl(h27, h24, h30); __ uqrshl(s10, s13, s8); __ uqrshl(v9.V16B(), v18.V16B(), v14.V16B()); __ uqrshl(v24.V2D(), v15.V2D(), v17.V2D()); __ uqrshl(v4.V2S(), v14.V2S(), v27.V2S()); __ uqrshl(v15.V4H(), v5.V4H(), v8.V4H()); __ uqrshl(v21.V4S(), v29.V4S(), v0.V4S()); __ uqrshl(v16.V8B(), v24.V8B(), v9.V8B()); __ uqrshl(v2.V8H(), v0.V8H(), v15.V8H()); __ uqrshrn(b11, h26, 4); __ uqrshrn(h7, s30, 5); __ uqrshrn(s10, d8, 21); __ uqrshrn(v15.V2S(), v6.V2D(), 11); __ uqrshrn(v5.V4H(), v26.V4S(), 12); __ uqrshrn(v28.V8B(), v25.V8H(), 5); __ uqrshrn2(v25.V16B(), v30.V8H(), 2); __ uqrshrn2(v21.V4S(), v14.V2D(), 32); __ uqrshrn2(v13.V8H(), v7.V4S(), 2); __ uqshl(b13, b0, b23); __ uqshl(b9, b17, 4); __ uqshl(d23, d6, d4); __ uqshl(d8, d11, 44); __ uqshl(h19, h13, h15); __ uqshl(h25, h26, 6); __ uqshl(s4, s24, s10); __ uqshl(s19, s14, 1); __ uqshl(v14.V16B(), v30.V16B(), v25.V16B()); __ uqshl(v6.V16B(), v10.V16B(), 5); __ uqshl(v18.V2D(), v8.V2D(), v7.V2D()); __ uqshl(v25.V2D(), v14.V2D(), 18); __ uqshl(v25.V2S(), v16.V2S(), v23.V2S()); __ uqshl(v13.V2S(), v15.V2S(), 31); __ uqshl(v28.V4H(), v24.V4H(), v15.V4H()); __ uqshl(v4.V4H(), v17.V4H(), 1); __ uqshl(v9.V4S(), v31.V4S(), v23.V4S()); __ uqshl(v18.V4S(), v28.V4S(), 31); __ uqshl(v31.V8B(), v21.V8B(), v15.V8B()); __ uqshl(v6.V8B(), v21.V8B(), 1); __ uqshl(v28.V8H(), v2.V8H(), v17.V8H()); __ uqshl(v24.V8H(), v8.V8H(), 14); __ uqshrn(b21, h27, 7); __ uqshrn(h28, s26, 11); __ uqshrn(s13, d31, 17); __ uqshrn(v21.V2S(), v16.V2D(), 8); __ uqshrn(v24.V4H(), v24.V4S(), 2); __ uqshrn(v5.V8B(), v1.V8H(), 8); __ uqshrn2(v16.V16B(), v29.V8H(), 6); __ uqshrn2(v2.V4S(), v6.V2D(), 1); __ uqshrn2(v16.V8H(), v10.V4S(), 14); __ uqsub(b28, b20, b26); __ uqsub(d0, d7, d10); __ uqsub(h26, h24, h7); __ uqsub(s23, s23, s16); __ uqsub(v14.V16B(), v16.V16B(), v24.V16B()); __ uqsub(v11.V2D(), v17.V2D(), v6.V2D()); __ uqsub(v10.V2S(), v10.V2S(), v8.V2S()); __ uqsub(v9.V4H(), v15.V4H(), v12.V4H()); __ uqsub(v23.V4S(), v18.V4S(), v7.V4S()); __ uqsub(v9.V8B(), v19.V8B(), v17.V8B()); __ uqsub(v20.V8H(), v2.V8H(), v6.V8H()); __ uqxtn(b29, h19); __ uqxtn(h0, s13); __ uqxtn(s26, d22); __ uqxtn(v5.V2S(), v31.V2D()); __ uqxtn(v30.V4H(), v19.V4S()); __ uqxtn(v15.V8B(), v2.V8H()); __ uqxtn2(v29.V16B(), v3.V8H()); __ uqxtn2(v13.V4S(), v17.V2D()); __ uqxtn2(v28.V8H(), v11.V4S()); __ urecpe(v23.V2S(), v15.V2S()); __ urecpe(v27.V4S(), v7.V4S()); __ urhadd(v2.V16B(), v15.V16B(), v27.V16B()); __ urhadd(v15.V2S(), v1.V2S(), v18.V2S()); __ urhadd(v17.V4H(), v4.V4H(), v26.V4H()); __ urhadd(v2.V4S(), v27.V4S(), v14.V4S()); __ urhadd(v5.V8B(), v17.V8B(), v14.V8B()); __ urhadd(v30.V8H(), v2.V8H(), v25.V8H()); __ urshl(d4, d28, d30); __ urshl(v13.V16B(), v31.V16B(), v19.V16B()); __ urshl(v14.V2D(), v23.V2D(), v21.V2D()); __ urshl(v10.V2S(), v7.V2S(), v8.V2S()); __ urshl(v15.V4H(), v21.V4H(), v28.V4H()); __ urshl(v30.V4S(), v8.V4S(), v23.V4S()); __ urshl(v31.V8B(), v20.V8B(), v5.V8B()); __ urshl(v30.V8H(), v27.V8H(), v30.V8H()); __ urshr(d4, d13, 49); __ urshr(v2.V16B(), v20.V16B(), 1); __ urshr(v13.V2D(), v11.V2D(), 51); __ urshr(v21.V2S(), v31.V2S(), 10); __ urshr(v21.V4H(), v17.V4H(), 11); __ urshr(v4.V4S(), v22.V4S(), 1); __ urshr(v0.V8B(), v1.V8B(), 7); __ urshr(v13.V8H(), v20.V8H(), 1); __ ursqrte(v20.V2S(), v16.V2S()); __ ursqrte(v28.V4S(), v8.V4S()); __ ursra(d27, d16, 45); __ ursra(v18.V16B(), v17.V16B(), 3); __ ursra(v26.V2D(), v28.V2D(), 58); __ ursra(v8.V2S(), v22.V2S(), 31); __ ursra(v31.V4H(), v4.V4H(), 7); __ ursra(v31.V4S(), v15.V4S(), 2); __ ursra(v3.V8B(), v1.V8B(), 5); __ ursra(v18.V8H(), v14.V8H(), 13); __ ushl(d31, d0, d16); __ ushl(v0.V16B(), v6.V16B(), v2.V16B()); __ ushl(v18.V2D(), v1.V2D(), v18.V2D()); __ ushl(v27.V2S(), v7.V2S(), v29.V2S()); __ ushl(v14.V4H(), v14.V4H(), v13.V4H()); __ ushl(v22.V4S(), v4.V4S(), v9.V4S()); __ ushl(v23.V8B(), v22.V8B(), v27.V8B()); __ ushl(v21.V8H(), v25.V8H(), v8.V8H()); __ ushll(v11.V2D(), v0.V2S(), 21); __ ushll(v2.V4S(), v17.V4H(), 8); __ ushll(v11.V8H(), v14.V8B(), 1); __ ushll2(v8.V2D(), v29.V4S(), 7); __ ushll2(v29.V4S(), v9.V8H(), 2); __ ushll2(v5.V8H(), v24.V16B(), 6); __ ushr(d28, d27, 53); __ ushr(v1.V16B(), v9.V16B(), 7); __ ushr(v2.V2D(), v24.V2D(), 43); __ ushr(v30.V2S(), v25.V2S(), 11); __ ushr(v10.V4H(), v26.V4H(), 12); __ ushr(v4.V4S(), v5.V4S(), 30); __ ushr(v30.V8B(), v2.V8B(), 1); __ ushr(v6.V8H(), v12.V8H(), 2); __ usqadd(b19, b5); __ usqadd(d9, d2); __ usqadd(h2, h16); __ usqadd(s16, s3); __ usqadd(v31.V16B(), v29.V16B()); __ usqadd(v8.V2D(), v10.V2D()); __ usqadd(v18.V2S(), v9.V2S()); __ usqadd(v24.V4H(), v14.V4H()); __ usqadd(v10.V4S(), v30.V4S()); __ usqadd(v16.V8B(), v20.V8B()); __ usqadd(v12.V8H(), v16.V8H()); __ usra(d28, d27, 37); __ usra(v5.V16B(), v22.V16B(), 5); __ usra(v2.V2D(), v19.V2D(), 33); __ usra(v0.V2S(), v0.V2S(), 21); __ usra(v7.V4H(), v6.V4H(), 12); __ usra(v4.V4S(), v17.V4S(), 9); __ usra(v9.V8B(), v12.V8B(), 7); __ usra(v3.V8H(), v27.V8H(), 14); __ usubl(v29.V2D(), v12.V2S(), v30.V2S()); __ usubl(v29.V4S(), v28.V4H(), v6.V4H()); __ usubl(v12.V8H(), v4.V8B(), v14.V8B()); __ usubl2(v1.V2D(), v24.V4S(), v17.V4S()); __ usubl2(v4.V4S(), v1.V8H(), v3.V8H()); __ usubl2(v23.V8H(), v4.V16B(), v7.V16B()); __ usubw(v9.V2D(), v20.V2D(), v30.V2S()); __ usubw(v20.V4S(), v16.V4S(), v23.V4H()); __ usubw(v25.V8H(), v8.V8H(), v29.V8B()); __ usubw2(v18.V2D(), v29.V2D(), v6.V4S()); __ usubw2(v6.V4S(), v6.V4S(), v20.V8H()); __ usubw2(v18.V8H(), v4.V8H(), v16.V16B()); __ uxtl(v27.V2D(), v21.V2S()); __ uxtl(v0.V4S(), v31.V4H()); __ uxtl(v27.V8H(), v10.V8B()); __ uxtl2(v6.V2D(), v16.V4S()); __ uxtl2(v22.V4S(), v20.V8H()); __ uxtl2(v20.V8H(), v21.V16B()); __ uzp1(v30.V16B(), v9.V16B(), v17.V16B()); __ uzp1(v7.V2D(), v26.V2D(), v28.V2D()); __ uzp1(v26.V2S(), v16.V2S(), v22.V2S()); __ uzp1(v14.V4H(), v19.V4H(), v6.V4H()); __ uzp1(v17.V4S(), v23.V4S(), v30.V4S()); __ uzp1(v28.V8B(), v27.V8B(), v13.V8B()); __ uzp1(v17.V8H(), v1.V8H(), v12.V8H()); __ uzp2(v8.V16B(), v18.V16B(), v26.V16B()); __ uzp2(v21.V2D(), v22.V2D(), v24.V2D()); __ uzp2(v20.V2S(), v21.V2S(), v2.V2S()); __ uzp2(v16.V4H(), v31.V4H(), v6.V4H()); __ uzp2(v25.V4S(), v11.V4S(), v8.V4S()); __ uzp2(v31.V8B(), v31.V8B(), v13.V8B()); __ uzp2(v8.V8H(), v17.V8H(), v1.V8H()); __ xtn(v17.V2S(), v26.V2D()); __ xtn(v3.V4H(), v0.V4S()); __ xtn(v18.V8B(), v8.V8H()); __ xtn2(v0.V16B(), v0.V8H()); __ xtn2(v15.V4S(), v4.V2D()); __ xtn2(v31.V8H(), v18.V4S()); __ zip1(v22.V16B(), v9.V16B(), v6.V16B()); __ zip1(v23.V2D(), v11.V2D(), v2.V2D()); __ zip1(v26.V2S(), v16.V2S(), v9.V2S()); __ zip1(v1.V4H(), v9.V4H(), v7.V4H()); __ zip1(v0.V4S(), v30.V4S(), v20.V4S()); __ zip1(v30.V8B(), v17.V8B(), v15.V8B()); __ zip1(v17.V8H(), v8.V8H(), v2.V8H()); __ zip2(v23.V16B(), v10.V16B(), v11.V16B()); __ zip2(v30.V2D(), v6.V2D(), v14.V2D()); __ zip2(v9.V2S(), v10.V2S(), v21.V2S()); __ zip2(v8.V4H(), v24.V4H(), v29.V4H()); __ zip2(v0.V4S(), v21.V4S(), v23.V4S()); __ zip2(v25.V8B(), v23.V8B(), v30.V8B()); __ zip2(v7.V8H(), v10.V8H(), v30.V8H()); } // NOLINT(readability/fn_size) static void GenerateTestSequenceNEONFP(MacroAssembler* masm) { ExactAssemblyScope guard(masm, masm->GetBuffer()->GetRemainingBytes(), ExactAssemblyScope::kMaximumSize); // NEON floating point instructions. __ fabd(v3.V2D(), v25.V2D(), v8.V2D()); __ fabd(v14.V2S(), v27.V2S(), v11.V2S()); __ fabd(v9.V4S(), v22.V4S(), v18.V4S()); __ fabs(v1.V2D(), v29.V2D()); __ fabs(v6.V2S(), v21.V2S()); __ fabs(v12.V4S(), v25.V4S()); __ facge(v18.V2D(), v5.V2D(), v0.V2D()); __ facge(v15.V2S(), v11.V2S(), v6.V2S()); __ facge(v30.V4S(), v10.V4S(), v25.V4S()); __ facgt(v28.V2D(), v16.V2D(), v31.V2D()); __ facgt(v15.V2S(), v1.V2S(), v4.V2S()); __ facgt(v22.V4S(), v3.V4S(), v10.V4S()); __ fadd(v7.V2D(), v10.V2D(), v24.V2D()); __ fadd(v10.V2S(), v23.V2S(), v7.V2S()); __ fadd(v16.V4S(), v22.V4S(), v11.V4S()); __ faddp(d27, v28.V2D()); __ faddp(s20, v23.V2S()); __ faddp(v21.V2D(), v4.V2D(), v11.V2D()); __ faddp(v31.V2S(), v26.V2S(), v1.V2S()); __ faddp(v13.V4S(), v27.V4S(), v28.V4S()); __ fcmeq(v17.V2D(), v13.V2D(), v20.V2D()); __ fcmeq(v24.V2D(), v16.V2D(), 0.0); __ fcmeq(v26.V2S(), v17.V2S(), v10.V2S()); __ fcmeq(v24.V2S(), v4.V2S(), 0.0); __ fcmeq(v8.V4S(), v4.V4S(), v14.V4S()); __ fcmeq(v26.V4S(), v25.V4S(), 0.0); __ fcmge(v27.V2D(), v0.V2D(), v0.V2D()); __ fcmge(v22.V2D(), v30.V2D(), 0.0); __ fcmge(v7.V2S(), v21.V2S(), v25.V2S()); __ fcmge(v15.V2S(), v15.V2S(), 0.0); __ fcmge(v29.V4S(), v4.V4S(), v27.V4S()); __ fcmge(v22.V4S(), v21.V4S(), 0.0); __ fcmgt(v1.V2D(), v26.V2D(), v15.V2D()); __ fcmgt(v15.V2D(), v23.V2D(), 0.0); __ fcmgt(v21.V2S(), v16.V2S(), v6.V2S()); __ fcmgt(v1.V2S(), v13.V2S(), 0.0); __ fcmgt(v14.V4S(), v0.V4S(), v25.V4S()); __ fcmgt(v13.V4S(), v8.V4S(), 0.0); __ fcmle(v4.V2D(), v6.V2D(), 0.0); __ fcmle(v24.V2S(), v31.V2S(), 0.0); __ fcmle(v8.V4S(), v23.V4S(), 0.0); __ fcmlt(v7.V2D(), v3.V2D(), 0.0); __ fcmlt(v15.V2S(), v21.V2S(), 0.0); __ fcmlt(v1.V4S(), v2.V4S(), 0.0); __ fcvtas(v6.V2D(), v8.V2D()); __ fcvtas(v1.V2S(), v9.V2S()); __ fcvtas(v8.V4S(), v19.V4S()); __ fcvtau(v5.V2D(), v31.V2D()); __ fcvtau(v28.V2S(), v29.V2S()); __ fcvtau(v11.V4S(), v26.V4S()); __ fcvtl(v8.V2D(), v25.V2S()); __ fcvtl(v27.V4S(), v14.V4H()); __ fcvtl2(v1.V2D(), v6.V4S()); __ fcvtl2(v24.V4S(), v9.V8H()); __ fcvtms(v9.V2D(), v24.V2D()); __ fcvtms(v7.V2S(), v11.V2S()); __ fcvtms(v23.V4S(), v21.V4S()); __ fcvtmu(v13.V2D(), v1.V2D()); __ fcvtmu(v26.V2S(), v12.V2S()); __ fcvtmu(v21.V4S(), v21.V4S()); __ fcvtn(v11.V2S(), v1.V2D()); __ fcvtn(v8.V4H(), v2.V4S()); __ fcvtn2(v24.V4S(), v29.V2D()); __ fcvtn2(v4.V8H(), v10.V4S()); __ fcvtns(v25.V2D(), v10.V2D()); __ fcvtns(v4.V2S(), v8.V2S()); __ fcvtns(v29.V4S(), v27.V4S()); __ fcvtnu(v18.V2D(), v27.V2D()); __ fcvtnu(v11.V2S(), v14.V2S()); __ fcvtnu(v27.V4S(), v21.V4S()); __ fcvtps(v23.V2D(), v5.V2D()); __ fcvtps(v24.V2S(), v15.V2S()); __ fcvtps(v5.V4S(), v19.V4S()); __ fcvtpu(v3.V2D(), v21.V2D()); __ fcvtpu(v3.V2S(), v21.V2S()); __ fcvtpu(v0.V4S(), v7.V4S()); __ fcvtxn(v29.V2S(), v11.V2D()); __ fcvtxn2(v31.V4S(), v25.V2D()); __ fcvtzs(v19.V2D(), v17.V2D()); __ fcvtzs(v12.V2D(), v24.V2D(), 64); __ fcvtzs(v9.V2S(), v2.V2S()); __ fcvtzs(v5.V2S(), v20.V2S(), 29); __ fcvtzs(v21.V4S(), v25.V4S()); __ fcvtzs(v26.V4S(), v1.V4S(), 6); __ fcvtzu(v13.V2D(), v25.V2D()); __ fcvtzu(v28.V2D(), v13.V2D(), 32); __ fcvtzu(v26.V2S(), v6.V2S()); __ fcvtzu(v9.V2S(), v10.V2S(), 15); __ fcvtzu(v30.V4S(), v6.V4S()); __ fcvtzu(v19.V4S(), v22.V4S(), 18); __ fdiv(v15.V2D(), v8.V2D(), v15.V2D()); __ fdiv(v12.V2S(), v9.V2S(), v26.V2S()); __ fdiv(v19.V4S(), v22.V4S(), v19.V4S()); __ fmax(v19.V2D(), v7.V2D(), v8.V2D()); __ fmax(v25.V2S(), v12.V2S(), v29.V2S()); __ fmax(v6.V4S(), v15.V4S(), v5.V4S()); __ fmaxnm(v16.V2D(), v8.V2D(), v20.V2D()); __ fmaxnm(v15.V2S(), v26.V2S(), v25.V2S()); __ fmaxnm(v23.V4S(), v14.V4S(), v16.V4S()); __ fmaxnmp(d6, v19.V2D()); __ fmaxnmp(s27, v26.V2S()); __ fmaxnmp(v8.V2D(), v12.V2D(), v23.V2D()); __ fmaxnmp(v13.V2S(), v25.V2S(), v22.V2S()); __ fmaxnmp(v15.V4S(), v11.V4S(), v17.V4S()); __ fmaxnmv(s27, v19.V4S()); __ fmaxp(d20, v14.V2D()); __ fmaxp(s18, v2.V2S()); __ fmaxp(v9.V2D(), v23.V2D(), v31.V2D()); __ fmaxp(v7.V2S(), v22.V2S(), v31.V2S()); __ fmaxp(v18.V4S(), v7.V4S(), v29.V4S()); __ fmaxv(s31, v29.V4S()); __ fmin(v2.V2D(), v5.V2D(), v2.V2D()); __ fmin(v31.V2S(), v17.V2S(), v10.V2S()); __ fmin(v10.V4S(), v4.V4S(), v16.V4S()); __ fminnm(v21.V2D(), v6.V2D(), v5.V2D()); __ fminnm(v22.V2S(), v18.V2S(), v14.V2S()); __ fminnm(v25.V4S(), v31.V4S(), v3.V4S()); __ fminnmp(d9, v1.V2D()); __ fminnmp(s21, v20.V2S()); __ fminnmp(v16.V2D(), v21.V2D(), v19.V2D()); __ fminnmp(v16.V2S(), v31.V2S(), v25.V2S()); __ fminnmp(v26.V4S(), v16.V4S(), v15.V4S()); __ fminnmv(s3, v4.V4S()); __ fminp(d24, v26.V2D()); __ fminp(s7, v17.V2S()); __ fminp(v23.V2D(), v19.V2D(), v3.V2D()); __ fminp(v29.V2S(), v21.V2S(), v9.V2S()); __ fminp(v0.V4S(), v24.V4S(), v21.V4S()); __ fminv(s25, v8.V4S()); __ fmla(d23, d0, v9.D(), 1); __ fmla(s23, s15, v7.S(), 0); __ fmla(v17.V2D(), v11.V2D(), v6.V2D()); __ fmla(v30.V2D(), v30.V2D(), v11.D(), 0); __ fmla(v19.V2S(), v12.V2S(), v6.V2S()); __ fmla(v24.V2S(), v17.V2S(), v9.S(), 0); __ fmla(v16.V4S(), v11.V4S(), v11.V4S()); __ fmla(v27.V4S(), v23.V4S(), v9.S(), 2); __ fmls(d27, d30, v6.D(), 0); __ fmls(s21, s16, v2.S(), 0); __ fmls(v5.V2D(), v19.V2D(), v21.V2D()); __ fmls(v18.V2D(), v30.V2D(), v12.D(), 0); __ fmls(v5.V2S(), v16.V2S(), v7.V2S()); __ fmls(v3.V2S(), v18.V2S(), v11.S(), 1); __ fmls(v27.V4S(), v5.V4S(), v30.V4S()); __ fmls(v26.V4S(), v20.V4S(), v4.S(), 3); __ fmov(v14.V2D(), -0.34375); __ fmov(v26.V2S(), 0.90625f); __ fmov(v31.V4S(), -5.0000f); __ fmov(v28.D(), 1, x25); __ fmov(x18, v2.D(), 1); __ fmul(d12, d4, v1.D(), 1); __ fmul(s30, s1, v15.S(), 3); __ fmul(v25.V2D(), v0.V2D(), v21.V2D()); __ fmul(v10.V2D(), v24.V2D(), v10.D(), 1); __ fmul(v7.V2S(), v24.V2S(), v16.V2S()); __ fmul(v1.V2S(), v16.V2S(), v4.S(), 2); __ fmul(v5.V4S(), v28.V4S(), v25.V4S()); __ fmul(v11.V4S(), v3.V4S(), v8.S(), 0); __ fmulx(d28, d9, v3.D(), 1); __ fmulx(s25, s21, v15.S(), 1); __ fmulx(v31.V2D(), v28.V2D(), v8.V2D()); __ fmulx(v3.V2D(), v21.V2D(), v6.D(), 0); __ fmulx(v9.V2S(), v1.V2S(), v0.V2S()); __ fmulx(v16.V2S(), v27.V2S(), v6.S(), 0); __ fmulx(v2.V4S(), v4.V4S(), v5.V4S()); __ fmulx(v18.V4S(), v7.V4S(), v4.S(), 0); __ fneg(v1.V2D(), v25.V2D()); __ fneg(v14.V2S(), v31.V2S()); __ fneg(v5.V4S(), v4.V4S()); __ frecpe(v18.V2D(), v12.V2D()); __ frecpe(v10.V2S(), v22.V2S()); __ frecpe(v5.V4S(), v6.V4S()); __ frecps(v22.V2D(), v7.V2D(), v26.V2D()); __ frecps(v31.V2S(), v27.V2S(), v2.V2S()); __ frecps(v18.V4S(), v6.V4S(), v27.V4S()); __ frinta(v26.V2D(), v13.V2D()); __ frinta(v15.V2S(), v26.V2S()); __ frinta(v13.V4S(), v16.V4S()); __ frinti(v9.V2D(), v12.V2D()); __ frinti(v5.V2S(), v19.V2S()); __ frinti(v15.V4S(), v11.V4S()); __ frintm(v17.V2D(), v29.V2D()); __ frintm(v30.V2S(), v11.V2S()); __ frintm(v1.V4S(), v20.V4S()); __ frintn(v24.V2D(), v6.V2D()); __ frintn(v12.V2S(), v17.V2S()); __ frintn(v29.V4S(), v11.V4S()); __ frintp(v10.V2D(), v7.V2D()); __ frintp(v12.V2S(), v18.V2S()); __ frintp(v26.V4S(), v31.V4S()); __ frintx(v24.V2D(), v13.V2D()); __ frintx(v7.V2S(), v9.V2S()); __ frintx(v18.V4S(), v21.V4S()); __ frintz(v19.V2D(), v25.V2D()); __ frintz(v15.V2S(), v8.V2S()); __ frintz(v20.V4S(), v3.V4S()); __ frsqrte(v23.V2D(), v5.V2D()); __ frsqrte(v9.V2S(), v7.V2S()); __ frsqrte(v3.V4S(), v9.V4S()); __ frsqrts(v25.V2D(), v28.V2D(), v15.V2D()); __ frsqrts(v9.V2S(), v26.V2S(), v10.V2S()); __ frsqrts(v5.V4S(), v1.V4S(), v10.V4S()); __ fsqrt(v6.V2D(), v18.V2D()); __ fsqrt(v6.V2S(), v18.V2S()); __ fsqrt(v0.V4S(), v31.V4S()); __ fsub(v31.V2D(), v30.V2D(), v31.V2D()); __ fsub(v11.V2S(), v8.V2S(), v6.V2S()); __ fsub(v16.V4S(), v0.V4S(), v31.V4S()); __ scvtf(v25.V2D(), v31.V2D()); __ scvtf(v10.V2D(), v13.V2D(), 45); __ scvtf(v10.V2S(), v15.V2S()); __ scvtf(v18.V2S(), v4.V2S(), 27); __ scvtf(v17.V4S(), v5.V4S()); __ scvtf(v11.V4S(), v25.V4S(), 24); __ ucvtf(v9.V2D(), v3.V2D()); __ ucvtf(v26.V2D(), v30.V2D(), 46); __ ucvtf(v11.V2S(), v4.V2S()); __ ucvtf(v29.V2S(), v3.V2S(), 25); __ ucvtf(v22.V4S(), v23.V4S()); __ ucvtf(v18.V4S(), v9.V4S(), 25); } static void MaskAddresses(const char* trace) { #ifdef __APPLE__ #define ESCAPE(c) "\\\\" #c const char* sed_options = "-i \"\" -E"; #else #define ESCAPE(c) "\\" #c const char* sed_options = "-i -E"; #endif #define COLOUR "(." ESCAPE([) "[01];([0-9][0-9])?m)?" struct { const char* search; const char* replace; } patterns[] = {// Mask registers that hold addresses that change from run to run. {"((x0|x1|x2|sp): " COLOUR "0x)[0-9a-f]{16}", ESCAPE(1) "~~~~~~~~~~~~~~~~"}, // Mask accessed memory addresses. {"((<-|->) " COLOUR "0x)[0-9a-f]{16}", ESCAPE(1) "~~~~~~~~~~~~~~~~"}, // Mask instruction addresses. {"^0x[0-9a-f]{16}", "0x~~~~~~~~~~~~~~~~"}, // Mask branch targets. {"(Branch" COLOUR " to 0x)[0-9a-f]{16}", ESCAPE(1) "~~~~~~~~~~~~~~~~"}, {"addr 0x[0-9a-f]+", "addr 0x~~~~~~~~~~~~~~~~"}}; const size_t patterns_length = sizeof(patterns) / sizeof(patterns[0]); // Rewrite `trace`, masking addresses and other values that legitimately vary // from run to run. char command[1024]; for (size_t i = 0; i < patterns_length; i++) { size_t length = snprintf(command, sizeof(command), "sed %s 's/%s/%s/' '%s'", sed_options, patterns[i].search, patterns[i].replace, trace); VIXL_CHECK(length < sizeof(command)); VIXL_CHECK(system(command) == 0); } } static bool CheckOrGenerateTrace(const char* filename, const char* ref_file) { bool trace_matched_reference; if (Test::generate_test_trace()) { // Copy trace_stream to stdout. FILE* trace_stream = fopen(filename, "r"); VIXL_ASSERT(trace_stream != NULL); fseek(trace_stream, 0, SEEK_SET); int c; while (1) { c = getc(trace_stream); if (c == EOF) break; putc(c, stdout); } fclose(trace_stream); trace_matched_reference = true; } else { // Check trace_stream against ref_file. char command[1024]; size_t length = snprintf(command, sizeof(command), "diff -u %s %s", ref_file, filename); VIXL_CHECK(length < sizeof(command)); trace_matched_reference = (system(command) == 0); } return trace_matched_reference; } // Trace tests can only work with the simulator. #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 static void TraceTestHelper(bool coloured_trace, TraceParameters trace_parameters, const char* ref_file) { MacroAssembler masm(12 * KBytes); char trace_stream_filename[] = "/tmp/vixl-test-trace-XXXXXX"; FILE* trace_stream = fdopen(mkstemp(trace_stream_filename), "w"); Decoder decoder; Simulator simulator(&decoder, trace_stream); simulator.SetColouredTrace(coloured_trace); simulator.SetTraceParameters(trace_parameters); simulator.SilenceExclusiveAccessWarning(); // Set up a scratch buffer so we can test loads and stores. const int kScratchSize = 64 * KBytes; const int kScratchGuardSize = 128; char scratch_buffer[kScratchSize + kScratchGuardSize]; for (size_t i = 0; i < (sizeof(scratch_buffer) / sizeof(scratch_buffer[0])); i++) { scratch_buffer[i] = i & 0xff; } // Used for offset addressing. simulator.WriteRegister(0, scratch_buffer); // Used for pre-/post-index addressing. simulator.WriteRegister(1, scratch_buffer); const int kPostIndexRegisterStep = 13; // Arbitrary interesting value. // Used for post-index offsets. simulator.WriteRegister(2, kPostIndexRegisterStep); // Initialize the other registers with unique values. uint64_t initial_base_u64 = 0x0100001000100101; for (unsigned i = 3; i < kNumberOfRegisters; i++) { if (i == kLinkRegCode) continue; if (i == kZeroRegCode) continue; // NoRegLog suppresses the log now, but the registers will still be logged // before the first instruction is executed since they have been written but // not printed. simulator.WriteRegister(i, initial_base_u64 * i, Simulator::NoRegLog); } float initial_base_f32 = 1.2345f; double initial_base_f64 = 1.3456f; for (unsigned i = 0; i < kNumberOfVRegisters; i++) { // Try to initialise V registers with reasonable FP values. uint64_t low = (DoubleToRawbits(initial_base_f64 * i) & ~kSRegMask) | FloatToRawbits(initial_base_f32 * i); uint64_t high = low ^ 0x0005555500555555; LogicVRegister reg(simulator.ReadVRegister(i)); reg.SetUint(kFormat2D, 0, low); reg.SetUint(kFormat2D, 1, high); } GenerateTestSequenceBase(&masm); GenerateTestSequenceFP(&masm); GenerateTestSequenceNEON(&masm); GenerateTestSequenceNEONFP(&masm); masm.Ret(); masm.FinalizeCode(); simulator.RunFrom(masm.GetBuffer()->GetStartAddress<Instruction*>()); fclose(trace_stream); MaskAddresses(trace_stream_filename); bool trace_matched_reference = CheckOrGenerateTrace(trace_stream_filename, ref_file); remove(trace_stream_filename); // Clean up before checking the result. VIXL_CHECK(trace_matched_reference); uint64_t offset_base = simulator.ReadRegister<uint64_t>(0); uint64_t index_base = simulator.ReadRegister<uint64_t>(1); VIXL_CHECK(index_base >= offset_base); VIXL_CHECK((index_base - offset_base) <= kScratchSize); } // Test individual options. TEST(disasm) { TraceTestHelper(false, LOG_DISASM, REF("log-disasm")); } TEST(regs) { TraceTestHelper(false, LOG_REGS, REF("log-regs")); } TEST(vregs) { TraceTestHelper(false, LOG_VREGS, REF("log-vregs")); } TEST(sysregs) { TraceTestHelper(false, LOG_SYSREGS, REF("log-sysregs")); } TEST(write) { TraceTestHelper(false, LOG_WRITE, REF("log-write")); } TEST(branch) { TraceTestHelper(false, LOG_WRITE, REF("log-branch")); } // Test standard combinations. TEST(none) { TraceTestHelper(false, LOG_NONE, REF("log-none")); } TEST(state) { TraceTestHelper(false, LOG_STATE, REF("log-state")); } TEST(all) { TraceTestHelper(false, LOG_ALL, REF("log-all")); } // Test individual options (with colour). TEST(disasm_colour) { TraceTestHelper(true, LOG_DISASM, REF("log-disasm-colour")); } TEST(regs_colour) { TraceTestHelper(true, LOG_REGS, REF("log-regs-colour")); } TEST(vregs_colour) { TraceTestHelper(true, LOG_VREGS, REF("log-vregs-colour")); } TEST(sysregs_colour) { TraceTestHelper(true, LOG_SYSREGS, REF("log-sysregs-colour")); } TEST(write_colour) { TraceTestHelper(true, LOG_WRITE, REF("log-write-colour")); } TEST(branch_colour) { TraceTestHelper(true, LOG_WRITE, REF("log-branch-colour")); } // Test standard combinations (with colour). TEST(none_colour) { TraceTestHelper(true, LOG_NONE, REF("log-none-colour")); } TEST(state_colour) { TraceTestHelper(true, LOG_STATE, REF("log-state-colour")); } TEST(all_colour) { TraceTestHelper(true, LOG_ALL, REF("log-all-colour")); } #endif // VIXL_INCLUDE_SIMULATOR_AARCH64 static void PrintDisassemblerTestHelper(const char* prefix, const char* suffix, const char* ref_file) { MacroAssembler masm(12 * KBytes); char trace_stream_filename[] = "/tmp/vixl-test-trace-XXXXXX"; FILE* trace_stream = fdopen(mkstemp(trace_stream_filename), "w"); // We don't need to execute this code so there's no need for the execution // environment setup from TraceTestHelper. GenerateTestSequenceBase(&masm); GenerateTestSequenceFP(&masm); GenerateTestSequenceNEON(&masm); GenerateTestSequenceNEONFP(&masm); masm.FinalizeCode(); Decoder decoder; CPUFeaturesAuditor auditor(&decoder); PrintDisassembler disasm(trace_stream); if (prefix != NULL) disasm.SetCPUFeaturesPrefix(prefix); if (suffix != NULL) disasm.SetCPUFeaturesSuffix(suffix); disasm.RegisterCPUFeaturesAuditor(&auditor); decoder.AppendVisitor(&disasm); Instruction* instruction = masm.GetBuffer()->GetStartAddress<Instruction*>(); Instruction* end = masm.GetCursorAddress<Instruction*>(); while (instruction != end) { decoder.Decode(instruction); instruction += kInstructionSize; } fclose(trace_stream); MaskAddresses(trace_stream_filename); bool trace_matched_reference = CheckOrGenerateTrace(trace_stream_filename, ref_file); remove(trace_stream_filename); // Clean up before checking the result. VIXL_CHECK(trace_matched_reference); } // Test CPUFeatures disassembly annotations. TEST(cpufeatures) { PrintDisassemblerTestHelper(NULL, NULL, REF("log-cpufeatures")); } TEST(cpufeatures_custom) { PrintDisassemblerTestHelper("### {", "} ###", REF("log-cpufeatures-custom")); } TEST(cpufeatures_colour) { // The colour chosen is arbitrary. PrintDisassemblerTestHelper("\033[1;35m", // Prefix: Bold magenta. "\033[0;m", // Suffix: Reset colour. REF("log-cpufeatures-colour")); } } // namespace aarch64 } // namespace vixl