/* gcc -o v8crypto v8crypto.c -march=armv8-a -mfpu=crypto-neon-fp-armv8 gcc -o v8crypto v8crypto.c -mfpu=crypto-neon-fp-armv8 */ #include <stdio.h> #include <assert.h> #include <malloc.h> // memalign #include <string.h> // memset #include "tests/malloc.h" #include <math.h> // isnormal typedef unsigned char UChar; typedef unsigned short int UShort; typedef unsigned int UInt; typedef signed int Int; typedef unsigned char UChar; typedef unsigned long long int ULong; typedef signed long long int Long; typedef double Double; typedef float Float; typedef unsigned char Bool; #define False ((Bool)0) #define True ((Bool)1) #define ITERS 1 typedef enum { TyHF=1234, TySF, TyDF, TyB, TyH, TyS, TyD, TyNONE } LaneTy; union _V128 { UChar u8[16]; UShort u16[8]; UInt u32[4]; ULong u64[2]; Float f32[4]; Double f64[2]; }; typedef union _V128 V128; static inline UChar randUChar ( void ) { static UInt seed = 80021; seed = 1103515245 * seed + 12345; return (seed >> 17) & 0xFF; } //static ULong randULong ( LaneTy ty ) //{ // Int i; // ULong r = 0; // for (i = 0; i < 8; i++) { // r = (r << 8) | (ULong)(0xFF & randUChar()); // } // return r; //} /* Generates a random V128. Ensures that that it contains normalised FP numbers when viewed as either F32x4 or F64x2, so that it is reasonable to use in FP test cases. */ static void randV128 ( /*OUT*/V128* v, LaneTy ty ) { static UInt nCalls = 0, nIters = 0; Int i; nCalls++; while (1) { nIters++; for (i = 0; i < 16; i++) { v->u8[i] = randUChar(); } if (isnormal(v->f32[0]) && isnormal(v->f32[1]) && isnormal(v->f32[2]) && isnormal(v->f32[3]) && isnormal(v->f64[0]) && isnormal(v->f64[1])) break; } if (0 == (nCalls & 0xFF)) printf("randV128: %u calls, %u iters\n", nCalls, nIters); } static void showV128 ( V128* v ) { Int i; for (i = 15; i >= 0; i--) printf("%02x", (Int)v->u8[i]); } //static void showBlock ( const char* msg, V128* block, Int nBlock ) //{ // Int i; // printf("%s\n", msg); // for (i = 0; i < nBlock; i++) { // printf(" "); // showV128(&block[i]); // printf("\n"); // } //} /* ---------------------------------------------------------------- */ /* -- Parameterisable test macros -- */ /* ---------------------------------------------------------------- */ #define DO50(_action) \ do { \ Int _qq; for (_qq = 0; _qq < 50; _qq++) { _action ; } \ } while (0) /* Generate a test that involves two vector regs, with no bias as towards which is input or output. It's OK to use r8 as scratch.*/ #define GEN_TWOVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO) \ __attribute__((noinline)) \ static void test_##TESTNAME ( LaneTy ty ) { \ Int i; \ for (i = 0; i < ITERS; i++) { \ V128 block[4+1]; \ memset(block, 0x55, sizeof(block)); \ randV128(&block[0], ty); \ randV128(&block[1], ty); \ randV128(&block[2], ty); \ randV128(&block[3], ty); \ __asm__ __volatile__( \ "mov r9, #0 ; vmsr fpscr, r9 ; " \ "add r9, %0, #0 ; vld1.8 { q"#VECREG1NO" }, [r9] ; " \ "add r9, %0, #16 ; vld1.8 { q"#VECREG2NO" }, [r9] ; " \ INSN " ; " \ "add r9, %0, #32 ; vst1.8 { q"#VECREG1NO" }, [r9] ; " \ "add r9, %0, #48 ; vst1.8 { q"#VECREG2NO" }, [r9] ; " \ "vmrs r9, fpscr ; str r9, [%0, #64] " \ : : "r"(&block[0]) \ : "cc", "memory", "q"#VECREG1NO, "q"#VECREG2NO, "r8", "r9" \ ); \ printf(INSN " "); \ UInt fpscr = 0xFFFFFFFF & block[4].u32[0]; \ showV128(&block[0]); printf(" "); \ showV128(&block[1]); printf(" "); \ showV128(&block[2]); printf(" "); \ showV128(&block[3]); printf(" fpscr=%08x\n", fpscr); \ } \ } /* Generate a test that involves three vector regs, with no bias as towards which is input or output. It's also OK to use r8 scratch. */ #define GEN_THREEVEC_TEST(TESTNAME,INSN,VECREG1NO,VECREG2NO,VECREG3NO) \ __attribute__((noinline)) \ static void test_##TESTNAME ( LaneTy ty ) { \ Int i; \ for (i = 0; i < ITERS; i++) { \ V128 block[6+1]; \ memset(block, 0x55, sizeof(block)); \ randV128(&block[0], ty); \ randV128(&block[1], ty); \ randV128(&block[2], ty); \ randV128(&block[3], ty); \ randV128(&block[4], ty); \ randV128(&block[5], ty); \ __asm__ __volatile__( \ "mov r9, #0 ; vmsr fpscr, r9 ; " \ "add r9, %0, #0 ; vld1.8 { q"#VECREG1NO" }, [r9] ; " \ "add r9, %0, #16 ; vld1.8 { q"#VECREG2NO" }, [r9] ; " \ "add r9, %0, #32 ; vld1.8 { q"#VECREG3NO" }, [r9] ; " \ INSN " ; " \ "add r9, %0, #48 ; vst1.8 { q"#VECREG1NO" }, [r9] ; " \ "add r9, %0, #64 ; vst1.8 { q"#VECREG2NO" }, [r9] ; " \ "add r9, %0, #80 ; vst1.8 { q"#VECREG3NO" }, [r9] ; " \ "vmrs r9, fpscr ; str r9, [%0, #96] " \ : : "r"(&block[0]) \ : "cc", "memory", "q"#VECREG1NO, "q"#VECREG2NO, "q"#VECREG3NO, \ "r8", "r9" \ ); \ printf(INSN " "); \ UInt fpscr = 0xFFFFFFFF & block[6].u32[0]; \ showV128(&block[0]); printf(" "); \ showV128(&block[1]); printf(" "); \ showV128(&block[2]); printf(" "); \ showV128(&block[3]); printf(" "); \ showV128(&block[4]); printf(" "); \ showV128(&block[5]); printf(" fpscr=%08x\n", fpscr); \ } \ } // ======================== CRYPTO ======================== GEN_TWOVEC_TEST(aesd_q_q, "aesd.8 q3, q4", 3, 4) GEN_TWOVEC_TEST(aese_q_q, "aese.8 q12, q13", 12, 13) GEN_TWOVEC_TEST(aesimc_q_q, "aesimc.8 q15, q0", 15, 0) GEN_TWOVEC_TEST(aesmc_q_q, "aesmc.8 q1, q9", 1, 9) GEN_THREEVEC_TEST(sha1c_q_q_q, "sha1c.32 q11, q10, q2", 11, 10, 2) GEN_TWOVEC_TEST(sha1h_q_q, "sha1h.32 q6, q7", 6, 7) GEN_THREEVEC_TEST(sha1m_q_q_q, "sha1m.32 q2, q8, q13", 2, 8, 13) GEN_THREEVEC_TEST(sha1p_q_q_q, "sha1p.32 q3, q9, q14", 3, 9, 14) GEN_THREEVEC_TEST(sha1su0_q_q_q, "sha1su0.32 q4, q10, q15", 4, 10, 15) GEN_TWOVEC_TEST(sha1su1_q_q, "sha1su1.32 q11, q2", 11, 2) GEN_THREEVEC_TEST(sha256h2_q_q_q, "sha256h2.32 q9, q8, q7", 9, 8, 7) GEN_THREEVEC_TEST(sha256h_q_q_q, "sha256h.32 q10, q9, q8", 10, 9, 8) GEN_TWOVEC_TEST(sha256su0_q_q, "sha256su0.32 q11, q10", 11, 10) GEN_THREEVEC_TEST(sha256su1_q_q_q, "sha256su1.32 q12, q11, q10", 12, 11, 10) // This is a bit complex. This really mentions three registers, so it // should really be a THREEVEC variant. But the two source registers // are D registers. So we say it is just a TWOVEC insn, producing a Q // and taking a single Q (q7); q7 is the d14-d15 register pair, which // is why the insn itself is mentions d14 and d15 whereas the // numbers that follow mention q7. The result (q7) is 128 bits wide and // so is unaffected by these shenanigans. GEN_TWOVEC_TEST(pmull_q_d_d, "vmull.p64 q13, d14, d15", 13, 7) int main ( void ) { // ======================== CRYPTO ======================== // aesd.8 q_q (aes single round decryption) // aese.8 q_q (aes single round encryption) // aesimc.8 q_q (aes inverse mix columns) // aesmc.8 q_q (aes mix columns) if (1) DO50( test_aesd_q_q(TyNONE) ); if (1) DO50( test_aese_q_q(TyNONE) ); if (1) DO50( test_aesimc_q_q(TyNONE) ); if (1) DO50( test_aesmc_q_q(TyNONE) ); // sha1c.32 q_q_q // sha1h.32 q_q // sha1m.32 q_q_q // sha1p.32 q_q_q // sha1su0.32 q_q_q // sha1su1.32 q_q if (1) DO50( test_sha1c_q_q_q(TyNONE) ); if (1) DO50( test_sha1h_q_q(TyNONE) ); if (1) DO50( test_sha1m_q_q_q(TyNONE) ); if (1) DO50( test_sha1p_q_q_q(TyNONE) ); if (1) DO50( test_sha1su0_q_q_q(TyNONE) ); if (1) DO50( test_sha1su1_q_q(TyNONE) ); // sha256h2.32 q_q_q // sha256h.32 q_q_q // sha256su0.32 q_q // sha256su1.32 q_q_q if (1) DO50( test_sha256h2_q_q_q(TyNONE) ); if (1) DO50( test_sha256h_q_q_q(TyNONE) ); if (1) DO50( test_sha256su0_q_q(TyNONE) ); if (1) DO50( test_sha256su1_q_q_q(TyNONE) ); // vmull.64 q_d_d if (1) DO50( test_pmull_q_d_d(TyD) ); return 0; }