/* * Copyright (C) 2007 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> #include <time.h> #include <unistd.h> #include <sched.h> #include <sys/resource.h> #include <sys/syscall.h> #include <sys/types.h> #include <sys/mman.h> #ifdef __ARM_NEON__ #include <arm_neon.h> #endif typedef long long nsecs_t; static nsecs_t gTime; float data_f[1024 * 128]; static nsecs_t system_time() { struct timespec t; t.tv_sec = t.tv_nsec = 0; clock_gettime(CLOCK_MONOTONIC, &t); return nsecs_t(t.tv_sec)*1000000000LL + t.tv_nsec; } static void startTime() { gTime = system_time(); } static void endTime(const char *str, double ops) { nsecs_t t = system_time() - gTime; double ds = ((double)t) / 1e9; printf("Test: %s, %f Mops\n", str, ops / ds / 1e6); } static void test_mad() { for(int i=0; i<1020; i++) { data_f[i] = i; } startTime(); float total = 0; // Do ~1 billion ops for (int ct=0; ct < (1000 * (1000 / 20)); ct++) { for (int i=0; i < 1000; i++) { data_f[i] = (data_f[i] * 0.02f + data_f[i+1] * 0.04f + data_f[i+2] * 0.05f + data_f[i+3] * 0.1f + data_f[i+4] * 0.2f + data_f[i+5] * 0.2f + data_f[i+6] * 0.1f + data_f[i+7] * 0.05f + data_f[i+8] * 0.04f + data_f[i+9] * 0.02f + 1.f); } } endTime("scalar mad", 1e9); } #ifdef __ARM_NEON__ static void test_fma() { for(int i=0; i<1020 * 4; i++) { data_f[i] = i; } float32x4_t c0_02 = vdupq_n_f32(0.02f); float32x4_t c0_04 = vdupq_n_f32(0.04f); float32x4_t c0_05 = vdupq_n_f32(0.05f); float32x4_t c0_10 = vdupq_n_f32(0.1f); float32x4_t c0_20 = vdupq_n_f32(0.2f); float32x4_t c1_00 = vdupq_n_f32(1.0f); startTime(); float total = 0; // Do ~1 billion ops for (int ct=0; ct < (1000 * (1000 / 80)); ct++) { for (int i=0; i < 1000; i++) { float32x4_t t; t = vmulq_f32(vld1q_f32((float32_t *)&data_f[i]), c0_02); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+4]), c0_04); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+8]), c0_05); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+12]), c0_10); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+16]), c0_20); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+20]), c0_20); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+24]), c0_10); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+28]), c0_05); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+32]), c0_04); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+36]), c0_02); t = vaddq_f32(t, c1_00); vst1q_f32((float32_t *)&data_f[i], t); } } endTime("neon fma", 1e9); } #endif int fp_test(int argc, char** argv) { test_mad(); #ifdef __ARM_NEON__ test_fma(); #endif return 0; }