/*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <android/log.h>
#include <math.h>
#include <stdlib.h>
#include <unistd.h>
#include "Bench.h"
Bench::Bench()
{
mTimeBucket = NULL;
mTimeBuckets = 0;
mTimeBucketDivisor = 1;
mMemLatencyLastSize = 0;
mMemDst = NULL;
mMemSrc = NULL;
mMemLoopCount = 0;
}
Bench::~Bench()
{
}
uint64_t Bench::getTimeNanos() const
{
struct timespec t;
clock_gettime(CLOCK_MONOTONIC, &t);
return t.tv_nsec + ((uint64_t)t.tv_sec * 1000 * 1000 * 1000);
}
uint64_t Bench::getTimeMillis() const
{
return getTimeNanos() / 1000000;
}
void Bench::testWork(void *usr, uint32_t idx)
{
Bench *b = (Bench *)usr;
//__android_log_print(ANDROID_LOG_INFO, "bench", "test %i %p", idx, b);
float f1 = 0.f;
float f2 = 0.f;
float f3 = 0.f;
float f4 = 0.f;
float *ipk = b->mIpKernel[idx];
volatile float *src = b->mSrcBuf[idx];
volatile float *out = b->mOutBuf[idx];
//__android_log_print(ANDROID_LOG_INFO, "bench", "test %p %p %p", ipk, src, out);
do {
for (int i = 0; i < 1024; i++) {
f1 += src[i * 4] * ipk[i];
f2 += src[i * 4 + 1] * ipk[i];
f3 += src[i * 4 + 2] * ipk[i];
f4 += sqrtf(f1 + f2 + f3);
}
out[0] = f1;
out[1] = f2;
out[2] = f3;
out[3] = f4;
} while (b->incTimeBucket());
}
bool Bench::initIP() {
int workers = mWorkers.getWorkerCount();
mIpKernel = new float *[workers];
mSrcBuf = new float *[workers];
mOutBuf = new float *[workers];
for (int i = 0; i < workers; i++) {
mIpKernel[i] = new float[1024];
mSrcBuf[i] = new float[4096];
mOutBuf[i] = new float[4];
}
return true;
}
bool Bench::runPowerManagementTest(uint64_t options) {
//__android_log_print(ANDROID_LOG_INFO, "bench", "rpmt x %i", options);
mTimeBucketDivisor = 1000 * 1000; // use ms
allocateBuckets(2 * 1000);
usleep(2 * 1000 * 1000);
//__android_log_print(ANDROID_LOG_INFO, "bench", "rpmt 2 b %i", mTimeBuckets);
mTimeStartNanos = getTimeNanos();
mTimeEndNanos = mTimeStartNanos + mTimeBuckets * mTimeBucketDivisor;
memset(mTimeBucket, 0, sizeof(uint32_t) * mTimeBuckets);
bool useMT = false;
//__android_log_print(ANDROID_LOG_INFO, "bench", "rpmt 2.1 b %i", mTimeBuckets);
mTimeEndGroupNanos = mTimeStartNanos;
do {
// Advance 8ms
mTimeEndGroupNanos += 8 * 1000 * 1000;
int threads = useMT ? 1 : 0;
useMT = !useMT;
if ((options & 0x1f) != 0) {
threads = options & 0x1f;
}
//__android_log_print(ANDROID_LOG_INFO, "bench", "threads %i", threads);
mWorkers.launchWork(testWork, this, threads);
} while (mTimeEndGroupNanos <= mTimeEndNanos);
return true;
}
bool Bench::allocateBuckets(size_t bucketCount) {
if (bucketCount == mTimeBuckets) {
return true;
}
if (mTimeBucket != NULL) {
delete[] mTimeBucket;
mTimeBucket = NULL;
}
mTimeBuckets = bucketCount;
if (mTimeBuckets > 0) {
mTimeBucket = new uint32_t[mTimeBuckets];
}
return true;
}
bool Bench::init() {
mWorkers.init();
initIP();
//ALOGV("%p Launching thread(s), CPUs %i", mRSC, mWorkers.mCount + 1);
return true;
}
bool Bench::incTimeBucket() const {
uint64_t time = getTimeNanos();
uint64_t bucket = (time - mTimeStartNanos) / mTimeBucketDivisor;
if (bucket >= mTimeBuckets) {
return false;
}
__sync_fetch_and_add(&mTimeBucket[bucket], 1);
return time < mTimeEndGroupNanos;
}
void Bench::getData(float *data, size_t count) const {
if (count > mTimeBuckets) {
count = mTimeBuckets;
}
for (size_t ct = 0; ct < count; ct++) {
data[ct] = (float)mTimeBucket[ct];
}
}
bool Bench::runCPUHeatSoak(uint64_t /* options */)
{
mTimeBucketDivisor = 1000 * 1000; // use ms
allocateBuckets(1000);
mTimeStartNanos = getTimeNanos();
mTimeEndNanos = mTimeStartNanos + mTimeBuckets * mTimeBucketDivisor;
memset(mTimeBucket, 0, sizeof(uint32_t) * mTimeBuckets);
mTimeEndGroupNanos = mTimeEndNanos;
mWorkers.launchWork(testWork, this, 0);
return true;
}
float Bench::runMemoryBandwidthTest(uint64_t size)
{
uint64_t t1 = getTimeMillis();
for (size_t ct = mMemLoopCount; ct > 0; ct--) {
memcpy(mMemDst, mMemSrc, size);
}
double dt = getTimeMillis() - t1;
dt /= 1000;
double bw = ((double)size) * mMemLoopCount / dt;
bw /= 1024 * 1024 * 1024;
float targetTime = 0.2f;
if (dt > targetTime) {
mMemLoopCount = (size_t)((double)mMemLoopCount / (dt / targetTime));
}
return (float)bw;
}
float Bench::runMemoryLatencyTest(uint64_t size)
{
//__android_log_print(ANDROID_LOG_INFO, "bench", "latency %i", (int)size);
void ** sp = (void **)mMemSrc;
size_t maxIndex = size / sizeof(void *);
size_t loops = ((maxIndex / 2) & (~3));
//loops = 10;
if (size != mMemLatencyLastSize) {
__android_log_print(ANDROID_LOG_INFO, "bench", "latency build %i %i", (int)maxIndex, loops);
mMemLatencyLastSize = size;
memset((void *)mMemSrc, 0, mMemLatencyLastSize);
size_t lastIdx = 0;
for (size_t ct = 0; ct < loops; ct++) {
size_t ni = rand() * rand();
ni = ni % maxIndex;
while ((sp[ni] != NULL) || (ni == lastIdx)) {
ni++;
if (ni >= maxIndex) {
ni = 1;
}
// __android_log_print(ANDROID_LOG_INFO, "bench", "gen ni loop %i %i", lastIdx, ni);
}
// __android_log_print(ANDROID_LOG_INFO, "bench", "gen ct = %i %i %i %p %p", (int)ct, lastIdx, ni, &sp[lastIdx], &sp[ni]);
sp[lastIdx] = &sp[ni];
lastIdx = ni;
}
sp[lastIdx] = 0;
}
//__android_log_print(ANDROID_LOG_INFO, "bench", "latency testing");
uint64_t t1 = getTimeNanos();
for (size_t ct = mMemLoopCount; ct > 0; ct--) {
size_t lc = 1;
volatile void *p = sp[0];
while (p != NULL) {
// Unroll once to minimize branching overhead.
void **pn = (void **)p;
p = pn[0];
pn = (void **)p;
p = pn[0];
}
}
//__android_log_print(ANDROID_LOG_INFO, "bench", "v %i %i", loops * mMemLoopCount, v);
double dt = getTimeNanos() - t1;
double dts = dt / 1000000000;
double lat = dt / (loops * mMemLoopCount);
__android_log_print(ANDROID_LOG_INFO, "bench", "latency ret %f", lat);
float targetTime = 0.2f;
if (dts > targetTime) {
mMemLoopCount = (size_t)((double)mMemLoopCount / (dts / targetTime));
if (mMemLoopCount < 1) {
mMemLoopCount = 1;
}
}
return (float)lat;
}
bool Bench::startMemTests()
{
mMemSrc = (uint8_t *)malloc(1024*1024*64);
mMemDst = (uint8_t *)malloc(1024*1024*64);
memset(mMemSrc, 0, 1024*1024*16);
memset(mMemDst, 0, 1024*1024*16);
mMemLoopCount = 1;
uint64_t start = getTimeMillis();
while((getTimeMillis() - start) < 500) {
memcpy(mMemDst, mMemSrc, 1024);
mMemLoopCount++;
}
mMemLatencyLastSize = 0;
return true;
}
void Bench::endMemTests()
{
free(mMemSrc);
free(mMemDst);
mMemSrc = NULL;
mMemDst = NULL;
mMemLatencyLastSize = 0;
}
void Bench::GflopKernelC() {
int halfKX = (mGFlop.kernelXSize / 2);
for (int x = halfKX; x < (mGFlop.imageXSize - halfKX - 1); x++) {
const float * krnPtr = mGFlop.kernelBuffer;
float sum = 0.f;
int srcInc = mGFlop.imageXSize - mGFlop.kernelXSize;
const float * srcPtr = &mGFlop.srcBuffer[x - halfKX];
for (int ix = 0; ix < mGFlop.kernelXSize; ix++) {
sum += srcPtr[0] * krnPtr[0];
krnPtr++;
srcPtr++;
}
float * dstPtr = &mGFlop.dstBuffer[x];
dstPtr[0] = sum;
}
}
void Bench::GflopKernelC_y3() {
}
float Bench::runGFlopsTest(uint64_t /* options */)
{
mTimeBucketDivisor = 1000 * 1000; // use ms
allocateBuckets(1000);
mTimeStartNanos = getTimeNanos();
mTimeEndNanos = mTimeStartNanos + mTimeBuckets * mTimeBucketDivisor;
memset(mTimeBucket, 0, sizeof(uint32_t) * mTimeBuckets);
mTimeEndGroupNanos = mTimeEndNanos;
mWorkers.launchWork(testWork, this, 0);
// Simulate image convolve
mGFlop.kernelXSize = 27;
mGFlop.imageXSize = 1024 * 1024;
mGFlop.srcBuffer = (float *)malloc(mGFlop.imageXSize * sizeof(float));
mGFlop.dstBuffer = (float *)malloc(mGFlop.imageXSize * sizeof(float));
mGFlop.kernelBuffer = (float *)malloc(mGFlop.kernelXSize * sizeof(float));
double ops = mGFlop.kernelXSize;
ops = ops * 2.f - 1.f;
ops *= mGFlop.imageXSize;
uint64_t t1 = getTimeNanos();
GflopKernelC();
double dt = getTimeNanos() - t1;
dt /= 1000.f * 1000.f * 1000.f;
double gflops = ops / dt / 1000000000.f;
__android_log_print(ANDROID_LOG_INFO, "bench", "v %f %f %f", dt, ops, gflops);
return (float)gflops;
}