/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkChecksum_opts_DEFINED
#define SkChecksum_opts_DEFINED
#include "SkChecksum.h"
#include "SkTypes.h"
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
#include <immintrin.h>
#elif defined(SK_CPU_ARM64) && defined(SK_ARM_HAS_CRC32)
#include <arm_acle.h>
#endif
namespace SK_OPTS_NS {
template <typename T>
static inline T unaligned_load(const uint8_t* src) {
T val;
memcpy(&val, src, sizeof(val));
return val;
}
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
// This is not a CRC32. It's Just A Hash that uses those instructions because they're fast.
static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) {
auto data = (const uint8_t*)vdata;
// _mm_crc32_u64() operates on 64-bit registers, so we use uint64_t for a while.
uint64_t hash = seed;
if (bytes >= 24) {
// We'll create 3 independent hashes, each using _mm_crc32_u64()
// to hash 8 bytes per step. Both 3 and independent are important:
// we can execute 3 of these instructions in parallel on a single core.
uint64_t a = hash,
b = hash,
c = hash;
size_t steps = bytes/24;
while (steps --> 0) {
a = _mm_crc32_u64(a, unaligned_load<uint64_t>(data+ 0));
b = _mm_crc32_u64(b, unaligned_load<uint64_t>(data+ 8));
c = _mm_crc32_u64(c, unaligned_load<uint64_t>(data+16));
data += 24;
}
bytes %= 24;
hash = a^b^c;
}
SkASSERT(bytes < 24);
if (bytes >= 16) {
hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
bytes -= 8;
data += 8;
}
SkASSERT(bytes < 16);
if (bytes & 8) {
hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
data += 8;
}
// The remainder of these _mm_crc32_u*() operate on a 32-bit register.
// We don't lose anything here: only the bottom 32-bits were populated.
auto hash32 = (uint32_t)hash;
if (bytes & 4) {
hash32 = _mm_crc32_u32(hash32, unaligned_load<uint32_t>(data));
data += 4;
}
if (bytes & 2) {
hash32 = _mm_crc32_u16(hash32, unaligned_load<uint16_t>(data));
data += 2;
}
if (bytes & 1) {
hash32 = _mm_crc32_u8(hash32, unaligned_load<uint8_t>(data));
}
return hash32;
}
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
// 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64().
static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
auto data = (const uint8_t*)vdata;
if (bytes >= 12) {
// We'll create 3 independent hashes, each using _mm_crc32_u32()
// to hash 4 bytes per step. Both 3 and independent are important:
// we can execute 3 of these instructions in parallel on a single core.
uint32_t a = hash,
b = hash,
c = hash;
size_t steps = bytes/12;
while (steps --> 0) {
a = _mm_crc32_u32(a, unaligned_load<uint32_t>(data+0));
b = _mm_crc32_u32(b, unaligned_load<uint32_t>(data+4));
c = _mm_crc32_u32(c, unaligned_load<uint32_t>(data+8));
data += 12;
}
bytes %= 12;
hash = a^b^c;
}
SkASSERT(bytes < 12);
if (bytes >= 8) {
hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
bytes -= 4;
data += 4;
}
SkASSERT(bytes < 8);
if (bytes & 4) {
hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
data += 4;
}
if (bytes & 2) {
hash = _mm_crc32_u16(hash, unaligned_load<uint16_t>(data));
data += 2;
}
if (bytes & 1) {
hash = _mm_crc32_u8(hash, unaligned_load<uint8_t>(data));
}
return hash;
}
#elif defined(SK_CPU_ARM64) && defined(SK_ARM_HAS_CRC32)
static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
auto data = (const uint8_t*)vdata;
if (bytes >= 24) {
uint32_t a = hash,
b = hash,
c = hash;
size_t steps = bytes/24;
while (steps --> 0) {
a = __crc32d(a, unaligned_load<uint64_t>(data+ 0));
b = __crc32d(b, unaligned_load<uint64_t>(data+ 8));
c = __crc32d(c, unaligned_load<uint64_t>(data+16));
data += 24;
}
bytes %= 24;
hash = a^b^c;
}
SkASSERT(bytes < 24);
if (bytes >= 16) {
hash = __crc32d(hash, unaligned_load<uint64_t>(data));
bytes -= 8;
data += 8;
}
SkASSERT(bytes < 16);
if (bytes & 8) {
hash = __crc32d(hash, unaligned_load<uint64_t>(data));
data += 8;
}
if (bytes & 4) {
hash = __crc32w(hash, unaligned_load<uint32_t>(data));
data += 4;
}
if (bytes & 2) {
hash = __crc32h(hash, unaligned_load<uint16_t>(data));
data += 2;
}
if (bytes & 1) {
hash = __crc32b(hash, unaligned_load<uint8_t>(data));
}
return hash;
}
#else
// This is Murmur3.
static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
auto data = (const uint8_t*)vdata;
size_t original_bytes = bytes;
// Handle 4 bytes at a time while possible.
while (bytes >= 4) {
uint32_t k = unaligned_load<uint32_t>(data);
k *= 0xcc9e2d51;
k = (k << 15) | (k >> 17);
k *= 0x1b873593;
hash ^= k;
hash = (hash << 13) | (hash >> 19);
hash *= 5;
hash += 0xe6546b64;
bytes -= 4;
data += 4;
}
// Handle last 0-3 bytes.
uint32_t k = 0;
switch (bytes & 3) {
case 3: k ^= data[2] << 16;
case 2: k ^= data[1] << 8;
case 1: k ^= data[0] << 0;
k *= 0xcc9e2d51;
k = (k << 15) | (k >> 17);
k *= 0x1b873593;
hash ^= k;
}
hash ^= original_bytes;
return SkChecksum::Mix(hash);
}
#endif
} // namespace SK_OPTS_NS
#endif//SkChecksum_opts_DEFINED