// Copyright 2008 Google Inc. All Rights Reserved. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "adler32memcpy.h" // We are using (a modified form of) adler-32 checksum algorithm instead // of CRC since adler-32 is faster than CRC. // (Comparison: http://guru.multimedia.cx/crc32-vs-adler32/) // This form of adler is bit modified, instead of treating the data in // units of bytes, 32-bit data is taken as a unit and two 64-bit // checksums are done (we could have one checksum but two checksums // make the code run faster). // Adler-32 implementation: // Data is treated as 1-byte numbers and, // there are two 16-bit numbers a and b // Initialize a with 1 and b with 0. // for each data unit 'd' // a += d // b += a // checksum = a<<16 + b // This sum should never overflow. // // Adler-64+64 implementation: // (applied in this code) // Data is treated as 32-bit numbers and whole data is separated into two // streams, and hence the two checksums a1, a2, b1 and b2. // Initialize a1 and a2 with 1, b1 and b2 with 0 // add first dataunit to a1 // add a1 to b1 // add second dataunit to a1 // add a1 to b1 // add third dataunit to a2 // add a2 to b2 // add fourth dataunit to a2 // add a2 to b2 // ... // repeat the sequence back for next 4 dataunits // // variable A = XMM6 and variable B = XMM7. // (a1 = lower 8 bytes of XMM6 and b1 = lower 8 bytes of XMM7) // Assumptions // 1. size_in_bytes is a multiple of 16. // 2. srcmem and dstmem are 16 byte aligned. // 3. size_in_bytes is less than 2^19 bytes. // Assumption 3 ensures that there is no overflow when numbers are being // added (we can remove this assumption by doing modulus with a prime // number when it is just about to overflow but that would be a very costly // exercise) // Returns true if the checksums are equal. bool AdlerChecksum::Equals(const AdlerChecksum &other) const { return ( (a1_ == other.a1_) && (a2_ == other.a2_) && (b1_ == other.b1_) && (b2_ == other.b2_) ); } // Returns string representation of the Adler checksum. string AdlerChecksum::ToHexString() const { char buffer[128]; snprintf(buffer, sizeof(buffer), "%016llx %016llx %016llx %016llx", a1_, a2_, b1_, b2_); return string(buffer); } // Sets components of the Adler checksum. void AdlerChecksum::Set(uint64 a1, uint64 a2, uint64 b1, uint64 b2) { a1_ = a1; a2_ = a2; b1_ = b1; b2_ = b2; } // Calculates Adler checksum for supplied data. bool CalculateAdlerChecksum(uint64 *data64, unsigned int size_in_bytes, AdlerChecksum *checksum) { // Use this data wrapper to access memory with 64bit read/write. datacast_t data; unsigned int count = size_in_bytes / sizeof(data); if (count > (1U) << 19) { // Size is too large, must be strictly less than 512 KB. return false; } uint64 a1 = 1; uint64 a2 = 1; uint64 b1 = 0; uint64 b2 = 0; unsigned int i = 0; while (i < count) { // Process 64 bits at a time. data.l64 = data64[i]; a1 = a1 + data.l32.l; b1 = b1 + a1; a1 = a1 + data.l32.h; b1 = b1 + a1; i++; data.l64 = data64[i]; a2 = a2 + data.l32.l; b2 = b2 + a2; a2 = a2 + data.l32.h; b2 = b2 + a2; i++; } checksum->Set(a1, a2, b1, b2); return true; } // C implementation of Adler memory copy. bool AdlerMemcpyC(uint64 *dstmem64, uint64 *srcmem64, unsigned int size_in_bytes, AdlerChecksum *checksum) { // Use this data wrapper to access memory with 64bit read/write. datacast_t data; unsigned int count = size_in_bytes / sizeof(data); if (count > ((1U) << 19)) { // Size is too large, must be strictly less than 512 KB. return false; } uint64 a1 = 1; uint64 a2 = 1; uint64 b1 = 0; uint64 b2 = 0; unsigned int i = 0; while (i < count) { // Process 64 bits at a time. data.l64 = srcmem64[i]; a1 = a1 + data.l32.l; b1 = b1 + a1; a1 = a1 + data.l32.h; b1 = b1 + a1; dstmem64[i] = data.l64; i++; data.l64 = srcmem64[i]; a2 = a2 + data.l32.l; b2 = b2 + a2; a2 = a2 + data.l32.h; b2 = b2 + a2; dstmem64[i] = data.l64; i++; } checksum->Set(a1, a2, b1, b2); return true; } // C implementation of Adler memory copy with some float point ops, // attempting to warm up the CPU. bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64, unsigned int size_in_bytes, AdlerChecksum *checksum) { // Use this data wrapper to access memory with 64bit read/write. datacast_t data; unsigned int count = size_in_bytes / sizeof(data); if (count > ((1U) << 19)) { // Size is too large, must be strictly less than 512 KB. return false; } uint64 a1 = 1; uint64 a2 = 1; uint64 b1 = 0; uint64 b2 = 0; double a = 2.0 * static_cast<double>(srcmem64[0]); double b = 5.0 * static_cast<double>(srcmem64[0]); double c = 7.0 * static_cast<double>(srcmem64[0]); double d = 9.0 * static_cast<double>(srcmem64[0]); unsigned int i = 0; while (i < count) { // Process 64 bits at a time. data.l64 = srcmem64[i]; a1 = a1 + data.l32.l; b1 = b1 + a1; a1 = a1 + data.l32.h; b1 = b1 + a1; dstmem64[i] = data.l64; i++; // Warm cpu up. a = a * b; b = b + c; data.l64 = srcmem64[i]; a2 = a2 + data.l32.l; b2 = b2 + a2; a2 = a2 + data.l32.h; b2 = b2 + a2; dstmem64[i] = data.l64; i++; // Warm cpu up. c = c * d; d = d + d; } // Warm cpu up. d = a + b + c + d; if (d == 1.0) { // Reference the result so that it can't be discarded by the compiler. printf("Log: This will probably never happen.\n"); } checksum->Set(a1, a2, b1, b2); return true; } // x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy. bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64, unsigned int size_in_bytes, AdlerChecksum *checksum) { // Use assembly implementation where supported. #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686) // Pull a bit of tricky preprocessing to make the inline asm both // 32 bit and 64 bit. #ifdef STRESSAPPTEST_CPU_I686 // Instead of coding both, x86... #define rAX "%%eax" #define rCX "%%ecx" #define rDX "%%edx" #define rBX "%%ebx" #define rSP "%%esp" #define rBP "%%ebp" #define rSI "%%esi" #define rDI "%%edi" #endif #ifdef STRESSAPPTEST_CPU_X86_64 // ...and x64, we use rXX macros. #define rAX "%%rax" #define rCX "%%rcx" #define rDX "%%rdx" #define rBX "%%rbx" #define rSP "%%rsp" #define rBP "%%rbp" #define rSI "%%rsi" #define rDI "%%rdi" #endif // Elements 0 to 3 are used for holding checksum terms a1, a2, // b1, b2 respectively. These elements are filled by asm code. // Elements 4 and 5 are used by asm code to for ANDing MMX data and removing // 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are // setting word index 0 and word index 2 to zero). // Element 6 and 7 are used for setting a1 and a2 to 1. volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) = {0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1}; if ((size_in_bytes >> 19) > 0) { // Size is too large. Must be less than 2^19 bytes = 512 KB. return false; } // Number of 32-bit words which are not added to a1/a2 in the main loop. uint32 remaining_words = (size_in_bytes % 48) / 4; // Since we are moving 48 bytes at a time number of iterations = total size/48 // is value of counter. uint32 num_of_48_byte_units = size_in_bytes / 48; asm volatile ( // Source address is in ESI (extended source index) // destination is in EDI (extended destination index) // and counter is already in ECX (extended counter // index). "cmp $0, " rCX ";" // Compare counter to zero. "jz END;" // XMM6 is initialized with 1 and XMM7 with 0. "prefetchnta 0(" rSI ");" "prefetchnta 64(" rSI ");" "movdqu 48(" rAX "), %%xmm6;" "xorps %%xmm7, %%xmm7;" // Start of the loop which copies 48 bytes from source to dst each time. "TOP:\n" // Make 6 moves each of 16 bytes from srcmem to XMM registers. // We are using 2 words out of 4 words in each XMM register, // word index 0 and word index 2 "movdqa 0(" rSI "), %%xmm0;" "movdqu 4(" rSI "), %%xmm1;" // Be careful to use unaligned move here. "movdqa 16(" rSI "), %%xmm2;" "movdqu 20(" rSI "), %%xmm3;" "movdqa 32(" rSI "), %%xmm4;" "movdqu 36(" rSI "), %%xmm5;" // Move 3 * 16 bytes from XMM registers to dstmem. // Note: this copy must be performed before pinsrw instructions since // they will modify the XMM registers. "movntdq %%xmm0, 0(" rDI ");" "movntdq %%xmm2, 16(" rDI ");" "movntdq %%xmm4, 32(" rDI ");" // Sets the word[1] and word[3] of XMM0 to XMM5 to zero. "andps 32(" rAX "), %%xmm0;" "andps 32(" rAX "), %%xmm1;" "andps 32(" rAX "), %%xmm2;" "andps 32(" rAX "), %%xmm3;" "andps 32(" rAX "), %%xmm4;" "andps 32(" rAX "), %%xmm5;" // Add XMM0 to XMM6 and then add XMM6 to XMM7. // Repeat this for XMM1, ..., XMM5. // Overflow(for XMM7) can occur only if there are more // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so // if size_in_bytes > 2^19 than overflow occurs. "paddq %%xmm0, %%xmm6;" "paddq %%xmm6, %%xmm7;" "paddq %%xmm1, %%xmm6;" "paddq %%xmm6, %%xmm7;" "paddq %%xmm2, %%xmm6;" "paddq %%xmm6, %%xmm7;" "paddq %%xmm3, %%xmm6;" "paddq %%xmm6, %%xmm7;" "paddq %%xmm4, %%xmm6;" "paddq %%xmm6, %%xmm7;" "paddq %%xmm5, %%xmm6;" "paddq %%xmm6, %%xmm7;" // Increment ESI and EDI by 48 bytes and decrement counter by 1. "add $48, " rSI ";" "add $48, " rDI ";" "prefetchnta 0(" rSI ");" "prefetchnta 64(" rSI ");" "dec " rCX ";" "jnz TOP;" // Now only remaining_words 32-bit words are left. // make a loop, add first two words to a1 and next two to a2 (just like // above loop, the only extra thing we are doing is rechecking // rDX (=remaining_words) everytime we add a number to a1/a2. "REM_IS_STILL_NOT_ZERO:\n" // Unless remaining_words becomes less than 4 words(16 bytes) // there is not much issue and remaining_words will always // be a multiple of four by assumption. "cmp $4, " rDX ";" // In case for some weird reasons if remaining_words becomes // less than 4 but not zero then also break the code and go off to END. "jl END;" // Otherwise just go on and copy data in chunks of 4-words at a time till // whole data (<48 bytes) is copied. "movdqa 0(" rSI "), %%xmm0;" // Copy next 4-words to XMM0 and to XMM1. "movdqa 0(" rSI "), %%xmm5;" // Accomplish movdqu 4(%rSI) without "pshufd $0x39, %%xmm5, %%xmm1;" // indexing off memory boundary. "movntdq %%xmm0, 0(" rDI ");" // Copy 4-words to destination. "andps 32(" rAX "), %%xmm0;" "andps 32(" rAX "), %%xmm1;" "paddq %%xmm0, %%xmm6;" "paddq %%xmm6, %%xmm7;" "paddq %%xmm1, %%xmm6;" "paddq %%xmm6, %%xmm7;" "add $16, " rSI ";" "add $16, " rDI ";" "sub $4, " rDX ";" // Decrement %rDX by 4 since %rDX is number of 32-bit // words left after considering all 48-byte units. "jmp REM_IS_STILL_NOT_ZERO;" "END:\n" // Report checksum values A and B (both right now are two concatenated // 64 bit numbers and have to be converted to 64 bit numbers) // seems like Adler128 (since size of each part is 4 byte rather than // 1 byte). "movdqa %%xmm6, 0(" rAX ");" "movdqa %%xmm7, 16(" rAX ");" "sfence;" // No output registers. : // Input registers. : "S" (srcmem64), "D" (dstmem64), "a" (checksum_arr), "c" (num_of_48_byte_units), "d" (remaining_words) ); // asm. if (checksum != NULL) { checksum->Set(checksum_arr[0], checksum_arr[1], checksum_arr[2], checksum_arr[3]); } // Everything went fine, so return true (this does not mean // that there is no problem with memory this just mean that data was copied // from src to dst and checksum was calculated successfully). return true; #elif defined(STRESSAPPTEST_CPU_ARMV7A) && defined(__ARM_NEON__) // Elements 0 to 3 are used for holding checksum terms a1, a2, // b1, b2 respectively. These elements are filled by asm code. // Checksum is seeded with the null checksum. volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) = {1, 1, 0, 0}; if ((size_in_bytes >> 19) > 0) { // Size is too large. Must be less than 2^19 bytes = 512 KB. return false; } // Since we are moving 64 bytes at a time number of iterations = total size/64 uint32 blocks = size_in_bytes / 64; uint64 *dst = dstmem64; uint64 *src = srcmem64; #define src_r "r3" #define dst_r "r4" #define blocks_r "r5" #define crc_r "r6" asm volatile ( "mov " src_r ", %[src]; \n" "mov " dst_r ", %[dst]; \n" "mov " crc_r ", %[crc]; \n" "mov " blocks_r ", %[blocks]; \n" // Loop over block count. "cmp " blocks_r ", #0; \n" // Compare counter to zero. "ble END; \n" // Preload upcoming cacheline. "pld [" src_r ", #0x0]; \n" "pld [" src_r ", #0x20]; \n" // Init checksum "vldm " crc_r ", {q0}; \n" "vmov.i32 q1, #0; \n" // Start of the loop which copies 48 bytes from source to dst each time. "TOP: \n" // Make 3 moves each of 16 bytes from srcmem to qX registers. // We are using 2 words out of 4 words in each qX register, // word index 0 and word index 2. We'll swizzle them in a bit. // Copy it. "vldm " src_r "!, {q8, q9, q10, q11}; \n" "vstm " dst_r "!, {q8, q9, q10, q11}; \n" // Arrange it. "vmov.i64 q12, #0; \n" "vmov.i64 q13, #0; \n" "vmov.i64 q14, #0; \n" "vmov.i64 q15, #0; \n" // This exchenges words 1,3 in the filled registers with // words 0,2 in the empty registers. "vtrn.32 q8, q12; \n" "vtrn.32 q9, q13; \n" "vtrn.32 q10, q14; \n" "vtrn.32 q11, q15; \n" // Sum into q0, then into q1. // Repeat this for q8 - q13. // Overflow can occur only if there are more // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so // if size_in_bytes > 2^19 than overflow occurs. "vadd.i64 q0, q0, q8; \n" "vadd.i64 q1, q1, q0; \n" "vadd.i64 q0, q0, q12; \n" "vadd.i64 q1, q1, q0; \n" "vadd.i64 q0, q0, q9; \n" "vadd.i64 q1, q1, q0; \n" "vadd.i64 q0, q0, q13; \n" "vadd.i64 q1, q1, q0; \n" "vadd.i64 q0, q0, q10; \n" "vadd.i64 q1, q1, q0; \n" "vadd.i64 q0, q0, q14; \n" "vadd.i64 q1, q1, q0; \n" "vadd.i64 q0, q0, q11; \n" "vadd.i64 q1, q1, q0; \n" "vadd.i64 q0, q0, q15; \n" "vadd.i64 q1, q1, q0; \n" // Increment counter and loop. "sub " blocks_r ", " blocks_r ", #1; \n" "cmp " blocks_r ", #0; \n" // Compare counter to zero. "bgt TOP; \n" "END:\n" // Report checksum values A and B (both right now are two concatenated // 64 bit numbers and have to be converted to 64 bit numbers) // seems like Adler128 (since size of each part is 4 byte rather than // 1 byte). "vstm " crc_r ", {q0, q1}; \n" // Output registers. : // Input registers. : [src] "r"(src), [dst] "r"(dst), [blocks] "r"(blocks) , [crc] "r"(checksum_arr) : "memory", "cc", "r3", "r4", "r5", "r6", "q0", "q1", "q8","q9","q10", "q11", "q12","q13","q14","q15" ); // asm. if (checksum != NULL) { checksum->Set(checksum_arr[0], checksum_arr[1], checksum_arr[2], checksum_arr[3]); } // Everything went fine, so return true (this does not mean // that there is no problem with memory this just mean that data was copied // from src to dst and checksum was calculated successfully). return true; #else #warning "No vector copy defined for this architecture." // Fall back to C implementation for anything else. return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum); #endif }