/*
 * Copyright 2016 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

/*
ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench  --samples 300 --nompd --match LinearSrcOver -q
 */

#ifndef SkBlend_opts_DEFINED
#define SkBlend_opts_DEFINED

#include "SkNx.h"
#include "SkPM4fPriv.h"

#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
    #include <immintrin.h>
#endif

namespace SK_OPTS_NS {

static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) {
    if (src >= 0xFF000000) {
        *dst = src;
        return;
    }
    auto d = Sk4f_fromS32(*dst),
         s = Sk4f_fromS32( src);
    *dst = Sk4f_toS32(s + d * (1.0f - s[3]));
}

static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {
    srcover_srgb_srgb_1(dst++, *src++);
    srcover_srgb_srgb_1(dst++, *src++);
    srcover_srgb_srgb_1(dst++, *src++);
    srcover_srgb_srgb_1(dst  , *src  );
}

#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

    static inline __m128i load(const uint32_t* p) {
        return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
    }

    static inline void store(uint32_t* p, __m128i v) {
        _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);
    }

    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

        static void srcover_srgb_srgb(
            uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
            const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
            while (ndst > 0) {
                int count = SkTMin(ndst, nsrc);
                ndst -= count;
                const uint32_t* src = srcStart;
                const uint32_t* end = dst + (count & ~3);
                ptrdiff_t delta = src - dst;

                while (dst < end) {
                    __m128i pixels = load(src);
                    if (_mm_testc_si128(pixels, alphaMask)) {
                         uint32_t* start = dst;
                        do {
                            store(dst, pixels);
                            dst += 4;
                        } while (dst < end
                                 && _mm_testc_si128(pixels = load(dst + delta), alphaMask));
                        src += dst - start;
                    } else if (_mm_testz_si128(pixels, alphaMask)) {
                        do {
                            dst += 4;
                            src += 4;
                        } while (dst < end
                                 && _mm_testz_si128(pixels = load(src), alphaMask));
                    } else {
                        uint32_t* start = dst;
                        do {
                            srcover_srgb_srgb_4(dst, dst + delta);
                            dst += 4;
                        } while (dst < end
                                 && _mm_testnzc_si128(pixels = load(dst + delta), alphaMask));
                        src += dst - start;
                    }
                }

                count = count & 3;
                while (count-- > 0) {
                    srcover_srgb_srgb_1(dst++, *src++);
                }
            }
        }
    #else
    // SSE2 versions

        // Note: In the next three comparisons a group of 4 pixels is converted to a group of
        // "signed" pixels because the sse2 does not have an unsigned comparison.
        // Make it so that we can use the signed comparison operators by biasing
        // 0x00xxxxxx to 0x80xxxxxxx which is the smallest values and biasing 0xffxxxxxx to
        // 0x7fxxxxxx which is the largest set of values.
        static inline bool check_opaque_alphas(__m128i pixels) {
            __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
            int mask =
                _mm_movemask_epi8(
                    _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000)));
            return mask == 0;
        }

        static inline bool check_transparent_alphas(__m128i pixels) {
            __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
            int mask =
                _mm_movemask_epi8(
                    _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF)));
            return mask == 0;
        }

        static inline bool check_partial_alphas(__m128i pixels) {
            __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
            __m128i opaque       = _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000));
            __m128i transparent  = _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF));
            int mask             = _mm_movemask_epi8(_mm_xor_si128(opaque, transparent));
            return mask == 0;
        }

        static void srcover_srgb_srgb(
            uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
            while (ndst > 0) {
                int count = SkTMin(ndst, nsrc);
                ndst -= count;
                const uint32_t* src = srcStart;
                const uint32_t* end = dst + (count & ~3);
                const ptrdiff_t delta = src - dst;

                __m128i pixels = load(src);
                do {
                    if (check_opaque_alphas(pixels)) {
                        uint32_t* start = dst;
                        do {
                            store(dst, pixels);
                            dst += 4;
                        } while (dst < end && check_opaque_alphas((pixels = load(dst + delta))));
                        src += dst - start;
                    } else if (check_transparent_alphas(pixels)) {
                        const uint32_t* start = dst;
                        do {
                            dst += 4;
                        } while (dst < end && check_transparent_alphas(pixels = load(dst + delta)));
                        src += dst - start;
                    } else {
                        const uint32_t* start = dst;
                        do {
                            srcover_srgb_srgb_4(dst, dst + delta);
                            dst += 4;
                        } while (dst < end && check_partial_alphas(pixels = load(dst + delta)));
                        src += dst - start;
                    }
                } while (dst < end);

                count = count & 3;
                while (count-- > 0) {
                    srcover_srgb_srgb_1(dst++, *src++);
                }
            }
        }
    #endif
#else

    static void srcover_srgb_srgb(
        uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
        while (ndst > 0) {
            int n = SkTMin(ndst, nsrc);

            for (int i = 0; i < n; i++) {
                srcover_srgb_srgb_1(dst++, src[i]);
            }
            ndst -= n;
        }
    }

#endif

}  // namespace SK_OPTS_NS

#endif//SkBlend_opts_DEFINED