/*
 * Copyright 2017 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include "SkColorPriv.h"
#include "SkCpu.h"
#include "SkJumper.h"
#include "SkRasterPipeline.h"
#include "SkTemplates.h"

// A debugging mode that helps prioritize porting stages to SkJumper.
#if 0
    #include "SkOnce.h"
    #include <atomic>

    #define M(st) {0},
    static std::atomic<int> gMissing[] = { SK_RASTER_PIPELINE_STAGES(M) };
    #undef M

    #define M(st) #st,
    static const char* gNames[] = { SK_RASTER_PIPELINE_STAGES(M) };
    #undef M

    #define WHATS_NEXT
#endif

// We'll use __has_feature(memory_sanitizer) to detect MSAN.
// SkJumper_generated.S is not compiled with MSAN, so MSAN would yell really loud.
#if !defined(__has_feature)
    #define __has_feature(x) 0
#endif

// Stages expect these constants to be set to these values.
// It's fine to rearrange and add new ones if you update SkJumper_constants.
using K = const SkJumper_constants;
static K kConstants = {
    {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f},
};

#define STAGES(M)         \
    M(seed_shader)        \
    M(constant_color)     \
    M(clear)              \
    M(plus_)              \
    M(srcover)            \
    M(dstover)            \
    M(clamp_0)            \
    M(clamp_1)            \
    M(clamp_a)            \
    M(set_rgb)            \
    M(swap_rb)            \
    M(swap)               \
    M(move_src_dst)       \
    M(move_dst_src)       \
    M(premul)             \
    M(unpremul)           \
    M(from_srgb)          \
    M(to_srgb)            \
    M(scale_1_float)      \
    M(scale_u8)           \
    M(lerp_1_float)       \
    M(lerp_u8)            \
    M(lerp_565)           \
    M(load_tables)        \
    M(load_a8)            \
    M(store_a8)           \
    M(load_565)           \
    M(store_565)          \
    M(load_8888)          \
    M(store_8888)         \
    M(load_f16)           \
    M(store_f16)          \
    M(store_f32)          \
    M(luminance_to_alpha) \
    M(matrix_2x3)         \
    M(matrix_3x4)         \
    M(matrix_4x5)         \
    M(matrix_perspective) \
    M(clamp_x)            \
    M(clamp_y)            \
    M(repeat_x)           \
    M(repeat_y)           \
    M(mirror_x)           \
    M(mirror_y)           \
    M(linear_gradient_2stops)

// We can't express the real types of most stage functions portably, so we use a stand-in.
// We'll only ever call start_pipeline(), which then chains into the rest for us.
using StageFn = void(void);

// TODO: maybe don't need this wrapper anymore.
#define ASM(name, suffix) sk_##name##_##suffix

extern "C" {

#if __has_feature(memory_sanitizer)
    // We'll just run portable code.

#elif defined(__aarch64__)
    size_t ASM(start_pipeline,aarch64)(size_t, void**, K*, size_t);
    StageFn ASM(just_return,aarch64);
    #define M(st) StageFn ASM(st,aarch64);
        STAGES(M)
    #undef M

#elif defined(__arm__)
    size_t ASM(start_pipeline,vfp4)(size_t, void**, K*, size_t);
    StageFn ASM(just_return,vfp4);
    #define M(st) StageFn ASM(st,vfp4);
        STAGES(M)
    #undef M

#elif defined(__x86_64__) || defined(_M_X64)
    size_t ASM(start_pipeline,hsw  )(size_t, void**, K*, size_t);
    size_t ASM(start_pipeline,avx  )(size_t, void**, K*, size_t);
    size_t ASM(start_pipeline,sse41)(size_t, void**, K*, size_t);
    size_t ASM(start_pipeline,sse2 )(size_t, void**, K*, size_t);

    StageFn ASM(just_return,hsw),
            ASM(just_return,avx),
            ASM(just_return,sse41),
            ASM(just_return,sse2);

    #define M(st) StageFn ASM(st,hsw);
        STAGES(M)
    #undef M
    #define M(st) StageFn ASM(st,avx);
        STAGES(M)
    #undef M
    #define M(st) StageFn ASM(st,sse41);
        STAGES(M)
    #undef M
    #define M(st) StageFn ASM(st,sse2);
        STAGES(M)
    #undef M
#endif

    // Portable, single-pixel stages.
    size_t sk_start_pipeline(size_t, void**, K*, size_t);
    StageFn sk_just_return;
    #define M(st) StageFn sk_##st;
        STAGES(M)
    #undef M
}

// Translate SkRasterPipeline's StockStage enum to StageFn function pointers.

#if __has_feature(memory_sanitizer)
    // We'll just run portable code.

#elif defined(__aarch64__)
    static StageFn* lookup_aarch64(SkRasterPipeline::StockStage st) {
        switch (st) {
            default: return nullptr;
        #define M(st) case SkRasterPipeline::st: return ASM(st,aarch64);
            STAGES(M)
        #undef M
        }
    }

#elif defined(__arm__)
    static StageFn* lookup_vfp4(SkRasterPipeline::StockStage st) {
        switch (st) {
            default: return nullptr;
        #define M(st) case SkRasterPipeline::st: return ASM(st,vfp4);
            STAGES(M)
        #undef M
        }
    }

#elif defined(__x86_64__) || defined(_M_X64)
    static StageFn* lookup_hsw(SkRasterPipeline::StockStage st) {
        switch (st) {
            default:
        #ifdef WHATS_NEXT
                gMissing[st]++;
        #endif
                return nullptr;
        #define M(st) case SkRasterPipeline::st: return ASM(st,hsw);
            STAGES(M)
        #undef M
        }
    }
    static StageFn* lookup_avx(SkRasterPipeline::StockStage st) {
        switch (st) {
            default:
        #ifdef WHATS_NEXT
                gMissing[st]++;
        #endif
                return nullptr;
        #define M(st) case SkRasterPipeline::st: return ASM(st,avx);
            STAGES(M)
        #undef M
        }
    }
    static StageFn* lookup_sse41(SkRasterPipeline::StockStage st) {
        switch (st) {
            default:
        #ifdef WHATS_NEXT
                gMissing[st]++;
        #endif
                return nullptr;
        #define M(st) case SkRasterPipeline::st: return ASM(st,sse41);
            STAGES(M)
        #undef M
        }
    }
    static StageFn* lookup_sse2(SkRasterPipeline::StockStage st) {
        switch (st) {
            default: return nullptr;
        #define M(st) case SkRasterPipeline::st: return ASM(st,sse2);
            STAGES(M)
        #undef M
        }
    }
#endif

static StageFn* lookup_portable(SkRasterPipeline::StockStage st) {
    switch (st) {
        default: return nullptr;
    #define M(st) case SkRasterPipeline::st: return sk_##st;
        STAGES(M)
    #undef M
    }
}

bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
#ifdef WHATS_NEXT
    static SkOnce once;
    once([] {
        atexit([] {
            for (int i = 0; i < (int)SK_ARRAY_COUNT(gMissing); i++) {
                SkDebugf("%10d %s\n", gMissing[i].load(), gNames[i]);
            }
        });
    });
#endif

    SkAutoSTMalloc<64, void*> program(2*fStages.size() + 1);
    const size_t limit = x+n;

    auto build_and_run = [&](size_t   min_stride,
                             StageFn* (*lookup)(SkRasterPipeline::StockStage),
                             StageFn* just_return,
                             size_t   (*start_pipeline)(size_t, void**, K*, size_t)) {
        if (x + min_stride <= limit) {
            void** ip = program.get();
            for (auto&& st : fStages) {
                auto fn = lookup(st.stage);
                if (!fn) {
                    return false;
                }
                *ip++ = (void*)fn;
                if (st.ctx) {
                    *ip++ = st.ctx;
                }
            }
            *ip = (void*)just_return;

            x = start_pipeline(x, program.get(), &kConstants, limit);
        }
        return true;
    };

    // While possible, build and run at full vector stride.
#if __has_feature(memory_sanitizer)
    // We'll just run portable code.

#elif defined(__aarch64__)
    if (!build_and_run(4, lookup_aarch64, ASM(just_return,aarch64), ASM(start_pipeline,aarch64))) {
        return false;
    }

#elif defined(__arm__)
    if (1 && SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
        if (!build_and_run(2, lookup_vfp4, ASM(just_return,vfp4), ASM(start_pipeline,vfp4))) {
            return false;
        }
    }

#elif defined(__x86_64__) || defined(_M_X64)
    if (1 && SkCpu::Supports(SkCpu::HSW)) {
        if (!build_and_run(1, lookup_hsw, ASM(just_return,hsw), ASM(start_pipeline,hsw))) {
            return false;
        }
    }
    if (1 && SkCpu::Supports(SkCpu::AVX)) {
        if (!build_and_run(1, lookup_avx, ASM(just_return,avx), ASM(start_pipeline,avx))) {
            return false;
        }
    }
    if (1 && SkCpu::Supports(SkCpu::SSE41)) {
        if (!build_and_run(4, lookup_sse41, ASM(just_return,sse41), ASM(start_pipeline,sse41))) {
            return false;
        }
    }
    if (1 && SkCpu::Supports(SkCpu::SSE2)) {
        if (!build_and_run(4, lookup_sse2, ASM(just_return,sse2), ASM(start_pipeline,sse2))) {
            return false;
        }
    }
#endif

    // Finish up any leftover with portable code one pixel at a time.
    return build_and_run(1, lookup_portable, sk_just_return, sk_start_pipeline);
}