/*
 * Copyright (C) 2025 Niklas Haas
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include <assert.h>

#include <libavutil/avassert.h>
#include <libavutil/bswap.h>

#include "utils.h"
#include "ops.h"

#ifndef BIT_DEPTH
# define BIT_DEPTH 8
#endif

#if BIT_DEPTH > 8
#  define fn(name)   name ## _16bpc
#  define VALUE_BITS 19
#  define VALUE_MAX  0x7FFFF
#  define PIXEL_MAX  0xFFFF
#  define pixel_t    uint16_t
#  define value_t    int32_t
#  define value2_t   v2i32_t
#  define value3_t   v3i32_t
#  define value4_t   v4i32_t
#else
#  define fn(name)   name ## _8bpc
#  define VALUE_BITS 15
#  define VALUE_MAX  0x7FFF
#  define PIXEL_MAX  0xFF
#  define pixel_t    uint8_t
#  define value_t    int16_t
#  define value2_t   v2i16_t
#  define value3_t   v3i16_t
#  define value4_t   v4i16_t
#endif

union fn(chunk_t) {
    /* Universal accessors */
    uint8_t  u8[SWS_CHUNK_SIZE];
    uint16_t u16[SWS_CHUNK_SIZE];
    /* Per-bit depth accessors */
    pixel_t  p[SWS_CHUNK_SIZE];
    value_t  v[SWS_CHUNK_SIZE];
};

#define chunk_t union fn(chunk_t)

/* Extend the border of a chunk */
static av_always_inline void
fn(pad_input)(chunk_t *restrict x, chunk_t *restrict y,
              chunk_t *restrict z, chunk_t *restrict w,
              const int pixels)
{
    const pixel_t edge_x = x->p[pixels - 1];
    const pixel_t edge_y = y->p[pixels - 1];
    const pixel_t edge_z = z->p[pixels - 1];
    const pixel_t edge_w = w->p[pixels - 1];
    for (int i = pixels; i < SWS_CHUNK_SIZE; i++) {
        x->p[i] = edge_x;
        y->p[i] = edge_y;
        z->p[i] = edge_z;
        w->p[i] = edge_w;
    }
}

static av_always_inline void
fn(read_planar)(const pixel_t *restrict in0, const pixel_t *restrict in1,
                const pixel_t *restrict in2, const pixel_t *restrict in3,
                chunk_t *restrict x, chunk_t *restrict y,
                chunk_t *restrict z, chunk_t *restrict w,
                const int pixels, const int comps)
{
    for (int i = 0; i < pixels; i++) {
        x->p[i] =               in0[i];
        y->p[i] = (comps > 1) ? in1[i] : 0;
        z->p[i] = (comps > 2) ? in2[i] : 0;
        w->p[i] = (comps > 3) ? in3[i] : PIXEL_MAX;
    }

    fn(pad_input)(x, y, z, w, pixels);
}

static av_always_inline void
fn(read_packed)(const pixel_t *restrict in0, const pixel_t *restrict in1,
                const pixel_t *restrict in2, const pixel_t *restrict in3,
                chunk_t *restrict x, chunk_t *restrict y,
                chunk_t *restrict z, chunk_t *restrict w,
                const int pixels, const int comps)
{
    for (int i = 0; i < pixels; i++) {
        x->p[i] =               in0[comps * i + 0];
        y->p[i] = (comps > 1) ? in0[comps * i + 1] : 0;
        z->p[i] = (comps > 2) ? in0[comps * i + 2] : 0;
        w->p[i] = (comps > 3) ? in0[comps * i + 3] : 0xFF;
    }

    fn(pad_input)(x, y, z, w, pixels);
}

static av_always_inline void
fn(write_planar)(pixel_t *restrict out0, pixel_t *restrict out1,
                 pixel_t *restrict out2, pixel_t *restrict out3,
                 const chunk_t *restrict x, const chunk_t *restrict y,
                 const chunk_t *restrict z, const chunk_t *restrict w,
                 const int pixels, const int comps)
{
    for (int i = 0; i < pixels; i++) {
        out0[i] = x->p[i];
        if (comps > 1)
            out1[i] = y->p[i];
        if (comps > 2)
            out2[i] = z->p[i];
        if (comps > 3)
            out3[i] = w->p[i];
    }
}

static av_always_inline void
fn(write_packed)(pixel_t *restrict out0, pixel_t *restrict out1,
                 pixel_t *restrict out2, pixel_t *restrict out3,
                 const chunk_t *restrict x, const chunk_t *restrict y,
                 const chunk_t *restrict z, const chunk_t *restrict w,
                 const int pixels, const int comps)
{
    for (int i = 0; i < pixels; i++) {
        out0[comps * i + 0] = x->p[i];
        if (comps > 1)
            out0[comps * i + 1] = y->p[i];
        if (comps > 2)
            out0[comps * i + 2] = z->p[i];
        if (comps > 3)
            out0[comps * i + 3] = w->p[i];
    }
}

static av_always_inline void
fn(swizzle)(chunk_t *restrict x, chunk_t *restrict y,
            chunk_t *restrict z, chunk_t *restrict w,
            const SwsSwizzleOp swizzle)
{
    const chunk_t in[4] = { *x, *y, *z, *w };
    *x = in[swizzle.x];
    *y = in[swizzle.y];
    *z = in[swizzle.z];
    *w = in[swizzle.w];
}

/**
 * We need all 2x2 combinations of (input depth, output depth) for these
 * functions, so we template them by value depth and make the pixel depth
 * non-generic; e.g. shift_chunk_8_to_16 expands 8 bit input to HBD value_t
 */
static av_always_inline void
fn(expand_chunk_8_to)(chunk_t *restrict out, const chunk_t *restrict in,
                      const bool full)
{
    static_assert(VALUE_BITS >= 8 && VALUE_BITS <= 24, "VALUE_BITS out of range");
    const int shift = VALUE_BITS - 8;
#if 0
    if (full) {
        if (VALUE_BITS > 16) {
            /* Three copies needed */
            const int rshift = 24 - VALUE_BITS;
            for (int i = 0; i < SWS_CHUNK_SIZE; i++)
                out->v[i] = (in->u8[i] * 0x10101LU) >> rshift;
        } else {
            const int rshift = VALUE_BITS - shift;
            for (int i = 0; i < SWS_CHUNK_SIZE; i++)
                out->v[i] = (in->u8[i] << shift) | (in->u8[i] >> rshift);
        }
    } else {
        for (int i = 0; i < SWS_CHUNK_SIZE; i++)
            out->v[i] = in->u8[i] << shift;
    }
#else
    for (int i = 0; i < SWS_CHUNK_SIZE; i++) {
        const value_t x = in->u8[i] << shift;
        out->v[i] = full ? x | (x >> 8) | (x >> 16) : x;
    }
#endif
}

static av_always_inline void
fn(expand_chunk_16_to)(chunk_t *restrict out, const chunk_t *restrict in,
                       const int depth, const bool msb, const bool full)
{
    const int shift = VALUE_BITS - (msb ? 16 : depth);
    av_assert2(VALUE_BITS <= 2 * depth);
    for (int i = 0; i < SWS_CHUNK_SIZE; i++) {
        const uint16_t x16 = in->u16[i];
        const value_t x = shift >= 0 ? x16 << shift : x16 >> -shift;
        out->v[i] = full ? x | (x >> depth) : x;
    }
}

static av_always_inline void
fn(expand_8_to)(chunk_t *restrict out_x, chunk_t *restrict out_y,
                chunk_t *restrict out_z, chunk_t *restrict out_w,
                const chunk_t *restrict in_x, const chunk_t *restrict in_y,
                const chunk_t *restrict in_z, const chunk_t *restrict in_w,
                const SwsExpandOp op)
{
    av_assert2(op.depth == 8);
    fn(expand_chunk_8_to)(out_x, in_x, op.full.luma);
    fn(expand_chunk_8_to)(out_y, in_y, op.full.chroma);
    fn(expand_chunk_8_to)(out_z, in_z, op.full.chroma);
    fn(expand_chunk_8_to)(out_w, in_w, true);
}

static av_always_inline void
fn(expand_16_to)(chunk_t *restrict out_x, chunk_t *restrict out_y,
                 chunk_t *restrict out_z, chunk_t *restrict out_w,
                 const chunk_t *restrict in_x, const chunk_t *restrict in_y,
                 const chunk_t *restrict in_z, const chunk_t *restrict in_w,
                 const SwsExpandOp op)
{
    fn(expand_chunk_16_to)(out_x, in_x, op.depth, op.msb, op.full.luma);
    fn(expand_chunk_16_to)(out_y, in_y, op.depth, op.msb, op.full.chroma);
    fn(expand_chunk_16_to)(out_z, in_z, op.depth, op.msb, op.full.chroma);
    fn(expand_chunk_16_to)(out_w, in_w, op.depth, op.msb, true);
}

static av_always_inline void
fn(compress_8_from)(chunk_t *restrict out_x, chunk_t *restrict out_y,
                    chunk_t *restrict out_z, chunk_t *restrict out_w,
                    const chunk_t *restrict in_x, const chunk_t *restrict in_y,
                    const chunk_t *restrict in_z, const chunk_t *restrict in_w,
                    const SwsCompressOp op)
{
    av_assert2(op.depth == 8);
    const int shift = VALUE_BITS - 8;
    for (int i = 0; i < SWS_CHUNK_SIZE; i++) {
        out_x->u8[i] = av_clip_uint8(in_x->v[i] >> shift);
        out_y->u8[i] = av_clip_uint8(in_y->v[i] >> shift);
        out_z->u8[i] = av_clip_uint8(in_z->v[i] >> shift);
        out_w->u8[i] = av_clip_uint8(in_w->v[i] >> shift);
    }
}

#if BIT_DEPTH > 8 /* Never output 16 bit from low bit depth intermediates */
static av_always_inline void
fn(compress_16_from)(chunk_t *restrict out_x, chunk_t *restrict out_y,
                     chunk_t *restrict out_z, chunk_t *restrict out_w,
                     const chunk_t *restrict in_x, const chunk_t *restrict in_y,
                     const chunk_t *restrict in_z, const chunk_t *restrict in_w,
                     const SwsCompressOp op)
{
    const int shift16 = VALUE_BITS - 16;
    const int shift   = 16 - op.depth;
    const int mask  = ((1 << op.depth) - 1) << shift;
    for (int i = 0; i < SWS_CHUNK_SIZE; i++) {
        const uint16_t x16 = av_clip_uint16(in_x->v[i] >> shift16);
        const uint16_t y16 = av_clip_uint16(in_y->v[i] >> shift16);
        const uint16_t z16 = av_clip_uint16(in_z->v[i] >> shift16);
        const uint16_t w16 = av_clip_uint16(in_w->v[i] >> shift16);
        if (op.msb) {
            out_x->u16[i] = x16 & mask;
            out_y->u16[i] = y16 & mask;
            out_z->u16[i] = z16 & mask;
            out_w->u16[i] = w16 & mask;
        } else {
            out_x->u16[i] = x16 >> shift;
            out_y->u16[i] = y16 >> shift;
            out_z->u16[i] = z16 >> shift;
            out_w->u16[i] = w16 >> shift;
        }
    }
}
#endif

#if 0
#define DEF_READ_WRAPPER(name, planar, bits, comps, swizzle, decode)            \
static void fn(op_##name)(chunk4_t *restrict out, int pixels,                   \
                          const uint8_t *restrict in0,                          \
                          const uint8_t *restrict in1,                          \
                          const uint8_t *restrict in2,                          \
                          const uint8_t *restrict in3)                          \
{                                                                               \
    fn(read_##planar##_##bits)(out, pixels, in0, in1, in2, in3, comps);         \
    fn(swizzle)(out, swizzle);                                                  \
    fn(decode)(out, decode);                                                    \
}

DEF_READ_WRAPPER(test, planar, 4, SWS_FROM_RGBA, SWS_DEC_RGB8)

static const SwsOpEntry fn(sws_op_table_c)[] = {
    /* Atomic read wrappers */
    //{{ { SWS_OP_READ_BYTES, .rw_bytes = { 1, 1, false }} }, .read = op_read_planar_1 },

    /* Atomic write wrappers */
};
#endif

#undef fn
#undef VALUE_BITS
#undef VALUE_MAX
#undef PIXEL_MAX
#undef pixel_t
#undef value_t
#undef value2_t
#undef value3_t
#undef value4_t
#undef chunk_t