#include #include #include #include #include #include "libswscale/ops.h" typedef void (*sws_read_t)(void *, int, const uint8_t *, const uint8_t *, const uint8_t *, const uint8_t *); typedef void (*sws_write_t)(const void *, int, uint8_t *, uint8_t *, uint8_t *, uint8_t *); typedef void (*sws_post_op_t)(void *, const void *priv); typedef struct SwsOpEntry { #define SWS_OP_MAX_ENTRY 4 SwsOp ops[SWS_OP_MAX_ENTRY]; union { sws_read_t read; sws_write_t write; sws_post_op_t post_op; }; } SwsOpEntry; static const SwsOpEntry sws_op_table_c[]; static const SwsOpEntry sws_op_table_c_hbd[] = {}; #define BIT_DEPTH 8 #include "libswscale/ops_template.c" #undef BIT_DEPTH #define BIT_DEPTH 16 #include "libswscale/ops_template.c" #undef BIT_DEPTH //#define TEST //#define DO_LUT //#define DO_MAT //#define DO_MAT2 enum { /* For the benchmark */ WIDTH = 1920, HEIGHT = 1080, ITERS = 1000, }; static av_always_inline void merged(const uint8_t *restrict in0, const uint8_t *restrict in1, const uint8_t *restrict in2, const uint8_t *restrict in3, uint8_t *restrict out0, uint8_t *restrict out1, uint8_t *restrict out2, uint8_t *restrict out3, const int pixels) { union chunk_t_8bpc x8, y8, z8, w8; union chunk_t_16bpc x16, y16, z16, w16; union chunk_t_16bpc x, y, z, w; read_planar_8bpc(in0, in1, in2, in3, &x8, &y8, &z8, &w8, pixels, 3); expand_8_to_16bpc(&x, &y, &z, &w, (void *) &x8, (void *) &y8, (void *) &z8, (void *) &w8, (SwsExpandOp) { .depth = 8 }); compress_16_from_16bpc(&x16, &y16, &z16, &w16, &x, &y, &z, &w, (SwsCompressOp) { .depth = 10 }); //swap_bytes_16bpc(&x16, &y16, &z16, &w16); write_planar_16bpc((void *) out0, (void *) out1, (void *) out2, (void *) out3, &x16, &y16, &z16, &w16, pixels, 3); } void run_direct(const AVFrame *dst, const AVFrame *src, const AVFrame *inter) { const int src_w = src->width; const int src_h = src->height; const int dst_w = dst->width; const int dst_h = dst->height; const int base_w = dst_w & ~(SWS_CHUNK_SIZE - 1); const int rest_w = dst_w - base_w; for (int y = 0; y < src_h; y++) { const uint8_t *in0 = src->data[0] + y * src->linesize[0]; const uint8_t *in1 = src->data[1] + y * src->linesize[1]; const uint8_t *in2 = src->data[2] + y * src->linesize[2]; const uint8_t *in3 = src->data[3] + y * src->linesize[3]; uint8_t *out0 = dst->data[0] + y * dst->linesize[0]; uint8_t *out1 = dst->data[1] + y * dst->linesize[1]; uint8_t *out2 = dst->data[2] + y * dst->linesize[2]; uint8_t *out3 = dst->data[3] + y * dst->linesize[3]; for (int x = 0; x < base_w; x += SWS_CHUNK_SIZE) { merged(in0, in1, in2, in3, out0, out1, out2, out3, SWS_CHUNK_SIZE); in0 += SWS_CHUNK_SIZE * sizeof(uint8_t[1]); in1 += SWS_CHUNK_SIZE * sizeof(uint8_t[1]); in2 += SWS_CHUNK_SIZE * sizeof(uint8_t[1]); in3 += SWS_CHUNK_SIZE * sizeof(uint8_t[1]); out0 += SWS_CHUNK_SIZE * sizeof(uint16_t[1]); out1 += SWS_CHUNK_SIZE * sizeof(uint16_t[1]); out2 += SWS_CHUNK_SIZE * sizeof(uint16_t[1]); out3 += SWS_CHUNK_SIZE * sizeof(uint16_t[1]); } if (rest_w) merged(in0, in1, in2, in3, out0, out1, out2, out3, rest_w); } } #include #include #include #include "libswscale/swscale.h" void bench(const AVFrame *dst, const AVFrame *src, const AVFrame *inter, const char *const name, void (*run)(const AVFrame *, const AVFrame *, const AVFrame *)) { for (int p = 0; p < 3; p++) { if (dst->data[p]) memset(dst->data[p], 0x00, dst->linesize[p] * dst->height); } int64_t time = av_gettime_relative(); for (int i = 0; i < ITERS; i++) { run(dst, src, inter); #ifdef TEST for (int p = 0; p < 3; p++) { if (memcmp(src->data[p], dst->data[p], src->linesize[p] * src->height)) exit(1); } #endif } time = av_gettime_relative() - time; printf("%15s: %"PRId64" us\n", name, time / ITERS); } void bench_swscale(const AVFrame *src, AVFrame *dst) { SwsContext *sws = sws_alloc_context(); int ret; sws->flags |= SWS_PRINT_INFO; for (int p = 0; p < 3; p++) { if (dst->data[p]) memset(dst->data[p], 0x00, dst->linesize[p] * dst->height); } ret = sws_frame_setup(sws, dst, src); if (ret) exit(ret); int64_t time = av_gettime_relative(); for (int i = 0; i < ITERS; i++) sws_scale_frame(sws, dst, src); time = av_gettime_relative() - time; printf("%15s: %"PRId64" us\n", "swscale", time / ITERS); sws_free_context(&sws); } int main() { AVFrame *src, *inter, *dst; src = av_frame_alloc(); inter = av_frame_alloc(); dst = av_frame_alloc(); src->width = WIDTH; src->height = HEIGHT; src->format = AV_PIX_FMT_YUV444P; inter->width = FFALIGN(WIDTH, SWS_CHUNK_SIZE); inter->height = HEIGHT; inter->format = AV_PIX_FMT_RGBA64; dst->width = WIDTH; dst->height = HEIGHT; dst->format = AV_PIX_FMT_YUV444P10LE; av_frame_get_buffer(src, 0); av_frame_get_buffer(dst, 0); av_frame_get_buffer(inter, 0); for (int p = 0; p < 3; p++) memset(src->data[p], 0xAA, src->linesize[p] * src->height); bench(dst, src, inter, "direct", run_direct); bench_swscale(src, dst); av_frame_free(&src); av_frame_free(&dst); return 0; } static const m3i16_t identity_mat3 = {{ { 16384, 0, 0 }, { 0, 16384, 0 }, { 0, 0, 16384 }, }}; static const uint16_t identity_lut3[256] = { 0x0000, 0x0101, 0x0202, 0x0303, 0x0404, 0x0505, 0x0606, 0x0707, 0x0808, 0x0909, 0x0A0A, 0x0B0B, 0x0C0C, 0x0D0D, 0x0E0E, 0x0F0F, 0x1010, 0x1111, 0x1212, 0x1313, 0x1414, 0x1515, 0x1616, 0x1717, 0x1818, 0x1919, 0x1A1A, 0x1B1B, 0x1C1C, 0x1D1D, 0x1E1E, 0x1F1F, 0x2020, 0x2121, 0x2222, 0x2323, 0x2424, 0x2525, 0x2626, 0x2727, 0x2828, 0x2929, 0x2A2A, 0x2B2B, 0x2C2C, 0x2D2D, 0x2E2E, 0x2F2F, 0x3030, 0x3131, 0x3232, 0x3333, 0x3434, 0x3535, 0x3636, 0x3737, 0x3838, 0x3939, 0x3A3A, 0x3B3B, 0x3C3C, 0x3D3D, 0x3E3E, 0x3F3F, 0x4040, 0x4141, 0x4242, 0x4343, 0x4444, 0x4545, 0x4646, 0x4747, 0x4848, 0x4949, 0x4A4A, 0x4B4B, 0x4C4C, 0x4D4D, 0x4E4E, 0x4F4F, 0x5050, 0x5151, 0x5252, 0x5353, 0x5454, 0x5555, 0x5656, 0x5757, 0x5858, 0x5959, 0x5A5A, 0x5B5B, 0x5C5C, 0x5D5D, 0x5E5E, 0x5F5F, 0x6060, 0x6161, 0x6262, 0x6363, 0x6464, 0x6565, 0x6666, 0x6767, 0x6868, 0x6969, 0x6A6A, 0x6B6B, 0x6C6C, 0x6D6D, 0x6E6E, 0x6F6F, 0x7070, 0x7171, 0x7272, 0x7373, 0x7474, 0x7575, 0x7676, 0x7777, 0x7878, 0x7979, 0x7A7A, 0x7B7B, 0x7C7C, 0x7D7D, 0x7E7E, 0x7F7F, 0x8080, 0x8181, 0x8282, 0x8383, 0x8484, 0x8585, 0x8686, 0x8787, 0x8888, 0x8989, 0x8A8A, 0x8B8B, 0x8C8C, 0x8D8D, 0x8E8E, 0x8F8F, 0x9090, 0x9191, 0x9292, 0x9393, 0x9494, 0x9595, 0x9696, 0x9797, 0x9898, 0x9999, 0x9A9A, 0x9B9B, 0x9C9C, 0x9D9D, 0x9E9E, 0x9F9F, 0xA0A0, 0xA1A1, 0xA2A2, 0xA3A3, 0xA4A4, 0xA5A5, 0xA6A6, 0xA7A7, 0xA8A8, 0xA9A9, 0xAAAA, 0xABAB, 0xACAC, 0xADAD, 0xAEAE, 0xAFAF, 0xB0B0, 0xB1B1, 0xB2B2, 0xB3B3, 0xB4B4, 0xB5B5, 0xB6B6, 0xB7B7, 0xB8B8, 0xB9B9, 0xBABA, 0xBBBB, 0xBCBC, 0xBDBD, 0xBEBE, 0xBFBF, 0xC0C0, 0xC1C1, 0xC2C2, 0xC3C3, 0xC4C4, 0xC5C5, 0xC6C6, 0xC7C7, 0xC8C8, 0xC9C9, 0xCACA, 0xCBCB, 0xCCCC, 0xCDCD, 0xCECE, 0xCFCF, 0xD0D0, 0xD1D1, 0xD2D2, 0xD3D3, 0xD4D4, 0xD5D5, 0xD6D6, 0xD7D7, 0xD8D8, 0xD9D9, 0xDADA, 0xDBDB, 0xDCDC, 0xDDDD, 0xDEDE, 0xDFDF, 0xE0E0, 0xE1E1, 0xE2E2, 0xE3E3, 0xE4E4, 0xE5E5, 0xE6E6, 0xE7E7, 0xE8E8, 0xE9E9, 0xEAEA, 0xEBEB, 0xECEC, 0xEDED, 0xEEEE, 0xEFEF, 0xF0F0, 0xF1F1, 0xF2F2, 0xF3F3, 0xF4F4, 0xF5F5, 0xF6F6, 0xF7F7, 0xF8F8, 0xF9F9, 0xFAFA, 0xFBFB, 0xFCFC, 0xFDFD, 0xFEFE, 0xFFFF, };