From d0df8c271d9fb960e4a64ad07b08e9ab89e03b65 Mon Sep 17 00:00:00 2001 From: Holy Wu Date: Sat, 4 Jul 2020 22:42:20 +0800 Subject: [PATCH] Reduce code duplication --- CAS/CAS.cpp | 4 +- CAS/CAS_AVX2.cpp | 193 ++++++++++++++++----------------------------- CAS/CAS_AVX512.cpp | 193 ++++++++++++++++----------------------------- CAS/CAS_SSE2.cpp | 193 ++++++++++++++++----------------------------- 4 files changed, 206 insertions(+), 377 deletions(-) diff --git a/CAS/CAS.cpp b/CAS/CAS.cpp index aa24be9..2cfc592 100644 --- a/CAS/CAS.cpp +++ b/CAS/CAS.cpp @@ -181,10 +181,10 @@ static void VS_CC casCreate(const VSMap * in, VSMap * out, void * userData, VSCo for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { if (d->vi->width >> (plane ? d->vi->format->subSamplingW : 0) < 3) - throw "every plane's width must be greater than or equal to 3"; + throw "plane's width must be greater than or equal to 3"; if (d->vi->height >> (plane ? d->vi->format->subSamplingH : 0) < 3) - throw "every plane's height must be greater than or equal to 3"; + throw "plane's height must be greater than or equal to 3"; } d->sharpness = static_cast(vsapi->propGetFloat(in, "sharpness", 0, &err)); diff --git a/CAS/CAS_AVX2.cpp b/CAS/CAS_AVX2.cpp index 5f7c0c2..8fccd99 100644 --- a/CAS/CAS_AVX2.cpp +++ b/CAS/CAS_AVX2.cpp @@ -3,42 +3,45 @@ template void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept { - auto load_8u = [](const void * srcp) noexcept { + using var_t = std::conditional_t, int, float>; + using vec_t = std::conditional_t, Vec8i, Vec8f>; + + const vec_t limit = std::any_cast(data->limit); + + auto load = [](const pixel_t * srcp) noexcept { if constexpr (std::is_same_v) - return Vec8i().load_8uc(srcp); + return vec_t().load_8uc(srcp); + else if constexpr (std::is_same_v) + return vec_t().load_8us(srcp); else - return Vec8i().load_8us(srcp); + return vec_t().load(srcp); }; - auto store_8u = [&](const Vec8f __result, void * dstp) noexcept { - const Vec8i _result = truncatei(__result + 0.5f); - + auto store = [&](const Vec8f srcp, pixel_t * dstp) noexcept { if constexpr (std::is_same_v) { - const auto result = compress_saturated_s2u(compress_saturated(_result, zero_si256()), zero_si256()).get_low(); + const auto result = compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si256()), zero_si256()).get_low(); result.storel(dstp); - } else { - const auto result = compress_saturated_s2u(_result, zero_si256()).get_low(); + } else if constexpr (std::is_same_v) { + const auto result = compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si256()).get_low(); min(result, data->peak).store_nt(dstp); + } else { + srcp.store_nt(dstp); } }; - using var_t = std::conditional_t, Vec8i, Vec8f>; - - const var_t limit = std::any_cast, int, float>>(data->limit); - - auto filtering = [&](const var_t a, const var_t b, const var_t c, const var_t d, const var_t e, const var_t f, const var_t g, const var_t h, const var_t i, + auto filtering = [&](const vec_t a, const vec_t b, const vec_t c, const vec_t d, const vec_t e, const vec_t f, const vec_t g, const vec_t h, const vec_t i, const Vec8f chromaOffset) noexcept { // Soft min and max. // a b c b // d e f * 0.5 + d e f * 0.5 // g h i h // These are 2.0x bigger (factored out the extra multiply). - var_t mn = min(min(min(d, e), min(f, b)), h); - const var_t mn2 = min(min(min(mn, a), min(c, g)), i); + vec_t mn = min(min(min(d, e), min(f, b)), h); + const vec_t mn2 = min(min(min(mn, a), min(c, g)), i); mn += mn2; - var_t mx = max(max(max(d, e), max(f, b)), h); - const var_t mx2 = max(max(max(mx, a), max(c, g)), i); + vec_t mx = max(max(max(d, e), max(f, b)), h); + const vec_t mx2 = max(max(max(mx, a), max(c, g)), i); mx += mx2; if constexpr (std::is_floating_point_v) { @@ -77,128 +80,68 @@ void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const const Vec8f chromaOffset = plane ? 1.0f : 0.0f; - const int regularPart = (width - 1) & ~(Vec8i().size() - 1); + const int regularPart = (width - 1) & ~(vec_t().size() - 1); for (int y = 0; y < height; y++) { const pixel_t * above = srcp + (y == 0 ? stride : -stride); const pixel_t * below = srcp + (y == height - 1 ? -stride : stride); - if constexpr (std::is_integral_v) { - { - const Vec8i b = load_8u(above + 0); - const Vec8i e = load_8u(srcp + 0); - const Vec8i h = load_8u(below + 0); - - const Vec8i a = permute8<1, 0, 1, 2, 3, 4, 5, 6>(b); - const Vec8i d = permute8<1, 0, 1, 2, 3, 4, 5, 6>(e); - const Vec8i g = permute8<1, 0, 1, 2, 3, 4, 5, 6>(h); - - Vec8i c, f, i; - if (width > Vec8i().size()) { - c = load_8u(above + 1); - f = load_8u(srcp + 1); - i = load_8u(below + 1); - } else { - c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b); - f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e); - i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h); - } - - const Vec8f result = filtering(a, b, c, - d, e, f, - g, h, i, - chromaOffset); - - store_8u(result, dstp + 0); - } - - for (int x = Vec8i().size(); x < regularPart; x += Vec8i().size()) { - const Vec8f result = filtering(load_8u(above + x - 1), load_8u(above + x), load_8u(above + x + 1), - load_8u(srcp + x - 1), load_8u(srcp + x), load_8u(srcp + x + 1), - load_8u(below + x - 1), load_8u(below + x), load_8u(below + x + 1), - chromaOffset); - - store_8u(result, dstp + x); + { + const vec_t b = load(above + 0); + const vec_t e = load(srcp + 0); + const vec_t h = load(below + 0); + + const vec_t a = permute8<1, 0, 1, 2, 3, 4, 5, 6>(b); + const vec_t d = permute8<1, 0, 1, 2, 3, 4, 5, 6>(e); + const vec_t g = permute8<1, 0, 1, 2, 3, 4, 5, 6>(h); + + vec_t c, f, i; + if (width > vec_t().size()) { + c = load(above + 1); + f = load(srcp + 1); + i = load(below + 1); + } else { + c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b); + f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e); + i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h); } - if (regularPart >= Vec8i().size()) { - const Vec8i a = load_8u(above + regularPart - 1); - const Vec8i d = load_8u(srcp + regularPart - 1); - const Vec8i g = load_8u(below + regularPart - 1); + const Vec8f result = filtering(a, b, c, + d, e, f, + g, h, i, + chromaOffset); - const Vec8i b = load_8u(above + regularPart); - const Vec8i e = load_8u(srcp + regularPart); - const Vec8i h = load_8u(below + regularPart); - - const Vec8i c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b); - const Vec8i f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e); - const Vec8i i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h); - - const Vec8f result = filtering(a, b, c, - d, e, f, - g, h, i, - chromaOffset); - - store_8u(result, dstp + regularPart); - } - } else { - { - const Vec8f b = Vec8f().load_a(above + 0); - const Vec8f e = Vec8f().load_a(srcp + 0); - const Vec8f h = Vec8f().load_a(below + 0); - - const Vec8f a = permute8<1, 0, 1, 2, 3, 4, 5, 6>(b); - const Vec8f d = permute8<1, 0, 1, 2, 3, 4, 5, 6>(e); - const Vec8f g = permute8<1, 0, 1, 2, 3, 4, 5, 6>(h); - - Vec8f c, f, i; - if (width > Vec8f().size()) { - c = Vec8f().load(above + 1); - f = Vec8f().load(srcp + 1); - i = Vec8f().load(below + 1); - } else { - c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b); - f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e); - i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h); - } - - const Vec8f result = filtering(a, b, c, - d, e, f, - g, h, i, - chromaOffset); - - result.store_nt(dstp + 0); - } + store(result, dstp + 0); + } - for (int x = Vec8f().size(); x < regularPart; x += Vec8f().size()) { - const Vec8f result = filtering(Vec8f().load(above + x - 1), Vec8f().load_a(above + x), Vec8f().load(above + x + 1), - Vec8f().load(srcp + x - 1), Vec8f().load_a(srcp + x), Vec8f().load(srcp + x + 1), - Vec8f().load(below + x - 1), Vec8f().load_a(below + x), Vec8f().load(below + x + 1), - chromaOffset); + for (int x = vec_t().size(); x < regularPart; x += vec_t().size()) { + const Vec8f result = filtering(load(above + x - 1), load(above + x), load(above + x + 1), + load(srcp + x - 1), load(srcp + x), load(srcp + x + 1), + load(below + x - 1), load(below + x), load(below + x + 1), + chromaOffset); - result.store_nt(dstp + x); - } + store(result, dstp + x); + } - if (regularPart >= Vec8f().size()) { - const Vec8f a = Vec8f().load(above + regularPart - 1); - const Vec8f d = Vec8f().load(srcp + regularPart - 1); - const Vec8f g = Vec8f().load(below + regularPart - 1); + if (regularPart >= vec_t().size()) { + const vec_t a = load(above + regularPart - 1); + const vec_t d = load(srcp + regularPart - 1); + const vec_t g = load(below + regularPart - 1); - const Vec8f b = Vec8f().load_a(above + regularPart); - const Vec8f e = Vec8f().load_a(srcp + regularPart); - const Vec8f h = Vec8f().load_a(below + regularPart); + const vec_t b = load(above + regularPart); + const vec_t e = load(srcp + regularPart); + const vec_t h = load(below + regularPart); - const Vec8f c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b); - const Vec8f f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e); - const Vec8f i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h); + const vec_t c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b); + const vec_t f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e); + const vec_t i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h); - const Vec8f result = filtering(a, b, c, - d, e, f, - g, h, i, - chromaOffset); + const Vec8f result = filtering(a, b, c, + d, e, f, + g, h, i, + chromaOffset); - result.store_nt(dstp + regularPart); - } + store(result, dstp + regularPart); } srcp += stride; diff --git a/CAS/CAS_AVX512.cpp b/CAS/CAS_AVX512.cpp index 0e83a7e..21d8083 100644 --- a/CAS/CAS_AVX512.cpp +++ b/CAS/CAS_AVX512.cpp @@ -3,42 +3,45 @@ template void filter_avx512(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept { - auto load_16u = [](const void * srcp) noexcept { + using var_t = std::conditional_t, int, float>; + using vec_t = std::conditional_t, Vec16i, Vec16f>; + + const vec_t limit = std::any_cast(data->limit); + + auto load = [](const pixel_t * srcp) noexcept { if constexpr (std::is_same_v) - return Vec16i().load_16uc(srcp); + return vec_t().load_16uc(srcp); + else if constexpr (std::is_same_v) + return vec_t().load_16us(srcp); else - return Vec16i().load_16us(srcp); + return vec_t().load(srcp); }; - auto store_16u = [&](const Vec16f __result, void * dstp) noexcept { - const Vec16i _result = truncatei(__result + 0.5f); - + auto store = [&](const Vec16f srcp, pixel_t * dstp) noexcept { if constexpr (std::is_same_v) { - const auto result = compress_saturated_s2u(compress_saturated(_result, zero_si512()), zero_si512()).get_low().get_low(); + const auto result = compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si512()), zero_si512()).get_low().get_low(); result.store_nt(dstp); - } else { - const auto result = compress_saturated_s2u(_result, zero_si512()).get_low(); + } else if constexpr (std::is_same_v) { + const auto result = compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si512()).get_low(); min(result, data->peak).store_nt(dstp); + } else { + srcp.store_nt(dstp); } }; - using var_t = std::conditional_t, Vec16i, Vec16f>; - - const var_t limit = std::any_cast, int, float>>(data->limit); - - auto filtering = [&](const var_t a, const var_t b, const var_t c, const var_t d, const var_t e, const var_t f, const var_t g, const var_t h, const var_t i, + auto filtering = [&](const vec_t a, const vec_t b, const vec_t c, const vec_t d, const vec_t e, const vec_t f, const vec_t g, const vec_t h, const vec_t i, const Vec16f chromaOffset) noexcept { // Soft min and max. // a b c b // d e f * 0.5 + d e f * 0.5 // g h i h // These are 2.0x bigger (factored out the extra multiply). - var_t mn = min(min(min(d, e), min(f, b)), h); - const var_t mn2 = min(min(min(mn, a), min(c, g)), i); + vec_t mn = min(min(min(d, e), min(f, b)), h); + const vec_t mn2 = min(min(min(mn, a), min(c, g)), i); mn += mn2; - var_t mx = max(max(max(d, e), max(f, b)), h); - const var_t mx2 = max(max(max(mx, a), max(c, g)), i); + vec_t mx = max(max(max(d, e), max(f, b)), h); + const vec_t mx2 = max(max(max(mx, a), max(c, g)), i); mx += mx2; if constexpr (std::is_floating_point_v) { @@ -77,128 +80,68 @@ void filter_avx512(const VSFrameRef * src, VSFrameRef * dst, const CASData * con const Vec16f chromaOffset = plane ? 1.0f : 0.0f; - const int regularPart = (width - 1) & ~(Vec16i().size() - 1); + const int regularPart = (width - 1) & ~(vec_t().size() - 1); for (int y = 0; y < height; y++) { const pixel_t * above = srcp + (y == 0 ? stride : -stride); const pixel_t * below = srcp + (y == height - 1 ? -stride : stride); - if constexpr (std::is_integral_v) { - { - const Vec16i b = load_16u(above + 0); - const Vec16i e = load_16u(srcp + 0); - const Vec16i h = load_16u(below + 0); - - const Vec16i a = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(b); - const Vec16i d = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(e); - const Vec16i g = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(h); - - Vec16i c, f, i; - if (width > Vec16i().size()) { - c = load_16u(above + 1); - f = load_16u(srcp + 1); - i = load_16u(below + 1); - } else { - c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b); - f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e); - i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h); - } - - const Vec16f result = filtering(a, b, c, - d, e, f, - g, h, i, - chromaOffset); - - store_16u(result, dstp + 0); - } - - for (int x = Vec16i().size(); x < regularPart; x += Vec16i().size()) { - const Vec16f result = filtering(load_16u(above + x - 1), load_16u(above + x), load_16u(above + x + 1), - load_16u(srcp + x - 1), load_16u(srcp + x), load_16u(srcp + x + 1), - load_16u(below + x - 1), load_16u(below + x), load_16u(below + x + 1), - chromaOffset); - - store_16u(result, dstp + x); + { + const vec_t b = load(above + 0); + const vec_t e = load(srcp + 0); + const vec_t h = load(below + 0); + + const vec_t a = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(b); + const vec_t d = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(e); + const vec_t g = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(h); + + vec_t c, f, i; + if (width > vec_t().size()) { + c = load(above + 1); + f = load(srcp + 1); + i = load(below + 1); + } else { + c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b); + f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e); + i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h); } - if (regularPart >= Vec16i().size()) { - const Vec16i a = load_16u(above + regularPart - 1); - const Vec16i d = load_16u(srcp + regularPart - 1); - const Vec16i g = load_16u(below + regularPart - 1); + const Vec16f result = filtering(a, b, c, + d, e, f, + g, h, i, + chromaOffset); - const Vec16i b = load_16u(above + regularPart); - const Vec16i e = load_16u(srcp + regularPart); - const Vec16i h = load_16u(below + regularPart); - - const Vec16i c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b); - const Vec16i f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e); - const Vec16i i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h); - - const Vec16f result = filtering(a, b, c, - d, e, f, - g, h, i, - chromaOffset); - - store_16u(result, dstp + regularPart); - } - } else { - { - const Vec16f b = Vec16f().load_a(above + 0); - const Vec16f e = Vec16f().load_a(srcp + 0); - const Vec16f h = Vec16f().load_a(below + 0); - - const Vec16f a = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(b); - const Vec16f d = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(e); - const Vec16f g = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(h); - - Vec16f c, f, i; - if (width > Vec16f().size()) { - c = Vec16f().load(above + 1); - f = Vec16f().load(srcp + 1); - i = Vec16f().load(below + 1); - } else { - c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b); - f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e); - i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h); - } - - const Vec16f result = filtering(a, b, c, - d, e, f, - g, h, i, - chromaOffset); - - result.store_nt(dstp + 0); - } + store(result, dstp + 0); + } - for (int x = Vec16f().size(); x < regularPart; x += Vec16f().size()) { - const Vec16f result = filtering(Vec16f().load(above + x - 1), Vec16f().load_a(above + x), Vec16f().load(above + x + 1), - Vec16f().load(srcp + x - 1), Vec16f().load_a(srcp + x), Vec16f().load(srcp + x + 1), - Vec16f().load(below + x - 1), Vec16f().load_a(below + x), Vec16f().load(below + x + 1), - chromaOffset); + for (int x = vec_t().size(); x < regularPart; x += vec_t().size()) { + const Vec16f result = filtering(load(above + x - 1), load(above + x), load(above + x + 1), + load(srcp + x - 1), load(srcp + x), load(srcp + x + 1), + load(below + x - 1), load(below + x), load(below + x + 1), + chromaOffset); - result.store_nt(dstp + x); - } + store(result, dstp + x); + } - if (regularPart >= Vec16f().size()) { - const Vec16f a = Vec16f().load(above + regularPart - 1); - const Vec16f d = Vec16f().load(srcp + regularPart - 1); - const Vec16f g = Vec16f().load(below + regularPart - 1); + if (regularPart >= vec_t().size()) { + const vec_t a = load(above + regularPart - 1); + const vec_t d = load(srcp + regularPart - 1); + const vec_t g = load(below + regularPart - 1); - const Vec16f b = Vec16f().load_a(above + regularPart); - const Vec16f e = Vec16f().load_a(srcp + regularPart); - const Vec16f h = Vec16f().load_a(below + regularPart); + const vec_t b = load(above + regularPart); + const vec_t e = load(srcp + regularPart); + const vec_t h = load(below + regularPart); - const Vec16f c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b); - const Vec16f f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e); - const Vec16f i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h); + const vec_t c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b); + const vec_t f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e); + const vec_t i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h); - const Vec16f result = filtering(a, b, c, - d, e, f, - g, h, i, - chromaOffset); + const Vec16f result = filtering(a, b, c, + d, e, f, + g, h, i, + chromaOffset); - result.store_nt(dstp + regularPart); - } + store(result, dstp + regularPart); } srcp += stride; diff --git a/CAS/CAS_SSE2.cpp b/CAS/CAS_SSE2.cpp index a3a3c0b..ebd4096 100644 --- a/CAS/CAS_SSE2.cpp +++ b/CAS/CAS_SSE2.cpp @@ -3,42 +3,45 @@ template void filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept { - auto load_4u = [](const void * srcp) noexcept { + using var_t = std::conditional_t, int, float>; + using vec_t = std::conditional_t, Vec4i, Vec4f>; + + const vec_t limit = std::any_cast(data->limit); + + auto load = [](const pixel_t * srcp) noexcept { if constexpr (std::is_same_v) - return Vec4i().load_4uc(srcp); + return vec_t().load_4uc(srcp); + else if constexpr (std::is_same_v) + return vec_t().load_4us(srcp); else - return Vec4i().load_4us(srcp); + return vec_t().load(srcp); }; - auto store_4u = [&](const Vec4f __result, void * dstp) noexcept { - const Vec4i _result = truncatei(__result + 0.5f); - + auto store = [&](const Vec4f srcp, pixel_t * dstp) noexcept { if constexpr (std::is_same_v) { - const auto result = compress_saturated_s2u(compress_saturated(_result, zero_si128()), zero_si128()); + const auto result = compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si128()), zero_si128()); result.store_si32(dstp); - } else { - const auto result = compress_saturated_s2u(_result, zero_si128()); + } else if constexpr (std::is_same_v) { + const auto result = compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si128()); min(result, data->peak).storel(dstp); + } else { + srcp.store_nt(dstp); } }; - using var_t = std::conditional_t, Vec4i, Vec4f>; - - const var_t limit = std::any_cast, int, float>>(data->limit); - - auto filtering = [&](const var_t a, const var_t b, const var_t c, const var_t d, const var_t e, const var_t f, const var_t g, const var_t h, const var_t i, + auto filtering = [&](const vec_t a, const vec_t b, const vec_t c, const vec_t d, const vec_t e, const vec_t f, const vec_t g, const vec_t h, const vec_t i, const Vec4f chromaOffset) noexcept { // Soft min and max. // a b c b // d e f * 0.5 + d e f * 0.5 // g h i h // These are 2.0x bigger (factored out the extra multiply). - var_t mn = min(min(min(d, e), min(f, b)), h); - const var_t mn2 = min(min(min(mn, a), min(c, g)), i); + vec_t mn = min(min(min(d, e), min(f, b)), h); + const vec_t mn2 = min(min(min(mn, a), min(c, g)), i); mn += mn2; - var_t mx = max(max(max(d, e), max(f, b)), h); - const var_t mx2 = max(max(max(mx, a), max(c, g)), i); + vec_t mx = max(max(max(d, e), max(f, b)), h); + const vec_t mx2 = max(max(max(mx, a), max(c, g)), i); mx += mx2; if constexpr (std::is_floating_point_v) { @@ -77,128 +80,68 @@ void filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const const Vec4f chromaOffset = plane ? 1.0f : 0.0f; - const int regularPart = (width - 1) & ~(Vec4i().size() - 1); + const int regularPart = (width - 1) & ~(vec_t().size() - 1); for (int y = 0; y < height; y++) { const pixel_t * above = srcp + (y == 0 ? stride : -stride); const pixel_t * below = srcp + (y == height - 1 ? -stride : stride); - if constexpr (std::is_integral_v) { - { - const Vec4i b = load_4u(above + 0); - const Vec4i e = load_4u(srcp + 0); - const Vec4i h = load_4u(below + 0); - - const Vec4i a = permute4<1, 0, 1, 2>(b); - const Vec4i d = permute4<1, 0, 1, 2>(e); - const Vec4i g = permute4<1, 0, 1, 2>(h); - - Vec4i c, f, i; - if (width > Vec4i().size()) { - c = load_4u(above + 1); - f = load_4u(srcp + 1); - i = load_4u(below + 1); - } else { - c = permute4<1, 2, 3, 2>(b); - f = permute4<1, 2, 3, 2>(e); - i = permute4<1, 2, 3, 2>(h); - } - - const Vec4f result = filtering(a, b, c, - d, e, f, - g, h, i, - chromaOffset); - - store_4u(result, dstp + 0); - } - - for (int x = Vec4i().size(); x < regularPart; x += Vec4i().size()) { - const Vec4f result = filtering(load_4u(above + x - 1), load_4u(above + x), load_4u(above + x + 1), - load_4u(srcp + x - 1), load_4u(srcp + x), load_4u(srcp + x + 1), - load_4u(below + x - 1), load_4u(below + x), load_4u(below + x + 1), - chromaOffset); - - store_4u(result, dstp + x); + { + const vec_t b = load(above + 0); + const vec_t e = load(srcp + 0); + const vec_t h = load(below + 0); + + const vec_t a = permute4<1, 0, 1, 2>(b); + const vec_t d = permute4<1, 0, 1, 2>(e); + const vec_t g = permute4<1, 0, 1, 2>(h); + + vec_t c, f, i; + if (width > vec_t().size()) { + c = load(above + 1); + f = load(srcp + 1); + i = load(below + 1); + } else { + c = permute4<1, 2, 3, 2>(b); + f = permute4<1, 2, 3, 2>(e); + i = permute4<1, 2, 3, 2>(h); } - if (regularPart >= Vec4i().size()) { - const Vec4i a = load_4u(above + regularPart - 1); - const Vec4i d = load_4u(srcp + regularPart - 1); - const Vec4i g = load_4u(below + regularPart - 1); + const Vec4f result = filtering(a, b, c, + d, e, f, + g, h, i, + chromaOffset); - const Vec4i b = load_4u(above + regularPart); - const Vec4i e = load_4u(srcp + regularPart); - const Vec4i h = load_4u(below + regularPart); - - const Vec4i c = permute4<1, 2, 3, 2>(b); - const Vec4i f = permute4<1, 2, 3, 2>(e); - const Vec4i i = permute4<1, 2, 3, 2>(h); - - const Vec4f result = filtering(a, b, c, - d, e, f, - g, h, i, - chromaOffset); - - store_4u(result, dstp + regularPart); - } - } else { - { - const Vec4f b = Vec4f().load_a(above + 0); - const Vec4f e = Vec4f().load_a(srcp + 0); - const Vec4f h = Vec4f().load_a(below + 0); - - const Vec4f a = permute4<1, 0, 1, 2>(b); - const Vec4f d = permute4<1, 0, 1, 2>(e); - const Vec4f g = permute4<1, 0, 1, 2>(h); - - Vec4f c, f, i; - if (width > Vec4f().size()) { - c = Vec4f().load(above + 1); - f = Vec4f().load(srcp + 1); - i = Vec4f().load(below + 1); - } else { - c = permute4<1, 2, 3, 2>(b); - f = permute4<1, 2, 3, 2>(e); - i = permute4<1, 2, 3, 2>(h); - } - - const Vec4f result = filtering(a, b, c, - d, e, f, - g, h, i, - chromaOffset); - - result.store_nt(dstp + 0); - } + store(result, dstp + 0); + } - for (int x = Vec4f().size(); x < regularPart; x += Vec4f().size()) { - const Vec4f result = filtering(Vec4f().load(above + x - 1), Vec4f().load_a(above + x), Vec4f().load(above + x + 1), - Vec4f().load(srcp + x - 1), Vec4f().load_a(srcp + x), Vec4f().load(srcp + x + 1), - Vec4f().load(below + x - 1), Vec4f().load_a(below + x), Vec4f().load(below + x + 1), - chromaOffset); + for (int x = vec_t().size(); x < regularPart; x += vec_t().size()) { + const Vec4f result = filtering(load(above + x - 1), load(above + x), load(above + x + 1), + load(srcp + x - 1), load(srcp + x), load(srcp + x + 1), + load(below + x - 1), load(below + x), load(below + x + 1), + chromaOffset); - result.store_nt(dstp + x); - } + store(result, dstp + x); + } - if (regularPart >= Vec4f().size()) { - const Vec4f a = Vec4f().load(above + regularPart - 1); - const Vec4f d = Vec4f().load(srcp + regularPart - 1); - const Vec4f g = Vec4f().load(below + regularPart - 1); + if (regularPart >= vec_t().size()) { + const vec_t a = load(above + regularPart - 1); + const vec_t d = load(srcp + regularPart - 1); + const vec_t g = load(below + regularPart - 1); - const Vec4f b = Vec4f().load_a(above + regularPart); - const Vec4f e = Vec4f().load_a(srcp + regularPart); - const Vec4f h = Vec4f().load_a(below + regularPart); + const vec_t b = load(above + regularPart); + const vec_t e = load(srcp + regularPart); + const vec_t h = load(below + regularPart); - const Vec4f c = permute4<1, 2, 3, 2>(b); - const Vec4f f = permute4<1, 2, 3, 2>(e); - const Vec4f i = permute4<1, 2, 3, 2>(h); + const vec_t c = permute4<1, 2, 3, 2>(b); + const vec_t f = permute4<1, 2, 3, 2>(e); + const vec_t i = permute4<1, 2, 3, 2>(h); - const Vec4f result = filtering(a, b, c, - d, e, f, - g, h, i, - chromaOffset); + const Vec4f result = filtering(a, b, c, + d, e, f, + g, h, i, + chromaOffset); - result.store_nt(dstp + regularPart); - } + store(result, dstp + regularPart); } srcp += stride;