From d0df8c271d9fb960e4a64ad07b08e9ab89e03b65 Mon Sep 17 00:00:00 2001
From: Holy Wu <holywu@gmail.com>
Date: Sat, 4 Jul 2020 22:42:20 +0800
Subject: [PATCH] Reduce code duplication

---
 CAS/CAS.cpp        |   4 +-
 CAS/CAS_AVX2.cpp   | 193 ++++++++++++++++-----------------------------
 CAS/CAS_AVX512.cpp | 193 ++++++++++++++++-----------------------------
 CAS/CAS_SSE2.cpp   | 193 ++++++++++++++++-----------------------------
 4 files changed, 206 insertions(+), 377 deletions(-)
diff --git a/CAS/CAS.cpp b/CAS/CAS.cpp
index aa24be9..2cfc592 100644
--- a/CAS/CAS.cpp
+++ b/CAS/CAS.cpp
@@ -181,10 +181,10 @@ static void VS_CC casCreate(const VSMap * in, VSMap * out, void * userData, VSCo
 
         for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
             if (d->vi->width >> (plane ? d->vi->format->subSamplingW : 0) < 3)
-                throw "every plane's width must be greater than or equal to 3";
+                throw "plane's width must be greater than or equal to 3";
 
             if (d->vi->height >> (plane ? d->vi->format->subSamplingH : 0) < 3)
-                throw "every plane's height must be greater than or equal to 3";
+                throw "plane's height must be greater than or equal to 3";
         }
 
         d->sharpness = static_cast<float>(vsapi->propGetFloat(in, "sharpness", 0, &err));
diff --git a/CAS/CAS_AVX2.cpp b/CAS/CAS_AVX2.cpp
index 5f7c0c2..8fccd99 100644
--- a/CAS/CAS_AVX2.cpp
+++ b/CAS/CAS_AVX2.cpp
@@ -3,42 +3,45 @@
 
 template<typename pixel_t>
 void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept {
-    auto load_8u = [](const void * srcp) noexcept {
+    using var_t = std::conditional_t<std::is_integral_v<pixel_t>, int, float>;
+    using vec_t = std::conditional_t<std::is_integral_v<pixel_t>, Vec8i, Vec8f>;
+
+    const vec_t limit = std::any_cast<var_t>(data->limit);
+
+    auto load = [](const pixel_t * srcp) noexcept {
         if constexpr (std::is_same_v<pixel_t, uint8_t>)
-            return Vec8i().load_8uc(srcp);
+            return vec_t().load_8uc(srcp);
+        else if constexpr (std::is_same_v<pixel_t, uint16_t>)
+            return vec_t().load_8us(srcp);
         else
-            return Vec8i().load_8us(srcp);
+            return vec_t().load(srcp);
     };
 
-    auto store_8u = [&](const Vec8f __result, void * dstp) noexcept {
-        const Vec8i _result = truncatei(__result + 0.5f);
-
+    auto store = [&](const Vec8f srcp, pixel_t * dstp) noexcept {
         if constexpr (std::is_same_v<pixel_t, uint8_t>) {
-            const auto result = compress_saturated_s2u(compress_saturated(_result, zero_si256()), zero_si256()).get_low();
+            const auto result = compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si256()), zero_si256()).get_low();
             result.storel(dstp);
-        } else {
-            const auto result = compress_saturated_s2u(_result, zero_si256()).get_low();
+        } else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
+            const auto result = compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si256()).get_low();
             min(result, data->peak).store_nt(dstp);
+        } else {
+            srcp.store_nt(dstp);
         }
     };
 
-    using var_t = std::conditional_t<std::is_integral_v<pixel_t>, Vec8i, Vec8f>;
-
-    const var_t limit = std::any_cast<std::conditional_t<std::is_integral_v<pixel_t>, int, float>>(data->limit);
-
-    auto filtering = [&](const var_t a, const var_t b, const var_t c, const var_t d, const var_t e, const var_t f, const var_t g, const var_t h, const var_t i,
+    auto filtering = [&](const vec_t a, const vec_t b, const vec_t c, const vec_t d, const vec_t e, const vec_t f, const vec_t g, const vec_t h, const vec_t i,
                          const Vec8f chromaOffset) noexcept {
         // Soft min and max.
         //  a b c             b
         //  d e f * 0.5  +  d e f * 0.5
         //  g h i             h
         // These are 2.0x bigger (factored out the extra multiply).
-        var_t mn = min(min(min(d, e), min(f, b)), h);
-        const var_t mn2 = min(min(min(mn, a), min(c, g)), i);
+        vec_t mn = min(min(min(d, e), min(f, b)), h);
+        const vec_t mn2 = min(min(min(mn, a), min(c, g)), i);
         mn += mn2;
 
-        var_t mx = max(max(max(d, e), max(f, b)), h);
-        const var_t mx2 = max(max(max(mx, a), max(c, g)), i);
+        vec_t mx = max(max(max(d, e), max(f, b)), h);
+        const vec_t mx2 = max(max(max(mx, a), max(c, g)), i);
         mx += mx2;
 
         if constexpr (std::is_floating_point_v<pixel_t>) {
@@ -77,128 +80,68 @@ void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const
 
             const Vec8f chromaOffset = plane ? 1.0f : 0.0f;
 
-            const int regularPart = (width - 1) & ~(Vec8i().size() - 1);
+            const int regularPart = (width - 1) & ~(vec_t().size() - 1);
 
             for (int y = 0; y < height; y++) {
                 const pixel_t * above = srcp + (y == 0 ? stride : -stride);
                 const pixel_t * below = srcp + (y == height - 1 ? -stride : stride);
 
-                if constexpr (std::is_integral_v<pixel_t>) {
-                    {
-                        const Vec8i b = load_8u(above + 0);
-                        const Vec8i e = load_8u(srcp + 0);
-                        const Vec8i h = load_8u(below + 0);
-
-                        const Vec8i a = permute8<1, 0, 1, 2, 3, 4, 5, 6>(b);
-                        const Vec8i d = permute8<1, 0, 1, 2, 3, 4, 5, 6>(e);
-                        const Vec8i g = permute8<1, 0, 1, 2, 3, 4, 5, 6>(h);
-
-                        Vec8i c, f, i;
-                        if (width > Vec8i().size()) {
-                            c = load_8u(above + 1);
-                            f = load_8u(srcp + 1);
-                            i = load_8u(below + 1);
-                        } else {
-                            c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
-                            f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
-                            i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);
-                        }
-
-                        const Vec8f result = filtering(a, b, c,
-                                                       d, e, f,
-                                                       g, h, i,
-                                                       chromaOffset);
-
-                        store_8u(result, dstp + 0);
-                    }
-
-                    for (int x = Vec8i().size(); x < regularPart; x += Vec8i().size()) {
-                        const Vec8f result = filtering(load_8u(above + x - 1), load_8u(above + x), load_8u(above + x + 1),
-                                                       load_8u(srcp + x - 1), load_8u(srcp + x), load_8u(srcp + x + 1),
-                                                       load_8u(below + x - 1), load_8u(below + x), load_8u(below + x + 1),
-                                                       chromaOffset);
-
-                        store_8u(result, dstp + x);
+                {
+                    const vec_t b = load(above + 0);
+                    const vec_t e = load(srcp + 0);
+                    const vec_t h = load(below + 0);
+
+                    const vec_t a = permute8<1, 0, 1, 2, 3, 4, 5, 6>(b);
+                    const vec_t d = permute8<1, 0, 1, 2, 3, 4, 5, 6>(e);
+                    const vec_t g = permute8<1, 0, 1, 2, 3, 4, 5, 6>(h);
+
+                    vec_t c, f, i;
+                    if (width > vec_t().size()) {
+                        c = load(above + 1);
+                        f = load(srcp + 1);
+                        i = load(below + 1);
+                    } else {
+                        c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
+                        f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
+                        i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);
                     }
 
-                    if (regularPart >= Vec8i().size()) {
-                        const Vec8i a = load_8u(above + regularPart - 1);
-                        const Vec8i d = load_8u(srcp + regularPart - 1);
-                        const Vec8i g = load_8u(below + regularPart - 1);
+                    const Vec8f result = filtering(a, b, c,
+                                                   d, e, f,
+                                                   g, h, i,
+                                                   chromaOffset);
 
-                        const Vec8i b = load_8u(above + regularPart);
-                        const Vec8i e = load_8u(srcp + regularPart);
-                        const Vec8i h = load_8u(below + regularPart);
-
-                        const Vec8i c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
-                        const Vec8i f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
-                        const Vec8i i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);
-
-                        const Vec8f result = filtering(a, b, c,
-                                                       d, e, f,
-                                                       g, h, i,
-                                                       chromaOffset);
-
-                        store_8u(result, dstp + regularPart);
-                    }
-                } else {
-                    {
-                        const Vec8f b = Vec8f().load_a(above + 0);
-                        const Vec8f e = Vec8f().load_a(srcp + 0);
-                        const Vec8f h = Vec8f().load_a(below + 0);
-
-                        const Vec8f a = permute8<1, 0, 1, 2, 3, 4, 5, 6>(b);
-                        const Vec8f d = permute8<1, 0, 1, 2, 3, 4, 5, 6>(e);
-                        const Vec8f g = permute8<1, 0, 1, 2, 3, 4, 5, 6>(h);
-
-                        Vec8f c, f, i;
-                        if (width > Vec8f().size()) {
-                            c = Vec8f().load(above + 1);
-                            f = Vec8f().load(srcp + 1);
-                            i = Vec8f().load(below + 1);
-                        } else {
-                            c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
-                            f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
-                            i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);
-                        }
-
-                        const Vec8f result = filtering(a, b, c,
-                                                       d, e, f,
-                                                       g, h, i,
-                                                       chromaOffset);
-
-                        result.store_nt(dstp + 0);
-                    }
+                    store(result, dstp + 0);
+                }
 
-                    for (int x = Vec8f().size(); x < regularPart; x += Vec8f().size()) {
-                        const Vec8f result = filtering(Vec8f().load(above + x - 1), Vec8f().load_a(above + x), Vec8f().load(above + x + 1),
-                                                       Vec8f().load(srcp + x - 1), Vec8f().load_a(srcp + x), Vec8f().load(srcp + x + 1),
-                                                       Vec8f().load(below + x - 1), Vec8f().load_a(below + x), Vec8f().load(below + x + 1),
-                                                       chromaOffset);
+                for (int x = vec_t().size(); x < regularPart; x += vec_t().size()) {
+                    const Vec8f result = filtering(load(above + x - 1), load(above + x), load(above + x + 1),
+                                                   load(srcp + x - 1), load(srcp + x), load(srcp + x + 1),
+                                                   load(below + x - 1), load(below + x), load(below + x + 1),
+                                                   chromaOffset);
 
-                        result.store_nt(dstp + x);
-                    }
+                    store(result, dstp + x);
+                }
 
-                    if (regularPart >= Vec8f().size()) {
-                        const Vec8f a = Vec8f().load(above + regularPart - 1);
-                        const Vec8f d = Vec8f().load(srcp + regularPart - 1);
-                        const Vec8f g = Vec8f().load(below + regularPart - 1);
+                if (regularPart >= vec_t().size()) {
+                    const vec_t a = load(above + regularPart - 1);
+                    const vec_t d = load(srcp + regularPart - 1);
+                    const vec_t g = load(below + regularPart - 1);
 
-                        const Vec8f b = Vec8f().load_a(above + regularPart);
-                        const Vec8f e = Vec8f().load_a(srcp + regularPart);
-                        const Vec8f h = Vec8f().load_a(below + regularPart);
+                    const vec_t b = load(above + regularPart);
+                    const vec_t e = load(srcp + regularPart);
+                    const vec_t h = load(below + regularPart);
 
-                        const Vec8f c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
-                        const Vec8f f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
-                        const Vec8f i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);
+                    const vec_t c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
+                    const vec_t f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
+                    const vec_t i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);
 
-                        const Vec8f result = filtering(a, b, c,
-                                                       d, e, f,
-                                                       g, h, i,
-                                                       chromaOffset);
+                    const Vec8f result = filtering(a, b, c,
+                                                   d, e, f,
+                                                   g, h, i,
+                                                   chromaOffset);
 
-                        result.store_nt(dstp + regularPart);
-                    }
+                    store(result, dstp + regularPart);
                 }
 
                 srcp += stride;
diff --git a/CAS/CAS_AVX512.cpp b/CAS/CAS_AVX512.cpp
index 0e83a7e..21d8083 100644
--- a/CAS/CAS_AVX512.cpp
+++ b/CAS/CAS_AVX512.cpp
@@ -3,42 +3,45 @@
 
 template<typename pixel_t>
 void filter_avx512(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept {
-    auto load_16u = [](const void * srcp) noexcept {
+    using var_t = std::conditional_t<std::is_integral_v<pixel_t>, int, float>;
+    using vec_t = std::conditional_t<std::is_integral_v<pixel_t>, Vec16i, Vec16f>;
+
+    const vec_t limit = std::any_cast<var_t>(data->limit);
+
+    auto load = [](const pixel_t * srcp) noexcept {
         if constexpr (std::is_same_v<pixel_t, uint8_t>)
-            return Vec16i().load_16uc(srcp);
+            return vec_t().load_16uc(srcp);
+        else if constexpr (std::is_same_v<pixel_t, uint16_t>)
+            return vec_t().load_16us(srcp);
         else
-            return Vec16i().load_16us(srcp);
+            return vec_t().load(srcp);
     };
 
-    auto store_16u = [&](const Vec16f __result, void * dstp) noexcept {
-        const Vec16i _result = truncatei(__result + 0.5f);
-
+    auto store = [&](const Vec16f srcp, pixel_t * dstp) noexcept {
         if constexpr (std::is_same_v<pixel_t, uint8_t>) {
-            const auto result = compress_saturated_s2u(compress_saturated(_result, zero_si512()), zero_si512()).get_low().get_low();
+            const auto result = compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si512()), zero_si512()).get_low().get_low();
             result.store_nt(dstp);
-        } else {
-            const auto result = compress_saturated_s2u(_result, zero_si512()).get_low();
+        } else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
+            const auto result = compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si512()).get_low();
             min(result, data->peak).store_nt(dstp);
+        } else {
+            srcp.store_nt(dstp);
         }
     };
 
-    using var_t = std::conditional_t<std::is_integral_v<pixel_t>, Vec16i, Vec16f>;
-
-    const var_t limit = std::any_cast<std::conditional_t<std::is_integral_v<pixel_t>, int, float>>(data->limit);
-
-    auto filtering = [&](const var_t a, const var_t b, const var_t c, const var_t d, const var_t e, const var_t f, const var_t g, const var_t h, const var_t i,
+    auto filtering = [&](const vec_t a, const vec_t b, const vec_t c, const vec_t d, const vec_t e, const vec_t f, const vec_t g, const vec_t h, const vec_t i,
                          const Vec16f chromaOffset) noexcept {
         // Soft min and max.
         //  a b c             b
         //  d e f * 0.5  +  d e f * 0.5
         //  g h i             h
         // These are 2.0x bigger (factored out the extra multiply).
-        var_t mn = min(min(min(d, e), min(f, b)), h);
-        const var_t mn2 = min(min(min(mn, a), min(c, g)), i);
+        vec_t mn = min(min(min(d, e), min(f, b)), h);
+        const vec_t mn2 = min(min(min(mn, a), min(c, g)), i);
         mn += mn2;
 
-        var_t mx = max(max(max(d, e), max(f, b)), h);
-        const var_t mx2 = max(max(max(mx, a), max(c, g)), i);
+        vec_t mx = max(max(max(d, e), max(f, b)), h);
+        const vec_t mx2 = max(max(max(mx, a), max(c, g)), i);
         mx += mx2;
 
         if constexpr (std::is_floating_point_v<pixel_t>) {
@@ -77,128 +80,68 @@ void filter_avx512(const VSFrameRef * src, VSFrameRef * dst, const CASData * con
 
             const Vec16f chromaOffset = plane ? 1.0f : 0.0f;
 
-            const int regularPart = (width - 1) & ~(Vec16i().size() - 1);
+            const int regularPart = (width - 1) & ~(vec_t().size() - 1);
 
             for (int y = 0; y < height; y++) {
                 const pixel_t * above = srcp + (y == 0 ? stride : -stride);
                 const pixel_t * below = srcp + (y == height - 1 ? -stride : stride);
 
-                if constexpr (std::is_integral_v<pixel_t>) {
-                    {
-                        const Vec16i b = load_16u(above + 0);
-                        const Vec16i e = load_16u(srcp + 0);
-                        const Vec16i h = load_16u(below + 0);
-
-                        const Vec16i a = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(b);
-                        const Vec16i d = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(e);
-                        const Vec16i g = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(h);
-
-                        Vec16i c, f, i;
-                        if (width > Vec16i().size()) {
-                            c = load_16u(above + 1);
-                            f = load_16u(srcp + 1);
-                            i = load_16u(below + 1);
-                        } else {
-                            c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b);
-                            f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e);
-                            i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h);
-                        }
-
-                        const Vec16f result = filtering(a, b, c,
-                                                        d, e, f,
-                                                        g, h, i,
-                                                        chromaOffset);
-
-                        store_16u(result, dstp + 0);
-                    }
-
-                    for (int x = Vec16i().size(); x < regularPart; x += Vec16i().size()) {
-                        const Vec16f result = filtering(load_16u(above + x - 1), load_16u(above + x), load_16u(above + x + 1),
-                                                        load_16u(srcp + x - 1), load_16u(srcp + x), load_16u(srcp + x + 1),
-                                                        load_16u(below + x - 1), load_16u(below + x), load_16u(below + x + 1),
-                                                        chromaOffset);
-
-                        store_16u(result, dstp + x);
+                {
+                    const vec_t b = load(above + 0);
+                    const vec_t e = load(srcp + 0);
+                    const vec_t h = load(below + 0);
+
+                    const vec_t a = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(b);
+                    const vec_t d = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(e);
+                    const vec_t g = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(h);
+
+                    vec_t c, f, i;
+                    if (width > vec_t().size()) {
+                        c = load(above + 1);
+                        f = load(srcp + 1);
+                        i = load(below + 1);
+                    } else {
+                        c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b);
+                        f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e);
+                        i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h);
                     }
 
-                    if (regularPart >= Vec16i().size()) {
-                        const Vec16i a = load_16u(above + regularPart - 1);
-                        const Vec16i d = load_16u(srcp + regularPart - 1);
-                        const Vec16i g = load_16u(below + regularPart - 1);
+                    const Vec16f result = filtering(a, b, c,
+                                                    d, e, f,
+                                                    g, h, i,
+                                                    chromaOffset);
 
-                        const Vec16i b = load_16u(above + regularPart);
-                        const Vec16i e = load_16u(srcp + regularPart);
-                        const Vec16i h = load_16u(below + regularPart);
-
-                        const Vec16i c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b);
-                        const Vec16i f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e);
-                        const Vec16i i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h);
-
-                        const Vec16f result = filtering(a, b, c,
-                                                        d, e, f,
-                                                        g, h, i,
-                                                        chromaOffset);
-
-                        store_16u(result, dstp + regularPart);
-                    }
-                } else {
-                    {
-                        const Vec16f b = Vec16f().load_a(above + 0);
-                        const Vec16f e = Vec16f().load_a(srcp + 0);
-                        const Vec16f h = Vec16f().load_a(below + 0);
-
-                        const Vec16f a = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(b);
-                        const Vec16f d = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(e);
-                        const Vec16f g = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(h);
-
-                        Vec16f c, f, i;
-                        if (width > Vec16f().size()) {
-                            c = Vec16f().load(above + 1);
-                            f = Vec16f().load(srcp + 1);
-                            i = Vec16f().load(below + 1);
-                        } else {
-                            c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b);
-                            f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e);
-                            i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h);
-                        }
-
-                        const Vec16f result = filtering(a, b, c,
-                                                        d, e, f,
-                                                        g, h, i,
-                                                        chromaOffset);
-
-                        result.store_nt(dstp + 0);
-                    }
+                    store(result, dstp + 0);
+                }
 
-                    for (int x = Vec16f().size(); x < regularPart; x += Vec16f().size()) {
-                        const Vec16f result = filtering(Vec16f().load(above + x - 1), Vec16f().load_a(above + x), Vec16f().load(above + x + 1),
-                                                        Vec16f().load(srcp + x - 1), Vec16f().load_a(srcp + x), Vec16f().load(srcp + x + 1),
-                                                        Vec16f().load(below + x - 1), Vec16f().load_a(below + x), Vec16f().load(below + x + 1),
-                                                        chromaOffset);
+                for (int x = vec_t().size(); x < regularPart; x += vec_t().size()) {
+                    const Vec16f result = filtering(load(above + x - 1), load(above + x), load(above + x + 1),
+                                                    load(srcp + x - 1), load(srcp + x), load(srcp + x + 1),
+                                                    load(below + x - 1), load(below + x), load(below + x + 1),
+                                                    chromaOffset);
 
-                        result.store_nt(dstp + x);
-                    }
+                    store(result, dstp + x);
+                }
 
-                    if (regularPart >= Vec16f().size()) {
-                        const Vec16f a = Vec16f().load(above + regularPart - 1);
-                        const Vec16f d = Vec16f().load(srcp + regularPart - 1);
-                        const Vec16f g = Vec16f().load(below + regularPart - 1);
+                if (regularPart >= vec_t().size()) {
+                    const vec_t a = load(above + regularPart - 1);
+                    const vec_t d = load(srcp + regularPart - 1);
+                    const vec_t g = load(below + regularPart - 1);
 
-                        const Vec16f b = Vec16f().load_a(above + regularPart);
-                        const Vec16f e = Vec16f().load_a(srcp + regularPart);
-                        const Vec16f h = Vec16f().load_a(below + regularPart);
+                    const vec_t b = load(above + regularPart);
+                    const vec_t e = load(srcp + regularPart);
+                    const vec_t h = load(below + regularPart);
 
-                        const Vec16f c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b);
-                        const Vec16f f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e);
-                        const Vec16f i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h);
+                    const vec_t c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b);
+                    const vec_t f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e);
+                    const vec_t i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h);
 
-                        const Vec16f result = filtering(a, b, c,
-                                                        d, e, f,
-                                                        g, h, i,
-                                                        chromaOffset);
+                    const Vec16f result = filtering(a, b, c,
+                                                    d, e, f,
+                                                    g, h, i,
+                                                    chromaOffset);
 
-                        result.store_nt(dstp + regularPart);
-                    }
+                    store(result, dstp + regularPart);
                 }
 
                 srcp += stride;
diff --git a/CAS/CAS_SSE2.cpp b/CAS/CAS_SSE2.cpp
index a3a3c0b..ebd4096 100644
--- a/CAS/CAS_SSE2.cpp
+++ b/CAS/CAS_SSE2.cpp
@@ -3,42 +3,45 @@
 
 template<typename pixel_t>
 void filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept {
-    auto load_4u = [](const void * srcp) noexcept {
+    using var_t = std::conditional_t<std::is_integral_v<pixel_t>, int, float>;
+    using vec_t = std::conditional_t<std::is_integral_v<pixel_t>, Vec4i, Vec4f>;
+
+    const vec_t limit = std::any_cast<var_t>(data->limit);
+
+    auto load = [](const pixel_t * srcp) noexcept {
         if constexpr (std::is_same_v<pixel_t, uint8_t>)
-            return Vec4i().load_4uc(srcp);
+            return vec_t().load_4uc(srcp);
+        else if constexpr (std::is_same_v<pixel_t, uint16_t>)
+            return vec_t().load_4us(srcp);
         else
-            return Vec4i().load_4us(srcp);
+            return vec_t().load(srcp);
     };
 
-    auto store_4u = [&](const Vec4f __result, void * dstp) noexcept {
-        const Vec4i _result = truncatei(__result + 0.5f);
-
+    auto store = [&](const Vec4f srcp, pixel_t * dstp) noexcept {
         if constexpr (std::is_same_v<pixel_t, uint8_t>) {
-            const auto result = compress_saturated_s2u(compress_saturated(_result, zero_si128()), zero_si128());
+            const auto result = compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si128()), zero_si128());
             result.store_si32(dstp);
-        } else {
-            const auto result = compress_saturated_s2u(_result, zero_si128());
+        } else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
+            const auto result = compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si128());
             min(result, data->peak).storel(dstp);
+        } else {
+            srcp.store_nt(dstp);
         }
     };
 
-    using var_t = std::conditional_t<std::is_integral_v<pixel_t>, Vec4i, Vec4f>;
-
-    const var_t limit = std::any_cast<std::conditional_t<std::is_integral_v<pixel_t>, int, float>>(data->limit);
-
-    auto filtering = [&](const var_t a, const var_t b, const var_t c, const var_t d, const var_t e, const var_t f, const var_t g, const var_t h, const var_t i,
+    auto filtering = [&](const vec_t a, const vec_t b, const vec_t c, const vec_t d, const vec_t e, const vec_t f, const vec_t g, const vec_t h, const vec_t i,
                          const Vec4f chromaOffset) noexcept {
         // Soft min and max.
         //  a b c             b
         //  d e f * 0.5  +  d e f * 0.5
         //  g h i             h
         // These are 2.0x bigger (factored out the extra multiply).
-        var_t mn = min(min(min(d, e), min(f, b)), h);
-        const var_t mn2 = min(min(min(mn, a), min(c, g)), i);
+        vec_t mn = min(min(min(d, e), min(f, b)), h);
+        const vec_t mn2 = min(min(min(mn, a), min(c, g)), i);
         mn += mn2;
 
-        var_t mx = max(max(max(d, e), max(f, b)), h);
-        const var_t mx2 = max(max(max(mx, a), max(c, g)), i);
+        vec_t mx = max(max(max(d, e), max(f, b)), h);
+        const vec_t mx2 = max(max(max(mx, a), max(c, g)), i);
         mx += mx2;
 
         if constexpr (std::is_floating_point_v<pixel_t>) {
@@ -77,128 +80,68 @@ void filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const
 
             const Vec4f chromaOffset = plane ? 1.0f : 0.0f;
 
-            const int regularPart = (width - 1) & ~(Vec4i().size() - 1);
+            const int regularPart = (width - 1) & ~(vec_t().size() - 1);
 
             for (int y = 0; y < height; y++) {
                 const pixel_t * above = srcp + (y == 0 ? stride : -stride);
                 const pixel_t * below = srcp + (y == height - 1 ? -stride : stride);
 
-                if constexpr (std::is_integral_v<pixel_t>) {
-                    {
-                        const Vec4i b = load_4u(above + 0);
-                        const Vec4i e = load_4u(srcp + 0);
-                        const Vec4i h = load_4u(below + 0);
-
-                        const Vec4i a = permute4<1, 0, 1, 2>(b);
-                        const Vec4i d = permute4<1, 0, 1, 2>(e);
-                        const Vec4i g = permute4<1, 0, 1, 2>(h);
-
-                        Vec4i c, f, i;
-                        if (width > Vec4i().size()) {
-                            c = load_4u(above + 1);
-                            f = load_4u(srcp + 1);
-                            i = load_4u(below + 1);
-                        } else {
-                            c = permute4<1, 2, 3, 2>(b);
-                            f = permute4<1, 2, 3, 2>(e);
-                            i = permute4<1, 2, 3, 2>(h);
-                        }
-
-                        const Vec4f result = filtering(a, b, c,
-                                                       d, e, f,
-                                                       g, h, i,
-                                                       chromaOffset);
-
-                        store_4u(result, dstp + 0);
-                    }
-
-                    for (int x = Vec4i().size(); x < regularPart; x += Vec4i().size()) {
-                        const Vec4f result = filtering(load_4u(above + x - 1), load_4u(above + x), load_4u(above + x + 1),
-                                                       load_4u(srcp + x - 1), load_4u(srcp + x), load_4u(srcp + x + 1),
-                                                       load_4u(below + x - 1), load_4u(below + x), load_4u(below + x + 1),
-                                                       chromaOffset);
-
-                        store_4u(result, dstp + x);
+                {
+                    const vec_t b = load(above + 0);
+                    const vec_t e = load(srcp + 0);
+                    const vec_t h = load(below + 0);
+
+                    const vec_t a = permute4<1, 0, 1, 2>(b);
+                    const vec_t d = permute4<1, 0, 1, 2>(e);
+                    const vec_t g = permute4<1, 0, 1, 2>(h);
+
+                    vec_t c, f, i;
+                    if (width > vec_t().size()) {
+                        c = load(above + 1);
+                        f = load(srcp + 1);
+                        i = load(below + 1);
+                    } else {
+                        c = permute4<1, 2, 3, 2>(b);
+                        f = permute4<1, 2, 3, 2>(e);
+                        i = permute4<1, 2, 3, 2>(h);
                     }
 
-                    if (regularPart >= Vec4i().size()) {
-                        const Vec4i a = load_4u(above + regularPart - 1);
-                        const Vec4i d = load_4u(srcp + regularPart - 1);
-                        const Vec4i g = load_4u(below + regularPart - 1);
+                    const Vec4f result = filtering(a, b, c,
+                                                   d, e, f,
+                                                   g, h, i,
+                                                   chromaOffset);
 
-                        const Vec4i b = load_4u(above + regularPart);
-                        const Vec4i e = load_4u(srcp + regularPart);
-                        const Vec4i h = load_4u(below + regularPart);
-
-                        const Vec4i c = permute4<1, 2, 3, 2>(b);
-                        const Vec4i f = permute4<1, 2, 3, 2>(e);
-                        const Vec4i i = permute4<1, 2, 3, 2>(h);
-
-                        const Vec4f result = filtering(a, b, c,
-                                                       d, e, f,
-                                                       g, h, i,
-                                                       chromaOffset);
-
-                        store_4u(result, dstp + regularPart);
-                    }
-                } else {
-                    {
-                        const Vec4f b = Vec4f().load_a(above + 0);
-                        const Vec4f e = Vec4f().load_a(srcp + 0);
-                        const Vec4f h = Vec4f().load_a(below + 0);
-
-                        const Vec4f a = permute4<1, 0, 1, 2>(b);
-                        const Vec4f d = permute4<1, 0, 1, 2>(e);
-                        const Vec4f g = permute4<1, 0, 1, 2>(h);
-
-                        Vec4f c, f, i;
-                        if (width > Vec4f().size()) {
-                            c = Vec4f().load(above + 1);
-                            f = Vec4f().load(srcp + 1);
-                            i = Vec4f().load(below + 1);
-                        } else {
-                            c = permute4<1, 2, 3, 2>(b);
-                            f = permute4<1, 2, 3, 2>(e);
-                            i = permute4<1, 2, 3, 2>(h);
-                        }
-
-                        const Vec4f result = filtering(a, b, c,
-                                                       d, e, f,
-                                                       g, h, i,
-                                                       chromaOffset);
-
-                        result.store_nt(dstp + 0);
-                    }
+                    store(result, dstp + 0);
+                }
 
-                    for (int x = Vec4f().size(); x < regularPart; x += Vec4f().size()) {
-                        const Vec4f result = filtering(Vec4f().load(above + x - 1), Vec4f().load_a(above + x), Vec4f().load(above + x + 1),
-                                                       Vec4f().load(srcp + x - 1), Vec4f().load_a(srcp + x), Vec4f().load(srcp + x + 1),
-                                                       Vec4f().load(below + x - 1), Vec4f().load_a(below + x), Vec4f().load(below + x + 1),
-                                                       chromaOffset);
+                for (int x = vec_t().size(); x < regularPart; x += vec_t().size()) {
+                    const Vec4f result = filtering(load(above + x - 1), load(above + x), load(above + x + 1),
+                                                   load(srcp + x - 1), load(srcp + x), load(srcp + x + 1),
+                                                   load(below + x - 1), load(below + x), load(below + x + 1),
+                                                   chromaOffset);
 
-                        result.store_nt(dstp + x);
-                    }
+                    store(result, dstp + x);
+                }
 
-                    if (regularPart >= Vec4f().size()) {
-                        const Vec4f a = Vec4f().load(above + regularPart - 1);
-                        const Vec4f d = Vec4f().load(srcp + regularPart - 1);
-                        const Vec4f g = Vec4f().load(below + regularPart - 1);
+                if (regularPart >= vec_t().size()) {
+                    const vec_t a = load(above + regularPart - 1);
+                    const vec_t d = load(srcp + regularPart - 1);
+                    const vec_t g = load(below + regularPart - 1);
 
-                        const Vec4f b = Vec4f().load_a(above + regularPart);
-                        const Vec4f e = Vec4f().load_a(srcp + regularPart);
-                        const Vec4f h = Vec4f().load_a(below + regularPart);
+                    const vec_t b = load(above + regularPart);
+                    const vec_t e = load(srcp + regularPart);
+                    const vec_t h = load(below + regularPart);
 
-                        const Vec4f c = permute4<1, 2, 3, 2>(b);
-                        const Vec4f f = permute4<1, 2, 3, 2>(e);
-                        const Vec4f i = permute4<1, 2, 3, 2>(h);
+                    const vec_t c = permute4<1, 2, 3, 2>(b);
+                    const vec_t f = permute4<1, 2, 3, 2>(e);
+                    const vec_t i = permute4<1, 2, 3, 2>(h);
 
-                        const Vec4f result = filtering(a, b, c,
-                                                       d, e, f,
-                                                       g, h, i,
-                                                       chromaOffset);
+                    const Vec4f result = filtering(a, b, c,
+                                                   d, e, f,
+                                                   g, h, i,
+                                                   chromaOffset);
 
-                        result.store_nt(dstp + regularPart);
-                    }
+                    store(result, dstp + regularPart);
                 }
 
                 srcp += stride;