From 450aa8ae00dbfb28141db7783fc9222fe4a36015 Mon Sep 17 00:00:00 2001
From: HolyWu <holywu@gmail.com>
Date: Tue, 24 Apr 2018 18:52:22 +0800
Subject: [PATCH] Fix pixel overflow and add SSE4.1 code path

---
 DeblockPP7/DeblockPP7.cpp             |  26 ++-
 DeblockPP7/DeblockPP7.hpp             |   2 +-
 DeblockPP7/DeblockPP7.vcxproj         |   1 +
 DeblockPP7/DeblockPP7.vcxproj.filters |   3 +
 DeblockPP7/DeblockPP7_SSE2.cpp        | 100 ++++++++---
 DeblockPP7/DeblockPP7_SSE4.cpp        | 239 ++++++++++++++++++++++++++
 Makefile.am                           |   7 +
 README.md                             |   1 +
 configure.ac                          |   2 +-
 9 files changed, 346 insertions(+), 35 deletions(-)
 create mode 100644 DeblockPP7/DeblockPP7_SSE4.cpp
diff --git a/DeblockPP7/DeblockPP7.cpp b/DeblockPP7/DeblockPP7.cpp
index a1ba026..8e3354c 100644
--- a/DeblockPP7/DeblockPP7.cpp
+++ b/DeblockPP7/DeblockPP7.cpp
@@ -24,6 +24,7 @@
 
 #ifdef VS_TARGET_CPU_X86
 template<typename T> extern void pp7Filter_sse2(const VSFrameRef *, VSFrameRef *, const DeblockPP7Data * const VS_RESTRICT, const VSAPI *) noexcept;
+template<typename T> extern void pp7Filter_sse4(const VSFrameRef *, VSFrameRef *, const DeblockPP7Data * const VS_RESTRICT, const VSAPI *) noexcept;
 #endif
 
 template<typename T, int scale>
@@ -132,8 +133,11 @@ static void pp7Filter_c(const VSFrameRef * src, VSFrameRef * dst, const DeblockP
                             }
                         }
                     }
+                    v = (v + (1 << 17)) >> 18;
+                    if (static_cast<unsigned>(v) > d->peak)
+                        v = -v >> 63;
 
-                    dstp[srcStride * y + x] = static_cast<T>((v + (1 << 17)) >> 18);
+                    dstp[srcStride * y + x] = static_cast<T>(v);
                 }
             }
         }
@@ -219,21 +223,27 @@ static void selectFunctions(const unsigned opt, DeblockPP7Data * d) noexcept {
         d->pp7Filter = pp7Filter_c<uint8_t>;
 
 #ifdef VS_TARGET_CPU_X86
-        if ((opt == 0 && iset >= 2) || opt == 2)
+        if ((opt == 0 && iset >= 5) || opt == 3)
+            d->pp7Filter = pp7Filter_sse4<uint8_t>;
+        else if ((opt == 0 && iset >= 2) || opt == 2)
             d->pp7Filter = pp7Filter_sse2<uint8_t>;
 #endif
     } else if (d->vi->format->bytesPerSample == 2) {
         d->pp7Filter = pp7Filter_c<uint16_t>;
 
 #ifdef VS_TARGET_CPU_X86
-        if ((opt == 0 && iset >= 2) || opt == 2)
+        if ((opt == 0 && iset >= 5) || opt == 3)
+            d->pp7Filter = pp7Filter_sse4<uint16_t>;
+        else if ((opt == 0 && iset >= 2) || opt == 2)
             d->pp7Filter = pp7Filter_sse2<uint16_t>;
 #endif
     } else {
         d->pp7Filter = pp7Filter_c<float>;
 
 #ifdef VS_TARGET_CPU_X86
-        if ((opt == 0 && iset >= 2) || opt == 2)
+        if ((opt == 0 && iset >= 5) || opt == 3)
+            d->pp7Filter = pp7Filter_sse4<float>;
+        else if ((opt == 0 && iset >= 2) || opt == 2)
             d->pp7Filter = pp7Filter_sse2<float>;
 #endif
     }
@@ -330,8 +340,8 @@ static void VS_CC pp7Create(const VSMap *in, VSMap *out, void *userData, VSCore
         if (qp < 1 || qp > 63)
             throw std::string{ "qp must be between 1 and 63 (inclusive)" };
 
-        if (opt < 0 || opt > 2)
-            throw std::string{ "opt must be 0, 1 or 2" };
+        if (opt < 0 || opt > 3)
+            throw std::string{ "opt must be 0, 1, 2 or 3" };
 
         if (padWidth || padHeight) {
             VSMap * args = vsapi->createMap();
@@ -361,7 +371,7 @@ static void VS_CC pp7Create(const VSMap *in, VSMap *out, void *userData, VSCore
         const unsigned numThreads = vsapi->getCoreInfo(core)->numThreads;
         d->buffer.reserve(numThreads);
 
-        const int peak = (d->vi->format->sampleType == stInteger) ? (1 << d->vi->format->bitsPerSample) - 1 : 255;
+        d->peak = (d->vi->format->sampleType == stInteger) ? (1 << d->vi->format->bitsPerSample) - 1 : 255;
 
         for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
             const int width = d->vi->width >> (plane ? d->vi->format->subSamplingW : 0);
@@ -369,7 +379,7 @@ static void VS_CC pp7Create(const VSMap *in, VSMap *out, void *userData, VSCore
         }
 
         for (int i = 0; i < 16; i++)
-            d->thresh[i] = static_cast<unsigned>((((i & 1) ? SN2 : SN0) * ((i & 4) ? SN2 : SN0) * qp * (1 << 2) - 1) * peak / 255);
+            d->thresh[i] = static_cast<unsigned>((((i & 1) ? SN2 : SN0) * ((i & 4) ? SN2 : SN0) * qp * (1 << 2) - 1) * d->peak / 255);
     } catch (const std::string & error) {
         vsapi->setError(out, ("DeblockPP7: " + error).c_str());
         vsapi->freeNode(d->node);
diff --git a/DeblockPP7/DeblockPP7.hpp b/DeblockPP7/DeblockPP7.hpp
index 642b939..4e86b5e 100644
--- a/DeblockPP7/DeblockPP7.hpp
+++ b/DeblockPP7/DeblockPP7.hpp
@@ -24,7 +24,7 @@ struct DeblockPP7Data {
     const VSVideoInfo * vi;
     bool process[3];
     int stride[3];
-    unsigned thresh[16];
+    unsigned thresh[16], peak;
     std::unordered_map<std::thread::id, int *> buffer;
     const int16_t factor[16] = {
         N / (N0 * N0), N / (N0 * N1), N / (N0 * N0), N / (N0 * N2),
diff --git a/DeblockPP7/DeblockPP7.vcxproj b/DeblockPP7/DeblockPP7.vcxproj
index 5cd627b..8f1de58 100644
--- a/DeblockPP7/DeblockPP7.vcxproj
+++ b/DeblockPP7/DeblockPP7.vcxproj
@@ -86,6 +86,7 @@
   <ItemGroup>
     <ClCompile Include="DeblockPP7.cpp" />
     <ClCompile Include="DeblockPP7_SSE2.cpp" />
+    <ClCompile Include="DeblockPP7_SSE4.cpp" />
     <ClCompile Include="vectorclass\instrset_detect.cpp" />
   </ItemGroup>
   <ItemGroup>
diff --git a/DeblockPP7/DeblockPP7.vcxproj.filters b/DeblockPP7/DeblockPP7.vcxproj.filters
index d79354f..fdb3cdb 100644
--- a/DeblockPP7/DeblockPP7.vcxproj.filters
+++ b/DeblockPP7/DeblockPP7.vcxproj.filters
@@ -21,6 +21,9 @@
     <ClCompile Include="DeblockPP7_SSE2.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="DeblockPP7_SSE4.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="vectorclass\instrset_detect.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/DeblockPP7/DeblockPP7_SSE2.cpp b/DeblockPP7/DeblockPP7_SSE2.cpp
index 40d0f3f..c42d9b4 100644
--- a/DeblockPP7/DeblockPP7_SSE2.cpp
+++ b/DeblockPP7/DeblockPP7_SSE2.cpp
@@ -1,26 +1,73 @@
 #ifdef VS_TARGET_CPU_X86
 #include "DeblockPP7.hpp"
 
-template<typename T, int scale>
-static inline void dctA(const T * srcp, T * VS_RESTRICT dstp, const int stride) noexcept {
-    for (int i = 0; i < 4; i++) {
-        T s0 = (srcp[0 * stride] + srcp[6 * stride]) * scale;
-        T s1 = (srcp[1 * stride] + srcp[5 * stride]) * scale;
-        T s2 = (srcp[2 * stride] + srcp[4 * stride]) * scale;
-        T s3 = srcp[3 * stride] * scale;
-        T s = s3 + s3;
-        s3 = s - s0;
-        s0 = s + s0;
-        s = s2 + s1;
-        s2 = s2 - s1;
-        dstp[0] = s0 + s;
-        dstp[2] = s0 - s;
-        dstp[1] = 2 * s3 + s2;
-        dstp[3] = s3 - 2 * s2;
-
-        srcp++;
-        dstp += 4;
-    }
+template<typename T>
+static inline void dctA(const T * srcp, T * dstp, const int stride) noexcept;
+
+template<>
+inline void dctA(const int * srcp, int * dstp, const int stride) noexcept {
+    Vec4i s0 = Vec4i().load(srcp + 0 * stride) + Vec4i().load(srcp + 6 * stride);
+    Vec4i s1 = Vec4i().load(srcp + 1 * stride) + Vec4i().load(srcp + 5 * stride);
+    Vec4i s2 = Vec4i().load(srcp + 2 * stride) + Vec4i().load(srcp + 4 * stride);
+    Vec4i s3 = Vec4i().load(srcp + 3 * stride);
+    Vec4i s = s3 + s3;
+    s3 = s - s0;
+    s0 = s + s0;
+    s = s2 + s1;
+    s2 = s2 - s1;
+    const Vec4i temp0 = s0 + s;
+    const Vec4i temp2 = s0 - s;
+    const Vec4i temp1 = 2 * s3 + s2;
+    const Vec4i temp3 = s3 - 2 * s2;
+
+    Vec4i r1 = blend4i<0, 4, -256, -256>(temp0, temp1);
+    Vec4i r2 = blend4i<-256, -256, 0, 4>(temp2, temp3);
+    blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 0 * 4);
+
+    r1 = blend4i<1, 5, -256, -256>(temp0, temp1);
+    r2 = blend4i<-256, -256, 1, 5>(temp2, temp3);
+    blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 1 * 4);
+
+    r1 = blend4i<2, 6, -256, -256>(temp0, temp1);
+    r2 = blend4i<-256, -256, 2, 6>(temp2, temp3);
+    blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 2 * 4);
+
+    r1 = blend4i<3, 7, -256, -256>(temp0, temp1);
+    r2 = blend4i<-256, -256, 3, 7>(temp2, temp3);
+    blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 3 * 4);
+}
+
+template<>
+inline void dctA(const float * srcp, float * dstp, const int stride) noexcept {
+    Vec4f s0 = (Vec4f().load(srcp + 0 * stride) + Vec4f().load(srcp + 6 * stride)) * 255.f;
+    Vec4f s1 = (Vec4f().load(srcp + 1 * stride) + Vec4f().load(srcp + 5 * stride)) * 255.f;
+    Vec4f s2 = (Vec4f().load(srcp + 2 * stride) + Vec4f().load(srcp + 4 * stride)) * 255.f;
+    Vec4f s3 = Vec4f().load(srcp + 3 * stride) * 255.f;
+    Vec4f s = s3 + s3;
+    s3 = s - s0;
+    s0 = s + s0;
+    s = s2 + s1;
+    s2 = s2 - s1;
+    const Vec4f temp0 = s0 + s;
+    const Vec4f temp2 = s0 - s;
+    const Vec4f temp1 = 2 * s3 + s2;
+    const Vec4f temp3 = s3 - 2 * s2;
+
+    Vec4f r1 = blend4f<0, 4, -256, -256>(temp0, temp1);
+    Vec4f r2 = blend4f<-256, -256, 0, 4>(temp2, temp3);
+    blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 0 * 4);
+
+    r1 = blend4f<1, 5, -256, -256>(temp0, temp1);
+    r2 = blend4f<-256, -256, 1, 5>(temp2, temp3);
+    blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 1 * 4);
+
+    r1 = blend4f<2, 6, -256, -256>(temp0, temp1);
+    r2 = blend4f<-256, -256, 2, 6>(temp2, temp3);
+    blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 2 * 4);
+
+    r1 = blend4f<3, 7, -256, -256>(temp0, temp1);
+    r2 = blend4f<-256, -256, 3, 7>(temp2, temp3);
+    blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 3 * 4);
 }
 
 template<typename T1, typename T2>
@@ -76,7 +123,7 @@ void pp7Filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const DeblockPP7Da
                     const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x;
                     int * VS_RESTRICT tp = temp + 4 * x;
 
-                    dctA<int, 1>(p_src + index, tp + 4 * 8, stride);
+                    dctA(p_src + index, tp + 4 * 8, stride);
                 }
 
                 for (int x = 0; x < width; x++) {
@@ -84,7 +131,7 @@ void pp7Filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const DeblockPP7Da
                     int * VS_RESTRICT tp = temp + 4 * x;
 
                     if (!(x & 3))
-                        dctA<int, 1>(p_src + index, tp + 4 * 8, stride);
+                        dctA(p_src + index, tp + 4 * 8, stride);
                     dctB<int, Vec4i>(tp, block);
 
                     int64_t v = static_cast<int64_t>(block[0]) * d->factor[0];
@@ -102,8 +149,11 @@ void pp7Filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const DeblockPP7Da
                             }
                         }
                     }
+                    v = (v + (1 << 17)) >> 18;
+                    if (static_cast<unsigned>(v) > d->peak)
+                        v = -v >> 63;
 
-                    dstp[srcStride * y + x] = static_cast<T>((v + (1 << 17)) >> 18);
+                    dstp[srcStride * y + x] = static_cast<T>(v);
                 }
             }
         }
@@ -149,7 +199,7 @@ void pp7Filter_sse2<float>(const VSFrameRef * src, VSFrameRef * dst, const Deblo
                     const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x;
                     float * VS_RESTRICT tp = temp + 4 * x;
 
-                    dctA<float, 255>(p_src + index, tp + 4 * 8, stride);
+                    dctA(p_src + index, tp + 4 * 8, stride);
                 }
 
                 for (int x = 0; x < width; x++) {
@@ -157,7 +207,7 @@ void pp7Filter_sse2<float>(const VSFrameRef * src, VSFrameRef * dst, const Deblo
                     float * VS_RESTRICT tp = temp + 4 * x;
 
                     if (!(x & 3))
-                        dctA<float, 255>(p_src + index, tp + 4 * 8, stride);
+                        dctA(p_src + index, tp + 4 * 8, stride);
                     dctB<float, Vec4f>(tp, block);
 
                     float v = block[0] * d->factor[0];
diff --git a/DeblockPP7/DeblockPP7_SSE4.cpp b/DeblockPP7/DeblockPP7_SSE4.cpp
new file mode 100644
index 0000000..8bd9df3
--- /dev/null
+++ b/DeblockPP7/DeblockPP7_SSE4.cpp
@@ -0,0 +1,239 @@
+#ifdef VS_TARGET_CPU_X86
+#ifndef __SSE4_1__
+#define __SSE4_1__
+#endif
+
+#include "DeblockPP7.hpp"
+
+template<typename T>
+static inline void dctA(const T * srcp, T * dstp, const int stride) noexcept;
+
+template<>
+inline void dctA(const int * srcp, int * dstp, const int stride) noexcept {
+    Vec4i s0 = Vec4i().load(srcp + 0 * stride) + Vec4i().load(srcp + 6 * stride);
+    Vec4i s1 = Vec4i().load(srcp + 1 * stride) + Vec4i().load(srcp + 5 * stride);
+    Vec4i s2 = Vec4i().load(srcp + 2 * stride) + Vec4i().load(srcp + 4 * stride);
+    Vec4i s3 = Vec4i().load(srcp + 3 * stride);
+    Vec4i s = s3 + s3;
+    s3 = s - s0;
+    s0 = s + s0;
+    s = s2 + s1;
+    s2 = s2 - s1;
+    const Vec4i temp0 = s0 + s;
+    const Vec4i temp2 = s0 - s;
+    const Vec4i temp1 = 2 * s3 + s2;
+    const Vec4i temp3 = s3 - 2 * s2;
+
+    Vec4i r1 = blend4i<0, 4, -256, -256>(temp0, temp1);
+    Vec4i r2 = blend4i<-256, -256, 0, 4>(temp2, temp3);
+    blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 0 * 4);
+
+    r1 = blend4i<1, 5, -256, -256>(temp0, temp1);
+    r2 = blend4i<-256, -256, 1, 5>(temp2, temp3);
+    blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 1 * 4);
+
+    r1 = blend4i<2, 6, -256, -256>(temp0, temp1);
+    r2 = blend4i<-256, -256, 2, 6>(temp2, temp3);
+    blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 2 * 4);
+
+    r1 = blend4i<3, 7, -256, -256>(temp0, temp1);
+    r2 = blend4i<-256, -256, 3, 7>(temp2, temp3);
+    blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 3 * 4);
+}
+
+template<>
+inline void dctA(const float * srcp, float * dstp, const int stride) noexcept {
+    Vec4f s0 = (Vec4f().load(srcp + 0 * stride) + Vec4f().load(srcp + 6 * stride)) * 255.f;
+    Vec4f s1 = (Vec4f().load(srcp + 1 * stride) + Vec4f().load(srcp + 5 * stride)) * 255.f;
+    Vec4f s2 = (Vec4f().load(srcp + 2 * stride) + Vec4f().load(srcp + 4 * stride)) * 255.f;
+    Vec4f s3 = Vec4f().load(srcp + 3 * stride) * 255.f;
+    Vec4f s = s3 + s3;
+    s3 = s - s0;
+    s0 = s + s0;
+    s = s2 + s1;
+    s2 = s2 - s1;
+    const Vec4f temp0 = s0 + s;
+    const Vec4f temp2 = s0 - s;
+    const Vec4f temp1 = 2 * s3 + s2;
+    const Vec4f temp3 = s3 - 2 * s2;
+
+    Vec4f r1 = blend4f<0, 4, -256, -256>(temp0, temp1);
+    Vec4f r2 = blend4f<-256, -256, 0, 4>(temp2, temp3);
+    blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 0 * 4);
+
+    r1 = blend4f<1, 5, -256, -256>(temp0, temp1);
+    r2 = blend4f<-256, -256, 1, 5>(temp2, temp3);
+    blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 1 * 4);
+
+    r1 = blend4f<2, 6, -256, -256>(temp0, temp1);
+    r2 = blend4f<-256, -256, 2, 6>(temp2, temp3);
+    blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 2 * 4);
+
+    r1 = blend4f<3, 7, -256, -256>(temp0, temp1);
+    r2 = blend4f<-256, -256, 3, 7>(temp2, temp3);
+    blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 3 * 4);
+}
+
+template<typename T1, typename T2>
+static inline void dctB(const T1 * srcp, T1 * dstp) noexcept {
+    T2 s0 = T2().load_a(srcp + 0 * 4) + T2().load_a(srcp + 6 * 4);
+    T2 s1 = T2().load_a(srcp + 1 * 4) + T2().load_a(srcp + 5 * 4);
+    T2 s2 = T2().load_a(srcp + 2 * 4) + T2().load_a(srcp + 4 * 4);
+    T2 s3 = T2().load_a(srcp + 3 * 4);
+    T2 s = s3 + s3;
+    s3 = s - s0;
+    s0 = s + s0;
+    s = s2 + s1;
+    s2 = s2 - s1;
+    (s0 + s).store_a(dstp + 0 * 4);
+    (s0 - s).store_a(dstp + 2 * 4);
+    (2 * s3 + s2).store_a(dstp + 1 * 4);
+    (s3 - 2 * s2).store_a(dstp + 3 * 4);
+}
+
+template<typename T>
+void pp7Filter_sse4(const VSFrameRef * src, VSFrameRef * dst, const DeblockPP7Data * const VS_RESTRICT d, const VSAPI * vsapi) noexcept {
+    const auto threadId = std::this_thread::get_id();
+    int * buffer = d->buffer.at(threadId);
+
+    for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
+        if (d->process[plane]) {
+            const int width = vsapi->getFrameWidth(src, plane);
+            const int height = vsapi->getFrameHeight(src, plane);
+            const int srcStride = vsapi->getStride(src, plane) / sizeof(T);
+            const int stride = d->stride[plane];
+            const T * srcp = reinterpret_cast<const T *>(vsapi->getReadPtr(src, plane));
+            T * VS_RESTRICT dstp = reinterpret_cast<T *>(vsapi->getWritePtr(dst, plane));
+
+            int * VS_RESTRICT p_src = buffer + stride * 8;
+            int * VS_RESTRICT block = buffer;
+            int * VS_RESTRICT temp = buffer + 16;
+
+            for (int y = 0; y < height; y++) {
+                const int index = stride * (8 + y) + 8;
+                std::copy_n(srcp + srcStride * y, width, p_src + index);
+                for (int x = 0; x < 8; x++) {
+                    p_src[index - 1 - x] = p_src[index + x];
+                    p_src[index + width + x] = p_src[index + width - 1 - x];
+                }
+            }
+            for (int y = 0; y < 8; y++) {
+                memcpy(p_src + stride * (7 - y), p_src + stride * (8 + y), stride * sizeof(int));
+                memcpy(p_src + stride * (height + 8 + y), p_src + stride * (height + 7 - y), stride * sizeof(int));
+            }
+
+            for (int y = 0; y < height; y++) {
+                for (int x = -8; x < 0; x += 4) {
+                    const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x;
+                    int * VS_RESTRICT tp = temp + 4 * x;
+
+                    dctA(p_src + index, tp + 4 * 8, stride);
+                }
+
+                for (int x = 0; x < width; x++) {
+                    const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x;
+                    int * VS_RESTRICT tp = temp + 4 * x;
+
+                    if (!(x & 3))
+                        dctA(p_src + index, tp + 4 * 8, stride);
+                    dctB<int, Vec4i>(tp, block);
+
+                    int64_t v = static_cast<int64_t>(block[0]) * d->factor[0];
+                    for (int i = 1; i < 16; i++) {
+                        const unsigned threshold1 = d->thresh[i];
+                        const unsigned threshold2 = threshold1 * 2;
+                        if (block[i] + threshold1 > threshold2) {
+                            if (block[i] + threshold2 > threshold2 * 2) {
+                                v += static_cast<int64_t>(block[i]) * d->factor[i];
+                            } else {
+                                if (block[i] > 0)
+                                    v += 2LL * (block[i] - static_cast<int>(threshold1)) * d->factor[i];
+                                else
+                                    v += 2LL * (block[i] + static_cast<int>(threshold1)) * d->factor[i];
+                            }
+                        }
+                    }
+                    v = (v + (1 << 17)) >> 18;
+                    if (static_cast<unsigned>(v) > d->peak)
+                        v = -v >> 63;
+
+                    dstp[srcStride * y + x] = static_cast<T>(v);
+                }
+            }
+        }
+    }
+}
+
+template void pp7Filter_sse4<uint8_t>(const VSFrameRef *, VSFrameRef *, const DeblockPP7Data * const VS_RESTRICT, const VSAPI *) noexcept;
+template void pp7Filter_sse4<uint16_t>(const VSFrameRef *, VSFrameRef *, const DeblockPP7Data * const VS_RESTRICT, const VSAPI *) noexcept;
+
+template<>
+void pp7Filter_sse4<float>(const VSFrameRef * src, VSFrameRef * dst, const DeblockPP7Data * const VS_RESTRICT d, const VSAPI * vsapi) noexcept {
+    const auto threadId = std::this_thread::get_id();
+    float * buffer = reinterpret_cast<float *>(d->buffer.at(threadId));
+
+    for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
+        if (d->process[plane]) {
+            const int width = vsapi->getFrameWidth(src, plane);
+            const int height = vsapi->getFrameHeight(src, plane);
+            const int srcStride = vsapi->getStride(src, plane) / sizeof(float);
+            const int stride = d->stride[plane];
+            const float * srcp = reinterpret_cast<const float *>(vsapi->getReadPtr(src, plane));
+            float * VS_RESTRICT dstp = reinterpret_cast<float *>(vsapi->getWritePtr(dst, plane));
+
+            float * VS_RESTRICT p_src = buffer + stride * 8;
+            float * VS_RESTRICT block = buffer;
+            float * VS_RESTRICT temp = buffer + 16;
+
+            for (int y = 0; y < height; y++) {
+                const int index = stride * (8 + y) + 8;
+                std::copy_n(srcp + srcStride * y, width, p_src + index);
+                for (int x = 0; x < 8; x++) {
+                    p_src[index - 1 - x] = p_src[index + x];
+                    p_src[index + width + x] = p_src[index + width - 1 - x];
+                }
+            }
+            for (int y = 0; y < 8; y++) {
+                memcpy(p_src + stride * (7 - y), p_src + stride * (8 + y), stride * sizeof(float));
+                memcpy(p_src + stride * (height + 8 + y), p_src + stride * (height + 7 - y), stride * sizeof(float));
+            }
+
+            for (int y = 0; y < height; y++) {
+                for (int x = -8; x < 0; x += 4) {
+                    const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x;
+                    float * VS_RESTRICT tp = temp + 4 * x;
+
+                    dctA(p_src + index, tp + 4 * 8, stride);
+                }
+
+                for (int x = 0; x < width; x++) {
+                    const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x;
+                    float * VS_RESTRICT tp = temp + 4 * x;
+
+                    if (!(x & 3))
+                        dctA(p_src + index, tp + 4 * 8, stride);
+                    dctB<float, Vec4f>(tp, block);
+
+                    float v = block[0] * d->factor[0];
+                    for (int i = 1; i < 16; i++) {
+                        const unsigned threshold1 = d->thresh[i];
+                        const unsigned threshold2 = threshold1 * 2;
+                        if (static_cast<unsigned>(block[i]) + threshold1 > threshold2) {
+                            if (static_cast<unsigned>(block[i]) + threshold2 > threshold2 * 2) {
+                                v += block[i] * d->factor[i];
+                            } else {
+                                if (block[i] > 0.f)
+                                    v += 2.f * (block[i] - threshold1) * d->factor[i];
+                                else
+                                    v += 2.f * (block[i] + threshold1) * d->factor[i];
+                            }
+                        }
+                    }
+
+                    dstp[srcStride * y + x] = v * (1.f / (1 << 18)) * (1.f / 255.f);
+                }
+            }
+        }
+    }
+}
+#endif
diff --git a/Makefile.am b/Makefile.am
index e1371fc..9498d9c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -16,6 +16,13 @@ libdeblockpp7_la_SOURCES += DeblockPP7/DeblockPP7_SSE2.cpp \
                             DeblockPP7/vectorclass/vectorclass.h \
                             DeblockPP7/vectorclass/vectorf128.h \
                             DeblockPP7/vectorclass/vectori128.h
+
+noinst_LTLIBRARIES = libsse4.la
+
+libsse4_la_SOURCES = DeblockPP7/DeblockPP7_SSE4.cpp
+libsse4_la_CXXFLAGS = $(AM_CXXFLAGS) -msse4.1
+
+libdeblockpp7_la_LIBADD = libsse4.la
 endif
 
 libdeblockpp7_la_LDFLAGS = -no-undefined -avoid-version $(PLUGINLDFLAGS)
diff --git a/README.md b/README.md
index 74a9376..2c4f438 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,7 @@ Usage
   * 0 = auto detect
   * 1 = use c
   * 2 = use sse2
+  * 3 = use sse4.1
 
 * planes: A list of the planes to process. By default all planes are processed.
 
diff --git a/configure.ac b/configure.ac
index b2a972e..0ed2525 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([DeblockPP7], [2], [https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DeblockPP7/issues], [DeblockPP7], [https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DeblockPP7/])
+AC_INIT([DeblockPP7], [3], [https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DeblockPP7/issues], [DeblockPP7], [https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DeblockPP7/])
 
 : ${CXXFLAGS=""}