From 450aa8ae00dbfb28141db7783fc9222fe4a36015 Mon Sep 17 00:00:00 2001 From: HolyWu Date: Tue, 24 Apr 2018 18:52:22 +0800 Subject: [PATCH] Fix pixel overflow and add SSE4.1 code path --- DeblockPP7/DeblockPP7.cpp | 26 ++- DeblockPP7/DeblockPP7.hpp | 2 +- DeblockPP7/DeblockPP7.vcxproj | 1 + DeblockPP7/DeblockPP7.vcxproj.filters | 3 + DeblockPP7/DeblockPP7_SSE2.cpp | 100 ++++++++--- DeblockPP7/DeblockPP7_SSE4.cpp | 239 ++++++++++++++++++++++++++ Makefile.am | 7 + README.md | 1 + configure.ac | 2 +- 9 files changed, 346 insertions(+), 35 deletions(-) create mode 100644 DeblockPP7/DeblockPP7_SSE4.cpp diff --git a/DeblockPP7/DeblockPP7.cpp b/DeblockPP7/DeblockPP7.cpp index a1ba026..8e3354c 100644 --- a/DeblockPP7/DeblockPP7.cpp +++ b/DeblockPP7/DeblockPP7.cpp @@ -24,6 +24,7 @@ #ifdef VS_TARGET_CPU_X86 template extern void pp7Filter_sse2(const VSFrameRef *, VSFrameRef *, const DeblockPP7Data * const VS_RESTRICT, const VSAPI *) noexcept; +template extern void pp7Filter_sse4(const VSFrameRef *, VSFrameRef *, const DeblockPP7Data * const VS_RESTRICT, const VSAPI *) noexcept; #endif template @@ -132,8 +133,11 @@ static void pp7Filter_c(const VSFrameRef * src, VSFrameRef * dst, const DeblockP } } } + v = (v + (1 << 17)) >> 18; + if (static_cast(v) > d->peak) + v = -v >> 63; - dstp[srcStride * y + x] = static_cast((v + (1 << 17)) >> 18); + dstp[srcStride * y + x] = static_cast(v); } } } @@ -219,21 +223,27 @@ static void selectFunctions(const unsigned opt, DeblockPP7Data * d) noexcept { d->pp7Filter = pp7Filter_c; #ifdef VS_TARGET_CPU_X86 - if ((opt == 0 && iset >= 2) || opt == 2) + if ((opt == 0 && iset >= 5) || opt == 3) + d->pp7Filter = pp7Filter_sse4; + else if ((opt == 0 && iset >= 2) || opt == 2) d->pp7Filter = pp7Filter_sse2; #endif } else if (d->vi->format->bytesPerSample == 2) { d->pp7Filter = pp7Filter_c; #ifdef VS_TARGET_CPU_X86 - if ((opt == 0 && iset >= 2) || opt == 2) + if ((opt == 0 && iset >= 5) || opt == 3) + d->pp7Filter = pp7Filter_sse4; + else if ((opt == 0 && iset >= 2) || opt == 2) d->pp7Filter = pp7Filter_sse2; #endif } else { d->pp7Filter = pp7Filter_c; #ifdef VS_TARGET_CPU_X86 - if ((opt == 0 && iset >= 2) || opt == 2) + if ((opt == 0 && iset >= 5) || opt == 3) + d->pp7Filter = pp7Filter_sse4; + else if ((opt == 0 && iset >= 2) || opt == 2) d->pp7Filter = pp7Filter_sse2; #endif } @@ -330,8 +340,8 @@ static void VS_CC pp7Create(const VSMap *in, VSMap *out, void *userData, VSCore if (qp < 1 || qp > 63) throw std::string{ "qp must be between 1 and 63 (inclusive)" }; - if (opt < 0 || opt > 2) - throw std::string{ "opt must be 0, 1 or 2" }; + if (opt < 0 || opt > 3) + throw std::string{ "opt must be 0, 1, 2 or 3" }; if (padWidth || padHeight) { VSMap * args = vsapi->createMap(); @@ -361,7 +371,7 @@ static void VS_CC pp7Create(const VSMap *in, VSMap *out, void *userData, VSCore const unsigned numThreads = vsapi->getCoreInfo(core)->numThreads; d->buffer.reserve(numThreads); - const int peak = (d->vi->format->sampleType == stInteger) ? (1 << d->vi->format->bitsPerSample) - 1 : 255; + d->peak = (d->vi->format->sampleType == stInteger) ? (1 << d->vi->format->bitsPerSample) - 1 : 255; for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { const int width = d->vi->width >> (plane ? d->vi->format->subSamplingW : 0); @@ -369,7 +379,7 @@ static void VS_CC pp7Create(const VSMap *in, VSMap *out, void *userData, VSCore } for (int i = 0; i < 16; i++) - d->thresh[i] = static_cast((((i & 1) ? SN2 : SN0) * ((i & 4) ? SN2 : SN0) * qp * (1 << 2) - 1) * peak / 255); + d->thresh[i] = static_cast((((i & 1) ? SN2 : SN0) * ((i & 4) ? SN2 : SN0) * qp * (1 << 2) - 1) * d->peak / 255); } catch (const std::string & error) { vsapi->setError(out, ("DeblockPP7: " + error).c_str()); vsapi->freeNode(d->node); diff --git a/DeblockPP7/DeblockPP7.hpp b/DeblockPP7/DeblockPP7.hpp index 642b939..4e86b5e 100644 --- a/DeblockPP7/DeblockPP7.hpp +++ b/DeblockPP7/DeblockPP7.hpp @@ -24,7 +24,7 @@ struct DeblockPP7Data { const VSVideoInfo * vi; bool process[3]; int stride[3]; - unsigned thresh[16]; + unsigned thresh[16], peak; std::unordered_map buffer; const int16_t factor[16] = { N / (N0 * N0), N / (N0 * N1), N / (N0 * N0), N / (N0 * N2), diff --git a/DeblockPP7/DeblockPP7.vcxproj b/DeblockPP7/DeblockPP7.vcxproj index 5cd627b..8f1de58 100644 --- a/DeblockPP7/DeblockPP7.vcxproj +++ b/DeblockPP7/DeblockPP7.vcxproj @@ -86,6 +86,7 @@ + diff --git a/DeblockPP7/DeblockPP7.vcxproj.filters b/DeblockPP7/DeblockPP7.vcxproj.filters index d79354f..fdb3cdb 100644 --- a/DeblockPP7/DeblockPP7.vcxproj.filters +++ b/DeblockPP7/DeblockPP7.vcxproj.filters @@ -21,6 +21,9 @@ Source Files + + Source Files + Source Files diff --git a/DeblockPP7/DeblockPP7_SSE2.cpp b/DeblockPP7/DeblockPP7_SSE2.cpp index 40d0f3f..c42d9b4 100644 --- a/DeblockPP7/DeblockPP7_SSE2.cpp +++ b/DeblockPP7/DeblockPP7_SSE2.cpp @@ -1,26 +1,73 @@ #ifdef VS_TARGET_CPU_X86 #include "DeblockPP7.hpp" -template -static inline void dctA(const T * srcp, T * VS_RESTRICT dstp, const int stride) noexcept { - for (int i = 0; i < 4; i++) { - T s0 = (srcp[0 * stride] + srcp[6 * stride]) * scale; - T s1 = (srcp[1 * stride] + srcp[5 * stride]) * scale; - T s2 = (srcp[2 * stride] + srcp[4 * stride]) * scale; - T s3 = srcp[3 * stride] * scale; - T s = s3 + s3; - s3 = s - s0; - s0 = s + s0; - s = s2 + s1; - s2 = s2 - s1; - dstp[0] = s0 + s; - dstp[2] = s0 - s; - dstp[1] = 2 * s3 + s2; - dstp[3] = s3 - 2 * s2; - - srcp++; - dstp += 4; - } +template +static inline void dctA(const T * srcp, T * dstp, const int stride) noexcept; + +template<> +inline void dctA(const int * srcp, int * dstp, const int stride) noexcept { + Vec4i s0 = Vec4i().load(srcp + 0 * stride) + Vec4i().load(srcp + 6 * stride); + Vec4i s1 = Vec4i().load(srcp + 1 * stride) + Vec4i().load(srcp + 5 * stride); + Vec4i s2 = Vec4i().load(srcp + 2 * stride) + Vec4i().load(srcp + 4 * stride); + Vec4i s3 = Vec4i().load(srcp + 3 * stride); + Vec4i s = s3 + s3; + s3 = s - s0; + s0 = s + s0; + s = s2 + s1; + s2 = s2 - s1; + const Vec4i temp0 = s0 + s; + const Vec4i temp2 = s0 - s; + const Vec4i temp1 = 2 * s3 + s2; + const Vec4i temp3 = s3 - 2 * s2; + + Vec4i r1 = blend4i<0, 4, -256, -256>(temp0, temp1); + Vec4i r2 = blend4i<-256, -256, 0, 4>(temp2, temp3); + blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 0 * 4); + + r1 = blend4i<1, 5, -256, -256>(temp0, temp1); + r2 = blend4i<-256, -256, 1, 5>(temp2, temp3); + blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 1 * 4); + + r1 = blend4i<2, 6, -256, -256>(temp0, temp1); + r2 = blend4i<-256, -256, 2, 6>(temp2, temp3); + blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 2 * 4); + + r1 = blend4i<3, 7, -256, -256>(temp0, temp1); + r2 = blend4i<-256, -256, 3, 7>(temp2, temp3); + blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 3 * 4); +} + +template<> +inline void dctA(const float * srcp, float * dstp, const int stride) noexcept { + Vec4f s0 = (Vec4f().load(srcp + 0 * stride) + Vec4f().load(srcp + 6 * stride)) * 255.f; + Vec4f s1 = (Vec4f().load(srcp + 1 * stride) + Vec4f().load(srcp + 5 * stride)) * 255.f; + Vec4f s2 = (Vec4f().load(srcp + 2 * stride) + Vec4f().load(srcp + 4 * stride)) * 255.f; + Vec4f s3 = Vec4f().load(srcp + 3 * stride) * 255.f; + Vec4f s = s3 + s3; + s3 = s - s0; + s0 = s + s0; + s = s2 + s1; + s2 = s2 - s1; + const Vec4f temp0 = s0 + s; + const Vec4f temp2 = s0 - s; + const Vec4f temp1 = 2 * s3 + s2; + const Vec4f temp3 = s3 - 2 * s2; + + Vec4f r1 = blend4f<0, 4, -256, -256>(temp0, temp1); + Vec4f r2 = blend4f<-256, -256, 0, 4>(temp2, temp3); + blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 0 * 4); + + r1 = blend4f<1, 5, -256, -256>(temp0, temp1); + r2 = blend4f<-256, -256, 1, 5>(temp2, temp3); + blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 1 * 4); + + r1 = blend4f<2, 6, -256, -256>(temp0, temp1); + r2 = blend4f<-256, -256, 2, 6>(temp2, temp3); + blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 2 * 4); + + r1 = blend4f<3, 7, -256, -256>(temp0, temp1); + r2 = blend4f<-256, -256, 3, 7>(temp2, temp3); + blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 3 * 4); } template @@ -76,7 +123,7 @@ void pp7Filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const DeblockPP7Da const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x; int * VS_RESTRICT tp = temp + 4 * x; - dctA(p_src + index, tp + 4 * 8, stride); + dctA(p_src + index, tp + 4 * 8, stride); } for (int x = 0; x < width; x++) { @@ -84,7 +131,7 @@ void pp7Filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const DeblockPP7Da int * VS_RESTRICT tp = temp + 4 * x; if (!(x & 3)) - dctA(p_src + index, tp + 4 * 8, stride); + dctA(p_src + index, tp + 4 * 8, stride); dctB(tp, block); int64_t v = static_cast(block[0]) * d->factor[0]; @@ -102,8 +149,11 @@ void pp7Filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const DeblockPP7Da } } } + v = (v + (1 << 17)) >> 18; + if (static_cast(v) > d->peak) + v = -v >> 63; - dstp[srcStride * y + x] = static_cast((v + (1 << 17)) >> 18); + dstp[srcStride * y + x] = static_cast(v); } } } @@ -149,7 +199,7 @@ void pp7Filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const Deblo const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x; float * VS_RESTRICT tp = temp + 4 * x; - dctA(p_src + index, tp + 4 * 8, stride); + dctA(p_src + index, tp + 4 * 8, stride); } for (int x = 0; x < width; x++) { @@ -157,7 +207,7 @@ void pp7Filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const Deblo float * VS_RESTRICT tp = temp + 4 * x; if (!(x & 3)) - dctA(p_src + index, tp + 4 * 8, stride); + dctA(p_src + index, tp + 4 * 8, stride); dctB(tp, block); float v = block[0] * d->factor[0]; diff --git a/DeblockPP7/DeblockPP7_SSE4.cpp b/DeblockPP7/DeblockPP7_SSE4.cpp new file mode 100644 index 0000000..8bd9df3 --- /dev/null +++ b/DeblockPP7/DeblockPP7_SSE4.cpp @@ -0,0 +1,239 @@ +#ifdef VS_TARGET_CPU_X86 +#ifndef __SSE4_1__ +#define __SSE4_1__ +#endif + +#include "DeblockPP7.hpp" + +template +static inline void dctA(const T * srcp, T * dstp, const int stride) noexcept; + +template<> +inline void dctA(const int * srcp, int * dstp, const int stride) noexcept { + Vec4i s0 = Vec4i().load(srcp + 0 * stride) + Vec4i().load(srcp + 6 * stride); + Vec4i s1 = Vec4i().load(srcp + 1 * stride) + Vec4i().load(srcp + 5 * stride); + Vec4i s2 = Vec4i().load(srcp + 2 * stride) + Vec4i().load(srcp + 4 * stride); + Vec4i s3 = Vec4i().load(srcp + 3 * stride); + Vec4i s = s3 + s3; + s3 = s - s0; + s0 = s + s0; + s = s2 + s1; + s2 = s2 - s1; + const Vec4i temp0 = s0 + s; + const Vec4i temp2 = s0 - s; + const Vec4i temp1 = 2 * s3 + s2; + const Vec4i temp3 = s3 - 2 * s2; + + Vec4i r1 = blend4i<0, 4, -256, -256>(temp0, temp1); + Vec4i r2 = blend4i<-256, -256, 0, 4>(temp2, temp3); + blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 0 * 4); + + r1 = blend4i<1, 5, -256, -256>(temp0, temp1); + r2 = blend4i<-256, -256, 1, 5>(temp2, temp3); + blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 1 * 4); + + r1 = blend4i<2, 6, -256, -256>(temp0, temp1); + r2 = blend4i<-256, -256, 2, 6>(temp2, temp3); + blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 2 * 4); + + r1 = blend4i<3, 7, -256, -256>(temp0, temp1); + r2 = blend4i<-256, -256, 3, 7>(temp2, temp3); + blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 3 * 4); +} + +template<> +inline void dctA(const float * srcp, float * dstp, const int stride) noexcept { + Vec4f s0 = (Vec4f().load(srcp + 0 * stride) + Vec4f().load(srcp + 6 * stride)) * 255.f; + Vec4f s1 = (Vec4f().load(srcp + 1 * stride) + Vec4f().load(srcp + 5 * stride)) * 255.f; + Vec4f s2 = (Vec4f().load(srcp + 2 * stride) + Vec4f().load(srcp + 4 * stride)) * 255.f; + Vec4f s3 = Vec4f().load(srcp + 3 * stride) * 255.f; + Vec4f s = s3 + s3; + s3 = s - s0; + s0 = s + s0; + s = s2 + s1; + s2 = s2 - s1; + const Vec4f temp0 = s0 + s; + const Vec4f temp2 = s0 - s; + const Vec4f temp1 = 2 * s3 + s2; + const Vec4f temp3 = s3 - 2 * s2; + + Vec4f r1 = blend4f<0, 4, -256, -256>(temp0, temp1); + Vec4f r2 = blend4f<-256, -256, 0, 4>(temp2, temp3); + blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 0 * 4); + + r1 = blend4f<1, 5, -256, -256>(temp0, temp1); + r2 = blend4f<-256, -256, 1, 5>(temp2, temp3); + blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 1 * 4); + + r1 = blend4f<2, 6, -256, -256>(temp0, temp1); + r2 = blend4f<-256, -256, 2, 6>(temp2, temp3); + blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 2 * 4); + + r1 = blend4f<3, 7, -256, -256>(temp0, temp1); + r2 = blend4f<-256, -256, 3, 7>(temp2, temp3); + blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 3 * 4); +} + +template +static inline void dctB(const T1 * srcp, T1 * dstp) noexcept { + T2 s0 = T2().load_a(srcp + 0 * 4) + T2().load_a(srcp + 6 * 4); + T2 s1 = T2().load_a(srcp + 1 * 4) + T2().load_a(srcp + 5 * 4); + T2 s2 = T2().load_a(srcp + 2 * 4) + T2().load_a(srcp + 4 * 4); + T2 s3 = T2().load_a(srcp + 3 * 4); + T2 s = s3 + s3; + s3 = s - s0; + s0 = s + s0; + s = s2 + s1; + s2 = s2 - s1; + (s0 + s).store_a(dstp + 0 * 4); + (s0 - s).store_a(dstp + 2 * 4); + (2 * s3 + s2).store_a(dstp + 1 * 4); + (s3 - 2 * s2).store_a(dstp + 3 * 4); +} + +template +void pp7Filter_sse4(const VSFrameRef * src, VSFrameRef * dst, const DeblockPP7Data * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { + const auto threadId = std::this_thread::get_id(); + int * buffer = d->buffer.at(threadId); + + for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { + if (d->process[plane]) { + const int width = vsapi->getFrameWidth(src, plane); + const int height = vsapi->getFrameHeight(src, plane); + const int srcStride = vsapi->getStride(src, plane) / sizeof(T); + const int stride = d->stride[plane]; + const T * srcp = reinterpret_cast(vsapi->getReadPtr(src, plane)); + T * VS_RESTRICT dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + + int * VS_RESTRICT p_src = buffer + stride * 8; + int * VS_RESTRICT block = buffer; + int * VS_RESTRICT temp = buffer + 16; + + for (int y = 0; y < height; y++) { + const int index = stride * (8 + y) + 8; + std::copy_n(srcp + srcStride * y, width, p_src + index); + for (int x = 0; x < 8; x++) { + p_src[index - 1 - x] = p_src[index + x]; + p_src[index + width + x] = p_src[index + width - 1 - x]; + } + } + for (int y = 0; y < 8; y++) { + memcpy(p_src + stride * (7 - y), p_src + stride * (8 + y), stride * sizeof(int)); + memcpy(p_src + stride * (height + 8 + y), p_src + stride * (height + 7 - y), stride * sizeof(int)); + } + + for (int y = 0; y < height; y++) { + for (int x = -8; x < 0; x += 4) { + const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x; + int * VS_RESTRICT tp = temp + 4 * x; + + dctA(p_src + index, tp + 4 * 8, stride); + } + + for (int x = 0; x < width; x++) { + const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x; + int * VS_RESTRICT tp = temp + 4 * x; + + if (!(x & 3)) + dctA(p_src + index, tp + 4 * 8, stride); + dctB(tp, block); + + int64_t v = static_cast(block[0]) * d->factor[0]; + for (int i = 1; i < 16; i++) { + const unsigned threshold1 = d->thresh[i]; + const unsigned threshold2 = threshold1 * 2; + if (block[i] + threshold1 > threshold2) { + if (block[i] + threshold2 > threshold2 * 2) { + v += static_cast(block[i]) * d->factor[i]; + } else { + if (block[i] > 0) + v += 2LL * (block[i] - static_cast(threshold1)) * d->factor[i]; + else + v += 2LL * (block[i] + static_cast(threshold1)) * d->factor[i]; + } + } + } + v = (v + (1 << 17)) >> 18; + if (static_cast(v) > d->peak) + v = -v >> 63; + + dstp[srcStride * y + x] = static_cast(v); + } + } + } + } +} + +template void pp7Filter_sse4(const VSFrameRef *, VSFrameRef *, const DeblockPP7Data * const VS_RESTRICT, const VSAPI *) noexcept; +template void pp7Filter_sse4(const VSFrameRef *, VSFrameRef *, const DeblockPP7Data * const VS_RESTRICT, const VSAPI *) noexcept; + +template<> +void pp7Filter_sse4(const VSFrameRef * src, VSFrameRef * dst, const DeblockPP7Data * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { + const auto threadId = std::this_thread::get_id(); + float * buffer = reinterpret_cast(d->buffer.at(threadId)); + + for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { + if (d->process[plane]) { + const int width = vsapi->getFrameWidth(src, plane); + const int height = vsapi->getFrameHeight(src, plane); + const int srcStride = vsapi->getStride(src, plane) / sizeof(float); + const int stride = d->stride[plane]; + const float * srcp = reinterpret_cast(vsapi->getReadPtr(src, plane)); + float * VS_RESTRICT dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + + float * VS_RESTRICT p_src = buffer + stride * 8; + float * VS_RESTRICT block = buffer; + float * VS_RESTRICT temp = buffer + 16; + + for (int y = 0; y < height; y++) { + const int index = stride * (8 + y) + 8; + std::copy_n(srcp + srcStride * y, width, p_src + index); + for (int x = 0; x < 8; x++) { + p_src[index - 1 - x] = p_src[index + x]; + p_src[index + width + x] = p_src[index + width - 1 - x]; + } + } + for (int y = 0; y < 8; y++) { + memcpy(p_src + stride * (7 - y), p_src + stride * (8 + y), stride * sizeof(float)); + memcpy(p_src + stride * (height + 8 + y), p_src + stride * (height + 7 - y), stride * sizeof(float)); + } + + for (int y = 0; y < height; y++) { + for (int x = -8; x < 0; x += 4) { + const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x; + float * VS_RESTRICT tp = temp + 4 * x; + + dctA(p_src + index, tp + 4 * 8, stride); + } + + for (int x = 0; x < width; x++) { + const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x; + float * VS_RESTRICT tp = temp + 4 * x; + + if (!(x & 3)) + dctA(p_src + index, tp + 4 * 8, stride); + dctB(tp, block); + + float v = block[0] * d->factor[0]; + for (int i = 1; i < 16; i++) { + const unsigned threshold1 = d->thresh[i]; + const unsigned threshold2 = threshold1 * 2; + if (static_cast(block[i]) + threshold1 > threshold2) { + if (static_cast(block[i]) + threshold2 > threshold2 * 2) { + v += block[i] * d->factor[i]; + } else { + if (block[i] > 0.f) + v += 2.f * (block[i] - threshold1) * d->factor[i]; + else + v += 2.f * (block[i] + threshold1) * d->factor[i]; + } + } + } + + dstp[srcStride * y + x] = v * (1.f / (1 << 18)) * (1.f / 255.f); + } + } + } + } +} +#endif diff --git a/Makefile.am b/Makefile.am index e1371fc..9498d9c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -16,6 +16,13 @@ libdeblockpp7_la_SOURCES += DeblockPP7/DeblockPP7_SSE2.cpp \ DeblockPP7/vectorclass/vectorclass.h \ DeblockPP7/vectorclass/vectorf128.h \ DeblockPP7/vectorclass/vectori128.h + +noinst_LTLIBRARIES = libsse4.la + +libsse4_la_SOURCES = DeblockPP7/DeblockPP7_SSE4.cpp +libsse4_la_CXXFLAGS = $(AM_CXXFLAGS) -msse4.1 + +libdeblockpp7_la_LIBADD = libsse4.la endif libdeblockpp7_la_LDFLAGS = -no-undefined -avoid-version $(PLUGINLDFLAGS) diff --git a/README.md b/README.md index 74a9376..2c4f438 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ Usage * 0 = auto detect * 1 = use c * 2 = use sse2 + * 3 = use sse4.1 * planes: A list of the planes to process. By default all planes are processed. diff --git a/configure.ac b/configure.ac index b2a972e..0ed2525 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([DeblockPP7], [2], [https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DeblockPP7/issues], [DeblockPP7], [https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DeblockPP7/]) +AC_INIT([DeblockPP7], [3], [https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DeblockPP7/issues], [DeblockPP7], [https://github.com/HomeOfVapourSynthEvolution/VapourSynth-DeblockPP7/]) : ${CXXFLAGS=""}