Skip to content

Commit

Permalink
Fix pixel overflow and add SSE4.1 code path
Browse files Browse the repository at this point in the history
  • Loading branch information
HolyWu committed Apr 24, 2018
1 parent e75e1a4 commit 450aa8a
Show file tree
Hide file tree
Showing 9 changed files with 346 additions and 35 deletions.
26 changes: 18 additions & 8 deletions DeblockPP7/DeblockPP7.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#ifdef VS_TARGET_CPU_X86
template<typename T> extern void pp7Filter_sse2(const VSFrameRef *, VSFrameRef *, const DeblockPP7Data * const VS_RESTRICT, const VSAPI *) noexcept;
template<typename T> extern void pp7Filter_sse4(const VSFrameRef *, VSFrameRef *, const DeblockPP7Data * const VS_RESTRICT, const VSAPI *) noexcept;
#endif

template<typename T, int scale>
Expand Down Expand Up @@ -132,8 +133,11 @@ static void pp7Filter_c(const VSFrameRef * src, VSFrameRef * dst, const DeblockP
}
}
}
v = (v + (1 << 17)) >> 18;
if (static_cast<unsigned>(v) > d->peak)
v = -v >> 63;

dstp[srcStride * y + x] = static_cast<T>((v + (1 << 17)) >> 18);
dstp[srcStride * y + x] = static_cast<T>(v);
}
}
}
Expand Down Expand Up @@ -219,21 +223,27 @@ static void selectFunctions(const unsigned opt, DeblockPP7Data * d) noexcept {
d->pp7Filter = pp7Filter_c<uint8_t>;

#ifdef VS_TARGET_CPU_X86
if ((opt == 0 && iset >= 2) || opt == 2)
if ((opt == 0 && iset >= 5) || opt == 3)
d->pp7Filter = pp7Filter_sse4<uint8_t>;
else if ((opt == 0 && iset >= 2) || opt == 2)
d->pp7Filter = pp7Filter_sse2<uint8_t>;
#endif
} else if (d->vi->format->bytesPerSample == 2) {
d->pp7Filter = pp7Filter_c<uint16_t>;

#ifdef VS_TARGET_CPU_X86
if ((opt == 0 && iset >= 2) || opt == 2)
if ((opt == 0 && iset >= 5) || opt == 3)
d->pp7Filter = pp7Filter_sse4<uint16_t>;
else if ((opt == 0 && iset >= 2) || opt == 2)
d->pp7Filter = pp7Filter_sse2<uint16_t>;
#endif
} else {
d->pp7Filter = pp7Filter_c<float>;

#ifdef VS_TARGET_CPU_X86
if ((opt == 0 && iset >= 2) || opt == 2)
if ((opt == 0 && iset >= 5) || opt == 3)
d->pp7Filter = pp7Filter_sse4<float>;
else if ((opt == 0 && iset >= 2) || opt == 2)
d->pp7Filter = pp7Filter_sse2<float>;
#endif
}
Expand Down Expand Up @@ -330,8 +340,8 @@ static void VS_CC pp7Create(const VSMap *in, VSMap *out, void *userData, VSCore
if (qp < 1 || qp > 63)
throw std::string{ "qp must be between 1 and 63 (inclusive)" };

if (opt < 0 || opt > 2)
throw std::string{ "opt must be 0, 1 or 2" };
if (opt < 0 || opt > 3)
throw std::string{ "opt must be 0, 1, 2 or 3" };

if (padWidth || padHeight) {
VSMap * args = vsapi->createMap();
Expand Down Expand Up @@ -361,15 +371,15 @@ static void VS_CC pp7Create(const VSMap *in, VSMap *out, void *userData, VSCore
const unsigned numThreads = vsapi->getCoreInfo(core)->numThreads;
d->buffer.reserve(numThreads);

const int peak = (d->vi->format->sampleType == stInteger) ? (1 << d->vi->format->bitsPerSample) - 1 : 255;
d->peak = (d->vi->format->sampleType == stInteger) ? (1 << d->vi->format->bitsPerSample) - 1 : 255;

for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
const int width = d->vi->width >> (plane ? d->vi->format->subSamplingW : 0);
d->stride[plane] = (width + 16 + 15) & ~15;
}

for (int i = 0; i < 16; i++)
d->thresh[i] = static_cast<unsigned>((((i & 1) ? SN2 : SN0) * ((i & 4) ? SN2 : SN0) * qp * (1 << 2) - 1) * peak / 255);
d->thresh[i] = static_cast<unsigned>((((i & 1) ? SN2 : SN0) * ((i & 4) ? SN2 : SN0) * qp * (1 << 2) - 1) * d->peak / 255);
} catch (const std::string & error) {
vsapi->setError(out, ("DeblockPP7: " + error).c_str());
vsapi->freeNode(d->node);
Expand Down
2 changes: 1 addition & 1 deletion DeblockPP7/DeblockPP7.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ struct DeblockPP7Data {
const VSVideoInfo * vi;
bool process[3];
int stride[3];
unsigned thresh[16];
unsigned thresh[16], peak;
std::unordered_map<std::thread::id, int *> buffer;
const int16_t factor[16] = {
N / (N0 * N0), N / (N0 * N1), N / (N0 * N0), N / (N0 * N2),
Expand Down
1 change: 1 addition & 0 deletions DeblockPP7/DeblockPP7.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
<ItemGroup>
<ClCompile Include="DeblockPP7.cpp" />
<ClCompile Include="DeblockPP7_SSE2.cpp" />
<ClCompile Include="DeblockPP7_SSE4.cpp" />
<ClCompile Include="vectorclass\instrset_detect.cpp" />
</ItemGroup>
<ItemGroup>
Expand Down
3 changes: 3 additions & 0 deletions DeblockPP7/DeblockPP7.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
<ClCompile Include="DeblockPP7_SSE2.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="DeblockPP7_SSE4.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="vectorclass\instrset_detect.cpp">
<Filter>Source Files</Filter>
</ClCompile>
Expand Down
100 changes: 75 additions & 25 deletions DeblockPP7/DeblockPP7_SSE2.cpp
Original file line number Diff line number Diff line change
@@ -1,26 +1,73 @@
#ifdef VS_TARGET_CPU_X86
#include "DeblockPP7.hpp"

template<typename T, int scale>
static inline void dctA(const T * srcp, T * VS_RESTRICT dstp, const int stride) noexcept {
for (int i = 0; i < 4; i++) {
T s0 = (srcp[0 * stride] + srcp[6 * stride]) * scale;
T s1 = (srcp[1 * stride] + srcp[5 * stride]) * scale;
T s2 = (srcp[2 * stride] + srcp[4 * stride]) * scale;
T s3 = srcp[3 * stride] * scale;
T s = s3 + s3;
s3 = s - s0;
s0 = s + s0;
s = s2 + s1;
s2 = s2 - s1;
dstp[0] = s0 + s;
dstp[2] = s0 - s;
dstp[1] = 2 * s3 + s2;
dstp[3] = s3 - 2 * s2;

srcp++;
dstp += 4;
}
template<typename T>
static inline void dctA(const T * srcp, T * dstp, const int stride) noexcept;

template<>
inline void dctA(const int * srcp, int * dstp, const int stride) noexcept {
Vec4i s0 = Vec4i().load(srcp + 0 * stride) + Vec4i().load(srcp + 6 * stride);
Vec4i s1 = Vec4i().load(srcp + 1 * stride) + Vec4i().load(srcp + 5 * stride);
Vec4i s2 = Vec4i().load(srcp + 2 * stride) + Vec4i().load(srcp + 4 * stride);
Vec4i s3 = Vec4i().load(srcp + 3 * stride);
Vec4i s = s3 + s3;
s3 = s - s0;
s0 = s + s0;
s = s2 + s1;
s2 = s2 - s1;
const Vec4i temp0 = s0 + s;
const Vec4i temp2 = s0 - s;
const Vec4i temp1 = 2 * s3 + s2;
const Vec4i temp3 = s3 - 2 * s2;

Vec4i r1 = blend4i<0, 4, -256, -256>(temp0, temp1);
Vec4i r2 = blend4i<-256, -256, 0, 4>(temp2, temp3);
blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 0 * 4);

r1 = blend4i<1, 5, -256, -256>(temp0, temp1);
r2 = blend4i<-256, -256, 1, 5>(temp2, temp3);
blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 1 * 4);

r1 = blend4i<2, 6, -256, -256>(temp0, temp1);
r2 = blend4i<-256, -256, 2, 6>(temp2, temp3);
blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 2 * 4);

r1 = blend4i<3, 7, -256, -256>(temp0, temp1);
r2 = blend4i<-256, -256, 3, 7>(temp2, temp3);
blend4i<0, 1, 6, 7>(r1, r2).store_a(dstp + 3 * 4);
}

template<>
inline void dctA(const float * srcp, float * dstp, const int stride) noexcept {
Vec4f s0 = (Vec4f().load(srcp + 0 * stride) + Vec4f().load(srcp + 6 * stride)) * 255.f;
Vec4f s1 = (Vec4f().load(srcp + 1 * stride) + Vec4f().load(srcp + 5 * stride)) * 255.f;
Vec4f s2 = (Vec4f().load(srcp + 2 * stride) + Vec4f().load(srcp + 4 * stride)) * 255.f;
Vec4f s3 = Vec4f().load(srcp + 3 * stride) * 255.f;
Vec4f s = s3 + s3;
s3 = s - s0;
s0 = s + s0;
s = s2 + s1;
s2 = s2 - s1;
const Vec4f temp0 = s0 + s;
const Vec4f temp2 = s0 - s;
const Vec4f temp1 = 2 * s3 + s2;
const Vec4f temp3 = s3 - 2 * s2;

Vec4f r1 = blend4f<0, 4, -256, -256>(temp0, temp1);
Vec4f r2 = blend4f<-256, -256, 0, 4>(temp2, temp3);
blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 0 * 4);

r1 = blend4f<1, 5, -256, -256>(temp0, temp1);
r2 = blend4f<-256, -256, 1, 5>(temp2, temp3);
blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 1 * 4);

r1 = blend4f<2, 6, -256, -256>(temp0, temp1);
r2 = blend4f<-256, -256, 2, 6>(temp2, temp3);
blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 2 * 4);

r1 = blend4f<3, 7, -256, -256>(temp0, temp1);
r2 = blend4f<-256, -256, 3, 7>(temp2, temp3);
blend4f<0, 1, 6, 7>(r1, r2).store_a(dstp + 3 * 4);
}

template<typename T1, typename T2>
Expand Down Expand Up @@ -76,15 +123,15 @@ void pp7Filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const DeblockPP7Da
const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x;
int * VS_RESTRICT tp = temp + 4 * x;

dctA<int, 1>(p_src + index, tp + 4 * 8, stride);
dctA(p_src + index, tp + 4 * 8, stride);
}

for (int x = 0; x < width; x++) {
const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x;
int * VS_RESTRICT tp = temp + 4 * x;

if (!(x & 3))
dctA<int, 1>(p_src + index, tp + 4 * 8, stride);
dctA(p_src + index, tp + 4 * 8, stride);
dctB<int, Vec4i>(tp, block);

int64_t v = static_cast<int64_t>(block[0]) * d->factor[0];
Expand All @@ -102,8 +149,11 @@ void pp7Filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const DeblockPP7Da
}
}
}
v = (v + (1 << 17)) >> 18;
if (static_cast<unsigned>(v) > d->peak)
v = -v >> 63;

dstp[srcStride * y + x] = static_cast<T>((v + (1 << 17)) >> 18);
dstp[srcStride * y + x] = static_cast<T>(v);
}
}
}
Expand Down Expand Up @@ -149,15 +199,15 @@ void pp7Filter_sse2<float>(const VSFrameRef * src, VSFrameRef * dst, const Deblo
const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x;
float * VS_RESTRICT tp = temp + 4 * x;

dctA<float, 255>(p_src + index, tp + 4 * 8, stride);
dctA(p_src + index, tp + 4 * 8, stride);
}

for (int x = 0; x < width; x++) {
const int index = (stride + 1) * (8 - 3) + stride * y + 8 + x;
float * VS_RESTRICT tp = temp + 4 * x;

if (!(x & 3))
dctA<float, 255>(p_src + index, tp + 4 * 8, stride);
dctA(p_src + index, tp + 4 * 8, stride);
dctB<float, Vec4f>(tp, block);

float v = block[0] * d->factor[0];
Expand Down
Loading

0 comments on commit 450aa8a

Please sign in to comment.