static const int sBBIndexList[36] =
{
4, 8, 7,
4, 7, 3,
5, 1, 2,
5, 2, 6,
5, 8, 4,
5, 4, 1,
2, 3, 7,
2, 7, 6,
6, 7, 8,
6, 8, 5,
1, 4, 3,
1, 3, 2,
};
__m128 SSETransformCoords(__m128 *v, __m128 *m)
{
__m128 vResult = _mm_shuffle_ps(*v, *v, _MM_SHUFFLE(0,0,0,0));
vResult = _mm_mul_ps(vResult, m[0]);
__m128 vTemp = _mm_shuffle_ps(*v, *v, _MM_SHUFFLE(1,1,1,1));
vTemp = _mm_mul_ps(vTemp, m[1]);
vResult = _mm_add_ps(vResult, vTemp);
vTemp = _mm_shuffle_ps(*v, *v, _MM_SHUFFLE(2,2,2,2));
vTemp = _mm_mul_ps(vTemp, m[2]);
vResult = _mm_add_ps(vResult, vTemp);
vResult = _mm_add_ps(vResult, m[3]);
return vResult;
}
__forceinline __m128i Min(const __m128i &v0, const __m128i &v1)
{
__m128i tmp;
tmp = _mm_min_epi32(v0, v1);
return tmp;
}
__forceinline __m128i Max(const __m128i &v0, const __m128i &v1)
{
__m128i tmp;
tmp = _mm_max_epi32(v0, v1);
return tmp;
}
struct SSEVFloat4
{
__m128 X;
__m128 Y;
__m128 Z;
__m128 W;
};
void SSEGather(SSEVFloat4 pOut[3], int triId, const __m128 xformedPos[])
{
for(int i = 0; i < 3; i++)
{
int ind0 = sBBIndexList[triId*3 + i + 0]-1;
int ind1 = sBBIndexList[triId*3 + i + 3]-1;
int ind2 = sBBIndexList[triId*3 + i + 6]-1;
int ind3 = sBBIndexList[triId*3 + i + 9]-1;
__m128 v0 = xformedPos[ind0];
__m128 v1 = xformedPos[ind1];
__m128 v2 = xformedPos[ind2];
__m128 v3 = xformedPos[ind3];
_MM_TRANSPOSE4_PS(v0, v1, v2, v3);
pOut[i].X = v0;
pOut[i].Y = v1;
pOut[i].Z = v2;
pOut[i].W = v3;
}
}
bool RasterizeTestBBoxSSE(Box3F box, __m128* matrix, float* buffer, Point4I res)
{
LARGE_INTEGER frequency;
LARGE_INTEGER t1, t2;
double elapsedTime;
QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter(&t1);
__m128 verticesSSE[8];
int flags[8];
static Point4F vertices[8];
static Point4F xformedPos[3];
static int flagsLoc[3];
_mm_setcsr( _mm_getcsr() | 0x8040 );
Point3F center = box.getCenter();
Point3F extent = box.getExtents();
Point4F vCenter = Point4F(center.x, center.y, center.z, 1.0);
Point4F vHalf = Point4F(extent.x*0.5, extent.y*0.5, extent.z*0.5, 1.0);
Point4F vMin = vCenter - vHalf;
Point4F vMax = vCenter + vHalf;
vertices[0] = Point4F(vMin.x, vMin.y, vMin.z, 1);
vertices[1] = Point4F(vMax.x, vMin.y, vMin.z, 1);
vertices[2] = Point4F(vMax.x, vMax.y, vMin.z, 1);
vertices[3] = Point4F(vMin.x, vMax.y, vMin.z, 1);
vertices[4] = Point4F(vMin.x, vMin.y, vMax.z, 1);
vertices[5] = Point4F(vMax.x, vMin.y, vMax.z, 1);
vertices[6] = Point4F(vMax.x, vMax.y, vMax.z, 1);
vertices[7] = Point4F(vMin.x, vMax.y, vMax.z, 1);
for(int i = 0; i < 8; i++)
{
verticesSSE[i] = _mm_loadu_ps(vertices[i]);
verticesSSE[i] = SSETransformCoords(&verticesSSE[i], matrix);
__m128 vertX = _mm_shuffle_ps(verticesSSE[i], verticesSSE[i], _MM_SHUFFLE(0,0,0,0));
__m128 vertY = _mm_shuffle_ps(verticesSSE[i], verticesSSE[i], _MM_SHUFFLE(1,1,1,1));
__m128 vertZ = _mm_shuffle_ps(verticesSSE[i], verticesSSE[i], _MM_SHUFFLE(2,2,2,2));
__m128 vertW = _mm_shuffle_ps(verticesSSE[i], verticesSSE[i], _MM_SHUFFLE(3,3,3,3));
static const __m128 sign_mask = _mm_set1_ps(-0.f);
vertW = _mm_andnot_ps(sign_mask, vertW);
vertW = _mm_shuffle_ps(vertW, _mm_set1_ps(1.0f), _MM_SHUFFLE(0,0,0,0));
vertW = _mm_shuffle_ps(vertW, vertW, _MM_SHUFFLE(3,0,0,0));
verticesSSE[i] = _mm_div_ps(verticesSSE[i], vertW);
const __m128 sadd = _mm_setr_ps(res.x*0.5, res.y*0.5, 0, 0);
const __m128 smult = _mm_setr_ps(res.x*0.5, res.y*(-0.5), 1, 1);
verticesSSE[i] = _mm_add_ps( sadd, _mm_mul_ps(verticesSSE[i],smult) );
}
for(int i = 0; i < 12; i += 4)
{
SSEVFloat4 xformedPos[3];
SSEGather(xformedPos, i, verticesSSE);
__m128i fxPtX[3], fxPtY[3];
for(int m = 0; m < 3; m++)
{
fxPtX[m] = _mm_cvtps_epi32(xformedPos[m].X);
fxPtY[m] = _mm_cvtps_epi32(xformedPos[m].Y);
}
__m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]);
__m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]);
__m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]);
__m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]);
__m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]);
__m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]);
__m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1]));
__m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2]));
__m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0]));
__m128i triArea = _mm_mullo_epi32(B2, A1);
triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2));
__m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea));
__m128 Z[3];
Z[0] = xformedPos[0].W;
Z[1] = _mm_mul_ps(_mm_sub_ps(xformedPos[1].W, Z[0]), oneOverTriArea);
Z[2] = _mm_mul_ps(_mm_sub_ps(xformedPos[2].W, Z[0]), oneOverTriArea);
__m128i startX = _mm_and_si128(Max(Min(Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1));
__m128i endX = Min(Max(Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(res.x - 1));
__m128i startY = _mm_and_si128(Max(Min(Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1));
__m128i endY = Min(Max(Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(res.y - 1));
for(int lane=0; lane < 4; lane++)
{
if(triArea.m128i_i32[lane] <= 0)
{
continue;
}
__m128 zz[3];
for(int vv = 0; vv < 3; vv++)
{
zz[vv] = _mm_set1_ps(Z[vv].m128_f32[lane]);
}
int startXx = startX.m128i_i32[lane];
int endXx = endX.m128i_i32[lane];
int startYy = startY.m128i_i32[lane];
int endYy = endY.m128i_i32[lane];
__m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]);
__m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]);
__m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]);
__m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]);
__m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]);
__m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]);
__m128i cc0 = _mm_set1_epi32(C0.m128i_i32[lane]);
__m128i cc1 = _mm_set1_epi32(C1.m128i_i32[lane]);
__m128i cc2 = _mm_set1_epi32(C2.m128i_i32[lane]);
__m128i aa0Inc = _mm_mul_epi32(aa0, _mm_setr_epi32(1,2,3,4));
__m128i aa1Inc = _mm_mul_epi32(aa1, _mm_setr_epi32(1,2,3,4));
__m128i aa2Inc = _mm_mul_epi32(aa2, _mm_setr_epi32(1,2,3,4));
__m128i alpha0 = _mm_add_epi32(_mm_mul_epi32(aa0, _mm_set1_epi32(startXx)), _mm_mul_epi32(bb0, _mm_set1_epi32(startYy)));
alpha0 = _mm_add_epi32(cc0, alpha0);
__m128i beta0 = _mm_add_epi32(_mm_mul_epi32(aa1, _mm_set1_epi32(startXx)), _mm_mul_epi32(bb1, _mm_set1_epi32(startYy)));
beta0 = _mm_add_epi32(cc1, beta0);
__m128i gama0 = _mm_add_epi32(_mm_mul_epi32(aa2, _mm_set1_epi32(startXx)), _mm_mul_epi32(bb2, _mm_set1_epi32(startYy)));
gama0 = _mm_add_epi32(cc2, gama0);
int rowIdx = (startYy * res.x + startXx);
__m128 zx = _mm_mul_ps(_mm_cvtepi32_ps(aa1), zz[1]);
zx = _mm_add_ps(zx, _mm_mul_ps(_mm_cvtepi32_ps(aa2), zz[2]));
zx = _mm_mul_ps(zx, _mm_setr_ps(1.f, 2.f, 3.f, 4.f));
for(int r = startYy; r < endYy; r++,
rowIdx += res.x,
alpha0 = _mm_add_epi32(alpha0, bb0),
beta0 = _mm_add_epi32(beta0, bb1),
gama0 = _mm_add_epi32(gama0, bb2))
{
int index = rowIdx;
__m128i alpha = alpha0;
__m128i beta = beta0;
__m128i gama = gama0;
__m128 depth = zz[0];
depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1]));
depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2]));
__m128i anyOut = _mm_setzero_si128();
__m128i mask;
__m128 previousDepth;
__m128 depthMask;
__m128i finalMask;
for(int c = startXx; c < endXx;
c+=4,
index+=4,
alpha = _mm_add_epi32(alpha, aa0Inc),
beta = _mm_add_epi32(beta, aa1Inc),
gama = _mm_add_epi32(gama, aa2Inc),
depth = _mm_add_ps(depth, zx))
{
mask = _mm_or_si128(_mm_or_si128(alpha, beta), gama);
previousDepth = _mm_loadu_ps(&(buffer[index]));
__m128 curdepth = _mm_mul_ps(_mm_sub_ps(log_ps(depth),_mm_set1_ps(-6.907755375)),_mm_set1_ps(0.048254941));
curdepth = _mm_sub_ps(curdepth, _mm_set1_ps(0.05));
depthMask = _mm_cmplt_ps(curdepth, previousDepth);
finalMask = _mm_andnot_si128(mask, _mm_castps_si128(depthMask));
anyOut = _mm_or_si128(anyOut, finalMask);
}
if(!_mm_testz_si128(anyOut, _mm_set1_epi32(0x80000000)))
{
QueryPerformanceCounter(&t2);
elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;
RasterizationStats::RasterizeSSETimeSpent += elapsedTime;
return true;
}
}
}
}
return false;
}