Text archives Help
- From: boulos@sci.utah.edu
- To: manta@sci.utah.edu
- Subject: [MANTA] r986 - trunk/Model/Primitives
- Date: Mon, 13 Mar 2006 02:13:50 -0700 (MST)
Author: boulos
Date: Mon Mar 13 02:13:49 2006
New Revision: 986
Modified:
trunk/Model/Primitives/WaldTriangle.cc
Log:
Adding a new "CSIMD" implementation that should
emulate Altivec and SSE (but currently fails to
do so).
Making mask4 implementation specific (using a
single vec_sel call for Altivec, and simple code
for CSIMD).
Modified: trunk/Model/Primitives/WaldTriangle.cc
==============================================================================
--- trunk/Model/Primitives/WaldTriangle.cc (original)
+++ trunk/Model/Primitives/WaldTriangle.cc Mon Mar 13 02:13:49 2006
@@ -8,9 +8,18 @@
using namespace Manta;
using namespace std;
-#define USE_SIMD 1
+
+#define USE_SIMD 0 // off by default
+
+#ifdef __APPLE__
#define USE_ALTIVEC 1
-#define USE_SSE 0
+#define USE_SSE 0 // note this won't work for x86 apple...
+#else
+#define USE_ALTIVEC 0
+#define USE_SSE 1
+#endif
+
+#define USE_CSIMD (USE_SIMD && !USE_ALTIVEC && !USE_SSE)
WaldTriangle::WaldTriangle(Material* mat,
const Vector& _p1, const Vector& _p2, const
Vector& _p3) : PrimitiveCommon(mat)
@@ -128,7 +137,7 @@
}
}
#else
-// SSE version
+// SIMD version
#include <float.h>
#if USE_ALTIVEC // altivec
@@ -177,6 +186,11 @@
#define cmp_lt4 vec_cmplt
#define cmp_lte4 vec_cmple
+inline sse_t mask4(const sse_mask_t &mask, const sse_t& if_true, const sse_t
& if_false)
+{
+ return vec_sel(if_false, if_true, mask);
+}
+
#endif
@@ -215,14 +229,191 @@
#define cmp_lt4 _mm_cmplt_ps
#define cmp_lte4 _mm_cmple_ps
-#endif
-
inline sse_t mask4(const sse_mask_t &mask, const sse_t& if_true, const sse_t
& if_false)
{
- //return or4(and4(mask,if_true), andnot4(mask,if_false));
return or4(and4(mask,if_true), andnot4(if_false, mask));
}
+
+#endif
+
+#if USE_CSIMD
+
+typedef struct
+{
+ float v0,v1,v2,v3;
+} float4;
+
+typedef float4 sse_t;
+typedef float4 sse_mask_t;
+
+inline sse_t set4(float f)
+{
+ sse_t ret;
+ ret.v0 = ret.v1 = ret.v2 = ret.v3 = f;
+ return ret;
+}
+
+inline sse_t set44(float v0, float v1, float v2, float v3)
+{
+ sse_t ret;
+ ret.v0 = v0;
+ ret.v1 = v1;
+ ret.v2 = v2;
+ ret.v3 = v3;
+ return ret;
+}
+
+inline sse_t add4(const sse_t& a, const sse_t& b)
+{
+ sse_t ret;
+ ret.v0 = a.v0 + b.v0;
+ ret.v1 = a.v1 + b.v1;
+ ret.v2 = a.v2 + b.v2;
+ ret.v3 = a.v3 + b.v3;
+ return ret;
+}
+
+inline sse_t sub4(const sse_t& a, const sse_t& b)
+{
+ sse_t ret;
+ ret.v0 = a.v0 - b.v0;
+ ret.v1 = a.v1 - b.v1;
+ ret.v2 = a.v2 - b.v2;
+ ret.v3 = a.v3 - b.v3;
+ return ret;
+}
+
+inline sse_t mul4(const sse_t& a, const sse_t& b)
+{
+ sse_t ret;
+ ret.v0 = a.v0 * b.v0;
+ ret.v1 = a.v1 * b.v1;
+ ret.v2 = a.v2 * b.v2;
+ ret.v3 = a.v3 * b.v3;
+ return ret;
+}
+
+inline sse_t rcp4(const sse_t& a)
+{
+ sse_t ret;
+ ret.v0 = 1.f/a.v0;
+ ret.v1 = 1.f/a.v1;
+ ret.v2 = 1.f/a.v2;
+ ret.v3 = 1.f/a.v3;
+ return ret;
+}
+
+inline sse_mask_t and4(const sse_t& a, const sse_t& b)
+{
+ sse_mask_t ret;
+ const unsigned int* a_u = (const unsigned int*)(&a);
+ const unsigned int* b_u = (const unsigned int*)(&b);
+ unsigned int* ret_u = (unsigned int*)(&ret);
+ ret_u[0] = a_u[0] & b_u[0];
+ ret_u[1] = a_u[1] & b_u[1];
+ ret_u[2] = a_u[2] & b_u[2];
+ ret_u[3] = a_u[3] & b_u[3];
+
+ return ret;
+}
+
+// Each element of the result is the logical AND of the corresponding
element of arg1 and the one's complement of the corresponding element of
arg2. (from apple)
+inline sse_mask_t andnot4(const sse_t& a, const sse_t& b)
+{
+ sse_mask_t ret;
+ const unsigned int* a_u = (const unsigned int*)(&a);
+ const unsigned int* b_u = (const unsigned int*)(&b);
+ unsigned int* ret_u = (unsigned int*)(&ret);
+ ret_u[0] = a_u[0] & (~b_u[0]);
+ ret_u[1] = a_u[1] & (~b_u[1]);
+ ret_u[2] = a_u[2] & (~b_u[2]);
+ ret_u[3] = a_u[3] & (~b_u[3]);
+
+ return ret;
+}
+
+inline sse_mask_t or4(const sse_t& a, const sse_t& b)
+{
+ sse_mask_t ret;
+ const unsigned int* a_u = (const unsigned int*)(&a);
+ const unsigned int* b_u = (const unsigned int*)(&b);
+ unsigned int* ret_u = (unsigned int*)(&ret);
+ ret_u[0] = a_u[0] | b_u[0];
+ ret_u[1] = a_u[1] | b_u[1];
+ ret_u[2] = a_u[2] | b_u[2];
+ ret_u[3] = a_u[3] | b_u[3];
+ return ret;
+}
+
+// if all 4 components == 0
+inline bool none4(const sse_mask_t& mask)
+{
+ const unsigned int* mask_u = (const unsigned int*)(&mask);
+ return (mask_u[0] == 0 &&
+ mask_u[1] == 0 &&
+ mask_u[2] == 0 &&
+ mask_u[3] == 0);
+}
+
+inline sse_mask_t cmp_gt4(const sse_t& a, const sse_t& b)
+{
+ sse_mask_t ret;
+ unsigned int* ret_u = (unsigned int*)(&ret);
+ ret_u[0] = (a.v0 > b.v0) ? ~0 : 0;
+ ret_u[1] = (a.v1 > b.v1) ? ~0 : 0;
+ ret_u[2] = (a.v2 > b.v2) ? ~0 : 0;
+ ret_u[3] = (a.v3 > b.v3) ? ~0 : 0;
+ return ret;
+}
+
+inline sse_mask_t cmp_gte4(const sse_t& a, const sse_t& b)
+{
+ sse_mask_t ret;
+ unsigned int* ret_u = (unsigned int*)(&ret);
+ ret_u[0] = (a.v0 >= b.v0) ? ~0 : 0;
+ ret_u[1] = (a.v1 >= b.v1) ? ~0 : 0;
+ ret_u[2] = (a.v2 >= b.v2) ? ~0 : 0;
+ ret_u[3] = (a.v3 >= b.v3) ? ~0 : 0;
+ return ret;
+}
+
+inline sse_mask_t cmp_lt4(const sse_t& a, const sse_t& b)
+{
+ sse_mask_t ret;
+ unsigned int* ret_u = (unsigned int*)(&ret);
+ ret_u[0] = (a.v0 < b.v0) ? ~0 : 0;
+ ret_u[1] = (a.v1 < b.v1) ? ~0 : 0;
+ ret_u[2] = (a.v2 < b.v2) ? ~0 : 0;
+ ret_u[3] = (a.v3 < b.v3) ? ~0 : 0;
+ return ret;
+}
+
+inline sse_mask_t cmp_lte4(const sse_t& a, const sse_t& b)
+{
+ sse_mask_t ret;
+ unsigned int* ret_u = (unsigned int*)(&ret);
+ ret_u[0] = (a.v0 <= b.v0) ? ~0 : 0;
+ ret_u[1] = (a.v1 <= b.v1) ? ~0 : 0;
+ ret_u[2] = (a.v2 <= b.v2) ? ~0 : 0;
+ ret_u[3] = (a.v3 <= b.v3) ? ~0 : 0;
+ return ret;
+}
+
+inline sse_t mask4(const sse_mask_t& mask, const sse_t& if_true, const sse_t
&if_false)
+{
+ sse_t ret;
+ unsigned int* mask_u = (unsigned int*)&mask;
+
+ ret.v0 = (mask_u[0] == (~0)) ? if_true.v0 : if_false.v0;
+ ret.v1 = (mask_u[1] == (~0)) ? if_true.v1 : if_false.v1;
+ ret.v2 = (mask_u[2] == (~0)) ? if_true.v2 : if_false.v2;
+ ret.v3 = (mask_u[3] == (~0)) ? if_true.v3 : if_false.v3;
+ return ret;
+}
+
+#endif
+
inline sse_t accurateRcp4(const sse_t& v)
{
const sse_t rcp = rcp4(v);
@@ -260,7 +451,6 @@
addr_loader.ptr = (const void*)getTexCoordMapper();
const sse_t sse_tex = set4(addr_loader.value);
- //float org_kn, org_ku, org_kv, f0;
sse_t org_kn, org_ku, org_kv, f0;
const sse_t sse_nu = set4(n_u);
@@ -292,7 +482,7 @@
data->minT[ray] = -FLT_MAX; // try to kill the ray
}
- for ( int ray = sse_end * 4; ray < ray_end; ray++ )
+ for ( int ray = sse_end * 4; ray >= ray_end; ray-- )
{
data->minT[ray] = -FLT_MAX; // try to kill the ray
}
@@ -300,13 +490,10 @@
if (RaysConstantOrigin)
{
- //org_kn = data->origin[kn][rays.begin()];
org_kn = set4(data->origin[kn][ray_begin]);
- //org_ku = data->origin[ku][rays.begin()];
org_ku = set4(data->origin[ku][ray_begin]);
- //org_kv = data->origin[kv][rays.begin()];
org_kv = set4(data->origin[kv][ray_begin]);
- //f0 = n_d - (org_k + n_u * org_ku + n_v * org_kv);
+
f0 = sub4(sse_nd, add4(org_kn, add4(mul4(sse_nu,
org_ku),mul4(sse_nv,org_kv))));
}
@@ -341,19 +528,21 @@
sse_mask_t mask = and4( cmp_gt4(minT[i], f), cmp_gt4(f, sse_eps));
if (none4(mask))
- continue;
+ continue;
const sse_t hu = add4(org_ku, mul4(f,dir_ku[i]));
const sse_t hv = add4(org_kv, mul4(f,dir_kv[i]));
const sse_t lambda = add4(sse_bd,
add4(mul4(hu,sse_bnu),mul4(hv,sse_bnv)));
const sse_t zero = set4(0.f);
+ const sse_t one = set4(1.f);
// barycentric test (to pass, \lambda must be >= 0)
mask = and4(mask, cmp_gte4(lambda, zero));
if (none4(mask))
continue;
const sse_t mue = add4(sse_cd,
add4(mul4(hu,sse_cnu),mul4(hv,sse_cnv)));
- mask = and4(mask, and4(cmp_gte4(mue, zero), cmp_lte4(add4(mue,lambda),
set4(1.f))));
+ // barycentric test (to pass, \mue must be >= 0 and \lambda + \mue
must be <= 1.f
+ mask = and4(mask, and4(cmp_gte4(mue, zero), cmp_lte4(add4(mue,lambda),
one)));
if (none4(mask))
continue;
- [MANTA] r986 - trunk/Model/Primitives, boulos, 03/13/2006
Archive powered by MHonArc 2.6.16.