Text archives Help
- From: boulos@sci.utah.edu
- To: manta@sci.utah.edu
- Subject: [MANTA] r993 - trunk/Model/Primitives
- Date: Sun, 26 Mar 2006 02:49:41 -0700 (MST)
Author: boulos
Date: Sun Mar 26 02:49:40 2006
New Revision: 993
Modified:
trunk/Model/Primitives/WaldTriangle.cc
Log:
Working version of WaldTriangle. With basic
optimization flags, the CSIMD implementation
produces the same image as both the default C
implementation and the Altivec and SSE versions.
This has not been tested on a G5 with the Manta
optimization flags for that system.
Unfortunately, the CSIMD is slower (not very
surprising) since it does more work than the
default C implementation and probably confuses
the compiler a lot.
Modified: trunk/Model/Primitives/WaldTriangle.cc
==============================================================================
--- trunk/Model/Primitives/WaldTriangle.cc (original)
+++ trunk/Model/Primitives/WaldTriangle.cc Sun Mar 26 02:49:40 2006
@@ -161,6 +161,9 @@
return loader.vec_result;
}
+#define load4(p) vec_ld(0, (p))
+#define store4(v, p) vec_st((v), 0, (p))
+
#define add4 vec_add
#define sub4 vec_sub
@@ -203,13 +206,15 @@
typedef __m128 sse_mask_t;
#define set4 _mm_set_ps1
-//#define set44 _mm_set_ps
inline sse_t set44(float v0, float v1, float v2, float v3)
{
return _mm_set_ps(v3, v2, v1, v0);
}
+#define load4 _mm_load_ps
+#define store4(v,p) _mm_store_ps((p),(v))
+
#define add4 _mm_add_ps
#define sub4 _mm_sub_ps
#define mul4 _mm_mul_ps
@@ -264,6 +269,19 @@
return ret;
}
+// assume perfect alignment
+inline sse_t load4(const float* a)
+{
+ sse_t ret = *((const sse_t*)a);
+ return ret;
+}
+
+inline void store4(const sse_t& vec, float* addr)
+{
+ sse_t* res = (sse_t*)addr;
+ *res = vec;
+}
+
inline sse_t add4(const sse_t& a, const sse_t& b)
{
sse_t ret;
@@ -307,9 +325,9 @@
inline sse_mask_t and4(const sse_t& a, const sse_t& b)
{
sse_mask_t ret;
- const unsigned int* a_u = (const unsigned int*)(&a);
- const unsigned int* b_u = (const unsigned int*)(&b);
- unsigned int* ret_u = (unsigned int*)(&ret);
+ const unsigned int* a_u = (const unsigned int*)(&a.v0);
+ const unsigned int* b_u = (const unsigned int*)(&b.v0);
+ unsigned int* ret_u = (unsigned int*)(&ret.v0);
ret_u[0] = a_u[0] & b_u[0];
ret_u[1] = a_u[1] & b_u[1];
ret_u[2] = a_u[2] & b_u[2];
@@ -318,13 +336,15 @@
return ret;
}
-// Each element of the result is the logical AND of the corresponding
element of arg1 and the one's complement of the corresponding element of
arg2. (from apple)
+// Each element of the result is the logical AND of the corresponding
+// element of arg1 and the one's complement of the corresponding
+// element of arg2. (from apple)
inline sse_mask_t andnot4(const sse_t& a, const sse_t& b)
{
sse_mask_t ret;
- const unsigned int* a_u = (const unsigned int*)(&a);
- const unsigned int* b_u = (const unsigned int*)(&b);
- unsigned int* ret_u = (unsigned int*)(&ret);
+ const unsigned int* a_u = (const unsigned int*)(&a.v0);
+ const unsigned int* b_u = (const unsigned int*)(&b.v0);
+ unsigned int* ret_u = (unsigned int*)(&ret.v0);
ret_u[0] = a_u[0] & (~b_u[0]);
ret_u[1] = a_u[1] & (~b_u[1]);
ret_u[2] = a_u[2] & (~b_u[2]);
@@ -336,9 +356,9 @@
inline sse_mask_t or4(const sse_t& a, const sse_t& b)
{
sse_mask_t ret;
- const unsigned int* a_u = (const unsigned int*)(&a);
- const unsigned int* b_u = (const unsigned int*)(&b);
- unsigned int* ret_u = (unsigned int*)(&ret);
+ const unsigned int* a_u = (const unsigned int*)(&a.v0);
+ const unsigned int* b_u = (const unsigned int*)(&b.v0);
+ unsigned int* ret_u = (unsigned int*)(&ret.v0);
ret_u[0] = a_u[0] | b_u[0];
ret_u[1] = a_u[1] | b_u[1];
ret_u[2] = a_u[2] | b_u[2];
@@ -349,7 +369,8 @@
// if all 4 components == 0
inline bool none4(const sse_mask_t& mask)
{
- const unsigned int* mask_u = (const unsigned int*)(&mask);
+ const unsigned int* mask_u = (const unsigned int*)(&mask.v0);
+
return (mask_u[0] == 0 &&
mask_u[1] == 0 &&
mask_u[2] == 0 &&
@@ -359,56 +380,56 @@
inline sse_mask_t cmp_gt4(const sse_t& a, const sse_t& b)
{
sse_mask_t ret;
- unsigned int* ret_u = (unsigned int*)(&ret);
- ret_u[0] = (a.v0 > b.v0) ? ~0 : 0;
- ret_u[1] = (a.v1 > b.v1) ? ~0 : 0;
- ret_u[2] = (a.v2 > b.v2) ? ~0 : 0;
- ret_u[3] = (a.v3 > b.v3) ? ~0 : 0;
+ unsigned int* ret_u = (unsigned int*)(&ret.v0);
+ ret_u[0] = (a.v0 > b.v0) ? 0xffffffff : 0;
+ ret_u[1] = (a.v1 > b.v1) ? 0xffffffff : 0;
+ ret_u[2] = (a.v2 > b.v2) ? 0xffffffff : 0;
+ ret_u[3] = (a.v3 > b.v3) ? 0xffffffff : 0;
return ret;
}
inline sse_mask_t cmp_gte4(const sse_t& a, const sse_t& b)
{
sse_mask_t ret;
- unsigned int* ret_u = (unsigned int*)(&ret);
- ret_u[0] = (a.v0 >= b.v0) ? ~0 : 0;
- ret_u[1] = (a.v1 >= b.v1) ? ~0 : 0;
- ret_u[2] = (a.v2 >= b.v2) ? ~0 : 0;
- ret_u[3] = (a.v3 >= b.v3) ? ~0 : 0;
+ unsigned int* ret_u = (unsigned int*)(&ret.v0);
+ ret_u[0] = (a.v0 >= b.v0) ? 0xffffffff : 0;
+ ret_u[1] = (a.v1 >= b.v1) ? 0xffffffff : 0;
+ ret_u[2] = (a.v2 >= b.v2) ? 0xffffffff : 0;
+ ret_u[3] = (a.v3 >= b.v3) ? 0xffffffff : 0;
return ret;
}
inline sse_mask_t cmp_lt4(const sse_t& a, const sse_t& b)
{
sse_mask_t ret;
- unsigned int* ret_u = (unsigned int*)(&ret);
- ret_u[0] = (a.v0 < b.v0) ? ~0 : 0;
- ret_u[1] = (a.v1 < b.v1) ? ~0 : 0;
- ret_u[2] = (a.v2 < b.v2) ? ~0 : 0;
- ret_u[3] = (a.v3 < b.v3) ? ~0 : 0;
+ unsigned int* ret_u = (unsigned int*)(&ret.v0);
+ ret_u[0] = (a.v0 < b.v0) ? 0xffffffff : 0;
+ ret_u[1] = (a.v1 < b.v1) ? 0xffffffff : 0;
+ ret_u[2] = (a.v2 < b.v2) ? 0xffffffff : 0;
+ ret_u[3] = (a.v3 < b.v3) ? 0xffffffff : 0;
return ret;
}
inline sse_mask_t cmp_lte4(const sse_t& a, const sse_t& b)
{
sse_mask_t ret;
- unsigned int* ret_u = (unsigned int*)(&ret);
- ret_u[0] = (a.v0 <= b.v0) ? ~0 : 0;
- ret_u[1] = (a.v1 <= b.v1) ? ~0 : 0;
- ret_u[2] = (a.v2 <= b.v2) ? ~0 : 0;
- ret_u[3] = (a.v3 <= b.v3) ? ~0 : 0;
+ unsigned int* ret_u = (unsigned int*)(&ret.v0);
+ ret_u[0] = (a.v0 <= b.v0) ? 0xffffffff : 0;
+ ret_u[1] = (a.v1 <= b.v1) ? 0xffffffff : 0;
+ ret_u[2] = (a.v2 <= b.v2) ? 0xffffffff : 0;
+ ret_u[3] = (a.v3 <= b.v3) ? 0xffffffff : 0;
return ret;
}
inline sse_t mask4(const sse_mask_t& mask, const sse_t& if_true, const sse_t
&if_false)
{
sse_t ret;
- unsigned int* mask_u = (unsigned int*)&mask;
+ const unsigned int* mask_u = (const unsigned int*)&mask.v0;
- ret.v0 = (mask_u[0] == (~0)) ? if_true.v0 : if_false.v0;
- ret.v1 = (mask_u[1] == (~0)) ? if_true.v1 : if_false.v1;
- ret.v2 = (mask_u[2] == (~0)) ? if_true.v2 : if_false.v2;
- ret.v3 = (mask_u[3] == (~0)) ? if_true.v3 : if_false.v3;
+ ret.v0 = (mask_u[0] == (0xffffffff)) ? if_true.v0 : if_false.v0;
+ ret.v1 = (mask_u[1] == (0xffffffff)) ? if_true.v1 : if_false.v1;
+ ret.v2 = (mask_u[2] == (0xffffffff)) ? if_true.v2 : if_false.v2;
+ ret.v3 = (mask_u[3] == (0xffffffff)) ? if_true.v3 : if_false.v3;
return ret;
}
@@ -434,23 +455,6 @@
const sse_t* const dir_kv = (sse_t*)data->direction[kv];
sse_t* minT = (sse_t*)data->minT;
- sse_t* hitMatl = (sse_t*)data->hitMatl;
- sse_t* hitPrim = (sse_t*)data->hitPrim;
- sse_t* hitTex = (sse_t*)data->hitTex;
-
- union
- {
- const void* ptr;
- float value;
- } addr_loader;
-
- addr_loader.ptr = (const void*)getMaterial();
- const sse_t sse_matl = set4(addr_loader.value);
- addr_loader.ptr = (const void*)this;
- const sse_t sse_this = set4(addr_loader.value);
- addr_loader.ptr = (const void*)getTexCoordMapper();
- const sse_t sse_tex = set4(addr_loader.value);
-
sse_t org_kn, org_ku, org_kv, f0;
const sse_t sse_nu = set4(n_u);
@@ -470,24 +474,13 @@
// first SSE aligned ray in the packet of rays:
const int sse_begin = rays.begin() >> 2; // equivalent to
Floor(rays.begin()/4)
- const int sse_end = (rays.end() >> 2) + 1; // Ceil(rays.end()/4)
+ const int sse_end = ((rays.end()-1+3) >> 2); // Ceil(rays.end()-1/4)
const int ray_begin = rays.begin();
const int ray_end = rays.end();
const bool RaysConstantOrigin = rays.getAllFlags() &
RayPacket::ConstantOrigin;
- for ( int ray = sse_begin * 4; ray < ray_begin; ray++ )
- {
- data->minT[ray] = -FLT_MAX; // try to kill the ray
- }
-
- for ( int ray = sse_end * 4; ray >= ray_end; ray-- )
- {
- data->minT[ray] = -FLT_MAX; // try to kill the ray
- }
-
-
if (RaysConstantOrigin)
{
org_kn = set4(data->origin[kn][ray_begin]);
@@ -505,20 +498,9 @@
if (!RaysConstantOrigin)
{
const int ray_index = i * 4;
- org_kn = set44(data->origin[kn][ray_index+0],
- data->origin[kn][ray_index+1],
- data->origin[kn][ray_index+2],
- data->origin[kn][ray_index+3]);
-
- org_ku = set44(data->origin[ku][ray_index+0],
- data->origin[ku][ray_index+1],
- data->origin[ku][ray_index+2],
- data->origin[ku][ray_index+3]);
-
- org_kv = set44(data->origin[kv][ray_index+0],
- data->origin[kv][ray_index+1],
- data->origin[kv][ray_index+2],
- data->origin[kv][ray_index+3]);
+ org_kn = load4(&(data->origin[kn][ray_index]));
+ org_ku = load4(&(data->origin[ku][ray_index]));
+ org_kv = load4(&(data->origin[kv][ray_index]));
f0 = sub4(sse_nd, add4(org_kn, add4(mul4(sse_nu,
org_ku),mul4(sse_nv,org_kv))));
}
@@ -546,12 +528,14 @@
if (none4(mask))
continue;
- //rays.hit(i, f, getMaterial(), this, getTexCoordMapper());
- // we already know that the mask contains all the info for setting the
hit
- minT[i] = mask4(mask, f, minT[i]);
- hitMatl[i] = mask4(mask, sse_matl, hitMatl[i]);
- hitPrim[i] = mask4(mask, sse_this, hitPrim[i]);
- hitTex[i] = mask4(mask, sse_tex, hitTex[i]);
+ for ( int r = 4*i; r < (4*(i+1)); r++ )
+ {
+ const float* hits = (const float*)(&f);
+ const float* valid = (const float*)(&mask);
+ if ( r >= ray_begin && r < ray_end )
+ if ( valid[r-4*i] != 0.f )
+ rays.hit(r, hits[r-4*i], getMaterial(), this,
getTexCoordMapper());
+ }
}
}
#endif
- [MANTA] r993 - trunk/Model/Primitives, boulos, 03/26/2006
Archive powered by MHonArc 2.6.16.