Text archives Help
- From: boulos@sci.utah.edu
- To: manta@sci.utah.edu
- Subject: [MANTA] r1168 - in trunk/Model: Groups Primitives
- Date: Sat, 12 Aug 2006 01:04:14 -0600 (MDT)
Author: boulos
Date: Sat Aug 12 01:04:12 2006
New Revision: 1168
Modified:
trunk/Model/Groups/DynBVH.cc
trunk/Model/Primitives/WaldTriangle.cc
Log:
Adding some sse versions to the DynBVH traversal.
Single scalar version of first hit not working yet
but #ifdefed out.
Can't seem to use the macro'ed simd stuff in
WaldTriangle so switching to USE_SIMD 0 unless on
altivec system (now detected with __POWERPC__)
Modified: trunk/Model/Groups/DynBVH.cc
==============================================================================
--- trunk/Model/Groups/DynBVH.cc (original)
+++ trunk/Model/Groups/DynBVH.cc Sat Aug 12 01:04:12 2006
@@ -87,8 +87,83 @@
// return the first index (between [rays.begin(),rays.end()]) which hits the
box
int DynBVH::firstIntersects(const BBox& box, const RayPacket& rays, const
IAData& ia_data) const
{
+#define DYNBVH_NEW_SSE MANTA_SSE
+
+#if DYNBVH_NEW_SSE
+ int sse_begin = (rays.begin() + 3)&(~3);
+ int sse_end = rays.end() & (~3);
+ const RayPacketData* data = rays.data;
+#endif
+ // we always want to do the first ray and IA in C not SIMD
for (int ray = rays.begin(); ray < rays.end(); ray++ )
{
+#if (0 && DYNBVH_NEW_SSE) // for whatever reason this code doesn't work.
+ float temp_epsilon = 1e-5f;
+ __m128 tmin = _mm_load_ss(&temp_epsilon);
+ float check_load;
+ __m128 tmax = _mm_load_ss(&data->minT[ray]);
+
+#if 1
+ for (int c = 0; c < 3; ++c) {
+ __m128 t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][c])),
+
_mm_load_ss(&data->origin[c][ray])),
+
_mm_load_ss(&data->inverseDirection[c][ray]) );
+
+ __m128 t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][c])),
+
_mm_load_ss(&data->origin[c][ray])),
+
_mm_load_ss(&data->inverseDirection[c][ray]) );
+
+ tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
+ tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+ }
+#else
+ __m128 t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][0])),
+
_mm_load_ss(&data->origin[0][ray])),
+ _mm_load_ss(&data->inverseDirection[0][ray])
);
+
+ __m128 t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][0])),
+
_mm_load_ss(&data->origin[0][ray])),
+ _mm_load_ss(&data->inverseDirection[0][ray])
);
+
+ tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
+ tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+
+ t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][1])),
+ _mm_load_ss(&data->origin[1][ray])),
+ _mm_load_ss(&data->inverseDirection[1][ray]) );
+
+ t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][1])),
+ _mm_load_ss(&data->origin[1][ray])),
+ _mm_load_ss(&data->inverseDirection[1][ray]) );
+
+ tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
+ tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+
+ t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][2])),
+ _mm_load_ss(&data->origin[2][ray])),
+ _mm_load_ss(&data->inverseDirection[2][ray]) );
+
+ t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][2])),
+ _mm_load_ss(&data->origin[2][ray])),
+ _mm_load_ss(&data->inverseDirection[2][ray]) );
+
+ tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
+ tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+#endif // USE_UNROLLED or not
+#if 0
+ __m128 valid_intersect = _mm_cmplt_ss(tmin, tmax);
+ float result;
+ _mm_store_ss(&result, valid_intersect);
+ if (result != 0.f)
+ return ray;
+#else
+ float vals[2];
+ _mm_store_ss(&(vals[0]), tmin);
+ _mm_store_ss(&(vals[1]), tmax);
+ if (vals[0] < vals[1])
+ return ray;
+#endif
+#else
float maximum_minimum = 1e-5;
float minimum_maximum = rays.getMinT(ray);
@@ -101,16 +176,21 @@
float z_minimum = (box[rays.getSign(ray,2)][2] -
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
float z_maximum = (box[1-rays.getSign(ray,2)][2] -
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
+ // Note: we don't want to exit early since we might skip the frustum
test
+/*
if ( minimum_maximum < x_minimum ||
maximum_minimum > x_maximum )
continue;
+*/
if ( minimum_maximum > x_maximum )
minimum_maximum = x_maximum;
if ( maximum_minimum < x_minimum )
maximum_minimum = x_minimum;
+/*
if ( minimum_maximum < y_minimum ||
maximum_minimum > y_maximum )
continue;
+*/
if ( minimum_maximum > y_maximum )
minimum_maximum = y_maximum;
if ( maximum_minimum < y_minimum )
@@ -119,7 +199,8 @@
if ( minimum_maximum >= z_minimum &&
maximum_minimum <= z_maximum )
return ray; // found a hit
-
+#endif
+#if 1 // enable/disable frustum test
if (ray == rays.begin())
{
// try a frustum miss
@@ -151,8 +232,140 @@
return rays.end();
}
}
+#endif // do frustum test
+#if (DYNBVH_NEW_SSE && 1)
+ // if we can use simd now, jump out (redoes some work for SIMD
aligned rays)
+ if ( ray == sse_begin || ray == sse_begin - 1 )
+ break;
+#endif
+ }
+
+#if DYNBVH_NEW_SSE
+ // process simds now
+ int pack_begin = sse_begin >> 2;
+ int pack_end = sse_end >> 2;
+ // TODO(boulos): replace operator overloads with direct access
+#if 0
+ __m128 box_x0 = _mm_set1_ps(box[0][0]);
+ __m128 box_x1 = _mm_set1_ps(box[1][0]);
+ __m128 box_y0 = _mm_set1_ps(box[0][1]);
+ __m128 box_y1 = _mm_set1_ps(box[1][1]);
+
+ __m128 box_z0 = _mm_set1_ps(box[0][2]);
+ __m128 box_z1 = _mm_set1_ps(box[1][2]);
+
+ for (int pack = pack_begin, ray = sse_begin; pack < pack_end; ++pack,
ray += 4)
+ {
+ __m128 x0 = _mm_mul_ps(_mm_sub_ps(box_x0,
_mm_load_ps(&data->origin[0][ray])),
+ _mm_load_ps(&data->inverseDirection[0][ray]));
+ __m128 x1 = _mm_mul_ps(_mm_sub_ps(box_x1,
_mm_load_ps(&data->origin[0][ray])),
+ _mm_load_ps(&data->inverseDirection[0][ray]));
+
+ __m128 xmin = _mm_min_ps(x0,x1);
+ __m128 xmax = _mm_max_ps(x0,x1);
+
+ __m128 y0 = _mm_mul_ps(_mm_sub_ps(box_y0,
_mm_load_ps(&data->origin[1][ray])),
+ _mm_load_ps(&data->inverseDirection[1][ray]));
+ __m128 y1 = _mm_mul_ps(_mm_sub_ps(box_y1,
_mm_load_ps(&data->origin[1][ray])),
+ _mm_load_ps(&data->inverseDirection[1][ray]));
+
+ __m128 ymin = _mm_min_ps(y0,y1);
+ __m128 ymax = _mm_max_ps(y0,y1);
+
+ __m128 z0 = _mm_mul_ps(_mm_sub_ps(box_z0,
_mm_load_ps(&data->origin[2][ray])),
+ _mm_load_ps(&data->inverseDirection[2][ray]));
+ __m128 z1 = _mm_mul_ps(_mm_sub_ps(box_z1,
_mm_load_ps(&data->origin[2][ray])),
+ _mm_load_ps(&data->inverseDirection[2][ray]));
+
+ __m128 zmin = _mm_min_ps(z0,z1);
+ __m128 zmax = _mm_max_ps(z0,z1);
+
+ __m128 maximum_minimum =
_mm_max_ps(xmin,_mm_max_ps(ymin,_mm_max_ps(zmin, _mm_set1_ps(1e-5f))));
+ __m128 minimum_maximum =
_mm_min_ps(xmax,_mm_min_ps(ymax,_mm_min_ps(zmax,_mm_load_ps(&data->minT[ray]))));
+ __m128 valid_intersect =
_mm_cmplt_ps(maximum_minimum,minimum_maximum);
+ if (_mm_movemask_ps(valid_intersect) != 0x0)
+ return ray;
+ }
+#else // different more register efficient version
+ // two regs for box
+ __m128 box0 = _mm_set_ps(0.f, box[0][2], box[0][1], box[0][0]);
+ __m128 box1 = _mm_set_ps(0.f, box[1][2], box[1][1], box[1][0]);
+ for (int pack = pack_begin, ray = sse_begin; pack < pack_end; ++pack,
ray += 4) {
+ // two regs for interval tracking
+ __m128 tmin = _mm_set1_ps(1e-5f);
+ __m128 tmax = _mm_load_ps(&data->minT[ray]);
+ // current is 4
+ // 1 for inv dir, 1 for ray org, 1 for shuffle output
+
+ __m128 t0 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box0, box0,
_MM_SHUFFLE(0,0,0,0)),
+
_mm_load_ps(&data->origin[0][ray])),
+ _mm_load_ps(&data->inverseDirection[0][ray])
);
+ __m128 t1 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box1, box1,
_MM_SHUFFLE(0,0,0,0)),
+
_mm_load_ps(&data->origin[0][ray])),
+ _mm_load_ps(&data->inverseDirection[0][ray])
);
+ tmin = _mm_max_ps(tmin, _mm_max_ps(t0, t1));
+ tmax = _mm_min_ps(tmax, _mm_min_ps(t0, t1));
+
+ t0 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box0, box0,
_MM_SHUFFLE(1,1,1,1)),
+ _mm_load_ps(&data->origin[1][ray])),
+ _mm_load_ps(&data->inverseDirection[1][ray]) );
+ t1 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box1, box1,
_MM_SHUFFLE(1,1,1,1)),
+ _mm_load_ps(&data->origin[1][ray])),
+ _mm_load_ps(&data->inverseDirection[1][ray]) );
+
+ tmin = _mm_max_ps(tmin, _mm_max_ps(t0, t1));
+ tmax = _mm_min_ps(tmax, _mm_min_ps(t0, t1));
+
+ t0 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box0, box0,
_MM_SHUFFLE(2,2,2,2)),
+ _mm_load_ps(&data->origin[2][ray])),
+ _mm_load_ps(&data->inverseDirection[2][ray]) );
+ t1 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box1, box1,
_MM_SHUFFLE(2,2,2,2)),
+ _mm_load_ps(&data->origin[2][ray])),
+ _mm_load_ps(&data->inverseDirection[2][ray]) );
+
+ tmin = _mm_max_ps(tmin, _mm_max_ps(t0, t1));
+ tmax = _mm_min_ps(tmax, _mm_min_ps(t0, t1));
+
+ __m128 valid_intersect = _mm_cmplt_ps(tmin, tmax);
+ if (_mm_movemask_ps(valid_intersect) != 0x0)
+ return ray;
+ }
+#endif
+ // get remaining rays
+ for (int ray = sse_begin+1; ray < rays.end(); ++ray) {
+ float maximum_minimum = 1e-5;
+ float minimum_maximum = rays.getMinT(ray);
+
+ float x_minimum = (box[rays.getSign(ray,0)][0] -
rays.getOrigin(ray,0)) * rays.getInverseDirection(ray,0);
+ float x_maximum = (box[1-rays.getSign(ray,0)][0] -
rays.getOrigin(ray,0)) * rays.getInverseDirection(ray,0);
+
+ float y_minimum = (box[rays.getSign(ray,1)][1] -
rays.getOrigin(ray,1)) * rays.getInverseDirection(ray,1);
+ float y_maximum = (box[1-rays.getSign(ray,1)][1] -
rays.getOrigin(ray,1)) * rays.getInverseDirection(ray,1);
+
+ float z_minimum = (box[rays.getSign(ray,2)][2] -
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
+ float z_maximum = (box[1-rays.getSign(ray,2)][2] -
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
+
+ if ( minimum_maximum < x_minimum ||
+ maximum_minimum > x_maximum )
+ continue;
+ if ( minimum_maximum > x_maximum )
+ minimum_maximum = x_maximum;
+ if ( maximum_minimum < x_minimum )
+ maximum_minimum = x_minimum;
+ if ( minimum_maximum < y_minimum ||
+ maximum_minimum > y_maximum )
+ continue;
+ if ( minimum_maximum > y_maximum )
+ minimum_maximum = y_maximum;
+ if ( maximum_minimum < y_minimum )
+ maximum_minimum = y_minimum;
+
+ if ( minimum_maximum >= z_minimum &&
+ maximum_minimum <= z_maximum )
+ return ray; // found a hit
}
+#endif
return rays.end();
}
Modified: trunk/Model/Primitives/WaldTriangle.cc
==============================================================================
--- trunk/Model/Primitives/WaldTriangle.cc (original)
+++ trunk/Model/Primitives/WaldTriangle.cc Sat Aug 12 01:04:12 2006
@@ -9,14 +9,23 @@
using namespace std;
-#define USE_SIMD 0 // off by default
-
-#ifdef __APPLE__
+#ifdef __POWERPC__
#define USE_ALTIVEC 1
-#define USE_SSE 0 // note this won't work for x86 apple...
+#define USE_SSE 0
+#else
+#ifdef MANTA_SSE
+#define USE_ALTIVEC 0
+#define USE_SSE 0
+#define USE_SIMD 0
#else
#define USE_ALTIVEC 0
-#define USE_SSE 1
+#define USE_SSE 0
+#define USE_SIMD 0
+#endif
+#endif
+
+#ifndef USE_SIMD
+#define USE_SIMD 0 // off by default
#endif
#define USE_CSIMD (USE_SIMD && !USE_ALTIVEC && !USE_SSE)
@@ -474,7 +483,7 @@
// first SSE aligned ray in the packet of rays:
const int sse_begin = rays.begin() >> 2; // equivalent to
Floor(rays.begin()/4)
- const int sse_end = ((rays.end()-1+3) >> 2); // Ceil(rays.end()-1/4)
+ const int sse_end = ((rays.end()+3) >> 2); // Ceil(rays.end()-1/4)
const int ray_begin = rays.begin();
const int ray_end = rays.end();
- [MANTA] r1168 - in trunk/Model: Groups Primitives, boulos, 08/12/2006
Archive powered by MHonArc 2.6.16.