Text archives Help
- From: boulos@sci.utah.edu
- To: manta@sci.utah.edu
- Subject: [MANTA] r1170 - trunk/Model/Groups
- Date: Sun, 13 Aug 2006 17:26:17 -0600 (MDT)
Author: boulos
Date: Sun Aug 13 17:26:15 2006
New Revision: 1170
Modified:
trunk/Model/Groups/DynBVH.cc
Log:
Fixed bugs in the single scalar sse code. When
compared to a new friendly C version it's only
barely faster (gcc uses single scalar sse too).
The interval arithmetic is a big win though. The
relative perf improvements of various ifdefs are
placed throughout with comments.
Modified: trunk/Model/Groups/DynBVH.cc
==============================================================================
--- trunk/Model/Groups/DynBVH.cc (original)
+++ trunk/Model/Groups/DynBVH.cc Sun Aug 13 17:26:15 2006
@@ -24,6 +24,7 @@
ia_data.max_org_rcp[axis] = -DBL_MAX;
}
+ // TODO(boulos): provide an SSE version
for (int ray = rays.begin(); ray < rays.end(); ray++ )
{
for (int axis = 0; axis < 3; axis++)
@@ -97,14 +98,13 @@
// we always want to do the first ray and IA in C not SIMD
for (int ray = rays.begin(); ray < rays.end(); ray++ )
{
-#if (0 && DYNBVH_NEW_SSE) // for whatever reason this code doesn't work.
+#if (0 && DYNBVH_NEW_SSE) // hand written single scalar is slightly faster
than C version
float temp_epsilon = 1e-5f;
__m128 tmin = _mm_load_ss(&temp_epsilon);
- float check_load;
__m128 tmax = _mm_load_ss(&data->minT[ray]);
-#if 1
- for (int c = 0; c < 3; ++c) {
+#if 1 // rolled vs unrolled makes no difference
+ for (int c = 0; c < 3; c++) {
__m128 t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][c])),
_mm_load_ss(&data->origin[c][ray])),
_mm_load_ss(&data->inverseDirection[c][ray]) );
@@ -113,8 +113,8 @@
_mm_load_ss(&data->origin[c][ray])),
_mm_load_ss(&data->inverseDirection[c][ray]) );
- tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
- tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+ tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
+ tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
}
#else
__m128 t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][0])),
@@ -125,8 +125,8 @@
_mm_load_ss(&data->origin[0][ray])),
_mm_load_ss(&data->inverseDirection[0][ray])
);
- tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
- tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+ tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
+ tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][1])),
_mm_load_ss(&data->origin[1][ray])),
@@ -136,8 +136,8 @@
_mm_load_ss(&data->origin[1][ray])),
_mm_load_ss(&data->inverseDirection[1][ray]) );
- tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
- tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+ tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
+ tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][2])),
_mm_load_ss(&data->origin[2][ray])),
@@ -147,23 +147,41 @@
_mm_load_ss(&data->origin[2][ray])),
_mm_load_ss(&data->inverseDirection[2][ray]) );
- tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
- tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+ tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
+ tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
#endif // USE_UNROLLED or not
-#if 0
- __m128 valid_intersect = _mm_cmplt_ss(tmin, tmax);
+#if 0 // for whatever reason this code path is slower
+ __m128 valid_intersect = _mm_cmple_ss(tmin, tmax);
float result;
_mm_store_ss(&result, valid_intersect);
- if (result != 0.f)
+ if (isnan(result))
return ray;
#else
float vals[2];
_mm_store_ss(&(vals[0]), tmin);
_mm_store_ss(&(vals[1]), tmax);
- if (vals[0] < vals[1])
+ if (vals[0] <= vals[1])
return ray;
#endif
#else
+#if 1 // use min,max (total fps about 10% faster on erw6 on macbook)
+ float tmin = 1e-5f;
+ float tmax = rays.getMinT(ray);
+
+ for (int c = 0; c < 3; c++) {
+ float t0 = (box[0][c] - rays.getOrigin(ray,c)) *
rays.getInverseDirection(ray,c);
+ float t1 = (box[1][c] - rays.getOrigin(ray,c)) *
rays.getInverseDirection(ray,c);
+
+ float near = (t0 < t1) ? t0 : t1;
+ float far = (t0 < t1) ? t1 : t0;
+ //tmin = (tmin < near) ? near : tmin; // max of tmin, near
+ //tmax = (far < tmax) ? far : tmax; // min of tmax, far
+ tmin = (near < tmin) ? tmin : near; // non nan-safe max of tmin,
near
+ tmax = (tmax < far) ? tmax : far; // non nan-safe min of tmax,
far
+ }
+ if (tmin <= tmax) // valid intersection
+ return ray;
+#else
float maximum_minimum = 1e-5;
float minimum_maximum = rays.getMinT(ray);
@@ -199,8 +217,9 @@
if ( minimum_maximum >= z_minimum &&
maximum_minimum <= z_maximum )
return ray; // found a hit
+#endif // use min/max or williams
#endif
-#if 1 // enable/disable frustum test
+#if 1 // enable/disable frustum test (goes from 15 fps to 21 fps)
if (ray == rays.begin())
{
// try a frustum miss
- [MANTA] r1170 - trunk/Model/Groups, boulos, 08/13/2006
Archive powered by MHonArc 2.6.16.