manta - [MANTA] r1170 - trunk/Model/Groups

Closed list
Subscribers: 0
Owners

sparker

thiago

Subscribe
Unsubscribe
Info
Admin
Archive

Post

Shared documents

Manta Interactive Ray Tracer Development Mailing List

Text archives Help

[MANTA] r1170 - trunk/Model/Groups

From: boulos@sci.utah.edu
To: manta@sci.utah.edu
Subject: [MANTA] r1170 - trunk/Model/Groups
Date: Sun, 13 Aug 2006 17:26:17 -0600 (MDT)

Author: boulos
Date: Sun Aug 13 17:26:15 2006
New Revision: 1170

Modified:
   trunk/Model/Groups/DynBVH.cc
Log:
Fixed bugs in the single scalar sse code.  When
compared to a new friendly C version it's only
barely faster (gcc uses single scalar sse too).

The interval arithmetic is a big win though. The
relative perf improvements of various ifdefs are
placed throughout with comments.

Modified: trunk/Model/Groups/DynBVH.cc
==============================================================================
--- trunk/Model/Groups/DynBVH.cc        (original)
+++ trunk/Model/Groups/DynBVH.cc        Sun Aug 13 17:26:15 2006
@@ -24,6 +24,7 @@
         ia_data.max_org_rcp[axis] = -DBL_MAX;
     }

+    // TODO(boulos): provide an SSE version
     for (int ray = rays.begin(); ray < rays.end(); ray++ )
     {
         for (int axis = 0; axis < 3; axis++)
@@ -97,14 +98,13 @@
     // we always want to do the first ray and IA in C not SIMD
     for (int ray = rays.begin(); ray < rays.end(); ray++ )
     {
-#if (0 && DYNBVH_NEW_SSE) // for whatever reason this code doesn't work.
+#if (0 && DYNBVH_NEW_SSE) // hand written single scalar is slightly faster
than C version
         float temp_epsilon = 1e-5f;
         __m128 tmin = _mm_load_ss(&temp_epsilon);
-        float check_load;
         __m128 tmax = _mm_load_ss(&data->minT[ray]);

-#if 1
-        for (int c = 0; c < 3; ++c) {
+#if 1 // rolled vs unrolled makes no difference
+        for (int c = 0; c < 3; c++) {
             __m128 t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][c])),

_mm_load_ss(&data->origin[c][ray])),

_mm_load_ss(&data->inverseDirection[c][ray]) );
@@ -113,8 +113,8 @@

_mm_load_ss(&data->origin[c][ray])),

_mm_load_ss(&data->inverseDirection[c][ray]) );

-            tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
-            tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+            tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
+            tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
         }
#else
         __m128 t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][0])),
@@ -125,8 +125,8 @@

_mm_load_ss(&data->origin[0][ray])),
                                 _mm_load_ss(&data->inverseDirection[0][ray])
);

-        tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
-        tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+        tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
+        tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));

         t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][1])),
                                      _mm_load_ss(&data->origin[1][ray])),
@@ -136,8 +136,8 @@
                                      _mm_load_ss(&data->origin[1][ray])),
                          _mm_load_ss(&data->inverseDirection[1][ray]) );

-        tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
-        tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+        tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
+        tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));

         t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][2])),
                                      _mm_load_ss(&data->origin[2][ray])),
@@ -147,23 +147,41 @@
                                      _mm_load_ss(&data->origin[2][ray])),
                          _mm_load_ss(&data->inverseDirection[2][ray]) );

-        tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
-        tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+        tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
+        tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
#endif // USE_UNROLLED or not
-#if 0
-        __m128 valid_intersect = _mm_cmplt_ss(tmin, tmax);
+#if 0 // for whatever reason this code path is slower
+        __m128 valid_intersect = _mm_cmple_ss(tmin, tmax);
         float result;
         _mm_store_ss(&result, valid_intersect);
-        if (result != 0.f)
+        if (isnan(result))
             return ray;
#else
         float vals[2];
         _mm_store_ss(&(vals[0]), tmin);
         _mm_store_ss(&(vals[1]), tmax);
-        if (vals[0] < vals[1])
+        if (vals[0] <= vals[1])
             return ray;
#endif
#else
+#if 1 // use min,max (total fps about 10% faster on erw6 on macbook)
+        float tmin = 1e-5f;
+        float tmax = rays.getMinT(ray);
+
+        for (int c = 0; c < 3; c++) {
+            float t0 = (box[0][c] - rays.getOrigin(ray,c)) *
rays.getInverseDirection(ray,c);
+            float t1 = (box[1][c] - rays.getOrigin(ray,c)) *
rays.getInverseDirection(ray,c);
+
+            float near = (t0 < t1) ? t0 : t1;
+            float far  = (t0 < t1) ? t1 : t0;
+            //tmin = (tmin < near) ? near : tmin; // max of tmin, near
+            //tmax = (far <  tmax) ? far : tmax;  // min of tmax, far
+            tmin = (near < tmin) ? tmin : near; // non nan-safe max of tmin,
near
+            tmax = (tmax < far)  ? tmax : far; // non nan-safe min of tmax,
far
+        }
+        if (tmin <= tmax) // valid intersection
+            return ray;
+#else
         float maximum_minimum = 1e-5;
         float minimum_maximum = rays.getMinT(ray);

@@ -199,8 +217,9 @@
         if ( minimum_maximum >= z_minimum &&
              maximum_minimum <= z_maximum )
             return ray; // found a hit
+#endif // use min/max or williams
#endif
-#if 1 // enable/disable frustum test
+#if 1 // enable/disable frustum test (goes from 15 fps to 21 fps)
         if (ray == rays.begin())
         {
             // try a frustum miss

[MANTA] r1170 - trunk/Model/Groups, boulos, 08/13/2006

Archive powered by MHonArc 2.6.16.