Manta Interactive Ray Tracer Development Mailing List

Text archives Help


[MANTA] r1168 - in trunk/Model: Groups Primitives


Chronological Thread 
  • From: boulos@sci.utah.edu
  • To: manta@sci.utah.edu
  • Subject: [MANTA] r1168 - in trunk/Model: Groups Primitives
  • Date: Sat, 12 Aug 2006 01:04:14 -0600 (MDT)

Author: boulos
Date: Sat Aug 12 01:04:12 2006
New Revision: 1168

Modified:
   trunk/Model/Groups/DynBVH.cc
   trunk/Model/Primitives/WaldTriangle.cc
Log:
Adding some sse versions to the DynBVH traversal.
Single scalar version of first hit not working yet 
but #ifdefed out.

Can't seem to use the macro'ed simd stuff in 
WaldTriangle so switching to USE_SIMD 0 unless on
altivec system (now detected with __POWERPC__)


Modified: trunk/Model/Groups/DynBVH.cc
==============================================================================
--- trunk/Model/Groups/DynBVH.cc        (original)
+++ trunk/Model/Groups/DynBVH.cc        Sat Aug 12 01:04:12 2006
@@ -87,8 +87,83 @@
 // return the first index (between [rays.begin(),rays.end()]) which hits the 
box
 int DynBVH::firstIntersects(const BBox& box, const RayPacket& rays, const 
IAData& ia_data) const
 {
+#define DYNBVH_NEW_SSE MANTA_SSE
+
+#if DYNBVH_NEW_SSE
+    int sse_begin = (rays.begin() + 3)&(~3);
+    int sse_end   = rays.end() & (~3);
+    const RayPacketData* data = rays.data;
+#endif
+    // we always want to do the first ray and IA in C not SIMD
     for (int ray = rays.begin(); ray < rays.end(); ray++ )
     {
+#if (0 && DYNBVH_NEW_SSE) // for whatever reason this code doesn't work.
+        float temp_epsilon = 1e-5f;
+        __m128 tmin = _mm_load_ss(&temp_epsilon);
+        float check_load;
+        __m128 tmax = _mm_load_ss(&data->minT[ray]);
+
+#if 1
+        for (int c = 0; c < 3; ++c) {
+            __m128 t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][c])),
+                                                
_mm_load_ss(&data->origin[c][ray])),
+                                    
_mm_load_ss(&data->inverseDirection[c][ray]) );
+
+            __m128 t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][c])),
+                                                
_mm_load_ss(&data->origin[c][ray])),
+                                    
_mm_load_ss(&data->inverseDirection[c][ray]) );
+
+            tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
+            tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+        }
+#else
+        __m128 t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][0])),
+                                            
_mm_load_ss(&data->origin[0][ray])),
+                                _mm_load_ss(&data->inverseDirection[0][ray]) 
);
+
+        __m128 t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][0])),
+                                            
_mm_load_ss(&data->origin[0][ray])),
+                                _mm_load_ss(&data->inverseDirection[0][ray]) 
);
+
+        tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
+        tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+
+        t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][1])),
+                                     _mm_load_ss(&data->origin[1][ray])),
+                         _mm_load_ss(&data->inverseDirection[1][ray]) );
+
+        t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][1])),
+                                     _mm_load_ss(&data->origin[1][ray])),
+                         _mm_load_ss(&data->inverseDirection[1][ray]) );
+
+        tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
+        tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+
+        t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][2])),
+                                     _mm_load_ss(&data->origin[2][ray])),
+                         _mm_load_ss(&data->inverseDirection[2][ray]) );
+
+        t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][2])),
+                                     _mm_load_ss(&data->origin[2][ray])),
+                         _mm_load_ss(&data->inverseDirection[2][ray]) );
+
+        tmin = _mm_max_ss(tmin, _mm_max_ss(t0, t1));
+        tmax = _mm_min_ss(tmax, _mm_min_ss(t0, t1));
+#endif // USE_UNROLLED or not
+#if 0
+        __m128 valid_intersect = _mm_cmplt_ss(tmin, tmax);
+        float result;
+        _mm_store_ss(&result, valid_intersect);
+        if (result != 0.f)
+            return ray;
+#else
+        float vals[2];
+        _mm_store_ss(&(vals[0]), tmin);
+        _mm_store_ss(&(vals[1]), tmax);
+        if (vals[0] < vals[1])
+            return ray;
+#endif
+#else
         float maximum_minimum = 1e-5;
         float minimum_maximum = rays.getMinT(ray);
 
@@ -101,16 +176,21 @@
         float z_minimum = (box[rays.getSign(ray,2)][2]   - 
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
         float z_maximum = (box[1-rays.getSign(ray,2)][2] - 
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
 
+        // Note: we don't want to exit early since we might skip the frustum 
test
+/*
         if ( minimum_maximum < x_minimum ||
              maximum_minimum > x_maximum )
             continue;
+*/
         if ( minimum_maximum > x_maximum )
             minimum_maximum = x_maximum;
         if ( maximum_minimum < x_minimum )
             maximum_minimum = x_minimum;
+/*
         if ( minimum_maximum < y_minimum ||
              maximum_minimum > y_maximum )
             continue;
+*/
         if ( minimum_maximum > y_maximum )
                 minimum_maximum = y_maximum;
         if ( maximum_minimum < y_minimum )
@@ -119,7 +199,8 @@
         if ( minimum_maximum >= z_minimum &&
              maximum_minimum <= z_maximum )
             return ray; // found a hit
-
+#endif
+#if 1 // enable/disable frustum test
         if (ray == rays.begin())
         {
             // try a frustum miss
@@ -151,8 +232,140 @@
                 return rays.end();
             }
         }
+#endif // do frustum test
+#if (DYNBVH_NEW_SSE && 1)
+        // if we can use simd now, jump out (redoes some work for SIMD 
aligned rays)
+        if ( ray == sse_begin || ray == sse_begin - 1 )
+            break;
+#endif
+    }
+
+#if DYNBVH_NEW_SSE
+    // process simds now
+    int pack_begin = sse_begin >> 2;
+    int pack_end   = sse_end   >> 2;
+    // TODO(boulos): replace operator overloads with direct access
+#if 0
+    __m128 box_x0 = _mm_set1_ps(box[0][0]);
+    __m128 box_x1 = _mm_set1_ps(box[1][0]);
 
+    __m128 box_y0 = _mm_set1_ps(box[0][1]);
+    __m128 box_y1 = _mm_set1_ps(box[1][1]);
+
+    __m128 box_z0 = _mm_set1_ps(box[0][2]);
+    __m128 box_z1 = _mm_set1_ps(box[1][2]);
+
+    for (int pack = pack_begin, ray = sse_begin; pack < pack_end; ++pack, 
ray += 4)
+    {
+        __m128 x0 = _mm_mul_ps(_mm_sub_ps(box_x0, 
_mm_load_ps(&data->origin[0][ray])),
+                               _mm_load_ps(&data->inverseDirection[0][ray]));
+        __m128 x1 = _mm_mul_ps(_mm_sub_ps(box_x1, 
_mm_load_ps(&data->origin[0][ray])),
+                               _mm_load_ps(&data->inverseDirection[0][ray]));
+
+        __m128 xmin = _mm_min_ps(x0,x1);
+        __m128 xmax = _mm_max_ps(x0,x1);
+
+        __m128 y0 = _mm_mul_ps(_mm_sub_ps(box_y0, 
_mm_load_ps(&data->origin[1][ray])),
+                               _mm_load_ps(&data->inverseDirection[1][ray]));
+        __m128 y1 = _mm_mul_ps(_mm_sub_ps(box_y1, 
_mm_load_ps(&data->origin[1][ray])),
+                               _mm_load_ps(&data->inverseDirection[1][ray]));
+
+        __m128 ymin = _mm_min_ps(y0,y1);
+        __m128 ymax = _mm_max_ps(y0,y1);
+
+        __m128 z0 = _mm_mul_ps(_mm_sub_ps(box_z0, 
_mm_load_ps(&data->origin[2][ray])),
+                               _mm_load_ps(&data->inverseDirection[2][ray]));
+        __m128 z1 = _mm_mul_ps(_mm_sub_ps(box_z1, 
_mm_load_ps(&data->origin[2][ray])),
+                               _mm_load_ps(&data->inverseDirection[2][ray]));
+
+        __m128 zmin = _mm_min_ps(z0,z1);
+        __m128 zmax = _mm_max_ps(z0,z1);
+
+        __m128 maximum_minimum = 
_mm_max_ps(xmin,_mm_max_ps(ymin,_mm_max_ps(zmin, _mm_set1_ps(1e-5f))));
+        __m128 minimum_maximum = 
_mm_min_ps(xmax,_mm_min_ps(ymax,_mm_min_ps(zmax,_mm_load_ps(&data->minT[ray]))));
+        __m128 valid_intersect = 
_mm_cmplt_ps(maximum_minimum,minimum_maximum);
+        if (_mm_movemask_ps(valid_intersect) != 0x0)
+            return ray;
+    }
+#else // different more register efficient version
+    // two regs for box
+    __m128 box0 = _mm_set_ps(0.f, box[0][2], box[0][1], box[0][0]);
+    __m128 box1 = _mm_set_ps(0.f, box[1][2], box[1][1], box[1][0]);
+    for (int pack = pack_begin, ray = sse_begin; pack < pack_end; ++pack, 
ray += 4) {
+        // two regs for interval tracking
+        __m128 tmin = _mm_set1_ps(1e-5f);
+        __m128 tmax = _mm_load_ps(&data->minT[ray]);
+        // current is 4
+        // 1 for inv dir, 1 for ray org, 1 for shuffle output
+
+        __m128 t0 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box0, box0, 
_MM_SHUFFLE(0,0,0,0)),
+                                            
_mm_load_ps(&data->origin[0][ray])),
+                                _mm_load_ps(&data->inverseDirection[0][ray]) 
);
+        __m128 t1 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box1, box1, 
_MM_SHUFFLE(0,0,0,0)),
+                                            
_mm_load_ps(&data->origin[0][ray])),
+                                _mm_load_ps(&data->inverseDirection[0][ray]) 
);
+        tmin = _mm_max_ps(tmin, _mm_max_ps(t0, t1));
+        tmax = _mm_min_ps(tmax, _mm_min_ps(t0, t1));
+
+        t0 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box0, box0, 
_MM_SHUFFLE(1,1,1,1)),
+                                     _mm_load_ps(&data->origin[1][ray])),
+                         _mm_load_ps(&data->inverseDirection[1][ray]) );
+        t1 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box1, box1, 
_MM_SHUFFLE(1,1,1,1)),
+                                     _mm_load_ps(&data->origin[1][ray])),
+                         _mm_load_ps(&data->inverseDirection[1][ray]) );
+
+        tmin = _mm_max_ps(tmin, _mm_max_ps(t0, t1));
+        tmax = _mm_min_ps(tmax, _mm_min_ps(t0, t1));
+
+        t0 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box0, box0, 
_MM_SHUFFLE(2,2,2,2)),
+                                     _mm_load_ps(&data->origin[2][ray])),
+                         _mm_load_ps(&data->inverseDirection[2][ray]) );
+        t1 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box1, box1, 
_MM_SHUFFLE(2,2,2,2)),
+                                     _mm_load_ps(&data->origin[2][ray])),
+                         _mm_load_ps(&data->inverseDirection[2][ray]) );
+
+        tmin = _mm_max_ps(tmin, _mm_max_ps(t0, t1));
+        tmax = _mm_min_ps(tmax, _mm_min_ps(t0, t1));
+
+        __m128 valid_intersect = _mm_cmplt_ps(tmin, tmax);
+        if (_mm_movemask_ps(valid_intersect) != 0x0)
+            return ray;
+    }
+#endif
+    // get remaining rays
+    for (int ray = sse_begin+1; ray < rays.end(); ++ray) {
+        float maximum_minimum = 1e-5;
+        float minimum_maximum = rays.getMinT(ray);
+
+        float x_minimum = (box[rays.getSign(ray,0)][0]   - 
rays.getOrigin(ray,0)) * rays.getInverseDirection(ray,0);
+        float x_maximum = (box[1-rays.getSign(ray,0)][0] - 
rays.getOrigin(ray,0)) * rays.getInverseDirection(ray,0);
+
+        float y_minimum = (box[rays.getSign(ray,1)][1]   - 
rays.getOrigin(ray,1)) * rays.getInverseDirection(ray,1);
+        float y_maximum = (box[1-rays.getSign(ray,1)][1] - 
rays.getOrigin(ray,1)) * rays.getInverseDirection(ray,1);
+
+        float z_minimum = (box[rays.getSign(ray,2)][2]   - 
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
+        float z_maximum = (box[1-rays.getSign(ray,2)][2] - 
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
+
+        if ( minimum_maximum < x_minimum ||
+             maximum_minimum > x_maximum )
+            continue;
+        if ( minimum_maximum > x_maximum )
+            minimum_maximum = x_maximum;
+        if ( maximum_minimum < x_minimum )
+            maximum_minimum = x_minimum;
+        if ( minimum_maximum < y_minimum ||
+             maximum_minimum > y_maximum )
+            continue;
+        if ( minimum_maximum > y_maximum )
+                minimum_maximum = y_maximum;
+        if ( maximum_minimum < y_minimum )
+            maximum_minimum = y_minimum;
+
+        if ( minimum_maximum >= z_minimum &&
+             maximum_minimum <= z_maximum )
+            return ray; // found a hit
     }
+#endif
     return rays.end();
 }
 

Modified: trunk/Model/Primitives/WaldTriangle.cc
==============================================================================
--- trunk/Model/Primitives/WaldTriangle.cc      (original)
+++ trunk/Model/Primitives/WaldTriangle.cc      Sat Aug 12 01:04:12 2006
@@ -9,14 +9,23 @@
 using namespace std;
 
 
-#define USE_SIMD    0 // off by default
-
-#ifdef __APPLE__
+#ifdef __POWERPC__
 #define USE_ALTIVEC 1
-#define USE_SSE     0 // note this won't work for x86 apple...
+#define USE_SSE     0
+#else
+#ifdef MANTA_SSE
+#define USE_ALTIVEC 0
+#define USE_SSE     0
+#define USE_SIMD    0
 #else
 #define USE_ALTIVEC 0
-#define USE_SSE     1
+#define USE_SSE     0
+#define USE_SIMD    0
+#endif
+#endif
+
+#ifndef USE_SIMD
+#define USE_SIMD    0 // off by default
 #endif
 
 #define USE_CSIMD   (USE_SIMD && !USE_ALTIVEC && !USE_SSE)
@@ -474,7 +483,7 @@
 
    // first SSE aligned ray in the packet of rays:
    const int sse_begin = rays.begin() >> 2; // equivalent to 
Floor(rays.begin()/4)
-   const int sse_end   = ((rays.end()-1+3) >> 2); // Ceil(rays.end()-1/4)
+   const int sse_end   = ((rays.end()+3) >> 2); // Ceil(rays.end()-1/4)
 
    const int ray_begin = rays.begin();
    const int ray_end   = rays.end();




  • [MANTA] r1168 - in trunk/Model: Groups Primitives, boulos, 08/12/2006

Archive powered by MHonArc 2.6.16.

Top of page