manta - [MANTA] r1230 - trunk/Model/Groups

Closed list
Subscribers: 0
Owners

sparker

thiago

Subscribe
Unsubscribe
Info
Admin
Archive

Post

Shared documents

Manta Interactive Ray Tracer Development Mailing List

Text archives Help

[MANTA] r1230 - trunk/Model/Groups

From: boulos@sci.utah.edu
To: manta@sci.utah.edu
Subject: [MANTA] r1230 - trunk/Model/Groups
Date: Sun, 29 Oct 2006 13:00:03 -0700 (MST)

Author: boulos
Date: Sun Oct 29 13:00:02 2006
New Revision: 1230

Modified:
   trunk/Model/Groups/DynBVH.cc
   trunk/Model/Groups/DynBVH.h
Log:
Fixing bug in alternate ray-box intersection code.  Now
both the "less registers" and "more registers" code are
equally slow.

Modified: trunk/Model/Groups/DynBVH.cc
==============================================================================
--- trunk/Model/Groups/DynBVH.cc        (original)
+++ trunk/Model/Groups/DynBVH.cc        Sun Oct 29 13:00:02 2006
@@ -32,23 +32,101 @@
         for (int axis = 0; axis < 3; axis++)
         {
             const Real new_rcp     = rays.getInverseDirection(ray, axis);
-            const Real new_org_rcp = rays.getOrigin(ray,axis) * new_rcp;
-
+            const Real new_org     = rays.getOrigin(ray,axis);
+            const Real new_org_rcp = new_org * new_rcp;
+#define USE_STDMINMAX 0
+#if USE_STDMINMAX
             ia_data.min_rcp[axis] = std::min(ia_data.min_rcp[axis], new_rcp);
             ia_data.max_rcp[axis] = std::max(ia_data.max_rcp[axis], new_rcp);

+            ia_data.min_org[axis] = std::min(ia_data.min_org[axis], new_org);
+            ia_data.max_org[axis] = std::max(ia_data.max_org[axis], new_org);
+
             ia_data.min_org_rcp[axis] = std::min(ia_data.min_org_rcp[axis],
new_org_rcp);
             ia_data.max_org_rcp[axis] = std::max(ia_data.max_org_rcp[axis],
new_org_rcp);
+#else
+            ia_data.min_rcp[axis] = (ia_data.min_rcp[axis] < new_rcp) ?
ia_data.min_rcp[axis] : new_rcp;
+            ia_data.max_rcp[axis] = (ia_data.max_rcp[axis] < new_rcp) ?
new_rcp : ia_data.max_rcp[axis];
+
+            ia_data.min_org[axis] = (ia_data.min_org[axis] < new_org) ?
ia_data.min_org[axis] : new_org;
+            ia_data.max_org[axis] = (ia_data.max_org[axis] < new_org) ?
new_org : ia_data.max_org[axis];
+
+            ia_data.min_org_rcp[axis] = (ia_data.min_org_rcp[axis] <
new_org_rcp) ?
+                ia_data.min_org_rcp[axis] : new_org_rcp;
+            ia_data.max_org_rcp[axis] = (ia_data.max_org_rcp[axis] <
new_org_rcp) ?
+                new_org_rcp : ia_data.max_org_rcp[axis];
+#endif
         }
     }

+#define USE_ASSERTED_PARENT 0
+
+#if USE_ASSERTED_PARENT
+    const BVHNode& root_node = nodes[0];
+    float tmin;
+    int firstActive = firstIntersects(root_node.bounds, rays, ia_data,
&tmin);
+    if (firstActive != rays.end()) {
+        if (root_node.isLeaf()) {
+            int lastActive = lastIntersects(root_node.bounds, rays);
+            // build a subpacket from firstActive to lastActive (inclusive,
hence +1)
+            RayPacket subpacket(rays, firstActive, lastActive+1);
+
+            for (int i = 0; i < root_node.children; i++ )
+            {
+                const int object_id = object_ids[root_node.child+i];
+                this->get(object_id)->intersect(context,subpacket);
+            }
+        } else {
+            RayPacket subpacket(rays, firstActive, rays.end());
+            intersectNode(root_node.child, context, subpacket, ia_data);
+        }
+    }
+#else
     intersectNode(0,context,rays, ia_data);
+#endif
}

+#if USE_ASSERTED_PARENT
+void DynBVH::intersectNode(int nodeID, const RenderContext& context,
RayPacket& rays, const IAData& ia_data) const {
+    const BVHNode& left_node  = nodes[nodeID+0];
+    const BVHNode& right_node = nodes[nodeID+1];
+    float left_tmin, right_tmin;
+
+    int leftFirstActive  = firstIntersects(left_node.bounds,  rays, ia_data,
&left_tmin);
+    int rightFirstActive = firstIntersects(right_node.bounds, rays, ia_data,
&right_tmin);
+
+    const BVHNode* child_nodes[2] = { (left_tmin < right_tmin) ? &left_node
: &right_node,
+                                      (left_tmin < right_tmin) ? &right_node
: &left_node };
+    int firstActive[2] = { (left_tmin < right_tmin) ? leftFirstActive  :
rightFirstActive,
+                           (left_tmin < right_tmin) ? rightFirstActive :
leftFirstActive };
+
+    for (int which = 0; which < 2; ++which) {
+        if (firstActive[which] != rays.end()) {
+            const BVHNode* node = child_nodes[which];
+            if (node->isLeaf()) {
+                int lastActive = lastIntersects(node->bounds, rays);
+
+                // build a subpacket from firstActive to lastActive
(inclusive, hence +1)
+                RayPacket subpacket(rays, firstActive[which], lastActive+1);
+
+                for (int i = 0; i < node->children; i++ )
+                {
+                    const int object_id = object_ids[node->child+i];
+                    this->get(object_id)->intersect(context,subpacket);
+                }
+            } else {
+                RayPacket subpacket(rays, firstActive[which], rays.end());
+                intersectNode(node->child, context, subpacket, ia_data);
+            }
+        }
+    }
+}
+#else
void DynBVH::intersectNode(int nodeID, const RenderContext& context,
RayPacket& rays, const IAData& ia_data) const
{
     const BVHNode& node = nodes[nodeID];
-    int firstActive = firstIntersects(node.bounds, rays, ia_data);
+    float tmin;
+    int firstActive = firstIntersects(node.bounds, rays, ia_data, &tmin);

     if (firstActive != rays.end())
     {
@@ -78,17 +156,30 @@
             RayPacket subpacket(rays, firstActive, rays.end());

             // recurse
+#if 0 // for erw6 this actually hurts slightly
+            int sign_bit = subpacket.getSign(subpacket.begin(),
static_cast<int>(node.axis));
+            // in manta sign_bit is 1 if direction is < 0
+            // if signbit is < 0, want left child first
+            intersectNode(node.child+(1-sign_bit), context, subpacket,
ia_data);
+            intersectNode(node.child+sign_bit, context, subpacket, ia_data);
+#else
+#if 1
+            int front_son = subpacket.getDirection(subpacket.begin(),
+
static_cast<int>(node.axis)) > 0 ? 0 : 1;
+            intersectNode(node.child+front_son, context, subpacket, ia_data);
+            intersectNode(node.child+1-front_son, context, subpacket,
ia_data);
+#else
             intersectNode(node.child+0, context, subpacket, ia_data);
             intersectNode(node.child+1, context, subpacket, ia_data);
+#endif
+#endif
+
         }
     }
}
-
-// TODO: add interval arithmetic tests so that we can skip big portions?
maybe collapse the
-// forward and backward search?
-
+#endif
// return the first index (between [rays.begin(),rays.end()]) which hits the
box
-int DynBVH::firstIntersects(const BBox& box, const RayPacket& rays, const
IAData& ia_data) const
+int DynBVH::firstIntersects(const BBox& box, const RayPacket& rays, const
IAData& ia_data, float* out_tmin) const
{
#define DYNBVH_NEW_SSE MANTA_SSE

@@ -100,73 +191,6 @@
     // we always want to do the first ray and IA in C not SIMD
     for (int ray = rays.begin(); ray < rays.end(); ray++ )
     {
-#if (0 && DYNBVH_NEW_SSE) // hand written single scalar is slightly faster
than C version
-        float temp_epsilon = 1e-5f;
-        __m128 tmin = _mm_load_ss(&temp_epsilon);
-        __m128 tmax = _mm_load_ss(&data->minT[ray]);
-
-#if 1 // rolled vs unrolled makes no difference
-        for (int c = 0; c < 3; c++) {
-            __m128 t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][c])),
-
_mm_load_ss(&data->origin[c][ray])),
-
_mm_load_ss(&data->inverseDirection[c][ray]) );
-
-            __m128 t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][c])),
-
_mm_load_ss(&data->origin[c][ray])),
-
_mm_load_ss(&data->inverseDirection[c][ray]) );
-
-            tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
-            tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
-        }
-#else
-        __m128 t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][0])),
-
_mm_load_ss(&data->origin[0][ray])),
-                                _mm_load_ss(&data->inverseDirection[0][ray])
);
-
-        __m128 t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][0])),
-
_mm_load_ss(&data->origin[0][ray])),
-                                _mm_load_ss(&data->inverseDirection[0][ray])
);
-
-        tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
-        tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
-
-        t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][1])),
-                                     _mm_load_ss(&data->origin[1][ray])),
-                         _mm_load_ss(&data->inverseDirection[1][ray]) );
-
-        t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][1])),
-                                     _mm_load_ss(&data->origin[1][ray])),
-                         _mm_load_ss(&data->inverseDirection[1][ray]) );
-
-        tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
-        tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
-
-        t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][2])),
-                                     _mm_load_ss(&data->origin[2][ray])),
-                         _mm_load_ss(&data->inverseDirection[2][ray]) );
-
-        t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][2])),
-                                     _mm_load_ss(&data->origin[2][ray])),
-                         _mm_load_ss(&data->inverseDirection[2][ray]) );
-
-        tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
-        tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
-#endif // USE_UNROLLED or not
-#if 0 // for whatever reason this code path is slower
-        __m128 valid_intersect = _mm_cmple_ss(tmin, tmax);
-        float result;
-        _mm_store_ss(&result, valid_intersect);
-        if (isnan(result))
-            return ray;
-#else
-        float vals[2];
-        _mm_store_ss(&(vals[0]), tmin);
-        _mm_store_ss(&(vals[1]), tmax);
-        if (vals[0] <= vals[1])
-            return ray;
-#endif
-#else
-#if 1 // use min,max (total fps about 10% faster on erw6 on macbook)
         float tmin = 1e-5f;
         float tmax = rays.getMinT(ray);

@@ -181,46 +205,11 @@
             tmin = (near < tmin) ? tmin : near; // non nan-safe max of tmin,
near
             tmax = (tmax < far)  ? tmax : far; // non nan-safe min of tmax,
far
         }
-        if (tmin <= tmax) // valid intersection
+        if (tmin <= tmax) {  // valid intersection
+            *out_tmin = tmin;
             return ray;
-#else
-        float maximum_minimum = 1e-5;
-        float minimum_maximum = rays.getMinT(ray);
-
-        float x_minimum = (box[rays.getSign(ray,0)][0]   -
rays.getOrigin(ray,0)) * rays.getInverseDirection(ray,0);
-        float x_maximum = (box[1-rays.getSign(ray,0)][0] -
rays.getOrigin(ray,0)) * rays.getInverseDirection(ray,0);
-
-        float y_minimum = (box[rays.getSign(ray,1)][1]   -
rays.getOrigin(ray,1)) * rays.getInverseDirection(ray,1);
-        float y_maximum = (box[1-rays.getSign(ray,1)][1] -
rays.getOrigin(ray,1)) * rays.getInverseDirection(ray,1);
-
-        float z_minimum = (box[rays.getSign(ray,2)][2]   -
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
-        float z_maximum = (box[1-rays.getSign(ray,2)][2] -
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
-
-        // Note: we don't want to exit early since we might skip the frustum
test
-/*
-        if ( minimum_maximum < x_minimum ||
-             maximum_minimum > x_maximum )
-            continue;
-*/
-        if ( minimum_maximum > x_maximum )
-            minimum_maximum = x_maximum;
-        if ( maximum_minimum < x_minimum )
-            maximum_minimum = x_minimum;
-/*
-        if ( minimum_maximum < y_minimum ||
-             maximum_minimum > y_maximum )
-            continue;
-*/
-        if ( minimum_maximum > y_maximum )
-                minimum_maximum = y_maximum;
-        if ( maximum_minimum < y_minimum )
-            maximum_minimum = y_minimum;
+        }

-        if ( minimum_maximum >= z_minimum &&
-             maximum_minimum <= z_maximum )
-            return ray; // found a hit
-#endif // use min/max or williams
-#endif
#if 1 // enable/disable frustum test (goes from 15 fps to 21 fps)
         if (ray == rays.begin())
         {
@@ -230,22 +219,44 @@

             for (int axis = 0; axis < 3; axis++)
             {
+#if 1 // (box - orgIA) * rcpIA
+                // the subtraction is really (boxIA + -orgIA)
+                // or boxIA + [-max_org, -min_org]
+                // [box_min, box_max] + [-max_org, -min_org]
+                float a = box[0][axis]-ia_data.max_org[axis];
+                float b = box[1][axis]-ia_data.min_org[axis];
+
+                // now for multiplication
+                float ar0 = a * ia_data.min_rcp[axis];
+                float ar1 = a * ia_data.max_rcp[axis];
+                float br0 = b * ia_data.min_rcp[axis];
+                float br1 = b * ia_data.max_rcp[axis];
+
+                // [a,b] is valid intersection interval so a is min
+                // and b is max t-value
+                // TODO(boulos): Replace these std::min calls
+                a = std::min(ar0, std::min(ar1, std::min(br0, br1)));
+                b = std::max(ar0, std::max(ar1, std::max(br0, br1)));
+#else //box * rcpIA - org_rcpIA
                 float a = box[0][axis]*ia_data.min_rcp[axis];
                 float b = box[0][axis]*ia_data.max_rcp[axis];
-                if (a > b)
-                    std::swap(a,b);
+                float first_min = (a < b) ? a : b;
+                float first_max = (a < b) ? b : a;
+
                 float c = box[1][axis]*ia_data.min_rcp[axis];
                 float d = box[1][axis]*ia_data.max_rcp[axis];
-                if (c > d)
-                    std::swap(c,d);

-                a = std::min(a,c);
-                b = std::max(b,d);
+                float second_min = (c < d) ? c : d;
+                float second_max = (c < d) ? d : c;
+
+                a = (first_min < second_min) ? first_min : second_min;
+                b = (first_max > second_max) ? first_max : second_max;

                 a -= ia_data.max_org_rcp[axis];
                 b -= ia_data.min_org_rcp[axis];
-                tmin_frustum = std::max(tmin_frustum, a);
-                tmax_frustum = std::min(tmax_frustum, b);
+#endif
+                tmin_frustum = (tmin_frustum < a) ? a : tmin_frustum;
+                tmax_frustum = (tmax_frustum > b) ? b : tmax_frustum;
             }

             if (tmin_frustum > tmax_frustum) // frustum exit
@@ -263,10 +274,11 @@

#if DYNBVH_NEW_SSE
     // process simds now
-    int pack_begin = sse_begin >> 2;
-    int pack_end   = sse_end   >> 2;
+
     // TODO(boulos): replace operator overloads with direct access
#if 0
+    int pack_begin = sse_begin >> 2;
+    int pack_end   = sse_end   >> 2;
     __m128 box_x0 = _mm_set1_ps(box[0][0]);
     __m128 box_x1 = _mm_set1_ps(box[1][0]);

@@ -309,10 +321,11 @@
             return ray;
     }
#else // different more register efficient version
+    // NOTE(boulos): This code had a severe bug that may have made it look
more efficient than it is...
     // two regs for box
     __m128 box0 = _mm_set_ps(0.f, box[0][2], box[0][1], box[0][0]);
     __m128 box1 = _mm_set_ps(0.f, box[1][2], box[1][1], box[1][0]);
-    for (int pack = pack_begin, ray = sse_begin; pack < pack_end; ++pack,
ray += 4) {
+    for (int ray = sse_begin; ray < sse_end; ray += 4) {
         // two regs for interval tracking
         __m128 tmin = _mm_set1_ps(1e-5f);
         __m128 tmax = _mm_load_ps(&data->minT[ray]);
@@ -325,8 +338,8 @@
         __m128 t1 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box1, box1,
_MM_SHUFFLE(0,0,0,0)),

_mm_load_ps(&data->origin[0][ray])),
                                 _mm_load_ps(&data->inverseDirection[0][ray])
);
-        tmin = _mm_max_ps(tmin, _mm_max_ps(t0, t1));
-        tmax = _mm_min_ps(tmax, _mm_min_ps(t0, t1));
+        tmin = _mm_max_ps(tmin, _mm_min_ps(t0, t1));
+        tmax = _mm_min_ps(tmax, _mm_max_ps(t0, t1));

         t0 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box0, box0,
_MM_SHUFFLE(1,1,1,1)),
                                      _mm_load_ps(&data->origin[1][ray])),
@@ -335,8 +348,8 @@
                                      _mm_load_ps(&data->origin[1][ray])),
                          _mm_load_ps(&data->inverseDirection[1][ray]) );

-        tmin = _mm_max_ps(tmin, _mm_max_ps(t0, t1));
-        tmax = _mm_min_ps(tmax, _mm_min_ps(t0, t1));
+        tmin = _mm_max_ps(tmin, _mm_min_ps(t0, t1));
+        tmax = _mm_min_ps(tmax, _mm_max_ps(t0, t1));

         t0 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box0, box0,
_MM_SHUFFLE(2,2,2,2)),
                                      _mm_load_ps(&data->origin[2][ray])),
@@ -345,12 +358,26 @@
                                      _mm_load_ps(&data->origin[2][ray])),
                          _mm_load_ps(&data->inverseDirection[2][ray]) );

-        tmin = _mm_max_ps(tmin, _mm_max_ps(t0, t1));
-        tmax = _mm_min_ps(tmax, _mm_min_ps(t0, t1));
+        tmin = _mm_max_ps(tmin, _mm_min_ps(t0, t1));
+        tmax = _mm_min_ps(tmax, _mm_max_ps(t0, t1));

-        __m128 valid_intersect = _mm_cmplt_ps(tmin, tmax);
-        if (_mm_movemask_ps(valid_intersect) != 0x0)
+        __m128 valid_intersect = _mm_cmple_ps(tmin, tmax);
+        if (_mm_movemask_ps(valid_intersect) != 0x0) {
+            sse_t min_with_inf = _mm_or_ps(_mm_and_ps(valid_intersect, tmin),
+                                           _mm_andnot_ps(valid_intersect,
_mm_set1_ps(1e150f)));
+            // a = (t0, t0, t1, t1)
+            sse_t a = _mm_unpacklo_ps(min_with_inf,min_with_inf);
+            // b = (t2, t2, t3, t3)
+            sse_t b = _mm_unpackhi_ps(min_with_inf,min_with_inf);
+            // c = (min(t0,t2), min(t0, t2), min(t1, t3), min(t1, t3))
+            sse_t c = _mm_min_ps(a, b);
+            // The movehl will move the high 2 values to the low 2 values.
+            // This will allow us to compare min(t0,t2) with min(t1, t3).
+            sse_t result = _mm_min_ss(c, _mm_movehl_ps(c, c));
+            // Return the first value.
+            *out_tmin = *((float*)&result);
             return ray;
+        }
     }
#endif
     // get remaining rays
@@ -384,7 +411,10 @@

         if ( minimum_maximum >= z_minimum &&
              maximum_minimum <= z_maximum )
+        {
+            *out_tmin = (maximum_minimum < z_minimum) ? maximum_minimum :
z_minimum;
             return ray; // found a hit
+        }
     }
#endif
     return rays.end();
@@ -393,6 +423,7 @@
// return the last index which hits the box
int DynBVH::lastIntersects(const BBox& box, const RayPacket& rays) const
{
+    // TODO(boulos): Consider adding SIMD march here too
     for (int ray = rays.end() - 1; ray >= rays.begin(); ray-- )
     {
         float maximum_minimum = 1e-5;

Modified: trunk/Model/Groups/DynBVH.h
==============================================================================
--- trunk/Model/Groups/DynBVH.h (original)
+++ trunk/Model/Groups/DynBVH.h Sun Oct 29 13:00:02 2006
@@ -14,6 +14,8 @@
         {
             Real min_rcp[3];
             Real max_rcp[3];
+            Real min_org[3];
+            Real max_org[3];
             Real min_org_rcp[3];
             Real max_org_rcp[3];
         };
@@ -64,7 +66,7 @@

         // return the first index (between [rays.begin(),rays.end()]) which
hits the box
-        int firstIntersects(const BBox& box, const RayPacket& rays, const
IAData& ia_data) const;
+        int firstIntersects(const BBox& box, const RayPacket& rays, const
IAData& ia_data, float* out_tmin) const;
         // return the last index which hits the box
         int lastIntersects(const BBox& box, const RayPacket& rays) const;

[MANTA] r1230 - trunk/Model/Groups, boulos, 10/29/2006

Archive powered by MHonArc 2.6.16.