Text archives Help
- From: boulos@sci.utah.edu
- To: manta@sci.utah.edu
- Subject: [MANTA] r1230 - trunk/Model/Groups
- Date: Sun, 29 Oct 2006 13:00:03 -0700 (MST)
Author: boulos
Date: Sun Oct 29 13:00:02 2006
New Revision: 1230
Modified:
trunk/Model/Groups/DynBVH.cc
trunk/Model/Groups/DynBVH.h
Log:
Fixing bug in alternate ray-box intersection code. Now
both the "less registers" and "more registers" code are
equally slow.
Modified: trunk/Model/Groups/DynBVH.cc
==============================================================================
--- trunk/Model/Groups/DynBVH.cc (original)
+++ trunk/Model/Groups/DynBVH.cc Sun Oct 29 13:00:02 2006
@@ -32,23 +32,101 @@
for (int axis = 0; axis < 3; axis++)
{
const Real new_rcp = rays.getInverseDirection(ray, axis);
- const Real new_org_rcp = rays.getOrigin(ray,axis) * new_rcp;
-
+ const Real new_org = rays.getOrigin(ray,axis);
+ const Real new_org_rcp = new_org * new_rcp;
+#define USE_STDMINMAX 0
+#if USE_STDMINMAX
ia_data.min_rcp[axis] = std::min(ia_data.min_rcp[axis], new_rcp);
ia_data.max_rcp[axis] = std::max(ia_data.max_rcp[axis], new_rcp);
+ ia_data.min_org[axis] = std::min(ia_data.min_org[axis], new_org);
+ ia_data.max_org[axis] = std::max(ia_data.max_org[axis], new_org);
+
ia_data.min_org_rcp[axis] = std::min(ia_data.min_org_rcp[axis],
new_org_rcp);
ia_data.max_org_rcp[axis] = std::max(ia_data.max_org_rcp[axis],
new_org_rcp);
+#else
+ ia_data.min_rcp[axis] = (ia_data.min_rcp[axis] < new_rcp) ?
ia_data.min_rcp[axis] : new_rcp;
+ ia_data.max_rcp[axis] = (ia_data.max_rcp[axis] < new_rcp) ?
new_rcp : ia_data.max_rcp[axis];
+
+ ia_data.min_org[axis] = (ia_data.min_org[axis] < new_org) ?
ia_data.min_org[axis] : new_org;
+ ia_data.max_org[axis] = (ia_data.max_org[axis] < new_org) ?
new_org : ia_data.max_org[axis];
+
+ ia_data.min_org_rcp[axis] = (ia_data.min_org_rcp[axis] <
new_org_rcp) ?
+ ia_data.min_org_rcp[axis] : new_org_rcp;
+ ia_data.max_org_rcp[axis] = (ia_data.max_org_rcp[axis] <
new_org_rcp) ?
+ new_org_rcp : ia_data.max_org_rcp[axis];
+#endif
}
}
+#define USE_ASSERTED_PARENT 0
+
+#if USE_ASSERTED_PARENT
+ const BVHNode& root_node = nodes[0];
+ float tmin;
+ int firstActive = firstIntersects(root_node.bounds, rays, ia_data,
&tmin);
+ if (firstActive != rays.end()) {
+ if (root_node.isLeaf()) {
+ int lastActive = lastIntersects(root_node.bounds, rays);
+ // build a subpacket from firstActive to lastActive (inclusive,
hence +1)
+ RayPacket subpacket(rays, firstActive, lastActive+1);
+
+ for (int i = 0; i < root_node.children; i++ )
+ {
+ const int object_id = object_ids[root_node.child+i];
+ this->get(object_id)->intersect(context,subpacket);
+ }
+ } else {
+ RayPacket subpacket(rays, firstActive, rays.end());
+ intersectNode(root_node.child, context, subpacket, ia_data);
+ }
+ }
+#else
intersectNode(0,context,rays, ia_data);
+#endif
}
+#if USE_ASSERTED_PARENT
+void DynBVH::intersectNode(int nodeID, const RenderContext& context,
RayPacket& rays, const IAData& ia_data) const {
+ const BVHNode& left_node = nodes[nodeID+0];
+ const BVHNode& right_node = nodes[nodeID+1];
+ float left_tmin, right_tmin;
+
+ int leftFirstActive = firstIntersects(left_node.bounds, rays, ia_data,
&left_tmin);
+ int rightFirstActive = firstIntersects(right_node.bounds, rays, ia_data,
&right_tmin);
+
+ const BVHNode* child_nodes[2] = { (left_tmin < right_tmin) ? &left_node
: &right_node,
+ (left_tmin < right_tmin) ? &right_node
: &left_node };
+ int firstActive[2] = { (left_tmin < right_tmin) ? leftFirstActive :
rightFirstActive,
+ (left_tmin < right_tmin) ? rightFirstActive :
leftFirstActive };
+
+ for (int which = 0; which < 2; ++which) {
+ if (firstActive[which] != rays.end()) {
+ const BVHNode* node = child_nodes[which];
+ if (node->isLeaf()) {
+ int lastActive = lastIntersects(node->bounds, rays);
+
+ // build a subpacket from firstActive to lastActive
(inclusive, hence +1)
+ RayPacket subpacket(rays, firstActive[which], lastActive+1);
+
+ for (int i = 0; i < node->children; i++ )
+ {
+ const int object_id = object_ids[node->child+i];
+ this->get(object_id)->intersect(context,subpacket);
+ }
+ } else {
+ RayPacket subpacket(rays, firstActive[which], rays.end());
+ intersectNode(node->child, context, subpacket, ia_data);
+ }
+ }
+ }
+}
+#else
void DynBVH::intersectNode(int nodeID, const RenderContext& context,
RayPacket& rays, const IAData& ia_data) const
{
const BVHNode& node = nodes[nodeID];
- int firstActive = firstIntersects(node.bounds, rays, ia_data);
+ float tmin;
+ int firstActive = firstIntersects(node.bounds, rays, ia_data, &tmin);
if (firstActive != rays.end())
{
@@ -78,17 +156,30 @@
RayPacket subpacket(rays, firstActive, rays.end());
// recurse
+#if 0 // for erw6 this actually hurts slightly
+ int sign_bit = subpacket.getSign(subpacket.begin(),
static_cast<int>(node.axis));
+ // in manta sign_bit is 1 if direction is < 0
+ // if signbit is < 0, want left child first
+ intersectNode(node.child+(1-sign_bit), context, subpacket,
ia_data);
+ intersectNode(node.child+sign_bit, context, subpacket, ia_data);
+#else
+#if 1
+ int front_son = subpacket.getDirection(subpacket.begin(),
+
static_cast<int>(node.axis)) > 0 ? 0 : 1;
+ intersectNode(node.child+front_son, context, subpacket, ia_data);
+ intersectNode(node.child+1-front_son, context, subpacket,
ia_data);
+#else
intersectNode(node.child+0, context, subpacket, ia_data);
intersectNode(node.child+1, context, subpacket, ia_data);
+#endif
+#endif
+
}
}
}
-
-// TODO: add interval arithmetic tests so that we can skip big portions?
maybe collapse the
-// forward and backward search?
-
+#endif
// return the first index (between [rays.begin(),rays.end()]) which hits the
box
-int DynBVH::firstIntersects(const BBox& box, const RayPacket& rays, const
IAData& ia_data) const
+int DynBVH::firstIntersects(const BBox& box, const RayPacket& rays, const
IAData& ia_data, float* out_tmin) const
{
#define DYNBVH_NEW_SSE MANTA_SSE
@@ -100,73 +191,6 @@
// we always want to do the first ray and IA in C not SIMD
for (int ray = rays.begin(); ray < rays.end(); ray++ )
{
-#if (0 && DYNBVH_NEW_SSE) // hand written single scalar is slightly faster
than C version
- float temp_epsilon = 1e-5f;
- __m128 tmin = _mm_load_ss(&temp_epsilon);
- __m128 tmax = _mm_load_ss(&data->minT[ray]);
-
-#if 1 // rolled vs unrolled makes no difference
- for (int c = 0; c < 3; c++) {
- __m128 t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][c])),
-
_mm_load_ss(&data->origin[c][ray])),
-
_mm_load_ss(&data->inverseDirection[c][ray]) );
-
- __m128 t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][c])),
-
_mm_load_ss(&data->origin[c][ray])),
-
_mm_load_ss(&data->inverseDirection[c][ray]) );
-
- tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
- tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
- }
-#else
- __m128 t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][0])),
-
_mm_load_ss(&data->origin[0][ray])),
- _mm_load_ss(&data->inverseDirection[0][ray])
);
-
- __m128 t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][0])),
-
_mm_load_ss(&data->origin[0][ray])),
- _mm_load_ss(&data->inverseDirection[0][ray])
);
-
- tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
- tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
-
- t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][1])),
- _mm_load_ss(&data->origin[1][ray])),
- _mm_load_ss(&data->inverseDirection[1][ray]) );
-
- t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][1])),
- _mm_load_ss(&data->origin[1][ray])),
- _mm_load_ss(&data->inverseDirection[1][ray]) );
-
- tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
- tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
-
- t0 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[0][2])),
- _mm_load_ss(&data->origin[2][ray])),
- _mm_load_ss(&data->inverseDirection[2][ray]) );
-
- t1 = _mm_mul_ss( _mm_sub_ss( _mm_load_ss(&(box[1][2])),
- _mm_load_ss(&data->origin[2][ray])),
- _mm_load_ss(&data->inverseDirection[2][ray]) );
-
- tmin = _mm_max_ss(tmin, _mm_min_ss(t0, t1));
- tmax = _mm_min_ss(tmax, _mm_max_ss(t0, t1));
-#endif // USE_UNROLLED or not
-#if 0 // for whatever reason this code path is slower
- __m128 valid_intersect = _mm_cmple_ss(tmin, tmax);
- float result;
- _mm_store_ss(&result, valid_intersect);
- if (isnan(result))
- return ray;
-#else
- float vals[2];
- _mm_store_ss(&(vals[0]), tmin);
- _mm_store_ss(&(vals[1]), tmax);
- if (vals[0] <= vals[1])
- return ray;
-#endif
-#else
-#if 1 // use min,max (total fps about 10% faster on erw6 on macbook)
float tmin = 1e-5f;
float tmax = rays.getMinT(ray);
@@ -181,46 +205,11 @@
tmin = (near < tmin) ? tmin : near; // non nan-safe max of tmin,
near
tmax = (tmax < far) ? tmax : far; // non nan-safe min of tmax,
far
}
- if (tmin <= tmax) // valid intersection
+ if (tmin <= tmax) { // valid intersection
+ *out_tmin = tmin;
return ray;
-#else
- float maximum_minimum = 1e-5;
- float minimum_maximum = rays.getMinT(ray);
-
- float x_minimum = (box[rays.getSign(ray,0)][0] -
rays.getOrigin(ray,0)) * rays.getInverseDirection(ray,0);
- float x_maximum = (box[1-rays.getSign(ray,0)][0] -
rays.getOrigin(ray,0)) * rays.getInverseDirection(ray,0);
-
- float y_minimum = (box[rays.getSign(ray,1)][1] -
rays.getOrigin(ray,1)) * rays.getInverseDirection(ray,1);
- float y_maximum = (box[1-rays.getSign(ray,1)][1] -
rays.getOrigin(ray,1)) * rays.getInverseDirection(ray,1);
-
- float z_minimum = (box[rays.getSign(ray,2)][2] -
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
- float z_maximum = (box[1-rays.getSign(ray,2)][2] -
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
-
- // Note: we don't want to exit early since we might skip the frustum
test
-/*
- if ( minimum_maximum < x_minimum ||
- maximum_minimum > x_maximum )
- continue;
-*/
- if ( minimum_maximum > x_maximum )
- minimum_maximum = x_maximum;
- if ( maximum_minimum < x_minimum )
- maximum_minimum = x_minimum;
-/*
- if ( minimum_maximum < y_minimum ||
- maximum_minimum > y_maximum )
- continue;
-*/
- if ( minimum_maximum > y_maximum )
- minimum_maximum = y_maximum;
- if ( maximum_minimum < y_minimum )
- maximum_minimum = y_minimum;
+ }
- if ( minimum_maximum >= z_minimum &&
- maximum_minimum <= z_maximum )
- return ray; // found a hit
-#endif // use min/max or williams
-#endif
#if 1 // enable/disable frustum test (goes from 15 fps to 21 fps)
if (ray == rays.begin())
{
@@ -230,22 +219,44 @@
for (int axis = 0; axis < 3; axis++)
{
+#if 1 // (box - orgIA) * rcpIA
+ // the subtraction is really (boxIA + -orgIA)
+ // or boxIA + [-max_org, -min_org]
+ // [box_min, box_max] + [-max_org, -min_org]
+ float a = box[0][axis]-ia_data.max_org[axis];
+ float b = box[1][axis]-ia_data.min_org[axis];
+
+ // now for multiplication
+ float ar0 = a * ia_data.min_rcp[axis];
+ float ar1 = a * ia_data.max_rcp[axis];
+ float br0 = b * ia_data.min_rcp[axis];
+ float br1 = b * ia_data.max_rcp[axis];
+
+ // [a,b] is valid intersection interval so a is min
+ // and b is max t-value
+ // TODO(boulos): Replace these std::min calls
+ a = std::min(ar0, std::min(ar1, std::min(br0, br1)));
+ b = std::max(ar0, std::max(ar1, std::max(br0, br1)));
+#else //box * rcpIA - org_rcpIA
float a = box[0][axis]*ia_data.min_rcp[axis];
float b = box[0][axis]*ia_data.max_rcp[axis];
- if (a > b)
- std::swap(a,b);
+ float first_min = (a < b) ? a : b;
+ float first_max = (a < b) ? b : a;
+
float c = box[1][axis]*ia_data.min_rcp[axis];
float d = box[1][axis]*ia_data.max_rcp[axis];
- if (c > d)
- std::swap(c,d);
- a = std::min(a,c);
- b = std::max(b,d);
+ float second_min = (c < d) ? c : d;
+ float second_max = (c < d) ? d : c;
+
+ a = (first_min < second_min) ? first_min : second_min;
+ b = (first_max > second_max) ? first_max : second_max;
a -= ia_data.max_org_rcp[axis];
b -= ia_data.min_org_rcp[axis];
- tmin_frustum = std::max(tmin_frustum, a);
- tmax_frustum = std::min(tmax_frustum, b);
+#endif
+ tmin_frustum = (tmin_frustum < a) ? a : tmin_frustum;
+ tmax_frustum = (tmax_frustum > b) ? b : tmax_frustum;
}
if (tmin_frustum > tmax_frustum) // frustum exit
@@ -263,10 +274,11 @@
#if DYNBVH_NEW_SSE
// process simds now
- int pack_begin = sse_begin >> 2;
- int pack_end = sse_end >> 2;
+
// TODO(boulos): replace operator overloads with direct access
#if 0
+ int pack_begin = sse_begin >> 2;
+ int pack_end = sse_end >> 2;
__m128 box_x0 = _mm_set1_ps(box[0][0]);
__m128 box_x1 = _mm_set1_ps(box[1][0]);
@@ -309,10 +321,11 @@
return ray;
}
#else // different more register efficient version
+ // NOTE(boulos): This code had a severe bug that may have made it look
more efficient than it is...
// two regs for box
__m128 box0 = _mm_set_ps(0.f, box[0][2], box[0][1], box[0][0]);
__m128 box1 = _mm_set_ps(0.f, box[1][2], box[1][1], box[1][0]);
- for (int pack = pack_begin, ray = sse_begin; pack < pack_end; ++pack,
ray += 4) {
+ for (int ray = sse_begin; ray < sse_end; ray += 4) {
// two regs for interval tracking
__m128 tmin = _mm_set1_ps(1e-5f);
__m128 tmax = _mm_load_ps(&data->minT[ray]);
@@ -325,8 +338,8 @@
__m128 t1 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box1, box1,
_MM_SHUFFLE(0,0,0,0)),
_mm_load_ps(&data->origin[0][ray])),
_mm_load_ps(&data->inverseDirection[0][ray])
);
- tmin = _mm_max_ps(tmin, _mm_max_ps(t0, t1));
- tmax = _mm_min_ps(tmax, _mm_min_ps(t0, t1));
+ tmin = _mm_max_ps(tmin, _mm_min_ps(t0, t1));
+ tmax = _mm_min_ps(tmax, _mm_max_ps(t0, t1));
t0 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box0, box0,
_MM_SHUFFLE(1,1,1,1)),
_mm_load_ps(&data->origin[1][ray])),
@@ -335,8 +348,8 @@
_mm_load_ps(&data->origin[1][ray])),
_mm_load_ps(&data->inverseDirection[1][ray]) );
- tmin = _mm_max_ps(tmin, _mm_max_ps(t0, t1));
- tmax = _mm_min_ps(tmax, _mm_min_ps(t0, t1));
+ tmin = _mm_max_ps(tmin, _mm_min_ps(t0, t1));
+ tmax = _mm_min_ps(tmax, _mm_max_ps(t0, t1));
t0 = _mm_mul_ps( _mm_sub_ps( _mm_shuffle_ps(box0, box0,
_MM_SHUFFLE(2,2,2,2)),
_mm_load_ps(&data->origin[2][ray])),
@@ -345,12 +358,26 @@
_mm_load_ps(&data->origin[2][ray])),
_mm_load_ps(&data->inverseDirection[2][ray]) );
- tmin = _mm_max_ps(tmin, _mm_max_ps(t0, t1));
- tmax = _mm_min_ps(tmax, _mm_min_ps(t0, t1));
+ tmin = _mm_max_ps(tmin, _mm_min_ps(t0, t1));
+ tmax = _mm_min_ps(tmax, _mm_max_ps(t0, t1));
- __m128 valid_intersect = _mm_cmplt_ps(tmin, tmax);
- if (_mm_movemask_ps(valid_intersect) != 0x0)
+ __m128 valid_intersect = _mm_cmple_ps(tmin, tmax);
+ if (_mm_movemask_ps(valid_intersect) != 0x0) {
+ sse_t min_with_inf = _mm_or_ps(_mm_and_ps(valid_intersect, tmin),
+ _mm_andnot_ps(valid_intersect,
_mm_set1_ps(1e150f)));
+ // a = (t0, t0, t1, t1)
+ sse_t a = _mm_unpacklo_ps(min_with_inf,min_with_inf);
+ // b = (t2, t2, t3, t3)
+ sse_t b = _mm_unpackhi_ps(min_with_inf,min_with_inf);
+ // c = (min(t0,t2), min(t0, t2), min(t1, t3), min(t1, t3))
+ sse_t c = _mm_min_ps(a, b);
+ // The movehl will move the high 2 values to the low 2 values.
+ // This will allow us to compare min(t0,t2) with min(t1, t3).
+ sse_t result = _mm_min_ss(c, _mm_movehl_ps(c, c));
+ // Return the first value.
+ *out_tmin = *((float*)&result);
return ray;
+ }
}
#endif
// get remaining rays
@@ -384,7 +411,10 @@
if ( minimum_maximum >= z_minimum &&
maximum_minimum <= z_maximum )
+ {
+ *out_tmin = (maximum_minimum < z_minimum) ? maximum_minimum :
z_minimum;
return ray; // found a hit
+ }
}
#endif
return rays.end();
@@ -393,6 +423,7 @@
// return the last index which hits the box
int DynBVH::lastIntersects(const BBox& box, const RayPacket& rays) const
{
+ // TODO(boulos): Consider adding SIMD march here too
for (int ray = rays.end() - 1; ray >= rays.begin(); ray-- )
{
float maximum_minimum = 1e-5;
Modified: trunk/Model/Groups/DynBVH.h
==============================================================================
--- trunk/Model/Groups/DynBVH.h (original)
+++ trunk/Model/Groups/DynBVH.h Sun Oct 29 13:00:02 2006
@@ -14,6 +14,8 @@
{
Real min_rcp[3];
Real max_rcp[3];
+ Real min_org[3];
+ Real max_org[3];
Real min_org_rcp[3];
Real max_org_rcp[3];
};
@@ -64,7 +66,7 @@
// return the first index (between [rays.begin(),rays.end()]) which
hits the box
- int firstIntersects(const BBox& box, const RayPacket& rays, const
IAData& ia_data) const;
+ int firstIntersects(const BBox& box, const RayPacket& rays, const
IAData& ia_data, float* out_tmin) const;
// return the last index which hits the box
int lastIntersects(const BBox& box, const RayPacket& rays) const;
- [MANTA] r1230 - trunk/Model/Groups, boulos, 10/29/2006
Archive powered by MHonArc 2.6.16.