Text archives Help
- From: boulos@sci.utah.edu
- To: manta@sci.utah.edu
- Subject: [Manta] r1776 - in trunk: Interface Model/Groups
- Date: Thu, 11 Oct 2007 22:02:10 -0600 (MDT)
Author: boulos
Date: Thu Oct 11 22:02:04 2007
New Revision: 1776
Modified:
trunk/Interface/RayPacket.h
trunk/Model/Groups/DynBVH.cc
Log:
Interface/RayPacket.h
Adding the ability to check whether or not a SIMD group of rays are
all inactive. Not very efficient yet...
Model/Groups/DynBVH.cc
Fixing a major performance bug in firstIntersect that skipped the
interval arithmetic test unless the ray packet happened to not start
on a SIMD aligned boundary.
Adding a SIMD version of lastIntersects (as it now comes up in the
profile).
Skipping inactive rays when appropriate.
For 6 test scenes this update has delivered an additional 20% perf
improvement.
Modified: trunk/Interface/RayPacket.h
==============================================================================
--- trunk/Interface/RayPacket.h (original)
+++ trunk/Interface/RayPacket.h Thu Oct 11 22:02:04 2007
@@ -442,6 +442,13 @@
return data->hitMatl[which] == (Material*)0xffffffff;
#endif
}
+ // TODO(boulos): Make this efficient
+ inline bool groupIsMasked(int which) const {
+ for (int i = 0; i < 4; i++) {
+ if (!rayIsMasked(which + i)) return false;
+ }
+ return true;
+ }
Real &getMinT(int which) const
{
return data->minT[which];
Modified: trunk/Model/Groups/DynBVH.cc
==============================================================================
--- trunk/Model/Groups/DynBVH.cc (original)
+++ trunk/Model/Groups/DynBVH.cc Thu Oct 11 22:02:04 2007
@@ -45,93 +45,31 @@
}
// TODO(boulos): provide an SSE version
- for (int ray = rays.begin(); ray < rays.end(); ray++ )
- {
- for (int axis = 0; axis < 3; axis++)
- {
- const Real new_rcp = rays.getInverseDirection(ray, axis);
- const Real new_org = rays.getOrigin(ray,axis);
- const Real new_org_rcp = new_org * new_rcp;
-
- ia_data.min_rcp[axis] = (ia_data.min_rcp[axis] < new_rcp) ?
ia_data.min_rcp[axis] : new_rcp;
- ia_data.max_rcp[axis] = (ia_data.max_rcp[axis] < new_rcp) ?
new_rcp : ia_data.max_rcp[axis];
-
- ia_data.min_org[axis] = (ia_data.min_org[axis] < new_org) ?
ia_data.min_org[axis] : new_org;
- ia_data.max_org[axis] = (ia_data.max_org[axis] < new_org) ?
new_org : ia_data.max_org[axis];
-
- ia_data.min_org_rcp[axis] = (ia_data.min_org_rcp[axis] <
new_org_rcp) ?
- ia_data.min_org_rcp[axis] : new_org_rcp;
- ia_data.max_org_rcp[axis] = (ia_data.max_org_rcp[axis] <
new_org_rcp) ?
- new_org_rcp : ia_data.max_org_rcp[axis];
-
- }
+ for (int ray = rays.begin(); ray < rays.end(); ray++ ) {
+ if (rays.rayIsMasked(ray)) continue;
+ for (int axis = 0; axis < 3; axis++) {
+ const Real new_rcp = rays.getInverseDirection(ray, axis);
+ const Real new_org = rays.getOrigin(ray,axis);
+ const Real new_org_rcp = new_org * new_rcp;
+
+ ia_data.min_rcp[axis] = (ia_data.min_rcp[axis] < new_rcp) ?
ia_data.min_rcp[axis] : new_rcp;
+ ia_data.max_rcp[axis] = (ia_data.max_rcp[axis] < new_rcp) ? new_rcp :
ia_data.max_rcp[axis];
+
+ ia_data.min_org[axis] = (ia_data.min_org[axis] < new_org) ?
ia_data.min_org[axis] : new_org;
+ ia_data.max_org[axis] = (ia_data.max_org[axis] < new_org) ? new_org :
ia_data.max_org[axis];
+
+ ia_data.min_org_rcp[axis] = (ia_data.min_org_rcp[axis] < new_org_rcp) ?
+ ia_data.min_org_rcp[axis] : new_org_rcp;
+ ia_data.max_org_rcp[axis] = (ia_data.max_org_rcp[axis] < new_org_rcp) ?
+ new_org_rcp : ia_data.max_org_rcp[axis];
}
+ }
-#define USE_ASSERTED_PARENT 0
-
-#if USE_ASSERTED_PARENT
- const BVHNode& root_node = nodes[0];
- float tmin;
- int firstActive = firstIntersects(root_node.bounds, rays, ia_data, &tmin);
- if (firstActive != rays.end()) {
- if (root_node.isLeaf()) {
- int lastActive = lastIntersects(root_node.bounds, rays);
- // build a subpacket from firstActive to lastActive (inclusive, hence
+1)
- RayPacket subpacket(rays, firstActive, lastActive+1);
- for (int i = 0; i < root_node.children; i++ )
- {
- const int object_id = object_ids[root_node.child+i];
- currGroup->get(object_id)->intersect(context,subpacket);
- }
- } else {
- RayPacket subpacket(rays, firstActive, rays.end());
- intersectNode(root_node.child, context, subpacket, ia_data);
- }
- }
-#else
intersectNode(0,context,rays, ia_data);
-#endif // asserted parent or not
-
#endif // dynbvh port or not
}
-#if USE_ASSERTED_PARENT
-void DynBVH::intersectNode(int nodeID, const RenderContext& context,
RayPacket& rays, const IAData& ia_data) const {
- const BVHNode& left_node = nodes[nodeID+0];
- const BVHNode& right_node = nodes[nodeID+1];
- float left_tmin, right_tmin;
-
- int leftFirstActive = firstIntersects(left_node.bounds, rays, ia_data,
&left_tmin);
- int rightFirstActive = firstIntersects(right_node.bounds, rays, ia_data,
&right_tmin);
-
- const BVHNode* child_nodes[2] = { (left_tmin < right_tmin) ? &left_node :
&right_node,
- (left_tmin < right_tmin) ? &right_node :
&left_node };
- int firstActive[2] = { (left_tmin < right_tmin) ? leftFirstActive :
rightFirstActive,
- (left_tmin < right_tmin) ? rightFirstActive :
leftFirstActive };
-
- for (int which = 0; which < 2; ++which) {
- if (firstActive[which] != rays.end()) {
- const BVHNode* node = child_nodes[which];
- if (node->isLeaf()) {
- int lastActive = lastIntersects(node->bounds, rays);
-
- // build a subpacket from firstActive to lastActive (inclusive,
hence +1)
- RayPacket subpacket(rays, firstActive[which], lastActive+1);
-
- for (int i = 0; i < node->children; i++ )
- {
- const int object_id = object_ids[node->child+i];
- currGroup->get(object_id)->intersect(context,subpacket);
- }
- } else {
- RayPacket subpacket(rays, firstActive[which], rays.end());
- intersectNode(node->child, context, subpacket, ia_data);
- }
- }
- }
-}
-#else
void DynBVH::intersectNode(int nodeID, const RenderContext& context,
RayPacket& rays, const IAData& ia_data) const
{
const BVHNode& node = nodes[nodeID];
@@ -173,7 +111,7 @@
}
}
}
-#endif
+
// return the first index (between [rays.begin(),rays.end()]) which hits the
box
int DynBVH::firstIntersects(const BBox& box, const RayPacket& rays, const
IAData& ia_data, float* out_tmin) const
{
@@ -182,6 +120,7 @@
int e = rays.rayEnd & (~3);
if (b >=e) {
for (int i = rays.begin(); i < rays.end(); i++) {
+ if (rays.rayIsMasked(i)) continue;
float tmin = 1e-5f;
float tmax = rays.getMinT(i);
@@ -201,7 +140,9 @@
}
} else {
int i = rays.rayBegin;
- for(;i<b;i++) {
+
+ {
+ // Try first hit as scalar
float tmin = 1e-5f;
float tmax = rays.getMinT(i);
@@ -218,46 +159,68 @@
*out_tmin = tmin;
return i;
}
+ }
+
+ // try a frustum miss
+ {
+ float tmin_frustum = 1e-5;
+ float tmax_frustum = FLT_MAX;
- if (i == rays.rayBegin) {
- // try a frustum miss
- float tmin_frustum = 1e-5;
- float tmax_frustum = FLT_MAX;
-
- for (int axis = 0; axis < 3; axis++) {
- // the subtraction is really (boxIA + -orgIA)
- // or boxIA + [-max_org, -min_org]
- // [box_min, box_max] + [-max_org, -min_org]
- float a = box[0][axis]-ia_data.max_org[axis];
- float b = box[1][axis]-ia_data.min_org[axis];
-
- // now for multiplication
- float ar0 = a * ia_data.min_rcp[axis];
- float ar1 = a * ia_data.max_rcp[axis];
- float br0 = b * ia_data.min_rcp[axis];
- float br1 = b * ia_data.max_rcp[axis];
-
- // [a,b] is valid intersection interval so a is min
- // and b is max t-value
-
- //a = std::min(ar0, std::min(ar1, std::min(br0, br1)));
- a = (br0 < br1) ? br0 : br1;
- a = (a < ar1) ? a : ar1;
- a = (a < ar0) ? a : ar0;
-
- //b = std::max(ar0, std::max(ar1, std::max(br0, br1)));
- b = (br0 < br1) ? br1 : br0;
- b = (b < ar1) ? ar1 : b;
- b = (b < ar0) ? ar0 : b;
+ for (int axis = 0; axis < 3; axis++) {
+ // the subtraction is really (boxIA + -orgIA)
+ // or boxIA + [-max_org, -min_org]
+ // [box_min, box_max] + [-max_org, -min_org]
+ float a = box[0][axis]-ia_data.max_org[axis];
+ float b = box[1][axis]-ia_data.min_org[axis];
- tmin_frustum = (tmin_frustum < a) ? a : tmin_frustum;
- tmax_frustum = (tmax_frustum > b) ? b : tmax_frustum;
- }
+ // now for multiplication
+ float ar0 = a * ia_data.min_rcp[axis];
+ float ar1 = a * ia_data.max_rcp[axis];
+ float br0 = b * ia_data.min_rcp[axis];
+ float br1 = b * ia_data.max_rcp[axis];
- // frustum exit
- if (tmin_frustum > tmax_frustum) {
- return rays.end();
- }
+ // [a,b] is valid intersection interval so a is min
+ // and b is max t-value
+
+ //a = std::min(ar0, std::min(ar1, std::min(br0, br1)));
+ a = (br0 < br1) ? br0 : br1;
+ a = (a < ar1) ? a : ar1;
+ a = (a < ar0) ? a : ar0;
+
+ //b = std::max(ar0, std::max(ar1, std::max(br0, br1)));
+ b = (br0 < br1) ? br1 : br0;
+ b = (b < ar1) ? ar1 : b;
+ b = (b < ar0) ? ar0 : b;
+
+ tmin_frustum = (tmin_frustum < a) ? a : tmin_frustum;
+ tmax_frustum = (tmax_frustum > b) ? b : tmax_frustum;
+ }
+
+ // frustum exit
+ if (tmin_frustum > tmax_frustum) {
+ return rays.end();
+ }
+ }
+
+ if (i < b) i++; // Avoid redoing first ray, but only if we won't do it
in SIMD
+
+ // Scalar pickup loop
+ for (; i < b; i++) {
+ float tmin = 1e-5f;
+ float tmax = rays.getMinT(i);
+
+ for (int c = 0; c < 3; c++) {
+ float t0 = (box[0][c] - rays.getOrigin(i,c)) *
rays.getInverseDirection(i,c);
+ float t1 = (box[1][c] - rays.getOrigin(i,c)) *
rays.getInverseDirection(i,c);
+
+ float near = (t0 < t1) ? t0 : t1;
+ float far = (t0 < t1) ? t1 : t0;
+ tmin = (tmin < near) ? near : tmin; // max of tmin, near
+ tmax = (far < tmax) ? far : tmax; // min of tmax, far
+ }
+ if (tmin <= tmax) { // valid intersection
+ *out_tmin = tmin;
+ return i;
}
}
@@ -271,7 +234,8 @@
__m128 box_z0 = _mm_set1_ps(box[0][2]);
__m128 box_z1 = _mm_set1_ps(box[1][2]);
- for(;i<e;i+=4){
+ for(;i<e;i+=4) {
+ if (rays.groupIsMasked(i)) continue;
__m128 x0 = _mm_mul_ps(_mm_sub_ps(box_x0,
_mm_load_ps(&data->origin[0][i])),
_mm_load_ps(&data->inverseDirection[0][i]));
__m128 x1 = _mm_mul_ps(_mm_sub_ps(box_x1,
_mm_load_ps(&data->origin[0][i])),
@@ -304,6 +268,7 @@
}
for(;i<rays.rayEnd;i++) {
+ if (rays.rayIsMasked(i)) continue;
float tmin = 1e-5f;
float tmax = rays.getMinT(i);
@@ -323,7 +288,7 @@
}
}
return rays.end();
-#else
+#else // NOT MANTA_SSE (scalar case)
for (int i = rays.begin(); i < rays.end(); i++) {
float tmin = 1e-5f;
float tmax = rays.getMinT(i);
@@ -390,40 +355,134 @@
// return the last index which hits the box
int DynBVH::lastIntersects(const BBox& box, const RayPacket& rays) const
{
- // TODO(boulos): Consider adding SIMD march here too
- for (int ray = rays.end() - 1; ray > rays.begin(); ray-- )
- {
- float maximum_minimum = 1e-5;
- float minimum_maximum = rays.getMinT(ray);
+#ifdef MANTA_SSE
+ int last_simd = rays.rayEnd & (~3);
+ int first_simd = (rays.rayBegin + 3) & (~3);
+ if (first_simd >= last_simd) {
+ for (int i = rays.rayEnd - 1; i > rays.rayBegin; i--) {
+ if (rays.rayIsMasked(i)) continue;
+ float tmin = 1e-5f;
+ float tmax = rays.getMinT(i);
- float x_minimum = (box[rays.getSign(ray,0)][0] -
rays.getOrigin(ray,0)) * rays.getInverseDirection(ray,0);
- float x_maximum = (box[1-rays.getSign(ray,0)][0] -
rays.getOrigin(ray,0)) * rays.getInverseDirection(ray,0);
+ for (int c = 0; c < 3; c++) {
+ float t0 = (box[0][c] - rays.getOrigin(i,c)) *
rays.getInverseDirection(i,c);
+ float t1 = (box[1][c] - rays.getOrigin(i,c)) *
rays.getInverseDirection(i,c);
- float y_minimum = (box[rays.getSign(ray,1)][1] -
rays.getOrigin(ray,1)) * rays.getInverseDirection(ray,1);
- float y_maximum = (box[1-rays.getSign(ray,1)][1] -
rays.getOrigin(ray,1)) * rays.getInverseDirection(ray,1);
+ float near = (t0 < t1) ? t0 : t1;
+ float far = (t0 < t1) ? t1 : t0;
+ tmin = (tmin < near) ? near : tmin; // max of tmin, near
+ tmax = (far < tmax) ? far : tmax; // min of tmax, far
+ }
+ if (tmin <= tmax) { // valid intersection
+ return i;
+ }
+ }
+ } else {
+ int i = rays.end();
+ for (; i > last_simd;) {
+ i--;
+ if (rays.rayIsMasked(i)) continue;
+ float tmin = 1e-5f;
+ float tmax = rays.getMinT(i);
+
+ for (int c = 0; c < 3; c++) {
+ float t0 = (box[0][c] - rays.getOrigin(i,c)) *
rays.getInverseDirection(i,c);
+ float t1 = (box[1][c] - rays.getOrigin(i,c)) *
rays.getInverseDirection(i,c);
- float z_minimum = (box[rays.getSign(ray,2)][2] -
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
- float z_maximum = (box[1-rays.getSign(ray,2)][2] -
rays.getOrigin(ray,2)) * rays.getInverseDirection(ray,2);
-
- if ( minimum_maximum < x_minimum ||
- maximum_minimum > x_maximum )
- continue;
- if ( minimum_maximum > x_maximum )
- minimum_maximum = x_maximum;
- if ( maximum_minimum < x_minimum )
- maximum_minimum = x_minimum;
- if ( minimum_maximum < y_minimum ||
- maximum_minimum > y_maximum )
- continue;
- if ( minimum_maximum > y_maximum )
- minimum_maximum = y_maximum;
- if ( maximum_minimum < y_minimum )
- maximum_minimum = y_minimum;
-
- if ( minimum_maximum >= z_minimum &&
- maximum_minimum <= z_maximum )
- return ray; // found a hit
+ float near = (t0 < t1) ? t0 : t1;
+ float far = (t0 < t1) ? t1 : t0;
+ tmin = (tmin < near) ? near : tmin; // max of tmin, near
+ tmax = (far < tmax) ? far : tmax; // min of tmax, far
+ }
+ if (tmin <= tmax) { // valid intersection
+ return i;
+ }
}
+
+ RayPacketData* data = rays.data;
+ __m128 box_x0 = _mm_set1_ps(box[0][0]);
+ __m128 box_x1 = _mm_set1_ps(box[1][0]);
+
+ __m128 box_y0 = _mm_set1_ps(box[0][1]);
+ __m128 box_y1 = _mm_set1_ps(box[1][1]);
+
+ __m128 box_z0 = _mm_set1_ps(box[0][2]);
+ __m128 box_z1 = _mm_set1_ps(box[1][2]);
+
+ for(;i>first_simd;) {
+ i -= 4;
+ if (rays.groupIsMasked(i)) continue;
+ __m128 x0 = _mm_mul_ps(_mm_sub_ps(box_x0,
_mm_load_ps(&data->origin[0][i])),
+ _mm_load_ps(&data->inverseDirection[0][i]));
+ __m128 x1 = _mm_mul_ps(_mm_sub_ps(box_x1,
_mm_load_ps(&data->origin[0][i])),
+ _mm_load_ps(&data->inverseDirection[0][i]));
+
+ __m128 xmin = _mm_min_ps(x0,x1);
+ __m128 xmax = _mm_max_ps(x0,x1);
+
+ __m128 y0 = _mm_mul_ps(_mm_sub_ps(box_y0,
_mm_load_ps(&data->origin[1][i])),
+ _mm_load_ps(&data->inverseDirection[1][i]));
+ __m128 y1 = _mm_mul_ps(_mm_sub_ps(box_y1,
_mm_load_ps(&data->origin[1][i])),
+ _mm_load_ps(&data->inverseDirection[1][i]));
+
+ __m128 ymin = _mm_min_ps(y0,y1);
+ __m128 ymax = _mm_max_ps(y0,y1);
+
+ __m128 z0 = _mm_mul_ps(_mm_sub_ps(box_z0,
_mm_load_ps(&data->origin[2][i])),
+ _mm_load_ps(&data->inverseDirection[2][i]));
+ __m128 z1 = _mm_mul_ps(_mm_sub_ps(box_z1,
_mm_load_ps(&data->origin[2][i])),
+ _mm_load_ps(&data->inverseDirection[2][i]));
+
+ __m128 zmin = _mm_min_ps(z0,z1);
+ __m128 zmax = _mm_max_ps(z0,z1);
+
+ __m128 maximum_minimum =
_mm_max_ps(xmin,_mm_max_ps(ymin,_mm_max_ps(zmin, _mm_set1_ps(1e-5f))));
+ __m128 minimum_maximum =
_mm_min_ps(xmax,_mm_min_ps(ymax,_mm_min_ps(zmax,_mm_load_ps(&data->minT[i]))));
+ __m128 valid_intersect = _mm_cmple_ps(maximum_minimum,minimum_maximum);
+ if (_mm_movemask_ps(valid_intersect) != 0x0)
+ return i+3;
+ }
+
+ for (; i > rays.rayBegin;) {
+ i--;
+ if (rays.rayIsMasked(i)) continue;
+ float tmin = 1e-5f;
+ float tmax = rays.getMinT(i);
+
+ for (int c = 0; c < 3; c++) {
+ float t0 = (box[0][c] - rays.getOrigin(i,c)) *
rays.getInverseDirection(i,c);
+ float t1 = (box[1][c] - rays.getOrigin(i,c)) *
rays.getInverseDirection(i,c);
+
+ float near = (t0 < t1) ? t0 : t1;
+ float far = (t0 < t1) ? t1 : t0;
+ tmin = (tmin < near) ? near : tmin; // max of tmin, near
+ tmax = (far < tmax) ? far : tmax; // min of tmax, far
+ }
+ if (tmin <= tmax) { // valid intersection
+ return i;
+ }
+ }
+ }
+#else
+ for (int i = rays.rayEnd - 1; i > rays.rayBegin; i--) {
+ if (rays.rayIsMasked(i)) continue;
+ float tmin = 1e-5f;
+ float tmax = rays.getMinT(i);
+
+ for (int c = 0; c < 3; c++) {
+ float t0 = (box[0][c] - rays.getOrigin(i,c)) *
rays.getInverseDirection(i,c);
+ float t1 = (box[1][c] - rays.getOrigin(i,c)) *
rays.getInverseDirection(i,c);
+
+ float near = (t0 < t1) ? t0 : t1;
+ float far = (t0 < t1) ? t1 : t0;
+ tmin = (tmin < near) ? near : tmin; // max of tmin, near
+ tmax = (far < tmax) ? far : tmax; // min of tmax, far
+ }
+ if (tmin <= tmax) { // valid intersection
+ return i;
+ }
+ }
+#endif
return rays.begin();
}
@@ -528,7 +587,6 @@
int left_son = node.child;
int right_son = left_son + 1;
// Recurse into two subtrees
-#if 1
taskpool_mutex.lock();
size_t tasklist_id = cur_tasklist;
@@ -541,7 +599,6 @@
cur_callback += 3;
taskpool_mutex.unlock();
-#endif
TaskList* children = new (tasklist_memory + sizeof(TaskList) *
tasklist_id) TaskList();
Task* left_child = new (task_memory + sizeof(Task) * left_task_id) Task(
@@ -550,12 +607,7 @@
&DynBVH::parallelTopDownUpdate,
left_son,
context));
-/*
- Callback::create(this,
- &DynBVH::parallelTopDownUpdate,
- left_son,
- context));
-*/
+
children->push_back(left_child);
Task* right_child = new (task_memory + sizeof(Task) * right_task_id)
Task(
@@ -564,29 +616,15 @@
&DynBVH::parallelTopDownUpdate,
right_son,
context));
-/*
- Callback::create(this,
- &DynBVH::parallelTopDownUpdate,
- right_son,
- context));
-*/
children->push_back(right_child);
-
// When the children finish, call reduction
children->setReduction(
new (callback_memory + (callback_id+2) * sizeof(bvhtask_type))
Callback_0Data_2Arg<DynBVH, int, Task*>(this,
&DynBVH::updateBoundsReduction,
node_id,
task));
-
-/*
- Callback::create(this,
- &DynBVH::updateBoundsReduction,
- node_id,
- task));
-*/
// Insert new TaskList for the BVH Object* in UpdateGraph
context.insertWork(children);
- [Manta] r1776 - in trunk: Interface Model/Groups, boulos, 10/12/2007
Archive powered by MHonArc 2.6.16.