This breaks IsosurfaceOctreeVolume.cc in icc 9.0 and 9.1
Abe
knolla@sci.utah.edu wrote:
Author: knolla
Date: Mon Jun 26 01:04:00 2006
New Revision: 1127
Modified:
trunk/Core/Math/SSEDefs.h
trunk/Model/Primitives/IsosurfaceOctreeVolume.cc
trunk/Model/Primitives/IsosurfaceOctreeVolume.h
trunk/Model/Primitives/OctreeVolume.h
Log:
update to IsosurfaceOctreeVolume in SSE only. Slight improvement; some divide-by-zero bugs.
Modified: trunk/Core/Math/SSEDefs.h
==============================================================================
--- trunk/Core/Math/SSEDefs.h (original)
+++ trunk/Core/Math/SSEDefs.h Mon Jun 26 01:04:00 2006
@@ -11,6 +11,7 @@
#include <Core/Util/Align.h>
#include <Core/Geometry/vecdefs.h>
#include <Core/Geometry/Vector.h>
+#include <iostream>
typedef __m128 sse_t;
typedef __m128i sse_int_t;
@@ -49,6 +50,17 @@
namespace Manta
{
+ union sse_union
+ {
+ sse_t sse;
+ float f[4];
+ };
+ + union sse_int_union
+ {
+ __m128i ssei;
+ int i[4];
+ }; #if defined(__x86_64) && defined(__INTEL_COMPILER)
@@ -61,10 +73,6 @@
}
#endif
- -
-
-
static const MANTA_ALIGN(16) sse_t _mm_eps = _mm_set_ps1(1e-5);
static const MANTA_ALIGN(16) sse_t _mm_minus_eps = _mm_set_ps1(-1e-5);
@@ -246,6 +254,14 @@
float f[4];
_mm_store_ps(f,t);
return f[offset];
+ }
+ + inline float simd_cerr(sse_t t)
+ { + MANTA_ALIGN(16)
+ float f[4];
+ _mm_store_ps(f,t);
+ std::cerr << f[0] << ", " << f[1] << ", " << f[2] << ", " << f[3] << std::endl;
}
inline Vec3f as_Vec3f(sse_t t)
Modified: trunk/Model/Primitives/IsosurfaceOctreeVolume.cc
==============================================================================
--- trunk/Model/Primitives/IsosurfaceOctreeVolume.cc (original)
+++ trunk/Model/Primitives/IsosurfaceOctreeVolume.cc Mon Jun 26 01:04:00 2006
@@ -78,6 +78,7 @@
min_rho = MIN(min_rho, this_rho); \
max_rho = MAX(max_rho, this_rho); \
rho[0][1][0] = static_cast<float>(this_rho); \
+ static const int axis_table[] = {4, 2, 1};
@@ -114,8 +115,8 @@
void IsosurfaceOctreeVolume::intersect(RenderContext const &context, RayPacket &packet) const
{
#ifdef MANTA_SSE
-//#if 1
- packet_intersect_implicit_bvh(packet);
+//#if 0
+ packet_intersect_sse(packet);
#else
for ( int i = packet.rayBegin; i < packet.rayEnd; i++ )
single_intersect(packet, i);
@@ -518,7 +519,7 @@
this_rho = octdata->lookup_neighbor<1,0,1>(child_cell, offset, stop_depth, leaf_depth, index_trace);
min_rho = MIN(min_rho, this_rho);
max_rho = MAX(max_rho, this_rho);
- }
+ }s
else
this_rho = scalar;
rho[1][0][1] = static_cast<float>(this_rho); @@ -748,8 +749,7 @@
*/
#ifdef MANTA_SSE
-//an octree traversal based on implicit BVH
-void IsosurfaceOctreeVolume::packet_intersect_implicit_bvh(RayPacket& rays) const
+void IsosurfaceOctreeVolume::packet_intersect_sse(RayPacket& rays) const
{
rays.computeInverseDirections();
rays.computeSigns();
@@ -817,74 +817,270 @@
{
first = MIN(first, smd);
last = smd;
+ srp.activeRays += count_nonzeros(srp.activeMask[smd]); }
- - srp.activeRays += count_nonzeros(srp.activeMask[smd]); }
if (first > last)
return;
- //cerr << "root node: first = " << (int)first << ", last = " << (int)last << endl;
- + char axis_order[3];
+ Vector direction = rays.getDirection(first<<2);
+ Vector dir2 = direction * direction;
+ if (dir2[0] > dir2[1] && dir2[0] > dir2[2])
+ {
+ if (direction[0] > 0)
+ sse_traverse<0,1,2,1>(srp, first, last, (direction[1] > 0.f ? 1 : -1), (direction[2] > 0.f ? 1 : -1));
+ else
+ sse_traverse<0,1,2,0>(srp, first, last, (direction[1] > 0.f ? 1 : -1), (direction[2] > 0.f ? 1 : -1));
+ }
+ else if (dir2[1] < dir2[2])
+ {
+ if (direction[1] > 0)
+ sse_traverse<1,0,2,1>(srp, first, last, (direction[0] > 0.f ? 1 : -1), (direction[2] > 0.f ? 1 : -1));
+ else
+ sse_traverse<1,0,2,0>(srp, first, last, (direction[0] > 0.f ? 1 : -1), (direction[2] > 0.f ? 1 : -1));
+ }
+ else
+ {
+ if (direction[2] > 0)
+ sse_traverse<2,0,1,1>(srp, first, last, (direction[0] > 0.f ? 1 : -1), (direction[1] > 0.f ? 1 : -1));
+ else
+ sse_traverse<2,0,1,0>(srp, first, last, (direction[0] > 0.f ? 1 : -1), (direction[1] > 0.f ? 1 : -1));
+ }
+}
+
+#define DBGP 0
+
+template<char K, char U, char V, char DK>
+void IsosurfaceOctreeVolume::sse_traverse(SSERayPacket& srp, char first, char last, char DU, char DV) const
+{
+ //find the bounding frustum of the rays, as (umin, vmin, umax, vmax) coordinates
+ sse_t smin_orig[3];
+ sse_t smin_dir[3];
+ sse_t smax_orig[3];
+ sse_t smax_dir[3];
+ + #pragma unroll(3)
+ for(int axis=0; axis<3; axis++)
+ {
+ smin_orig[axis] = _mm_infty;
+ smin_dir[axis] = _mm_infty;
+ smax_orig[axis] = _mm_minus_infty;
+ smax_dir[axis] = _mm_minus_infty; + }
+ for (int smd=first; smd<=last; smd++)
+ {
+ #pragma unroll(3)
+ for(int axis=0; axis<3; axis++)
+ {
+ smin_orig[axis] = min4(smin_orig[axis], srp.orig[axis][smd]);
+ smax_orig[axis] = max4(smax_orig[axis], srp.orig[axis][smd]);
+ smin_dir[axis] = min4(smin_dir[axis], srp.dir[axis][smd]);
+ smax_dir[axis] = max4(smax_dir[axis], srp.dir[axis][smd]);
+ }
+ }
+ + FrustumInterval fi;
+ fi.uvminmax_dir = set44(min4f(smin_dir[U]), min4f(smin_dir[V]), max4f(smax_dir[U]), max4f(smax_dir[V]));
+ fi.uvminmax_invdir = oneOver(fi.uvminmax_dir);
+ fi.uvminmax_orig = set44(min4f(smin_orig[U]), min4f(smin_orig[V]), max4f(smax_orig[U]), max4f(smax_orig[V]));
+ + #if DBGP
+ cerr << "fi.uvminmax_orig "; simd_cerr(fi.uvminmax_orig);
+ cerr << "fi.uvminmax_dir "; simd_cerr(fi.uvminmax_dir);
+ #endif
+ + float komin = min4f(smin_orig[K]);
+ float komax = max4f(smax_orig[K]);
+ float kdmin = min4f(smin_dir[K]);
+ float kdmax = max4f(smax_dir[K]);
+ fi.kminmax_orig = set44(komin, komin, komax, komax);
+ fi.kminmax_dir = set44(kdmin, kdmin, kdmax, kdmax);
+ fi.kminmax_invdir = oneOver(fi.kminmax_dir); + + #if DBGP
+ cerr << "fi.kminmax_orig "; simd_cerr(fi.kminmax_orig);
+ cerr << "fi.kminmax_dir "; simd_cerr(fi.kminmax_dir);
+ #endif + unsigned int index_trace[octdata->get_max_depth() + 1];
Vec3i cell(0,0,0);
- bvh_octnode(srp, first, last, cell, octdata->get_cap_depth(), 0, 0, index_trace);
+ sse_traverse_node<K,U,V,DK>(srp, first, last, DU, DV, fi, cell, octdata->get_cap_depth(), 0, 0, index_trace);
}
-void IsosurfaceOctreeVolume::bvh_octnode(SSERayPacket& srp, char first, char last, - const Vec3i& cell, int stop_depth, int depth, unsigned int index, unsigned int index_trace[]) const
+
+
+template<char K, char U, char V, char DK>
+void IsosurfaceOctreeVolume::sse_traverse_node(SSERayPacket& srp, char first, char last, + char DU, char DV, const FrustumInterval& fi,
+ Vec3i& cell, char stop_depth, char depth, unsigned int index, unsigned int index_trace[]) const
{
- //cerr << "octnode " << (int)depth << ", " << index << "; first=" << (int)first << ",last=" << (int)last << endl;
+#if DBGP
+ cerr << "octnode, depth " << (int)depth << ", index " << (int)index << ", first=" << (int)first << ",last=" << (int)last << "cell " << cell[0] << ", " << cell[1] << ", " << cell[2] << endl;
+ cerr << "(with K=" << (int)(K) << ", U=" << (int)(U) << ", V=" << (int)(V) << ", DK=" << (int)(DK) << endl; +#endif
+
OctNode& node = octdata->get_node(depth, index);
- Vec3i child_cell;
int child_bit = octdata->get_child_bit_depth(depth);
+ Vec3i child_cell;
index_trace[depth] = index;
- int smd_first = first << 2;
-
- //intersect all children in order
+ Vector pcenter( static_cast<float>(cell[0] | child_bit), + static_cast<float>(cell[1] | child_bit), + static_cast<float>(cell[2] | child_bit));
+ #pragma unroll(2)
- for(int midplane_x=0; midplane_x!=2; midplane_x++)
- {
- int target_x;
- if (midplane_x - srp.rp->getSign(smd_first,0))
- {
- target_x = 4;
- child_cell.data[0] = cell.data[0] | child_bit;
- }
- else
- {
- target_x = 0;
- child_cell.data[0] = cell.data[0];
- }
- #pragma unroll(2)
- for(int midplane_y=0; midplane_y!=2; midplane_y++)
- {
- int target_xy;
- if (midplane_y - srp.rp->getSign(smd_first,1))
- {
- target_xy = target_x | 2;
- child_cell.data[1] = cell.data[1] | child_bit;
- }
- else
- {
- target_xy = target_x;
- child_cell.data[1] = cell.data[1];
- }
- #pragma unroll(2)
- for(int midplane_z=0; midplane_z!=2; midplane_z++)
- {
- int target_child;
- if (midplane_z - srp.rp->getSign(smd_first,2))
- {
- target_child = target_xy | 1;
- child_cell.data[2] = cell.data[2] | child_bit;
- }
- else
- {
- target_child = target_xy;
- child_cell.data[2] = cell.data[2];
- }
+ for(int k=0; k<2; k++)
+ { + sse_t child_tkenter;
+ sse_t child_tkexit;
+ int tc_K;
+ + if (k) //AFTER THE K MIDPLANE
+ {
+ if (DK)
+ {
+ child_tkenter = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(cell[K]+(child_bit<<1)), fi.kminmax_orig), fi.kminmax_invdir);
+ tc_K = axis_table[K];
+ child_cell[K] = cell[K] | child_bit;
+ #if DBGP
+ cerr << "kenter = pcenter[K] =" << pcenter[K] << endl;
+ cerr << "kexit = cell[K] + depth_bit =" << cell[K]+(child_bit<<1) << endl;
+ #endif
+ }
+ else
+ {
+ child_tkenter = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(cell[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ tc_K = 0;
+ child_cell[K] = cell[K]; + #if DBGP + cerr << "kenter = pcenter[K] =" << pcenter[K] << endl;
+ cerr << "kexit = cell[K] =" << cell[K] << endl;
+ #endif
+ }
+ + }
+ else //BEFORE THE K MIDPLANE
+ {
+ if (DK)
+ {
+ child_tkenter = mul4(sub4(set4(cell[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ tc_K = 0;
+ child_cell[K] = cell[K];
+ #if DBGP
+ cerr << "kenter = cell[K] =" << cell[K] << endl;
+ cerr << "kexit = pcenter[K] =" << pcenter[K] << endl;
+ #endif
+ }
+ else
+ {
+ child_tkenter = mul4(sub4(set4(cell[K]+(child_bit<<1)), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ tc_K = axis_table[K];
+ child_cell[K] = cell[K] | child_bit; + #if DBGP
+ cerr << "kenter = cell[K]+depth_bit =" << cell[K]+(child_bit<<1) << endl;
+ cerr << "kexit = pcenter[K] =" << pcenter[K] << endl;
+ #endif
+ }
+ }
+ + #if DBGP
+ cerr << "child_tkenter "; simd_cerr(child_tkenter);
+ cerr << "child_tkexit "; simd_cerr(child_tkexit);
+ cerr << "child_pkenter[K] = "; simd_cerr( mul4(oneOver(fi.kminmax_invdir), add4(fi.kminmax_orig, child_tkenter)) );
+ cerr << "child_pkexit[K] = "; simd_cerr( mul4(oneOver(fi.kminmax_invdir), add4(fi.kminmax_orig, child_tkexit)) );
+ #endif
+ + //we have child_tkenter, child_tkexit.
+ if (_mm_movemask_ps(cmp4_ge(child_tkexit, zero4()))==0)
+ {
+ #if DBGP
+ cerr << "texit was negative; continuing." << endl;
+ #endif
+ continue;
+ }
+ + //find pkenter_uvminmax, pkexit[u], pkexit[v]
+ //child_pkenter = dir*(orig/dir) + dir*t = dir(orig+t)
+ const sse_t child_pkenter_uvminmax = add4(fi.uvminmax_orig, mul4(fi.uvminmax_dir, child_tkenter));
+ const sse_t child_pkexit_uvminmax = add4(fi.uvminmax_orig, mul4(fi.uvminmax_dir, child_tkexit));
+ + #if DBGP
+ cerr << "child_pkenter_uvminmax "; simd_cerr(child_pkenter_uvminmax);
+ cerr << "child_pkexit_uvminmax "; simd_cerr(child_pkexit_uvminmax); + #endif
+
+ sse_union tmp_min, tmp_max;
+ tmp_min.sse = min4(child_pkenter_uvminmax, child_pkexit_uvminmax);
+ tmp_max.sse = max4(child_pkenter_uvminmax, child_pkexit_uvminmax);
+ + #if DBGP
+ cerr << "tmp_min "; simd_cerr(tmp_min.sse);
+ cerr << "tmp_max "; simd_cerr(tmp_max.sse);
+ #endif
+ + const float umin = MIN(tmp_min.f[3], tmp_min.f[1]);
+ const float vmin = MIN(tmp_min.f[2], tmp_min.f[0]);
+ const float umax = MAX(tmp_max.f[3], tmp_max.f[1]);
+ const float vmax = MAX(tmp_max.f[2], tmp_max.f[0]);
+ sse_t sse_fuvminmax = set44(umin, vmin, umax, vmax);
+ + #if DBGP
+ cerr << "sse_fuvminmax (before clamp) = "; simd_cerr(sse_fuvminmax);
+ #endif
+ + sse_fuvminmax = sub4(sse_fuvminmax, set44(cell[U], cell[V], cell[U], cell[V]));
+ sse_fuvminmax = mul4(sse_fuvminmax, set4(octdata->get_inv_child_bit_depth(depth)));
+ sse_fuvminmax = max4(sse_fuvminmax, set44(0.0f, 0.0f, -9.9e9999f, -9.9e9999f));
+ sse_fuvminmax = min4(sse_fuvminmax, set44(9.9e9999f, 9.9e9999f, 1.0f, 1.0f));
+ + #if DBGP
+ cerr << "sse_fuvminmax (after clamp) = "; simd_cerr(sse_fuvminmax);
+ #endif
+ + sse_int_union iuvminmax;
+ + //convert to int
+ iuvminmax.ssei = _mm_cvttps_epi32(sse_fuvminmax);
+ + #if DBGP
+ cerr << "iuvminmax = " << iuvminmax.i[0] << ", " << iuvminmax.i[1] << ", " << iuvminmax.i[2] << ", " << iuvminmax.i[3];
+ cerr << endl << endl;
+ #endif
+ + for(int u= (DU==1 ? iuvminmax.i[3] : iuvminmax.i[1]); (DU==1 ? u <= iuvminmax.i[1] : u >= iuvminmax.i[3]); u += DU)
+ {
+ int tc_U;
+ if (u)
+ {
+ tc_U = axis_table[U];
+ child_cell[U] = cell[U] | child_bit;
+ }
+ else
+ {
+ tc_U = 0;
+ child_cell[U] = cell[U]; + }
+
+ for(int v= (DV==1 ? iuvminmax.i[2] : iuvminmax.i[0]); (DV==1 ? v <= iuvminmax.i[0] : v >= iuvminmax.i[2]); v += DV)
+ { + int tc_V;
+ if (v)
+ {
+ tc_V = axis_table[V];
+ child_cell[V] = cell[V] | child_bit;
+ }
+ else
+ {
+ tc_V = 0;
+ child_cell[V] = cell[V]; + }
+ + int target_child = tc_K | tc_U | tc_V;
if (octdata->get_isovalue() >= node.mins[target_child] && octdata->get_isovalue() <= node.maxs[target_child])
{
@@ -894,89 +1090,202 @@
//cerr << "newfirst=" << (int)newfirst << ", last=" << (int)last << endl;
- if (newfirst <= last)
+ if (first <= last)
{
if (node.offsets[target_child]==-1)
{
- bvh_octleaf(srp, newfirst, last, child_cell, stop_depth, depth+1, - depth, node.values[target_child], child_cell, index_trace);
+ sse_traverse_leaf<K,U,V,DK>(srp, newfirst, last, DU, DV, fi,
+ child_cell, stop_depth, depth+1, depth, node.values[target_child], child_cell, index_trace);
}
else
{
unsigned int child_idx = node.children_start + node.offsets[target_child];
if (depth == octdata->get_pre_cap_depth()) //cap
{
- bvh_octcap(srp, newfirst, last, child_cell, stop_depth, depth+1, child_idx, index_trace);
+ sse_traverse_cap<K,U,V,DK>(srp, newfirst, last, DU, DV, fi, child_cell, + stop_depth, depth+1, child_idx, index_trace);
}
else
{
- bvh_octnode(srp, newfirst, last, child_cell, stop_depth, depth+1, child_idx, index_trace);
+ sse_traverse_node<K,U,V,DK>(srp, newfirst, last, DU, DV, fi, child_cell, + stop_depth, depth+1, child_idx, index_trace);
}
}
if (srp.activeRays<=0)
return;
}
-
}
}
}
}
-}
+} -void IsosurfaceOctreeVolume::bvh_octleaf(SSERayPacket& srp, char first, char last, - const Vec3i& cell, int stop_depth, int depth, +template<char K, char U, char V, char DK>
+void IsosurfaceOctreeVolume::sse_traverse_leaf(SSERayPacket& srp, char first, char last, char DU, char DV,
+ const FrustumInterval& fi, const Vec3i& cell, int stop_depth, int depth, int leaf_depth, ST leaf_value, const Vec3i& leaf_base_cell,
unsigned int index_trace[]) const
{
int child_bit = octdata->get_child_bit_depth(depth);
- int unsafe_zone = octdata->get_child_bit_depth(depth-1) - octdata->get_child_bit_depth(octdata->get_cap_depth());
- int smd_first = first << 2;
+ int unsafe_zone = (child_bit<<1) - octdata->get_child_bit_depth(octdata->get_cap_depth());
Vec3i child_cell;
-
- //intersect all children in order
+ Vector pcenter( static_cast<float>(cell[0] | child_bit), + static_cast<float>(cell[1] | child_bit), + static_cast<float>(cell[2] | child_bit));
+ #pragma unroll(2)
- for(int midplane_x=0; midplane_x!=2; midplane_x++)
- {
- int target_x;
- if (midplane_x - srp.rp->getSign(smd_first,0))
- {
- target_x = 4;
- child_cell.data[0] = cell.data[0] | child_bit;
- }
- else
- {
- target_x = 0;
- child_cell.data[0] = cell.data[0];
- }
- #pragma unroll(2)
- for(int midplane_y=0; midplane_y!=2; midplane_y++)
- {
- int target_xy;
- if (midplane_y - srp.rp->getSign(smd_first,1))
- {
- target_xy = target_x | 2;
- child_cell.data[1] = cell.data[1] | child_bit;
- }
- else
- {
- target_xy = target_x;
- child_cell.data[1] = cell.data[1];
- }
- #pragma unroll(2)
- for(int midplane_z=0; midplane_z!=2; midplane_z++)
- {
- int target_child;
- if (midplane_z - srp.rp->getSign(smd_first,2))
- {
- target_child = target_xy | 1;
- child_cell.data[2] = cell.data[2] | child_bit;
- }
- else
- {
- target_child = target_xy;
- child_cell.data[2] = cell.data[2];
- }
+ for(int k=0; k<2; k++)
+ { + sse_t child_tkenter;
+ sse_t child_tkexit;
+ int tc_K;
+ if (k) //AFTER THE K MIDPLANE
+ {
+ if (DK)
+ {
+ child_tkenter = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(cell[K]+(child_bit<<1)), fi.kminmax_orig), fi.kminmax_invdir);
+ tc_K = axis_table[K];
+ child_cell[K] = cell[K] | child_bit;
+ #if DBGP
+ cerr << "kenter = pcenter[K] =" << pcenter[K] << endl;
+ cerr << "kexit = cell[K] + depth_bit =" << cell[K]+(child_bit<<1) << endl;
+ #endif
+ }
+ else
+ {
+ child_tkenter = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(cell[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ tc_K = 0;
+ child_cell[K] = cell[K]; + #if DBGP + cerr << "kenter = pcenter[K] =" << pcenter[K] << endl;
+ cerr << "kexit = cell[K] =" << cell[K] << endl;
+ #endif
+ }
+ + }
+ else //BEFORE THE K MIDPLANE
+ {
+ if (DK)
+ {
+ child_tkenter = mul4(sub4(set4(cell[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ tc_K = 0;
+ child_cell[K] = cell[K];
+ #if DBGP
+ cerr << "kenter = cell[K] =" << cell[K] << endl;
+ cerr << "kexit = pcenter[K] =" << pcenter[K] << endl;
+ #endif
+ }
+ else
+ {
+ child_tkenter = mul4(sub4(set4(cell[K]+(child_bit<<1)), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ tc_K = axis_table[K];
+ child_cell[K] = cell[K] | child_bit; + #if DBGP
+ cerr << "kenter = cell[K]+depth_bit =" << cell[K]+(child_bit<<1) << endl;
+ cerr << "kexit = pcenter[K] =" << pcenter[K] << endl;
+ #endif
+ }
+ }
+ + #if DBGP
+ cerr << "child_tkenter "; simd_cerr(child_tkenter);
+ cerr << "child_tkexit "; simd_cerr(child_tkexit);
+ cerr << "child_pkenter[K] = "; simd_cerr( mul4(oneOver(fi.kminmax_invdir), add4(fi.kminmax_orig, child_tkenter)) );
+ cerr << "child_pkexit[K] = "; simd_cerr( mul4(oneOver(fi.kminmax_invdir), add4(fi.kminmax_orig, child_tkexit)) );
+ #endif
+ + //we have child_tkenter, child_tkexit.
+ if (_mm_movemask_ps(cmp4_ge(child_tkexit, zero4()))==0)
+ {
+ #if DBGP
+ cerr << "texit was negative; continuing." << endl;
+ #endif
+ continue;
+ }
+ + //find pkenter_uvminmax, pkexit[u], pkexit[v]
+ //child_pkenter = dir*(orig/dir) + dir*t = dir(orig+t)
+ const sse_t child_pkenter_uvminmax = add4(fi.uvminmax_orig, mul4(fi.uvminmax_dir, child_tkenter));
+ const sse_t child_pkexit_uvminmax = add4(fi.uvminmax_orig, mul4(fi.uvminmax_dir, child_tkexit));
+ + #if DBGP
+ cerr << "child_pkenter_uvminmax "; simd_cerr(child_pkenter_uvminmax);
+ cerr << "child_pkexit_uvminmax "; simd_cerr(child_pkexit_uvminmax); + #endif
+
+ sse_union tmp_min, tmp_max;
+ tmp_min.sse = min4(child_pkenter_uvminmax, child_pkexit_uvminmax);
+ tmp_max.sse = max4(child_pkenter_uvminmax, child_pkexit_uvminmax);
+ + #if DBGP
+ cerr << "tmp_min "; simd_cerr(tmp_min.sse);
+ cerr << "tmp_max "; simd_cerr(tmp_max.sse);
+ #endif
+ + const float umin = MIN(tmp_min.f[3], tmp_min.f[1]);
+ const float vmin = MIN(tmp_min.f[2], tmp_min.f[0]);
+ const float umax = MAX(tmp_max.f[3], tmp_max.f[1]);
+ const float vmax = MAX(tmp_max.f[2], tmp_max.f[0]);
+ sse_t sse_fuvminmax = set44(umin, vmin, umax, vmax);
+ + #if DBGP
+ cerr << "sse_fuvminmax (before clamp) = "; simd_cerr(sse_fuvminmax);
+ #endif
+ + sse_fuvminmax = sub4(sse_fuvminmax, set44(cell[U], cell[V], cell[U], cell[V]));
+ sse_fuvminmax = mul4(sse_fuvminmax, set4(octdata->get_inv_child_bit_depth(depth)));
+ sse_fuvminmax = max4(sse_fuvminmax, set44(0.0f, 0.0f, -9.9e9999f, -9.9e9999f));
+ sse_fuvminmax = min4(sse_fuvminmax, set44(9.9e9999f, 9.9e9999f, 1.0f, 1.0f));
+ + #if DBGP
+ cerr << "sse_fuvminmax (after clamp) = "; simd_cerr(sse_fuvminmax);
+ #endif
+ + sse_int_union iuvminmax;
+ + //convert to int
+ iuvminmax.ssei = _mm_cvttps_epi32(sse_fuvminmax);
+ + #if DBGP
+ cerr << "iuvminmax = " << iuvminmax.i[0] << ", " << iuvminmax.i[1] << ", " << iuvminmax.i[2] << ", " << iuvminmax.i[3];
+ cerr << endl << endl;
+ #endif
+
+ for(int u= (DU==1 ? iuvminmax.i[3] : iuvminmax.i[1]); (DU==1 ? u <= iuvminmax.i[1] : u >= iuvminmax.i[3]); u += DU)
+ {
+ int tc_U;
+ if (u)
+ {
+ tc_U = axis_table[U];
+ child_cell[U] = cell[U] | child_bit;
+ }
+ else
+ {
+ tc_U = 0;
+ child_cell[U] = cell[U]; + }
+
+ for(int v= (DV==1 ? iuvminmax.i[2] : iuvminmax.i[0]); (DV==1 ? v <= iuvminmax.i[0] : v >= iuvminmax.i[2]); v += DV)
+ { + int tc_V;
+ if (v)
+ {
+ tc_V = axis_table[V];
+ child_cell[V] = cell[V] | child_bit;
+ }
+ else
+ {
+ tc_V = 0;
+ child_cell[V] = cell[V]; + }
+ + int target_child = tc_K | tc_U | tc_V;
+ Vec3i local_child_cell = child_cell - leaf_base_cell;
if (local_child_cell.data[0] & unsafe_zone || local_child_cell.data[1] & unsafe_zone || local_child_cell.data[2] & unsafe_zone)
{
@@ -1113,9 +1422,10 @@
Vector cmin(child_cell.data[0], child_cell.data[1], child_cell.data[2]);
Vector cmax(child_cell.data[0]+child_bit, child_cell.data[1]+child_bit, child_cell.data[2]+child_bit);
char newfirst = first_intersects(srp, first, last, cmin, cmax);
- - bvh_octleaf(srp, newfirst, last, child_cell, stop_depth, depth+1, - leaf_depth, leaf_value, leaf_base_cell, index_trace);
+ + sse_traverse_leaf<K,U,V,DK>(srp, newfirst, last, DU, DV, fi,
+ child_cell, stop_depth, depth+1, leaf_depth, leaf_value, leaf_base_cell, index_trace);
+ }
if (srp.activeRays<=0)
return;
@@ -1123,63 +1433,177 @@
}
}
}
-
-
}
-void IsosurfaceOctreeVolume::bvh_octcap(SSERayPacket& srp, char first, char last,
- const Vec3i& cell, int stop_depth, int depth, unsigned int index, unsigned int index_trace[]) const
+template<char K, char U, char V, char DK>
+void IsosurfaceOctreeVolume::sse_traverse_cap(SSERayPacket& srp, char first, char last, + char DU, char DV, const FrustumInterval& fi,
+ Vec3i& cell, char stop_depth, char depth, unsigned int index, unsigned int index_trace[]) const
{
- //cerr << "octcap " << index << ", first=" << (int)first << ",last=" << (int)last << endl;
+#if DBGP
+ cerr << "octcap " << index << ", first=" << (int)first << ",last=" << (int)last << endl;
+#endif OctCap& cap = octdata->get_cap(index);
+ int child_bit = octdata->get_child_bit_depth(depth);
Vec3i child_cell;
index_trace[depth] = index;
- int smd_first = first << 2;
- - //intersect all children in order
+ Vector pcenter( static_cast<float>(cell[0] | child_bit), + static_cast<float>(cell[1] | child_bit), + static_cast<float>(cell[2] | child_bit));
+ #pragma unroll(2)
- for(int midplane_x=0; midplane_x<2; midplane_x++)
- {
- int target_x;
- if (midplane_x - srp.rp->getSign(smd_first,0))
- {
- target_x = 4;
- child_cell.data[0] = cell.data[0] | 1;
- }
- else
- {
- target_x = 0;
- child_cell.data[0] = cell.data[0];
- }
- #pragma unroll(2)
- for(int midplane_y=0; midplane_y<2; midplane_y++)
- {
- int target_xy;
- if (midplane_y - srp.rp->getSign(smd_first,1))
- {
- target_xy = target_x | 2;
- child_cell.data[1] = cell.data[1] | 1;
- }
- else
- {
- target_xy = target_x;
- child_cell.data[1] = cell.data[1];
- }
- #pragma unroll(2)
- for(int midplane_z=0; midplane_z<2; midplane_z++)
- {
- int target_child;
- if (midplane_z - srp.rp->getSign(smd_first,2))
- {
- target_child = target_xy | 1;
- child_cell.data[2] = cell.data[2] | 1;
- }
- else
- {
- target_child = target_xy;
- child_cell.data[2] = cell.data[2];
- }
- + for(int k=0; k<2; k++)
+ { + sse_t child_tkenter;
+ sse_t child_tkexit;
+ int tc_K;
+ + if (k) //AFTER THE K MIDPLANE
+ {
+ if (DK)
+ {
+ child_tkenter = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(cell[K]+(child_bit<<1)), fi.kminmax_orig), fi.kminmax_invdir);
+ tc_K = axis_table[K];
+ child_cell[K] = cell[K] | child_bit;
+ #if DBGP
+ cerr << "kenter = pcenter[K] =" << pcenter[K] << endl;
+ cerr << "kexit = cell[K] + depth_bit =" << cell[K]+(child_bit<<1) << endl;
+ #endif
+ }
+ else
+ {
+ child_tkenter = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(cell[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ tc_K = 0;
+ child_cell[K] = cell[K]; + #if DBGP + cerr << "kenter = pcenter[K] =" << pcenter[K] << endl;
+ cerr << "kexit = cell[K] =" << cell[K] << endl;
+ #endif
+ }
+ + }
+ else //BEFORE THE K MIDPLANE
+ {
+ if (DK)
+ {
+ child_tkenter = mul4(sub4(set4(cell[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ tc_K = 0;
+ child_cell[K] = cell[K];
+ #if DBGP
+ cerr << "kenter = cell[K] =" << cell[K] << endl;
+ cerr << "kexit = pcenter[K] =" << pcenter[K] << endl;
+ #endif
+ }
+ else
+ {
+ child_tkenter = mul4(sub4(set4(cell[K]+(child_bit<<1)), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ tc_K = axis_table[K];
+ child_cell[K] = cell[K] | child_bit; + #if DBGP
+ cerr << "kenter = cell[K]+depth_bit =" << cell[K]+(child_bit<<1) << endl;
+ cerr << "kexit = pcenter[K] =" << pcenter[K] << endl;
+ #endif
+ }
+ }
+ + #if DBGP
+ cerr << "child_tkenter "; simd_cerr(child_tkenter);
+ cerr << "child_tkexit "; simd_cerr(child_tkexit);
+ cerr << "child_pkenter[K] = "; simd_cerr( mul4(oneOver(fi.kminmax_invdir), add4(fi.kminmax_orig, child_tkenter)) );
+ cerr << "child_pkexit[K] = "; simd_cerr( mul4(oneOver(fi.kminmax_invdir), add4(fi.kminmax_orig, child_tkexit)) );
+ #endif
+ + //we have child_tkenter, child_tkexit.
+ if (_mm_movemask_ps(cmp4_ge(child_tkexit, zero4()))==0)
+ {
+ #if DBGP
+ cerr << "texit was negative; continuing." << endl;
+ #endif
+ continue;
+ }
+ + //find pkenter_uvminmax, pkexit[u], pkexit[v]
+ //child_pkenter = dir*(orig/dir) + dir*t = dir(orig+t)
+ const sse_t child_pkenter_uvminmax = add4(fi.uvminmax_orig, mul4(fi.uvminmax_dir, child_tkenter));
+ const sse_t child_pkexit_uvminmax = add4(fi.uvminmax_orig, mul4(fi.uvminmax_dir, child_tkexit));
+ + #if DBGP
+ cerr << "child_pkenter_uvminmax "; simd_cerr(child_pkenter_uvminmax);
+ cerr << "child_pkexit_uvminmax "; simd_cerr(child_pkexit_uvminmax); + #endif
+
+ sse_union tmp_min, tmp_max;
+ tmp_min.sse = min4(child_pkenter_uvminmax, child_pkexit_uvminmax);
+ tmp_max.sse = max4(child_pkenter_uvminmax, child_pkexit_uvminmax);
+ + #if DBGP
+ cerr << "tmp_min "; simd_cerr(tmp_min.sse);
+ cerr << "tmp_max "; simd_cerr(tmp_max.sse);
+ #endif
+ + const float umin = MIN(tmp_min.f[3], tmp_min.f[1]);
+ const float vmin = MIN(tmp_min.f[2], tmp_min.f[0]);
+ const float umax = MAX(tmp_max.f[3], tmp_max.f[1]);
+ const float vmax = MAX(tmp_max.f[2], tmp_max.f[0]);
+ sse_t sse_fuvminmax = set44(umin, vmin, umax, vmax);
+ + #if DBGP
+ cerr << "sse_fuvminmax (before clamp) = "; simd_cerr(sse_fuvminmax);
+ #endif
+ + sse_fuvminmax = sub4(sse_fuvminmax, set44(cell[U], cell[V], cell[U], cell[V]));
+ sse_fuvminmax = mul4(sse_fuvminmax, set4(octdata->get_inv_child_bit_depth(depth)));
+ sse_fuvminmax = max4(sse_fuvminmax, set44(0.0f, 0.0f, -9.9e9999f, -9.9e9999f));
+ sse_fuvminmax = min4(sse_fuvminmax, set44(9.9e9999f, 9.9e9999f, 1.0f, 1.0f));
+ + #if DBGP
+ cerr << "sse_fuvminmax (after clamp) = "; simd_cerr(sse_fuvminmax);
+ #endif
+ + sse_int_union iuvminmax;
+ + //convert to int
+ iuvminmax.ssei = _mm_cvttps_epi32(sse_fuvminmax);
+ + #if DBGP
+ cerr << "iuvminmax = " << iuvminmax.i[0] << ", " << iuvminmax.i[1] << ", " << iuvminmax.i[2] << ", " << iuvminmax.i[3];
+ cerr << endl << endl;
+ #endif
+ + for(int u= (DU==1 ? iuvminmax.i[3] : iuvminmax.i[1]); (DU==1 ? u <= iuvminmax.i[1] : u >= iuvminmax.i[3]); u += DU)
+ {
+ int tc_U;
+ if (u)
+ {
+ tc_U = axis_table[U];
+ child_cell[U] = cell[U] | child_bit;
+ }
+ else
+ {
+ tc_U = 0;
+ child_cell[U] = cell[U]; + }
+
+ for(int v= (DV==1 ? iuvminmax.i[2] : iuvminmax.i[0]); (DV==1 ? v <= iuvminmax.i[0] : v >= iuvminmax.i[2]); v += DV)
+ { + int tc_V;
+ if (v)
+ {
+ tc_V = axis_table[V];
+ child_cell[V] = cell[V] | child_bit;
+ }
+ else
+ {
+ tc_V = 0;
+ child_cell[V] = cell[V]; + }
+ + int target_child = tc_K | tc_U | tc_V;
+ sse_t child_tenter[RayPacket::SSE_MaxSize];
sse_t child_texit[RayPacket::SSE_MaxSize];
sse_t hitmask[RayPacket::SSE_MaxSize];
Modified: trunk/Model/Primitives/IsosurfaceOctreeVolume.h
==============================================================================
--- trunk/Model/Primitives/IsosurfaceOctreeVolume.h (original)
+++ trunk/Model/Primitives/IsosurfaceOctreeVolume.h Mon Jun 26 01:04:00 2006
@@ -58,20 +58,42 @@
const float texit) const;
#ifdef MANTA_SSE
- void packet_intersect_implicit_bvh(RayPacket& rays) const;
+ struct FrustumInterval
+ {
+ //NOTE: these are all in the order umin, vmin, umax, vmax.
+ // But specifically, that order is defined by orig and dir.
+ sse_t uvminmax_orig;
+ sse_t uvminmax_dir; + sse_t uvminmax_invdir;
+ + sse_t kminmax_orig; //kmin_xforig, kmin_xforig, kmax_xforig, kmax_xforig
+ sse_t kminmax_dir;
+ sse_t kminmax_invdir; //kmin_invdir, kmin_invdir, kmax_invdir, kmax_invdir + };
+
+ void packet_intersect_sse(RayPacket& rays) const;
- void bvh_octnode(SSERayPacket& srp, char first, char last,
- const Vec3i& cell, int stop_depth, int depth, unsigned int index, - unsigned int index_trace[]) const;
+ template<char K, char U, char V, char DK>
+ void sse_traverse(SSERayPacket& srp, char first, char last, + char DU, char DV) const;
- void bvh_octleaf(SSERayPacket& srp, char first, char last, - const Vec3i& cell, int stop_depth, int depth, - int leaf_depth, ST leaf_value, const Vec3i& leaf_base_cell,
- unsigned int index_trace[]) const;
- - void bvh_octcap(SSERayPacket& srp, char first, char last,
- const Vec3i& cell, int stop_depth, int depth, unsigned int index, + template<char K, char U, char V, char DK>
+ void IsosurfaceOctreeVolume::sse_traverse_node(SSERayPacket& srp, + char first, char last, char DU, char DV, const FrustumInterval& fi,
+ Vec3i& cell, char stop_depth, char depth, unsigned int index, + unsigned int index_trace[]) const;
+ + template<char K, char U, char V, char DK>
+ void sse_traverse_leaf(SSERayPacket& srp, char first, char last, char DU, char DV,
+ const FrustumInterval& fi, const Vec3i& cell, int stop_depth, int depth, + int leaf_depth, ST leaf_value, const Vec3i& leaf_base_cell,
unsigned int index_trace[]) const;
+ + template<char K, char U, char V, char DK>
+ void IsosurfaceOctreeVolume::sse_traverse_cap(SSERayPacket& srp, + char first, char last, char DU, char DV, const FrustumInterval& fi,
+ Vec3i& cell, char stop_depth, char depth, unsigned int index, + unsigned int index_trace[]) const; inline char first_intersects(SSERayPacket& srp, char first, char last, const Vector& min, const Vector& max) const
Modified: trunk/Model/Primitives/OctreeVolume.h
==============================================================================
--- trunk/Model/Primitives/OctreeVolume.h (original)
+++ trunk/Model/Primitives/OctreeVolume.h Mon Jun 26 01:04:00 2006
@@ -217,6 +217,12 @@
return child_bit_depth[d];
}
+ inline float get_inv_child_bit_depth(int d) const
+ {
+ return inv_child_bit_depth[d];
+ }
+
+ inline int get_max_depth() const
{
return max_depth;
Archive powered by MHonArc 2.6.16.