Manta Interactive Ray Tracer Development Mailing List

Text archives Help


Re: [MANTA] ICC copile broken -> now fixed


Chronological Thread 
  • From: Aaron Knoll <knolla@cs.utah.edu>
  • To: manta@sci.utah.edu
  • Subject: Re: [MANTA] ICC copile broken -> now fixed
  • Date: Wed, 28 Jun 2006 17:51:15 +0200

Hey James - thanks for fixing that. I had no internet access yesterday after I committed...

-Aaron

On Jun 26, 2006, at 11:34 PM, James Bigler wrote:

The problem was in the declaration of a class member function with the class name in front:

class MyClass {
public:
  void MyClass::doStuff() {}
};

This arises most often when the function head is copied from the .cc file to the .h file.

It is fixed in the trunk.

James

Abe Stephens wrote:
This breaks IsosurfaceOctreeVolume.cc in icc 9.0 and 9.1
Abe
knolla@sci.utah.edu wrote:
Author: knolla
Date: Mon Jun 26 01:04:00 2006
New Revision: 1127

Modified:
  trunk/Core/Math/SSEDefs.h
  trunk/Model/Primitives/IsosurfaceOctreeVolume.cc
  trunk/Model/Primitives/IsosurfaceOctreeVolume.h
  trunk/Model/Primitives/OctreeVolume.h
Log:
update to IsosurfaceOctreeVolume in SSE only. Slight improvement; some divide-by-zero bugs.

Modified: trunk/Core/Math/SSEDefs.h
==================================================================== ==========
--- trunk/Core/Math/SSEDefs.h    (original)
+++ trunk/Core/Math/SSEDefs.h    Mon Jun 26 01:04:00 2006
@@ -11,6 +11,7 @@
#include <Core/Util/Align.h>
#include <Core/Geometry/vecdefs.h>
#include <Core/Geometry/Vector.h>
+#include <iostream>

typedef __m128 sse_t;
typedef __m128i sse_int_t;
@@ -49,6 +50,17 @@

namespace Manta
{
+    union sse_union
+    {
+        sse_t sse;
+        float f[4];
+    };
+    +    union sse_int_union
+    {
+        __m128i ssei;
+        int i[4];
+    };   #if defined(__x86_64) && defined(__INTEL_COMPILER)

@@ -61,10 +73,6 @@
    }
    #endif
-  -
-
-

    static const MANTA_ALIGN(16) sse_t _mm_eps = _mm_set_ps1(1e-5);
static const MANTA_ALIGN(16) sse_t _mm_minus_eps = _mm_set_ps1 (-1e-5);
@@ -246,6 +254,14 @@
        float f[4];
        _mm_store_ps(f,t);
        return f[offset];
+    }
+    +    inline float simd_cerr(sse_t t)
+    {  +        MANTA_ALIGN(16)
+        float f[4];
+        _mm_store_ps(f,t);
+ std::cerr << f[0] << ", " << f[1] << ", " << f[2] << ", " << f[3] << std::endl;
    }

    inline Vec3f as_Vec3f(sse_t t)

Modified: trunk/Model/Primitives/IsosurfaceOctreeVolume.cc
==================================================================== ==========
--- trunk/Model/Primitives/IsosurfaceOctreeVolume.cc    (original)
+++ trunk/Model/Primitives/IsosurfaceOctreeVolume.cc Mon Jun 26 01:04:00 2006
@@ -78,6 +78,7 @@
    min_rho = MIN(min_rho, this_rho); \
    max_rho = MAX(max_rho, this_rho); \
    rho[0][1][0] = static_cast<float>(this_rho); \
+   static const int axis_table[] = {4, 2, 1};
    @@ -114,8 +115,8 @@
void IsosurfaceOctreeVolume::intersect(RenderContext const &context, RayPacket &packet) const
{
#ifdef MANTA_SSE
-//#if 1
-    packet_intersect_implicit_bvh(packet);
+//#if 0
+    packet_intersect_sse(packet);
#else
    for ( int i = packet.rayBegin; i < packet.rayEnd; i++ )
        single_intersect(packet, i);
@@ -518,7 +519,7 @@
this_rho = octdata->lookup_neighbor<1,0,1> (child_cell, offset, stop_depth, leaf_depth, index_trace);
                    min_rho = MIN(min_rho, this_rho);
                    max_rho = MAX(max_rho, this_rho);
-                }
+                }s
                else
                    this_rho = scalar;
rho[1][0][1] = static_cast<float>(this_rho); @@ -748,8 +749,7 @@
*/
#ifdef MANTA_SSE

-//an octree traversal based on implicit BVH
-void IsosurfaceOctreeVolume::packet_intersect_implicit_bvh (RayPacket& rays) const
+void IsosurfaceOctreeVolume::packet_intersect_sse(RayPacket& rays) const
{
    rays.computeInverseDirections();
    rays.computeSigns();
@@ -817,74 +817,270 @@
        {
            first = MIN(first, smd);
            last = smd;
+ srp.activeRays += count_nonzeros(srp.activeMask [smd]); }
- - srp.activeRays += count_nonzeros(srp.activeMask [smd]); }
        if (first > last)
        return;
- //cerr << "root node: first = " << (int)first << ", last = " << (int)last << endl;
-   +    char axis_order[3];
+    Vector direction = rays.getDirection(first<<2);
+    Vector dir2 = direction * direction;
+    if (dir2[0] > dir2[1] && dir2[0] > dir2[2])
+    {
+        if (direction[0] > 0)
+ sse_traverse<0,1,2,1>(srp, first, last, (direction [1] > 0.f ? 1 : -1), (direction[2] > 0.f ? 1 : -1));
+        else
+ sse_traverse<0,1,2,0>(srp, first, last, (direction [1] > 0.f ? 1 : -1), (direction[2] > 0.f ? 1 : -1));
+    }
+    else if (dir2[1] < dir2[2])
+    {
+        if (direction[1] > 0)
+ sse_traverse<1,0,2,1>(srp, first, last, (direction [0] > 0.f ? 1 : -1), (direction[2] > 0.f ? 1 : -1));
+        else
+ sse_traverse<1,0,2,0>(srp, first, last, (direction [0] > 0.f ? 1 : -1), (direction[2] > 0.f ? 1 : -1));
+    }
+    else
+    {
+        if (direction[2] > 0)
+ sse_traverse<2,0,1,1>(srp, first, last, (direction [0] > 0.f ? 1 : -1), (direction[1] > 0.f ? 1 : -1));
+        else
+ sse_traverse<2,0,1,0>(srp, first, last, (direction [0] > 0.f ? 1 : -1), (direction[1] > 0.f ? 1 : -1));
+    }
+}
+
+#define DBGP 0
+
+template<char K, char U, char V, char DK>
+void IsosurfaceOctreeVolume::sse_traverse(SSERayPacket& srp, char first, char last, char DU, char DV) const
+{
+ //find the bounding frustum of the rays, as (umin, vmin, umax, vmax) coordinates
+    sse_t smin_orig[3];
+    sse_t smin_dir[3];
+    sse_t smax_orig[3];
+    sse_t smax_dir[3];
+    +    #pragma unroll(3)
+    for(int axis=0; axis<3; axis++)
+    {
+        smin_orig[axis] = _mm_infty;
+        smin_dir[axis] = _mm_infty;
+        smax_orig[axis] = _mm_minus_infty;
+        smax_dir[axis] = _mm_minus_infty;        +    }
+    for (int smd=first; smd<=last; smd++)
+    {
+        #pragma unroll(3)
+        for(int axis=0; axis<3; axis++)
+        {
+ smin_orig[axis] = min4(smin_orig[axis], srp.orig [axis][smd]);
+ smax_orig[axis] = max4(smax_orig[axis], srp.orig [axis][smd]);
+ smin_dir[axis] = min4(smin_dir[axis], srp.dir[axis] [smd]);
+ smax_dir[axis] = max4(smax_dir[axis], srp.dir[axis] [smd]);
+        }
+    }
+    +    FrustumInterval fi;
+ fi.uvminmax_dir = set44(min4f(smin_dir[U]), min4f(smin_dir [V]), max4f(smax_dir[U]), max4f(smax_dir[V]));
+    fi.uvminmax_invdir = oneOver(fi.uvminmax_dir);
+ fi.uvminmax_orig = set44(min4f(smin_orig[U]), min4f(smin_orig [V]), max4f(smax_orig[U]), max4f(smax_orig[V]));
+    +    #if DBGP
+    cerr << "fi.uvminmax_orig "; simd_cerr(fi.uvminmax_orig);
+    cerr << "fi.uvminmax_dir "; simd_cerr(fi.uvminmax_dir);
+    #endif
+    +    float komin = min4f(smin_orig[K]);
+    float komax = max4f(smax_orig[K]);
+    float kdmin = min4f(smin_dir[K]);
+    float kdmax = max4f(smax_dir[K]);
+    fi.kminmax_orig = set44(komin, komin, komax, komax);
+    fi.kminmax_dir = set44(kdmin, kdmin, kdmax, kdmax);
+ fi.kminmax_invdir = oneOver(fi.kminmax_dir); + + #if DBGP
+    cerr << "fi.kminmax_orig "; simd_cerr(fi.kminmax_orig);
+    cerr << "fi.kminmax_dir "; simd_cerr(fi.kminmax_dir);
+ #endif + unsigned int index_trace[octdata- >get_max_depth() + 1];
    Vec3i cell(0,0,0);
- bvh_octnode(srp, first, last, cell, octdata->get_cap_depth (), 0, 0, index_trace);
+ sse_traverse_node<K,U,V,DK>(srp, first, last, DU, DV, fi, cell, octdata->get_cap_depth(), 0, 0, index_trace);
}

-void IsosurfaceOctreeVolume::bvh_octnode(SSERayPacket& srp, char first, char last, - const Vec3i& cell, int stop_depth, int depth, unsigned int index, unsigned int index_trace[]) const
+
+
+template<char K, char U, char V, char DK>
+void IsosurfaceOctreeVolume::sse_traverse_node(SSERayPacket& srp, char first, char last, + char DU, char DV, const FrustumInterval& fi,
+ Vec3i& cell, char stop_depth, char depth, unsigned int index, unsigned int index_trace[]) const
{
- //cerr << "octnode " << (int)depth << ", " << index << "; first=" << (int)first << ",last=" << (int)last << endl;
+#if DBGP
+ cerr << "octnode, depth " << (int)depth << ", index " << (int)index << ", first=" << (int)first << ",last=" << (int)last << "cell " << cell[0] << ", " << cell[1] << ", " << cell[2] << endl;
+ cerr << "(with K=" << (int)(K) << ", U=" << (int)(U) << ", V=" << (int)(V) << ", DK=" << (int)(DK) << endl; +#endif
+
    OctNode& node = octdata->get_node(depth, index);
-    Vec3i child_cell;
    int child_bit = octdata->get_child_bit_depth(depth);
+    Vec3i child_cell;
    index_trace[depth] = index;
-    int smd_first = first << 2;
-
-    //intersect all children in order
+ Vector pcenter( static_cast<float>(cell[0] | child_bit), + static_cast<float>(cell[1] | child_bit), + static_cast<float>(cell[2] | child_bit));
+                        #pragma unroll(2)
-    for(int midplane_x=0; midplane_x!=2; midplane_x++)
-    {
-        int target_x;
-        if (midplane_x - srp.rp->getSign(smd_first,0))
-        {
-            target_x = 4;
-            child_cell.data[0] = cell.data[0] | child_bit;
-        }
-        else
-        {
-            target_x = 0;
-            child_cell.data[0] = cell.data[0];
-        }
-        #pragma unroll(2)
-        for(int midplane_y=0; midplane_y!=2; midplane_y++)
-        {
-            int target_xy;
-            if (midplane_y - srp.rp->getSign(smd_first,1))
-            {
-                target_xy = target_x | 2;
-                child_cell.data[1] = cell.data[1] | child_bit;
-            }
-            else
-            {
-                target_xy = target_x;
-                child_cell.data[1] = cell.data[1];
-            }
-            #pragma unroll(2)
-            for(int midplane_z=0; midplane_z!=2; midplane_z++)
-            {
-                int target_child;
-                if (midplane_z - srp.rp->getSign(smd_first,2))
-                {
-                    target_child = target_xy | 1;
-                    child_cell.data[2] = cell.data[2] | child_bit;
-                }
-                else
-                {
-                    target_child = target_xy;
-                    child_cell.data[2] = cell.data[2];
-                }
+    for(int k=0; k<2; k++)
+    {   +        sse_t child_tkenter;
+        sse_t child_tkexit;
+        int tc_K;
+                +        if (k)  //AFTER THE K MIDPLANE
+        {
+            if (DK)
+            {
+ child_tkenter = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(cell[K]+ (child_bit<<1)), fi.kminmax_orig), fi.kminmax_invdir);
+                tc_K = axis_table[K];
+                child_cell[K] = cell[K] | child_bit;
+                #if DBGP
+ cerr << "kenter = pcenter[K] =" << pcenter[K] << endl;
+ cerr << "kexit = cell[K] + depth_bit =" << cell [K]+(child_bit<<1) << endl;
+                #endif
+            }
+            else
+            {
+ child_tkenter = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(cell[K]), fi.kminmax_orig), fi.kminmax_invdir);
+                tc_K = 0;
+ child_cell[K] = cell[K]; + #if DBGP + cerr << "kenter = pcenter[K] =" << pcenter[K] << endl;
+                cerr << "kexit = cell[K] =" << cell[K] << endl;
+                #endif
+            }
+           +        }
+        else    //BEFORE THE K MIDPLANE
+        {
+            if (DK)
+            {
+ child_tkenter = mul4(sub4(set4(cell[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+                tc_K = 0;
+                child_cell[K] = cell[K];
+                #if DBGP
+                cerr << "kenter = cell[K] =" << cell[K] << endl;
+ cerr << "kexit = pcenter[K] =" << pcenter[K] << endl;
+                #endif
+            }
+            else
+            {
+ child_tkenter = mul4(sub4(set4(cell[K]+ (child_bit<<1)), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+                tc_K = axis_table[K];
+ child_cell[K] = cell[K] | child_bit; + #if DBGP
+ cerr << "kenter = cell[K]+depth_bit =" << cell[K] +(child_bit<<1) << endl;
+ cerr << "kexit = pcenter[K] =" << pcenter[K] << endl;
+                #endif
+            }
+        }
+        +        #if DBGP
+        cerr << "child_tkenter "; simd_cerr(child_tkenter);
+        cerr << "child_tkexit "; simd_cerr(child_tkexit);
+ cerr << "child_pkenter[K] = "; simd_cerr( mul4(oneOver (fi.kminmax_invdir), add4(fi.kminmax_orig, child_tkenter)) );
+ cerr << "child_pkexit[K] = "; simd_cerr( mul4(oneOver (fi.kminmax_invdir), add4(fi.kminmax_orig, child_tkexit)) );
+        #endif
+     +        //we have child_tkenter, child_tkexit.
+        if (_mm_movemask_ps(cmp4_ge(child_tkexit, zero4()))==0)
+        {
+            #if DBGP
+            cerr << "texit was negative; continuing." << endl;
+            #endif
+            continue;
+        }
+         +        //find pkenter_uvminmax, pkexit[u], pkexit[v]
+        //child_pkenter = dir*(orig/dir) + dir*t = dir(orig+t)
+ const sse_t child_pkenter_uvminmax = add4 (fi.uvminmax_orig, mul4(fi.uvminmax_dir, child_tkenter));
+ const sse_t child_pkexit_uvminmax = add4 (fi.uvminmax_orig, mul4(fi.uvminmax_dir, child_tkexit));
+        +        #if DBGP
+ cerr << "child_pkenter_uvminmax "; simd_cerr (child_pkenter_uvminmax);
+ cerr << "child_pkexit_uvminmax "; simd_cerr (child_pkexit_uvminmax); + #endif
+
+        sse_union tmp_min, tmp_max;
+ tmp_min.sse = min4(child_pkenter_uvminmax, child_pkexit_uvminmax);
+ tmp_max.sse = max4(child_pkenter_uvminmax, child_pkexit_uvminmax);
+        +        #if DBGP
+        cerr << "tmp_min "; simd_cerr(tmp_min.sse);
+        cerr << "tmp_max "; simd_cerr(tmp_max.sse);
+        #endif
+ + const float umin = MIN(tmp_min.f[3], tmp_min.f [1]);
+        const float vmin = MIN(tmp_min.f[2], tmp_min.f[0]);
+        const float umax = MAX(tmp_max.f[3], tmp_max.f[1]);
+        const float vmax = MAX(tmp_max.f[2], tmp_max.f[0]);
+        sse_t sse_fuvminmax = set44(umin, vmin, umax, vmax);
+        +        #if DBGP
+ cerr << "sse_fuvminmax (before clamp) = "; simd_cerr (sse_fuvminmax);
+        #endif
+ + sse_fuvminmax = sub4(sse_fuvminmax, set44(cell[U], cell[V], cell[U], cell[V]));
+ sse_fuvminmax = mul4(sse_fuvminmax, set4(octdata- >get_inv_child_bit_depth(depth)));
+ sse_fuvminmax = max4(sse_fuvminmax, set44(0.0f, 0.0f, -9.9e9999f, -9.9e9999f));
+ sse_fuvminmax = min4(sse_fuvminmax, set44(9.9e9999f, 9.9e9999f, 1.0f, 1.0f));
+       +        #if DBGP
+ cerr << "sse_fuvminmax (after clamp) = "; simd_cerr (sse_fuvminmax);
+        #endif
+        +        sse_int_union iuvminmax;
+       +        //convert to int
+        iuvminmax.ssei = _mm_cvttps_epi32(sse_fuvminmax);
+        +        #if DBGP
+ cerr << "iuvminmax = " << iuvminmax.i[0] << ", " << iuvminmax.i[1] << ", " << iuvminmax.i[2] << ", " << iuvminmax.i[3];
+        cerr << endl << endl;
+        #endif
+ + for(int u= (DU==1 ? iuvminmax.i[3] : iuvminmax.i [1]); (DU==1 ? u <= iuvminmax.i[1] : u >= iuvminmax.i[3]); u += DU)
+        {
+            int tc_U;
+            if (u)
+            {
+                tc_U = axis_table[U];
+                child_cell[U] = cell[U] | child_bit;
+            }
+            else
+            {
+                tc_U = 0;
+                child_cell[U] = cell[U];   +            }
+
+ for(int v= (DV==1 ? iuvminmax.i[2] : iuvminmax.i [0]); (DV==1 ? v <= iuvminmax.i[0] : v >= iuvminmax.i[2]); v += DV)
+            {               +                int tc_V;
+                if (v)
+                {
+                    tc_V = axis_table[V];
+                    child_cell[V] = cell[V] | child_bit;
+                }
+                else
+                {
+                    tc_V = 0;
+                    child_cell[V] = cell[V];   +                }
+ + int target_child = tc_K | tc_U | tc_V;
if (octdata->get_isovalue() >= node.mins[target_child] && octdata->get_isovalue() <= node.maxs [target_child])
                {
@@ -894,89 +1090,202 @@
//cerr << "newfirst=" << (int)newfirst << ", last=" << (int)last << endl;
                    -                    if (newfirst <= last)
+                    if (first <= last)
                    {
                        if (node.offsets[target_child]==-1)
                        {
- bvh_octleaf(srp, newfirst, last, child_cell, stop_depth, depth+1, - depth, node.values[target_child], child_cell, index_trace);
+ sse_traverse_leaf<K,U,V,DK>(srp, newfirst, last, DU, DV, fi,
+ child_cell, stop_depth, depth+1, depth, node.values[target_child], child_cell, index_trace);
                        }
                        else
                        {
unsigned int child_idx = node.children_start + node.offsets[target_child];
if (depth == octdata- >get_pre_cap_depth()) //cap
                            {
- bvh_octcap(srp, newfirst, last, child_cell, stop_depth, depth+1, child_idx, index_trace);
+ sse_traverse_cap<K,U,V,DK>(srp, newfirst, last, DU, DV, fi, child_cell, + stop_depth, depth+1, child_idx, index_trace);
                            }
                            else
                            {
- bvh_octnode(srp, newfirst, last, child_cell, stop_depth, depth+1, child_idx, index_trace);
+ sse_traverse_node<K,U,V,DK>(srp, newfirst, last, DU, DV, fi, child_cell, + stop_depth, depth+1, child_idx, index_trace);
                            }
                        }
                        if (srp.activeRays<=0)
                            return;
                    }
-
                }
            }
        }
    }
-}
+} -void IsosurfaceOctreeVolume::bvh_octleaf (SSERayPacket& srp, char first, char last, - const Vec3i& cell, int stop_depth, int depth, +template<char K, char U, char V, char DK>
+void IsosurfaceOctreeVolume::sse_traverse_leaf(SSERayPacket& srp, char first, char last, char DU, char DV,
+ const FrustumInterval& fi, const Vec3i& cell, int stop_depth, int depth, int leaf_depth, ST leaf_value, const Vec3i& leaf_base_cell,
            unsigned int index_trace[]) const
{
    int child_bit = octdata->get_child_bit_depth(depth);
- int unsafe_zone = octdata->get_child_bit_depth(depth-1) - octdata->get_child_bit_depth(octdata->get_cap_depth());
-    int smd_first = first << 2;
+ int unsafe_zone = (child_bit<<1) - octdata- >get_child_bit_depth(octdata->get_cap_depth());
    Vec3i child_cell;
-
-    //intersect all children in order
+ Vector pcenter( static_cast<float>(cell[0] | child_bit), + static_cast<float>(cell[1] | child_bit), + static_cast<float>(cell[2] | child_bit));
+                        #pragma unroll(2)
-    for(int midplane_x=0; midplane_x!=2; midplane_x++)
-    {
-        int target_x;
-        if (midplane_x - srp.rp->getSign(smd_first,0))
-        {
-            target_x = 4;
-            child_cell.data[0] = cell.data[0] | child_bit;
-        }
-        else
-        {
-            target_x = 0;
-            child_cell.data[0] = cell.data[0];
-        }
-        #pragma unroll(2)
-        for(int midplane_y=0; midplane_y!=2; midplane_y++)
-        {
-            int target_xy;
-            if (midplane_y - srp.rp->getSign(smd_first,1))
-            {
-                target_xy = target_x | 2;
-                child_cell.data[1] = cell.data[1] | child_bit;
-            }
-            else
-            {
-                target_xy = target_x;
-                child_cell.data[1] = cell.data[1];
-            }
-            #pragma unroll(2)
-            for(int midplane_z=0; midplane_z!=2; midplane_z++)
-            {
-                int target_child;
-                if (midplane_z - srp.rp->getSign(smd_first,2))
-                {
-                    target_child = target_xy | 1;
-                    child_cell.data[2] = cell.data[2] | child_bit;
-                }
-                else
-                {
-                    target_child = target_xy;
-                    child_cell.data[2] = cell.data[2];
-                }
+    for(int k=0; k<2; k++)
+    {   +        sse_t child_tkenter;
+        sse_t child_tkexit;
+        int tc_K;
                +        if (k)  //AFTER THE K MIDPLANE
+        {
+            if (DK)
+            {
+ child_tkenter = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(cell[K]+ (child_bit<<1)), fi.kminmax_orig), fi.kminmax_invdir);
+                tc_K = axis_table[K];
+                child_cell[K] = cell[K] | child_bit;
+                #if DBGP
+ cerr << "kenter = pcenter[K] =" << pcenter[K] << endl;
+ cerr << "kexit = cell[K] + depth_bit =" << cell [K]+(child_bit<<1) << endl;
+                #endif
+            }
+            else
+            {
+ child_tkenter = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(cell[K]), fi.kminmax_orig), fi.kminmax_invdir);
+                tc_K = 0;
+ child_cell[K] = cell[K]; + #if DBGP + cerr << "kenter = pcenter[K] =" << pcenter[K] << endl;
+                cerr << "kexit = cell[K] =" << cell[K] << endl;
+                #endif
+            }
+           +        }
+        else    //BEFORE THE K MIDPLANE
+        {
+            if (DK)
+            {
+ child_tkenter = mul4(sub4(set4(cell[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+                tc_K = 0;
+                child_cell[K] = cell[K];
+                #if DBGP
+                cerr << "kenter = cell[K] =" << cell[K] << endl;
+ cerr << "kexit = pcenter[K] =" << pcenter[K] << endl;
+                #endif
+            }
+            else
+            {
+ child_tkenter = mul4(sub4(set4(cell[K]+ (child_bit<<1)), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+                tc_K = axis_table[K];
+ child_cell[K] = cell[K] | child_bit; + #if DBGP
+ cerr << "kenter = cell[K]+depth_bit =" << cell[K] +(child_bit<<1) << endl;
+ cerr << "kexit = pcenter[K] =" << pcenter[K] << endl;
+                #endif
+            }
+        }
+        +        #if DBGP
+        cerr << "child_tkenter "; simd_cerr(child_tkenter);
+        cerr << "child_tkexit "; simd_cerr(child_tkexit);
+ cerr << "child_pkenter[K] = "; simd_cerr( mul4(oneOver (fi.kminmax_invdir), add4(fi.kminmax_orig, child_tkenter)) );
+ cerr << "child_pkexit[K] = "; simd_cerr( mul4(oneOver (fi.kminmax_invdir), add4(fi.kminmax_orig, child_tkexit)) );
+        #endif
+     +        //we have child_tkenter, child_tkexit.
+        if (_mm_movemask_ps(cmp4_ge(child_tkexit, zero4()))==0)
+        {
+            #if DBGP
+            cerr << "texit was negative; continuing." << endl;
+            #endif
+            continue;
+        }
+         +        //find pkenter_uvminmax, pkexit[u], pkexit[v]
+        //child_pkenter = dir*(orig/dir) + dir*t = dir(orig+t)
+ const sse_t child_pkenter_uvminmax = add4 (fi.uvminmax_orig, mul4(fi.uvminmax_dir, child_tkenter));
+ const sse_t child_pkexit_uvminmax = add4 (fi.uvminmax_orig, mul4(fi.uvminmax_dir, child_tkexit));
+        +        #if DBGP
+ cerr << "child_pkenter_uvminmax "; simd_cerr (child_pkenter_uvminmax);
+ cerr << "child_pkexit_uvminmax "; simd_cerr (child_pkexit_uvminmax); + #endif
+
+        sse_union tmp_min, tmp_max;
+ tmp_min.sse = min4(child_pkenter_uvminmax, child_pkexit_uvminmax);
+ tmp_max.sse = max4(child_pkenter_uvminmax, child_pkexit_uvminmax);
+        +        #if DBGP
+        cerr << "tmp_min "; simd_cerr(tmp_min.sse);
+        cerr << "tmp_max "; simd_cerr(tmp_max.sse);
+        #endif
+ + const float umin = MIN(tmp_min.f[3], tmp_min.f [1]);
+        const float vmin = MIN(tmp_min.f[2], tmp_min.f[0]);
+        const float umax = MAX(tmp_max.f[3], tmp_max.f[1]);
+        const float vmax = MAX(tmp_max.f[2], tmp_max.f[0]);
+        sse_t sse_fuvminmax = set44(umin, vmin, umax, vmax);
+        +        #if DBGP
+ cerr << "sse_fuvminmax (before clamp) = "; simd_cerr (sse_fuvminmax);
+        #endif
+ + sse_fuvminmax = sub4(sse_fuvminmax, set44(cell[U], cell[V], cell[U], cell[V]));
+ sse_fuvminmax = mul4(sse_fuvminmax, set4(octdata- >get_inv_child_bit_depth(depth)));
+ sse_fuvminmax = max4(sse_fuvminmax, set44(0.0f, 0.0f, -9.9e9999f, -9.9e9999f));
+ sse_fuvminmax = min4(sse_fuvminmax, set44(9.9e9999f, 9.9e9999f, 1.0f, 1.0f));
+       +        #if DBGP
+ cerr << "sse_fuvminmax (after clamp) = "; simd_cerr (sse_fuvminmax);
+        #endif
+        +        sse_int_union iuvminmax;
+       +        //convert to int
+        iuvminmax.ssei = _mm_cvttps_epi32(sse_fuvminmax);
+        +        #if DBGP
+ cerr << "iuvminmax = " << iuvminmax.i[0] << ", " << iuvminmax.i[1] << ", " << iuvminmax.i[2] << ", " << iuvminmax.i[3];
+        cerr << endl << endl;
+        #endif
+
+ for(int u= (DU==1 ? iuvminmax.i[3] : iuvminmax.i[1]); (DU==1 ? u <= iuvminmax.i[1] : u >= iuvminmax.i[3]); u += DU)
+        {
+            int tc_U;
+            if (u)
+            {
+                tc_U = axis_table[U];
+                child_cell[U] = cell[U] | child_bit;
+            }
+            else
+            {
+                tc_U = 0;
+                child_cell[U] = cell[U];   +            }
+
+ for(int v= (DV==1 ? iuvminmax.i[2] : iuvminmax.i [0]); (DV==1 ? v <= iuvminmax.i[0] : v >= iuvminmax.i[2]); v += DV)
+            {               +                int tc_V;
+                if (v)
+                {
+                    tc_V = axis_table[V];
+                    child_cell[V] = cell[V] | child_bit;
+                }
+                else
+                {
+                    tc_V = 0;
+                    child_cell[V] = cell[V];   +                }
+ + int target_child = tc_K | tc_U | tc_V;
+ Vec3i local_child_cell = child_cell - leaf_base_cell;
if (local_child_cell.data[0] & unsafe_zone || local_child_cell.data[1] & unsafe_zone || local_child_cell.data [2] & unsafe_zone)
                {
@@ -1113,9 +1422,10 @@
Vector cmin(child_cell.data[0], child_cell.data[1], child_cell.data[2]);
Vector cmax(child_cell.data[0]+child_bit, child_cell.data[1]+child_bit, child_cell.data[2]+child_bit);
char newfirst = first_intersects(srp, first, last, cmin, cmax);
- - bvh_octleaf(srp, newfirst, last, child_cell, stop_depth, depth+1, - leaf_depth, leaf_value, leaf_base_cell, index_trace);
+ + sse_traverse_leaf<K,U,V,DK>(srp, newfirst, last, DU, DV, fi,
+ child_cell, stop_depth, depth+1, leaf_depth, leaf_value, leaf_base_cell, index_trace);
+                        }
                    if (srp.activeRays<=0)
                        return;
@@ -1123,63 +1433,177 @@
            }
        }
    }
-
-
}

-void IsosurfaceOctreeVolume::bvh_octcap(SSERayPacket& srp, char first, char last,
- const Vec3i& cell, int stop_depth, int depth, unsigned int index, unsigned int index_trace[]) const
+template<char K, char U, char V, char DK>
+void IsosurfaceOctreeVolume::sse_traverse_cap(SSERayPacket& srp, char first, char last, + char DU, char DV, const FrustumInterval& fi,
+ Vec3i& cell, char stop_depth, char depth, unsigned int index, unsigned int index_trace[]) const
{
- //cerr << "octcap " << index << ", first=" << (int)first << ",last=" << (int)last << endl;
+#if DBGP
+ cerr << "octcap " << index << ", first=" << (int)first << ",last=" << (int)last << endl;
+#endif        OctCap& cap = octdata->get_cap(index);
+    int child_bit = octdata->get_child_bit_depth(depth);
    Vec3i child_cell;
    index_trace[depth] = index;
-    int smd_first = first << 2;
-    -    //intersect all children in order
+ Vector pcenter( static_cast<float>(cell[0] | child_bit), + static_cast<float>(cell[1] | child_bit), + static_cast<float>(cell[2] | child_bit));
+                    #pragma unroll(2)
-    for(int midplane_x=0; midplane_x<2; midplane_x++)
-    {
-        int target_x;
-        if (midplane_x - srp.rp->getSign(smd_first,0))
-        {
-            target_x = 4;
-            child_cell.data[0] = cell.data[0] | 1;
-        }
-        else
-        {
-            target_x = 0;
-            child_cell.data[0] = cell.data[0];
-        }
-        #pragma unroll(2)
-        for(int midplane_y=0; midplane_y<2; midplane_y++)
-        {
-            int target_xy;
-            if (midplane_y - srp.rp->getSign(smd_first,1))
-            {
-                target_xy = target_x | 2;
-                child_cell.data[1] = cell.data[1] | 1;
-            }
-            else
-            {
-                target_xy = target_x;
-                child_cell.data[1] = cell.data[1];
-            }
-            #pragma unroll(2)
-            for(int midplane_z=0; midplane_z<2; midplane_z++)
-            {
-                int target_child;
-                if (midplane_z - srp.rp->getSign(smd_first,2))
-                {
-                    target_child = target_xy | 1;
-                    child_cell.data[2] = cell.data[2] | 1;
-                }
-                else
-                {
-                    target_child = target_xy;
-                    child_cell.data[2] = cell.data[2];
-                }
- +    for(int k=0; k<2; k++)
+    {   +        sse_t child_tkenter;
+        sse_t child_tkexit;
+        int tc_K;
+                +        if (k)  //AFTER THE K MIDPLANE
+        {
+            if (DK)
+            {
+ child_tkenter = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(cell[K]+ (child_bit<<1)), fi.kminmax_orig), fi.kminmax_invdir);
+                tc_K = axis_table[K];
+                child_cell[K] = cell[K] | child_bit;
+                #if DBGP
+ cerr << "kenter = pcenter[K] =" << pcenter[K] << endl;
+ cerr << "kexit = cell[K] + depth_bit =" << cell [K]+(child_bit<<1) << endl;
+                #endif
+            }
+            else
+            {
+ child_tkenter = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(cell[K]), fi.kminmax_orig), fi.kminmax_invdir);
+                tc_K = 0;
+ child_cell[K] = cell[K]; + #if DBGP + cerr << "kenter = pcenter[K] =" << pcenter[K] << endl;
+                cerr << "kexit = cell[K] =" << cell[K] << endl;
+                #endif
+            }
+           +        }
+        else    //BEFORE THE K MIDPLANE
+        {
+            if (DK)
+            {
+ child_tkenter = mul4(sub4(set4(cell[K]), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+                tc_K = 0;
+                child_cell[K] = cell[K];
+                #if DBGP
+                cerr << "kenter = cell[K] =" << cell[K] << endl;
+ cerr << "kexit = pcenter[K] =" << pcenter[K] << endl;
+                #endif
+            }
+            else
+            {
+ child_tkenter = mul4(sub4(set4(cell[K]+ (child_bit<<1)), fi.kminmax_orig), fi.kminmax_invdir);
+ child_tkexit = mul4(sub4(set4(pcenter[K]), fi.kminmax_orig), fi.kminmax_invdir);
+                tc_K = axis_table[K];
+ child_cell[K] = cell[K] | child_bit; + #if DBGP
+ cerr << "kenter = cell[K]+depth_bit =" << cell[K] +(child_bit<<1) << endl;
+ cerr << "kexit = pcenter[K] =" << pcenter[K] << endl;
+                #endif
+            }
+        }
+        +        #if DBGP
+        cerr << "child_tkenter "; simd_cerr(child_tkenter);
+        cerr << "child_tkexit "; simd_cerr(child_tkexit);
+ cerr << "child_pkenter[K] = "; simd_cerr( mul4(oneOver (fi.kminmax_invdir), add4(fi.kminmax_orig, child_tkenter)) );
+ cerr << "child_pkexit[K] = "; simd_cerr( mul4(oneOver (fi.kminmax_invdir), add4(fi.kminmax_orig, child_tkexit)) );
+        #endif
+     +        //we have child_tkenter, child_tkexit.
+        if (_mm_movemask_ps(cmp4_ge(child_tkexit, zero4()))==0)
+        {
+            #if DBGP
+            cerr << "texit was negative; continuing." << endl;
+            #endif
+            continue;
+        }
+         +        //find pkenter_uvminmax, pkexit[u], pkexit[v]
+        //child_pkenter = dir*(orig/dir) + dir*t = dir(orig+t)
+ const sse_t child_pkenter_uvminmax = add4 (fi.uvminmax_orig, mul4(fi.uvminmax_dir, child_tkenter));
+ const sse_t child_pkexit_uvminmax = add4 (fi.uvminmax_orig, mul4(fi.uvminmax_dir, child_tkexit));
+        +        #if DBGP
+ cerr << "child_pkenter_uvminmax "; simd_cerr (child_pkenter_uvminmax);
+ cerr << "child_pkexit_uvminmax "; simd_cerr (child_pkexit_uvminmax); + #endif
+
+        sse_union tmp_min, tmp_max;
+ tmp_min.sse = min4(child_pkenter_uvminmax, child_pkexit_uvminmax);
+ tmp_max.sse = max4(child_pkenter_uvminmax, child_pkexit_uvminmax);
+        +        #if DBGP
+        cerr << "tmp_min "; simd_cerr(tmp_min.sse);
+        cerr << "tmp_max "; simd_cerr(tmp_max.sse);
+        #endif
+ + const float umin = MIN(tmp_min.f[3], tmp_min.f [1]);
+        const float vmin = MIN(tmp_min.f[2], tmp_min.f[0]);
+        const float umax = MAX(tmp_max.f[3], tmp_max.f[1]);
+        const float vmax = MAX(tmp_max.f[2], tmp_max.f[0]);
+        sse_t sse_fuvminmax = set44(umin, vmin, umax, vmax);
+        +        #if DBGP
+ cerr << "sse_fuvminmax (before clamp) = "; simd_cerr (sse_fuvminmax);
+        #endif
+ + sse_fuvminmax = sub4(sse_fuvminmax, set44(cell[U], cell[V], cell[U], cell[V]));
+ sse_fuvminmax = mul4(sse_fuvminmax, set4(octdata- >get_inv_child_bit_depth(depth)));
+ sse_fuvminmax = max4(sse_fuvminmax, set44(0.0f, 0.0f, -9.9e9999f, -9.9e9999f));
+ sse_fuvminmax = min4(sse_fuvminmax, set44(9.9e9999f, 9.9e9999f, 1.0f, 1.0f));
+       +        #if DBGP
+ cerr << "sse_fuvminmax (after clamp) = "; simd_cerr (sse_fuvminmax);
+        #endif
+        +        sse_int_union iuvminmax;
+       +        //convert to int
+        iuvminmax.ssei = _mm_cvttps_epi32(sse_fuvminmax);
+        +        #if DBGP
+ cerr << "iuvminmax = " << iuvminmax.i[0] << ", " << iuvminmax.i[1] << ", " << iuvminmax.i[2] << ", " << iuvminmax.i[3];
+        cerr << endl << endl;
+        #endif
+ + for(int u= (DU==1 ? iuvminmax.i[3] : iuvminmax.i[1]); (DU==1 ? u <= iuvminmax.i[1] : u >= iuvminmax.i [3]); u += DU)
+        {
+            int tc_U;
+            if (u)
+            {
+                tc_U = axis_table[U];
+                child_cell[U] = cell[U] | child_bit;
+            }
+            else
+            {
+                tc_U = 0;
+                child_cell[U] = cell[U];   +            }
+
+ for(int v= (DV==1 ? iuvminmax.i[2] : iuvminmax.i [0]); (DV==1 ? v <= iuvminmax.i[0] : v >= iuvminmax.i[2]); v += DV)
+            {               +                int tc_V;
+                if (v)
+                {
+                    tc_V = axis_table[V];
+                    child_cell[V] = cell[V] | child_bit;
+                }
+                else
+                {
+                    tc_V = 0;
+                    child_cell[V] = cell[V];   +                }
+ + int target_child = tc_K | tc_U | tc_V;
+ sse_t child_tenter [RayPacket::SSE_MaxSize];
                sse_t child_texit[RayPacket::SSE_MaxSize];
                sse_t hitmask[RayPacket::SSE_MaxSize];

Modified: trunk/Model/Primitives/IsosurfaceOctreeVolume.h
==================================================================== ==========
--- trunk/Model/Primitives/IsosurfaceOctreeVolume.h    (original)
+++ trunk/Model/Primitives/IsosurfaceOctreeVolume.h Mon Jun 26 01:04:00 2006
@@ -58,20 +58,42 @@
                          const float texit) const;
                    #ifdef MANTA_SSE
- void packet_intersect_implicit_bvh(RayPacket& rays) const;
+            struct FrustumInterval
+            {
+ //NOTE: these are all in the order umin, vmin, umax, vmax.
+ // But specifically, that order is defined by orig and dir.
+                sse_t uvminmax_orig;
+ sse_t uvminmax_dir; + sse_t uvminmax_invdir;
+ + sse_t kminmax_orig; //kmin_xforig, kmin_xforig, kmax_xforig, kmax_xforig
+                sse_t kminmax_dir;
+ sse_t kminmax_invdir; //kmin_invdir, kmin_invdir, kmax_invdir, kmax_invdir + };
+
+            void packet_intersect_sse(RayPacket& rays) const;
- void bvh_octnode(SSERayPacket& srp, char first, char last,
- const Vec3i& cell, int stop_depth, int depth, unsigned int index, - unsigned int index_trace[]) const;
+            template<char K, char U, char V, char DK>
+ void sse_traverse(SSERayPacket& srp, char first, char last, + char DU, char DV) const;
- void bvh_octleaf (SSERayPacket& srp, char first, char last, - const Vec3i& cell, int stop_depth, int depth, - int leaf_depth, ST leaf_value, const Vec3i& leaf_base_cell,
-                            unsigned int index_trace[]) const;
- - void bvh_octcap(SSERayPacket& srp, char first, char last,
- const Vec3i& cell, int stop_depth, int depth, unsigned int index, + template<char K, char U, char V, char DK>
+ void IsosurfaceOctreeVolume::sse_traverse_node (SSERayPacket& srp, + char first, char last, char DU, char DV, const FrustumInterval& fi,
+ Vec3i& cell, char stop_depth, char depth, unsigned int index, + unsigned int index_trace[]) const;
+ + template<char K, char U, char V, char DK>
+ void sse_traverse_leaf(SSERayPacket& srp, char first, char last, char DU, char DV,
+ const FrustumInterval& fi, const Vec3i& cell, int stop_depth, int depth, + int leaf_depth, ST leaf_value, const Vec3i& leaf_base_cell,
                unsigned int index_trace[]) const;
+ + template<char K, char U, char V, char DK>
+ void IsosurfaceOctreeVolume::sse_traverse_cap (SSERayPacket& srp, + char first, char last, char DU, char DV, const FrustumInterval& fi,
+ Vec3i& cell, char stop_depth, char depth, unsigned int index, + unsigned int index_trace[]) const; inline char first_intersects (SSERayPacket& srp, char first, char last, const Vector& min, const Vector& max) const

Modified: trunk/Model/Primitives/OctreeVolume.h
==================================================================== ==========
--- trunk/Model/Primitives/OctreeVolume.h    (original)
+++ trunk/Model/Primitives/OctreeVolume.h Mon Jun 26 01:04:00 2006
@@ -217,6 +217,12 @@
            return child_bit_depth[d];
        }
        +        inline float get_inv_child_bit_depth(int d) const
+        {
+            return inv_child_bit_depth[d];
+        }
+
+                inline int get_max_depth() const
        {
            return max_depth;







Archive powered by MHonArc 2.6.16.

Top of page