Manta Interactive Ray Tracer Development Mailing List

Text archives Help


[MANTA] r1099 - in trunk: Core Core/Math Interface Model/Groups Model/Intersections Model/Primitives


Chronological Thread 
  • From: knolla@sci.utah.edu
  • To: manta@sci.utah.edu
  • Subject: [MANTA] r1099 - in trunk: Core Core/Math Interface Model/Groups Model/Intersections Model/Primitives
  • Date: Wed, 7 Jun 2006 09:06:16 -0600 (MDT)

Author: knolla
Date: Wed Jun  7 09:06:10 2006
New Revision: 1099

Added:
   trunk/Core/Math/SSEDefs.h
Modified:
   trunk/Core/CMakeLists.txt
   trunk/Interface/RayPacket.h
   trunk/Model/Groups/SSEKDTree.cc
   trunk/Model/Intersections/IsosurfaceImplicit.cc
   trunk/Model/Intersections/IsosurfaceImplicit.h
   trunk/Model/Primitives/IsosurfaceOctreeVolume.cc
   trunk/Model/Primitives/IsosurfaceOctreeVolume.h
   trunk/Model/Primitives/OctreeVolume.h
Log:
implemented first pass at SSE octree using implicit BVH. Slow and buggy.

Modified: trunk/Core/CMakeLists.txt
==============================================================================
--- trunk/Core/CMakeLists.txt   (original)
+++ trunk/Core/CMakeLists.txt   Wed Jun  7 09:06:10 2006
@@ -48,6 +48,7 @@
      Math/Noise.cc
      Math/ipow.h
      Math/CatmullRomInterpolator.h
+     Math/SSEDefs.h
      )
 SET (CORE_SOURCES ${CORE_SOURCES}
      Util/Args.h

Added: trunk/Core/Math/SSEDefs.h
==============================================================================
--- (empty file)
+++ trunk/Core/Math/SSEDefs.h   Wed Jun  7 09:06:10 2006
@@ -0,0 +1,251 @@
+// SSEDefs.h
+// A comprehensive set of macros for SSE
+
+#ifndef _MANTA_SSEDEFS_H_
+#define _MANTA_SSEDEFS_H_
+
+#ifdef MANTA_SSE
+#include <xmmintrin.h>
+#include <Core/Util/Align.h>
+#include <Core/Geometry/vecdefs.h>
+#include <Core/Geometry/Vector.h>
+
+typedef __m128 sse_t;
+typedef __m128i sse_int_t;
+
+//add to these macros as necessary.
+#define or4 _mm_or_ps
+#define or4i _mm_or_si128
+#define and4 _mm_and_ps
+#define and4i _mm_and_si128
+#define andnot4 _mm_andnot_ps
+#define andnot4i _mm_andnot_si128
+#define mul4 _mm_mul_ps
+#define add4 _mm_add_ps
+#define sub4 _mm_sub_ps
+#define min4 _mm_min_ps
+#define max4 _mm_max_ps
+#define set4 _mm_set_ps1
+#define set44 _mm_set_ps
+#define set4i _mm_set1_epi32
+#define set44i _mm_set_epi32
+#define zero4 _mm_setzero_ps
+#define getmask4 _mm_movemask_ps
+#define cmp4_ge _mm_cmpge_ps
+#define cmp4_le _mm_cmple_ps
+#define cmp4_gt _mm_cmpgt_ps
+#define cmp4_lt _mm_cmplt_ps
+#define cmp4_eq _mm_cmpeq_ps
+
+//AARONBAD - this should really be somewhere in Core/Math
+#ifndef MIN
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#endif
+#ifndef MAX
+#define MAX(a,b) (((a)>(b))?(a):(b))
+#endif
+
+namespace Manta
+{
+    static const MANTA_ALIGN(16) sse_t _mm_eps = _mm_set_ps1(1e-5);
+    static const MANTA_ALIGN(16) sse_t _mm_minus_eps = _mm_set_ps1(-1e-5);
+    static const MANTA_ALIGN(16) sse_t _mm_epsilon = _mm_set_ps1(1e-5);
+    static const MANTA_ALIGN(16) sse_t _mm_one = _mm_set_ps1(1.f);
+    static const MANTA_ALIGN(16) sse_t _mm_zero = _mm_set_ps1(0.f);
+    static const MANTA_ALIGN(16) sse_t _mm_one_half = _mm_set_ps1(0.5f);
+    static const MANTA_ALIGN(16) sse_t _mm_two = _mm_set_ps1(2.f);
+    static const MANTA_ALIGN(16) sse_t _mm_256 = _mm_set_ps1(256);
+    static const MANTA_ALIGN(16) sse_t _mm_255 = _mm_set_ps1(255);
+    static const MANTA_ALIGN(16) sse_t _mm_infty = _mm_set_ps1(9.9e9999f);
+    static const MANTA_ALIGN(16) sse_t _mm_minus_infty = 
_mm_set_ps1(-9.9e9999f);
+    static const int _mm_intabsmask = 0x7fffffff;
+    static const int _mm_intsignbit = 0x80000000;
+    static const int _mm_inttruemask = 0xffffffff;
+    static const MANTA_ALIGN(16) sse_t _mm_absmask = 
_mm_set_ps1((float&)_mm_intabsmask);
+    static const MANTA_ALIGN(16) sse_t _mm_signbit = 
_mm_set_ps1((float&)_mm_intsignbit);
+    static const MANTA_ALIGN(16) sse_t _mm_true = 
_mm_set_ps1((float&)_mm_inttruemask);
+    static const int minusOneI = -1;
+    static const MANTA_ALIGN(16) sse_t _mm_minusOne = _mm_set_ps1((float 
&)minusOneI);
+
+    /*! return v0 + t*(v1-v0) */
+    inline sse_t lerp4(const sse_t t, const sse_t v0, const sse_t v1)
+    {
+      return add4(v0,mul4(t,sub4(v1,v0)));
+    }
+
+    inline sse_t dot4(const sse_t &ox, const sse_t &oy, const sse_t &oz,
+                      const sse_t &vx, const sse_t &vy, const sse_t &vz)
+    {
+      return _mm_add_ps(_mm_add_ps(_mm_mul_ps(vx,ox),
+                                   _mm_mul_ps(vy,oy)),
+                        _mm_mul_ps(vz,oz));
+    }
+
+    //equivalent to mask ? dest : src
+    inline sse_t mask4(const sse_t &mask, const sse_t &dest, const sse_t 
&src)
+    {
+        return or4(and4(mask,dest),andnot4(mask,src));
+    }
+
+    //equivalent to ~mask ? dest : src
+    inline sse_t masknot4(const sse_t &mask, const sse_t &dest, const sse_t 
&src)
+    {
+        return or4(andnot4(mask,dest),and4(mask,src));
+    }
+
+    inline sse_int_t mask4i(const sse_int_t &mask, const sse_int_t &dest, 
const sse_int_t &src)
+    {
+      return or4i(and4i(mask,dest),andnot4i(mask,src));
+    }
+
+    inline sse_t abs4(const sse_t &v)
+    {
+      return andnot4(_mm_signbit,v);
+    }
+
+    inline sse_t accurateReciprocal(const sse_t v)
+    {
+      const sse_t rcp = _mm_rcp_ps(v);
+      return 
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+    }
+
+    inline sse_t uberAccurateReciprocal(const sse_t v)
+    {
+        sse_t rcp = _mm_rcp_ps(v);
+        rcp = 
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+        rcp = 
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+        rcp = 
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+        return rcp;
+    }
+
+    inline sse_t oneOver(const sse_t v)
+    {
+      const sse_t rcp = _mm_rcp_ps(v);
+      return 
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+    }
+
+    inline sse_t accurateReciprocalSqrt(const sse_t v)
+    {
+      const sse_t rcp_sqrt = _mm_rsqrt_ps(v);
+      const sse_t one_point_five = _mm_set_ps1(1.5f);
+      return _mm_mul_ps(rcp_sqrt, _mm_sub_ps(one_point_five, 
_mm_mul_ps(_mm_one_half, _mm_mul_ps(_mm_mul_ps(rcp_sqrt,rcp_sqrt),v))));
+    }
+
+
+    inline sse_t reciprocal(const sse_t v)
+    {
+      return _mm_rcp_ps(v);
+    }
+
+    inline sse_t lin(const sse_t &base,
+                     float u,const sse_t du,
+                     float v, const sse_t dv)
+    {
+      return _mm_add_ps(_mm_add_ps(base,_mm_mul_ps(_mm_set_ps1(u),du)),
+                        _mm_mul_ps(_mm_set_ps1(v),dv));
+    }
+
+    inline sse_t lin(const sse_t &base,
+                     const sse_t &u,const sse_t du,
+                     const sse_t &v, const sse_t dv)
+    {
+      return _mm_add_ps(_mm_add_ps(base,_mm_mul_ps(u,du)),
+                        _mm_mul_ps(v,dv));
+    }
+
+
+    inline sse_t dot4(const sse_t &a, const sse_t &b)
+    {
+      const sse_t xyzw = _mm_mul_ps(a,b);
+      const sse_t zwxy = _mm_shuffle_ps(xyzw,xyzw,_MM_SHUFFLE(1,0,3,2));
+      const sse_t xz_yw_zx_wy = _mm_add_ps(zwxy,xyzw);
+      const sse_t wy_zx_yw_xz = 
_mm_shuffle_ps(xz_yw_zx_wy,xz_yw_zx_wy,_MM_SHUFFLE(0,1,2,3));
+      const sse_t res = _mm_add_ps(xz_yw_zx_wy,wy_zx_yw_xz);
+      return res;
+    }
+
+    inline float dot1(const sse_t &a, const sse_t &b)
+    {
+      const sse_t d = dot4(a,b);
+      return (float&)d;
+    }
+
+    inline void normalize(sse_t &v)
+    {
+      const sse_t dot = dot4(v,v);
+      v = _mm_mul_ps(v, _mm_rsqrt_ps(dot));
+    };
+
+    inline float sqrLength(const sse_t &a)
+    {
+      return dot1(a,a);
+    }
+
+    inline float length(sse_t a)
+    {
+      const sse_t d = dot4(a,a);
+      const sse_t v = _mm_sqrt_ps(d);
+      return (float &)v;
+    }
+
+    inline float min4f(sse_t t)
+    {
+      MANTA_ALIGN(16)
+      float f[4];
+      _mm_store_ps(f,t);
+      return MIN(MIN(f[0],f[1]),MIN(f[2],f[3]));
+    }
+
+    inline float max4f(sse_t t)
+    {
+      MANTA_ALIGN(16)
+      float f[4];
+      _mm_store_ps(f,t);
+      return MAX(MAX(f[0],f[1]),MAX(f[2],f[3]));
+    }
+
+    inline float min3f(sse_t t)
+    {
+      MANTA_ALIGN(16)
+      float f[4];
+      _mm_store_ps(f,t);
+      return MIN(MIN(f[0],f[1]),f[2]);
+    }
+
+    /*! get horizontal minimum of a whole 4-way simd */
+    inline float max3f(sse_t t)
+    {
+      MANTA_ALIGN(16)
+      float f[4];
+      _mm_store_ps(f,t);
+      return MAX(MAX(f[0],f[1]),f[2]);
+    }
+
+    inline float simd_component(sse_t t, int offset)
+    {  
+        MANTA_ALIGN(16)
+        float f[4];
+        _mm_store_ps(f,t);
+        return f[offset];
+    }
+
+    inline Vec3f as_Vec3f(sse_t t)
+    {
+        MANTA_ALIGN(16)
+        float f[4];
+        _mm_store_ps(f,t);
+        return Vec3f(f[2], f[1], f[0]);
+    }
+    
+    inline Vector as_Vector(sse_t t)
+    {
+        MANTA_ALIGN(16)
+        float f[4];
+        _mm_store_ps(f,t);
+        return Vector(f[2], f[1], f[0]);
+    }
+
+};
+
+#endif  //#ifdef MANTA_SSE
+#endif
\ No newline at end of file

Modified: trunk/Interface/RayPacket.h
==============================================================================
--- trunk/Interface/RayPacket.h (original)
+++ trunk/Interface/RayPacket.h Wed Jun  7 09:06:10 2006
@@ -14,6 +14,10 @@
 #include <RayPacketParameters.h>
 #include <MantaSSE.h>
 
+#ifdef MANTA_SSE
+#include <Core/Math/SSEDefs.h>
+#endif
+
 // #include <sgi_stl_warnings_off.h>
 // #include <algorithm>
 // #include <sgi_stl_warnings_on.h>
@@ -22,12 +26,14 @@
   class Material;
   class RenderContext;
 
-
   class MANTA_ALIGN(16) RayPacketData {
   public:
     enum {
       MaxScratchpadSize = SCRATCHPAD_MAXSIZE,
-      MaxSize              = RAYPACKET_MAXSIZE
+      MaxSize              = RAYPACKET_MAXSIZE,
+#ifdef MANTA_SSE      
+      SSE_MaxSize   = RAYPACKET_MAXSIZE/4,
+#endif      
     };
     RayPacketData()
       {
@@ -55,10 +61,10 @@
     MANTA_ALIGN(16) Real inverseDirection[3][MaxSize];
     MANTA_ALIGN(16) Real minT[MaxSize];
 
-    Real image[2][MaxSize];
-    Real normal[3][MaxSize];
-    Real hitPosition[3][MaxSize];
-    Real texCoords[3][MaxSize];
+    MANTA_ALIGN(16) Real image[2][MaxSize];
+    MANTA_ALIGN(16) Real normal[3][MaxSize];
+    MANTA_ALIGN(16) Real hitPosition[3][MaxSize];
+    MANTA_ALIGN(16) Real texCoords[3][MaxSize];
 
 
     // Color-based arrays
@@ -72,11 +78,26 @@
     // Char-based arrays
     char scratchpad_data[MaxSize][MaxScratchpadSize];
   };
+  
+#ifdef MANTA_SSE
+  struct MANTA_ALIGN(16) SSERayPacket
+  {
+    sse_t* orig[3];
+    sse_t* dir[3];
+    sse_t* inv_dir[3];
+    sse_t* normal[3];
+    sse_t* minT;
+  };
+#endif  
 
   class RayPacket {
   public:
     enum {
       MaxSize               = RayPacketData::MaxSize,
+      
+#ifdef MANTA_SSE
+      SSE_MaxSize           = RayPacketData::SSE_MaxSize,
+#endif
 
       // Flags.
       ConstantOrigin        = 0x0001,
@@ -280,9 +301,22 @@
     {
       if(flags & HaveInverseDirections)
         return;
+#if MANTA_SSE
+      int b = (rayBegin + 3) & (~3);
+      int e = rayEnd & (~3);
+#pragma unroll(3)
+      for(int j=0; j<3; j++)
+      {
+        sse_t* dirs = (sse_t*)data->direction[j];
+        sse_t* inv_dirs = (sse_t*)data->inverseDirection[j];
+        for(int smd=b; smd<e; smd++)
+          inv_dirs[smd] = oneOver(dirs[smd]);
+      }
+#else
       for(int i=rayBegin;i<rayEnd;i++)
         for(int j=0;j<3;j++)
           data->inverseDirection[j][i] = 1./data->direction[j][i];
+#endif
       flags |= HaveInverseDirections;
     }
     void computeSigns()

Modified: trunk/Model/Groups/SSEKDTree.cc
==============================================================================
--- trunk/Model/Groups/SSEKDTree.cc     (original)
+++ trunk/Model/Groups/SSEKDTree.cc     Wed Jun  7 09:06:10 2006
@@ -48,6 +48,8 @@
 
 #include <stdio.h>
 
+#include <Core/Math/SSEDefs.h>
+
 using namespace Manta;
 using namespace Manta::Kdtree;
 using namespace SCIRun;
@@ -551,11 +553,14 @@
 }
 
 // Newton-Raphson Iteration for 1/x
+/*
+AARONBAD - now implemented in Core/Math/SSEDefs.h
 inline __m128 accurateReciprocal(const __m128& v) {
 
   const __m128 rcp = _mm_rcp_ps(v);
   return _mm_sub_ps(_mm_add_ps(rcp, rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
 }
+*/
 
 void intersectTriangleEdgeSSE( IntersectPacket* result,
                                const RayPacket& rays,

Modified: trunk/Model/Intersections/IsosurfaceImplicit.cc
==============================================================================
--- trunk/Model/Intersections/IsosurfaceImplicit.cc     (original)
+++ trunk/Model/Intersections/IsosurfaceImplicit.cc     Wed Jun  7 09:06:10 
2006
@@ -1,8 +1,10 @@
 
 #include <Model/Intersections/IsosurfaceImplicit.h>
 #include <Core/Math/CubicSolver.h>
+#include <iostream>
 
 using namespace Manta;
+using namespace std;
 
 //From Steven Parker's 1997 RTRT isosurface intersection
 bool IsosurfaceImplicit::single_intersect(const Vector& orig, const Vector& 
dir, 
@@ -112,4 +114,153 @@
             + (x-x_0)*(y-y_0)*rho[1][1][1];
 }
 
+
+#ifdef MANTA_SSE
+//SSE packet implementation
+//Based on Marmitt et al. 04, Wald 05 SSE intersections (OpenRT)
+//  as well as Knoll DynRT implementation
+void IsosurfaceImplicit::sse_intersect(RayPacket& rays, SSERayPacket& srp, 
+            char first, char last, const Vector& pmin, const Vector& pmax, 
float rho[2][2][2], 
+            float isovalue, sse_t tenter[], sse_t texit[], sse_t hitmask[], 
+            const Manta::Primitive* prim, const Manta::Material* matl)
+{
+    //cerr << "sse_intersect: first=" << (int)first << ",last=" << (int)last 
<< endl;
+
+    for(int smd=first; smd<last; smd++)
+    {
+        if (_mm_movemask_ps(hitmask[smd])==0)
+            continue;
+    
+        //compute p0, p1
+        sse_t p0[3];
+        sse_t p1[3];
+        
+        #pragma unroll(3)                      
+        for(int axis=0; axis<3; axis++)
+        {                      
+            p0[axis] = sub4(add4(srp.orig[axis][smd], 
mul4(srp.dir[axis][smd], tenter[smd])), set4(pmin[axis]));
+            p1[axis] = sub4(add4(srp.orig[axis][smd], 
mul4(srp.dir[axis][smd], texit[smd])), set4(pmin[axis]));
+        }
+        
+        CubicPoly4 poly;
+        poly.generate(p0, p1, rho, isovalue);
+
+        sse_t t0 = zero4();                                        
+        sse_t t1 = _mm_one;
+        sse_t D0 = poly.d;
+        sse_t D1 = add4(add4(poly.a,poly.b), add4(poly.c,poly.d));
+        
+        //AARONBAD - we'd want something like this to avoid extra work
+        //sse_t sse_thisvoxelmask = and4(hitmask[smd], cmp4_lt(tenter[smd], 
srp.minT[smd]));
+
+        //find which rays have differing signs for D0, D1. Only retain the 
ones that have same signs?
+        sse_t differingSigns = cmp4_lt(mul4(D0,D1), zero4());
+        sse_t sse_thisvoxelmask = and4(hitmask[smd], differingSigns);
+        int int_thisvoxelmask = _mm_movemask_ps(sse_thisvoxelmask);
+        
+        if (int_thisvoxelmask == 0)    //if none of them hit, don't bother 
iterating any more
+            continue;
+                    
+        #define NEUBAUER_ITERATIONS 3                    
+        #pragma unroll(NEUBAUER_ITERATIONS)
+        for (int i=0;i<NEUBAUER_ITERATIONS;i++)
+        {
+            //compute linear interpolation
+            const sse_t denom = accurateReciprocal(sub4(D0,D1));
+            sse_t t = add4(t0,mul4(mul4(D0,denom), sub4(t1,t0)));
+    
+            //re-evaluate
+            sse_t D = poly.eval(t);
+    
+            //conditionally store
+            const sse_t frontHalf = _mm_cmplt_ps(mul4(D0,D), zero4());
+            t1 = or4(_mm_and_ps(frontHalf,t), _mm_andnot_ps(frontHalf,t1));
+            t0 = or4(_mm_and_ps(frontHalf,t0), _mm_andnot_ps(frontHalf,t));
+            D1 = or4(_mm_and_ps(frontHalf,D), _mm_andnot_ps(frontHalf,D1));
+            D0 = or4(_mm_and_ps(frontHalf,D0), _mm_andnot_ps(frontHalf,D));
+        }
+
+        //compute hit distance
+        const sse_t denom = accurateReciprocal(sub4(D0,D1));
+        sse_t t = add4(t0, mul4(mul4(D0,denom), sub4(t1,t0)));
+        sse_t hit_t = add4(tenter[smd], mul4(t, sub4(texit[smd], 
tenter[smd])));
+            
+        //sse_thisvoxelmask = and4(sse_thisvoxelmask, 
cmp4_lt(hit_t,srp.minT[smd]));
+        srp.minT[smd] = mask4(sse_thisvoxelmask, hit_t, srp.minT[smd]);
+        int_thisvoxelmask = _mm_movemask_ps(sse_thisvoxelmask);
+        if (int_thisvoxelmask)
+        {
+            sse_t normal[3];
+            sse_normal(rays, srp, smd, normal, pmin, pmax, rho);
+            #pragma unroll(3)
+            for(int axis=0; axis<3; axis++)
+                srp.normal[axis][smd] = mask4(sse_thisvoxelmask, 
normal[axis], srp.normal[axis][smd]);
+                
+            #pragma unroll(4);
+            for(int ray=0; ray<4; ray++)
+            {
+                if (int_thisvoxelmask & (1<<ray))
+                {
+                    int realray=(smd<<2)+ray;
+                    rays.data->hitMatl[realray] = matl;
+                    rays.data->hitPrim[realray] = prim;
+                }
+            }
+        }
+    }
+}
+            
+void IsosurfaceImplicit::sse_normal(RayPacket &ray, SSERayPacket& srp, int 
smd, 
+            sse_t normal[], const Vector& pmin, const Vector& pmax,
+            const float rho[2][2][2])
+{
+    sse_t phit[3];
+    #pragma unroll(3)
+    for(int axis=0; axis<3; axis++)
+        phit[axis] = add4(srp.orig[axis][smd], mul4(srp.dir[axis][smd], 
srp.minT[smd]));
+
+    int axis, U, V;    
+    
+    //axis=0
+    axis=0;
+    U=1;
+    V=2;
+    sse_t max_m_hit_U = sub4(set4(pmax[U]), phit[U]);
+    sse_t max_m_hit_V = sub4(set4(pmax[V]), phit[V]);
+    sse_t min_m_hit_U = sub4(set4(pmin[U]), phit[U]);
+    sse_t min_m_hit_V = sub4(set4(pmin[V]), phit[V]);
+    normal[axis] = mul4(mul4(max_m_hit_U, max_m_hit_V), set4(rho[1][0][0] - 
rho[0][0][0]));
+    normal[axis] = add4(normal[axis], mul4(mul4(min_m_hit_U, max_m_hit_V), 
set4(rho[0][1][0] - rho[1][1][0])));
+    normal[axis] = add4(normal[axis], mul4(mul4(max_m_hit_U, min_m_hit_V), 
set4(rho[0][0][1] - rho[1][0][1])));
+    normal[axis] = add4(normal[axis], mul4(mul4(min_m_hit_U, min_m_hit_V), 
set4(rho[1][1][1] - rho[0][1][1])));
+    
+    //axis=1
+    axis=1;
+    U=0;
+    V=2;
+    max_m_hit_U = sub4(set4(pmax[U]), phit[U]);
+    max_m_hit_V = sub4(set4(pmax[V]), phit[V]);
+    min_m_hit_U = sub4(set4(pmin[U]), phit[U]);
+    min_m_hit_V = sub4(set4(pmin[V]), phit[V]);
+    normal[axis] = mul4(mul4(max_m_hit_U, max_m_hit_V), set4(rho[0][1][0] - 
rho[0][0][0]));
+    normal[axis] = add4(normal[axis], mul4(mul4(min_m_hit_U, max_m_hit_V), 
set4(rho[1][0][0] - rho[1][1][0])));
+    normal[axis] = add4(normal[axis], mul4(mul4(max_m_hit_U, min_m_hit_V), 
set4(rho[0][0][1] - rho[0][1][1])));
+    normal[axis] = add4(normal[axis], mul4(mul4(min_m_hit_U, min_m_hit_V), 
set4(rho[1][1][1] - rho[1][0][1])));
+    
+    //axis=2
+    axis=2;
+    U=0;
+    V=1;
+    max_m_hit_U = sub4(set4(pmax[U]), phit[U]);
+    max_m_hit_V = sub4(set4(pmax[V]), phit[V]);
+    min_m_hit_U = sub4(set4(pmin[U]), phit[U]);
+    min_m_hit_V = sub4(set4(pmin[V]), phit[V]);
+    normal[axis] = mul4(mul4(max_m_hit_U, max_m_hit_V), set4(rho[0][0][1] - 
rho[0][0][0]));
+    normal[axis] = add4(normal[axis], mul4(mul4(min_m_hit_U, max_m_hit_V), 
set4(rho[1][0][0] - rho[1][0][1])));
+    normal[axis] = add4(normal[axis], mul4(mul4(max_m_hit_U, min_m_hit_V), 
set4(rho[0][1][0] - rho[0][1][1])));
+    normal[axis] = add4(normal[axis], mul4(mul4(min_m_hit_U, min_m_hit_V), 
set4(rho[1][1][1] - rho[1][1][0])));
+
+}
+
+#endif 
 

Modified: trunk/Model/Intersections/IsosurfaceImplicit.h
==============================================================================
--- trunk/Model/Intersections/IsosurfaceImplicit.h      (original)
+++ trunk/Model/Intersections/IsosurfaceImplicit.h      Wed Jun  7 09:06:10 
2006
@@ -4,6 +4,13 @@
 #include <Interface/RayPacket.h>
 #include <Core/Geometry/Vector.h>
 #include <Interface/Material.h>
+#include <Interface/Primitive.h>
+
+#include <MantaSSE.h>
+
+#ifdef MANTA_SSE
+#include <Core/Math/SSEDefs.h>
+#endif
 
 namespace Manta
 {
@@ -15,6 +22,119 @@
                       
         static void single_normal(Vector& outNormal, const Vector& pmin, 
                       const Vector& pmax, const Vector& p, float 
rho[2][2][2]);
+                      
+        //TODO - non-SSE packet intersection              
+                      
+#ifdef MANTA_SSE
+        static void sse_intersect(RayPacket& rays, SSERayPacket& srp, 
+                    char first, char last, const Vector& pmin, const Vector& 
pmax, float rho[2][2][2], 
+                    float isovalue, sse_t tenter[], sse_t texit[], sse_t 
hitmask[], 
+                    const Manta::Primitive* prim, const Manta::Material* 
matl);
+                    
+        static void sse_normal(RayPacket &ray, SSERayPacket& srp, int smd, 
+                    sse_t normal[], const Vector& pmin, const Vector& pmax,
+                    const float rho[2][2][2]);
+                    
+        struct CubicPoly4
+        {
+            MANTA_ALIGN(16) sse_t a, b, c, d;          
+        
+            inline void generate(sse_t p0[], sse_t p1[], const float 
voxels_cell[2][2][2], float isovalue)
+            {
+                sse_t e0[3];
+                sse_t e1[3];
+                sse_t d1[3];
+                
+                #pragma unroll(3)
+                for(int axis=0; axis<3; axis++)
+                {
+                    e0[axis] = sub4(_mm_one, p0[axis]);
+                    e1[axis] = p0[axis];
+                    d1[axis] = sub4(p1[axis], p0[axis]);
+                }
+
+                sse_t interimROO = mul4(d1[1], d1[2]);
+                const sse_t interimRRR = mul4(d1[0], interimROO);
+                const sse_t interimNRR = mul4(e1[0], interimROO);
+                const sse_t interimORR = mul4(e0[0], interimROO);
+                
+                interimROO = mul4(e1[1], d1[2]);
+                const sse_t interimRNR = mul4(d1[0], interimROO);
+                const sse_t interimNNR = mul4(e1[0], interimROO);
+                const sse_t interimONR = mul4(e0[0], interimROO);
+                
+                interimROO = mul4(d1[1], e1[2]);
+                const sse_t interimRRN = mul4(d1[0], interimROO);
+                const sse_t interimNRN = mul4(e1[0], interimROO);
+                const sse_t interimORN = mul4(e0[0], interimROO);
+                
+                interimROO = mul4(d1[1], e0[2]);
+                const sse_t interimRRO = mul4(d1[0], interimROO);
+                const sse_t interimNRO = mul4(e1[0], interimROO);
+                const sse_t interimORO = mul4(e0[0], interimROO);
+                
+                interimROO = mul4(e0[1], d1[2]);
+                const sse_t interimROR = mul4(d1[0], interimROO);
+                const sse_t interimNOR = mul4(e1[0], interimROO);
+                const sse_t interimOOR = mul4(e0[0], interimROO);
+                
+                interimROO = mul4(d1[0], e1[1]);
+                const sse_t interimRNN = mul4(interimROO, e1[2]);
+                const sse_t interimRNO = mul4(interimROO, e0[2]);
+                
+                interimROO = mul4(d1[0], e0[1]);
+                const sse_t interimRON = mul4(interimROO, e1[2]);
+                interimROO = mul4(interimROO, e0[2]);
+                
+                a = mul4(interimRRR, _mm_set_ps1(+ voxels_cell[1][1][1]
+                        - voxels_cell[0][1][1]
+                                - voxels_cell[1][0][1]
+                                + voxels_cell[0][0][1]
+                                - voxels_cell[1][1][0]
+                                + voxels_cell[0][1][0]
+                                + voxels_cell[1][0][0]
+                                - voxels_cell[0][0][0]));
+                
+                b = 
mul4(_mm_set_ps1(voxels_cell[1][1][1]),add4(add4(interimNRR,interimRNR),interimRRN));
+                b = 
sub4(b,mul4(_mm_set_ps1(voxels_cell[1][1][0]),sub4(add4(interimNRR,interimRNR),interimRRO)));
+                b = 
sub4(b,mul4(_mm_set_ps1(voxels_cell[1][0][1]),add4(sub4(interimNRR,interimROR),interimRRN)));
+                b = 
add4(b,mul4(_mm_set_ps1(voxels_cell[1][0][0]),sub4(sub4(interimNRR,interimROR),interimRRO)));
+                b = 
add4(b,mul4(_mm_set_ps1(voxels_cell[0][1][1]),sub4(sub4(interimORR,interimRNR),interimRRN)));
+                b = 
sub4(b,mul4(_mm_set_ps1(voxels_cell[0][1][0]),add4(sub4(interimORR,interimRNR),interimRRO)));
+                b = 
sub4(b,mul4(_mm_set_ps1(voxels_cell[0][0][1]),sub4(add4(interimORR,interimROR),interimRRN)));
+                b = 
add4(b,mul4(_mm_set_ps1(voxels_cell[0][0][0]),add4(add4(interimORR,interimROR),interimRRO)));
+                
+                c = 
mul4(_mm_set_ps1(voxels_cell[1][1][1]),add4(add4(interimRNN,interimNRN),interimNNR));
+                c = 
add4(c,mul4(_mm_set_ps1(voxels_cell[1][1][0]),sub4(add4(interimRNO,interimNRO),interimNNR)));
+                c = 
add4(c,mul4(_mm_set_ps1(voxels_cell[1][0][1]),add4(sub4(interimRON,interimNRN),interimNOR)));
+                c = 
add4(c,mul4(_mm_set_ps1(voxels_cell[1][0][0]),sub4(sub4(interimROO,interimNRO),interimNOR)));
+                c = 
add4(c,mul4(_mm_set_ps1(voxels_cell[0][1][1]),sub4(add4(interimORN,interimONR),interimRNN)));
+                c = 
sub4(c,mul4(_mm_set_ps1(voxels_cell[0][1][0]),add4(sub4(interimRNO,interimORO),interimONR)));
+                c = 
sub4(c,mul4(_mm_set_ps1(voxels_cell[0][0][1]),sub4(add4(interimRON,interimORN),interimOOR)));
+                c = 
sub4(c,mul4(_mm_set_ps1(voxels_cell[0][0][0]),add4(add4(interimROO,interimORO),interimOOR)));
+
+                d = add4(mul4(e1[0], add4(mul4(e1[1], add4(mul4(e1[2], 
+                               set4(voxels_cell[1][1][1])),
+                               mul4(e0[2], set4(voxels_cell[1][1][0])))),
+                               mul4(e0[1], add4(mul4(e1[2], 
set4(voxels_cell[1][0][1])),
+                                   mul4(e0[2], 
set4(voxels_cell[1][0][0])))))),
+                mul4(e0[0], add4(mul4(e1[1], add4(mul4(e1[2], 
+                    set4(voxels_cell[0][1][1])),
+                    mul4(e0[2], set4(voxels_cell[0][1][0])))),
+                    mul4(e0[1], add4(mul4(e1[2], set4(voxels_cell[0][0][1])),
+                        mul4(e0[2], set4(voxels_cell[0][0][0])))))));        
  
+                
+                d = sub4(d, set4(isovalue));
+            }
+        
+            inline sse_t eval(const sse_t &t) const
+            {
+                return add4(mul4(add4(mul4(add4(mul4(a,t),b),t),c),t),d);
+            }
+        };     
+            
+                    
+#endif                      
     };
 };
 

Modified: trunk/Model/Primitives/IsosurfaceOctreeVolume.cc
==============================================================================
--- trunk/Model/Primitives/IsosurfaceOctreeVolume.cc    (original)
+++ trunk/Model/Primitives/IsosurfaceOctreeVolume.cc    Wed Jun  7 09:06:10 
2006
@@ -7,6 +7,10 @@
 #include <Interface/RayPacket.h>
 #include <Model/Intersections/IsosurfaceImplicit.h>
 
+#ifdef MANTA_SSE
+#include <Core/Math/SSEDefs.h>
+#endif
+
 #define USE_OCTREE_DATA
 
 #define MIN4(a,b,c,d) min(min(a,b), min(c,d));
@@ -109,8 +113,12 @@
 
 void IsosurfaceOctreeVolume::intersect(RenderContext const &context, 
RayPacket &packet) const
 {
+#if 1
+       packet_intersect_implicit_bvh(packet);
+#else
     for ( int i = packet.rayBegin; i < packet.rayEnd; i++ )
         single_intersect(packet, i);
+#endif         
 }
 
 void IsosurfaceOctreeVolume::single_intersect(RayPacket& rays, int 
which_one) const
@@ -305,7 +313,6 @@
 #endif
             if (node.offsets[target_child]==-1)
             {
-                return false;
                 if (single_traverse_leaf(rays, which_one, orig, dir, 
inv_dir, stop_depth, 
                     next_depth, depth, node.values[target_child], 
                     child_cell, index_trace, child_cell, child_tenter, 
child_texit))
@@ -735,4 +742,271 @@
     return false;
 }
 
+/*
+       Begin packet intersection code, for SSE packets only.
+*/
+#ifdef MANTA_SSE
+
+//an octree traversal based on implicit BVH
+void IsosurfaceOctreeVolume::packet_intersect_implicit_bvh(RayPacket& rays) 
const
+{
+    rays.computeInverseDirections();
+    rays.computeSigns();
+    RayPacketData* data = rays.data;
+    SSERayPacket srp;
+
+    //intersect the global bounding box: find first, last
+    //  this will require a special-case AABB intersection
+
+    #pragma unroll(3)
+    for(int axis=0; axis<3; axis++)
+    {
+        srp.orig[axis] = (sse_t*)(data->origin[axis]);
+        srp.dir[axis] = (sse_t*)(data->direction[axis]);
+        srp.inv_dir[axis] = (sse_t*)(data->inverseDirection[axis]);
+        srp.normal[axis] = (sse_t*)(data->normal[axis]);
+    }
+    srp.minT = (sse_t*)(data->minT);
+    
+    int first = RayPacket::SSE_MaxSize;
+    int last = -1;
+    #pragma unroll(RayPacket::SSE_MaxSize)    
+    for(int smd=0; smd<RayPacket::SSE_MaxSize; smd++)
+    {
+        sse_t dgt0[3];
+        sse_t tnear[3];
+        sse_t tfar[3];
+        sse_t tnear_unpadded[3];
+        sse_t tfar_unpadded[3];
+            
+        #pragma unroll(3)
+        for(int axis=0; axis<3; axis++)
+        {
+            dgt0[axis] = cmp4_ge(srp.dir[axis][smd], zero4());
+            sse_t t0 = mul4(sub4(zero4(), srp.orig[axis][smd]), 
srp.inv_dir[axis][smd]);
+            sse_t t1 = mul4(sub4(set4(octdata->dims[axis]), 
srp.orig[axis][smd]), srp.inv_dir[axis][smd]);
+            sse_t t1p = mul4(sub4(set4(octdata->padded_dims[axis]), 
srp.orig[axis][smd]), srp.inv_dir[axis][smd]);
+
+            tnear_unpadded[axis] = mask4(dgt0[axis], t0, t1);
+            tfar_unpadded[axis] = mask4(dgt0[axis], t1, t0);
+            tnear[axis] = mask4(dgt0[axis], t0, t1p);
+            tfar[axis] = mask4(dgt0[axis], t1p, t0);
+        }
+        
+        sse_t tenter_unpadded = max4(max4(tnear_unpadded[0], 
tnear_unpadded[1]), tnear_unpadded[2]);
+        sse_t texit_unpadded = min4(min4(tfar_unpadded[0], 
tfar_unpadded[1]), tfar_unpadded[2]);
+                       
+        if (_mm_movemask_ps(_mm_cmple_ps(tenter_unpadded, texit_unpadded)) 
== 0)       //if none of them were valid
+            continue;
+               
+        first = MIN(first, smd);
+        last = smd;
+    }
+    last++;
+    
+    if (first >= last)
+        return;
+        
+    //cerr << "root node: first = " << (int)first << ", last = " << 
(int)last << endl;
+       
+    unsigned int index_trace[octdata->get_max_depth() + 1];
+    Vec3i cell(0,0,0);
+    bvh_octnode(rays, srp, first, last, cell, octdata->get_cap_depth(), 0, 
0, index_trace);
+}
+
+bool IsosurfaceOctreeVolume::bvh_octnode(RayPacket& rays, SSERayPacket& srp, 
char first, char last, 
+            const Vec3i& cell, int stop_depth, int depth, unsigned int 
index, unsigned int index_trace[]) const
+{
+    //cerr << "octnode " << (int)depth << ", " << index << "; first=" << 
(int)first << ",last=" << (int)last << endl;
+    OctNode& node = octdata->get_node(depth, index);
+    Vec3i child_cell = cell;
+    int child_bit = octdata->get_child_bit_depth(depth);
+    
+    index_trace[depth] = index;
+
+    //intersect all children in order
+    #pragma unroll(2)
+    for(int midplane_x=0; midplane_x!=2; midplane_x++)
+    {
+        int target_x;
+        if (midplane_x - rays.getSign(0,0))
+        {
+            target_x = 4;
+            child_cell.data[0] = cell.data[0] | child_bit;
+        }
+        else
+        {
+            target_x = 0;
+            child_cell.data[0] = cell.data[0];
+        }
+        #pragma unroll(2)
+        for(int midplane_y=0; midplane_y!=2; midplane_y++)
+        {
+            int target_xy;
+            if (midplane_y - rays.getSign(0,1))
+            {
+                target_xy = target_x | 2;
+                child_cell.data[1] = cell.data[1] | child_bit;
+            }
+            else
+            {
+                target_xy = target_x;
+                child_cell.data[1] = cell.data[1];
+            }
+            #pragma unroll(2)
+            for(int midplane_z=0; midplane_z!=2; midplane_z++)
+            {
+                int target_child;
+                if (midplane_z - rays.getSign(0,2))
+                {
+                    target_child = target_xy | 1;
+                    child_cell.data[2] = cell.data[2] | child_bit;
+                }
+                else
+                {
+                    target_child = target_xy;
+                    child_cell.data[2] = cell.data[2];
+                }

+                char newfirst, newlast;
+                Vector pmin(child_cell.data[0], child_cell.data[1], 
child_cell.data[2]);
+                Vector pmax(child_cell.data[0]+child_bit, 
child_cell.data[1]+child_bit, child_cell.data[2]+child_bit);
+                intersect_octant(srp, first, last, newfirst, newlast, pmin, 
pmax);
+                
+                //cerr << "newfirst=" << (int)newfirst << ", newlast=" << 
(int)newlast << endl;
+                
+                if (newfirst < newlast && octdata->get_isovalue() >= 
node.mins[target_child] && octdata->get_isovalue() <= node.maxs[target_child])
+                {
+                    if (node.offsets[target_child]==-1)
+                    {
+                        if (bvh_octleaf(rays, srp, newfirst, newlast, 
child_cell, stop_depth, depth, node.values[target_child], index_trace))
+                            return true;
+                    }
+                    else
+                    {
+                        unsigned int child_idx = node.children_start + 
node.offsets[target_child];
+                        if (depth == octdata->get_pre_cap_depth())     //cap
+                        {
+                            if (bvh_octcap(rays, srp, newfirst, newlast, 
child_cell, stop_depth, depth+1, child_idx, index_trace))
+                                return true;
+                        }
+                        else
+                        {
+                            if (bvh_octnode(rays, srp, newfirst, newlast, 
child_cell, stop_depth, depth+1, child_idx, index_trace))
+                                return true;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return false;
+}
+
+bool IsosurfaceOctreeVolume::bvh_octleaf(RayPacket& rays, SSERayPacket& srp, 
char first, char last, 
+            const Vec3i& cell, int stop_depth, int depth, ST value, unsigned 
int index_trace[]) const
+{
+}
+
+bool IsosurfaceOctreeVolume::bvh_octcap(RayPacket& rays, SSERayPacket& srp, 
char first, char last, 
+            const Vec3i& cell, int stop_depth, int depth, unsigned int 
index, unsigned int index_trace[]) const
+{
+    //cerr << "octcap " << index << ", first=" << (int)first << ",last=" << 
(int)last << endl;
+    OctCap& cap = octdata->get_cap(index);
+    Vec3i child_cell = cell;
+    index_trace[depth] = index;
+    
+    //intersect all children in order
+    #pragma unroll(2)
+    for(int midplane_x=0; midplane_x<2; midplane_x++)
+    {
+        int target_x;
+        if (midplane_x - rays.getSign(0,0))
+        {
+            target_x = 4;
+            child_cell.data[0] = cell.data[0] | 1;
+        }
+        else
+        {
+            target_x = 0;
+            child_cell.data[0] = cell.data[0];
+        }
+        #pragma unroll(2)
+        for(int midplane_y=0; midplane_y<2; midplane_y++)
+        {
+            int target_xy;
+            if (midplane_y - rays.getSign(0,1))
+            {
+                target_xy = target_x | 2;
+                child_cell.data[1] = cell.data[1] | 1;
+            }
+            else
+            {
+                target_xy = target_x;
+                child_cell.data[1] = cell.data[1];
+            }
+            #pragma unroll(2)
+            for(int midplane_z=0; midplane_z<2; midplane_z++)
+            {
+                int target_child;
+                if (midplane_z - rays.getSign(0,2))
+                {
+                    target_child = target_xy | 1;
+                    child_cell.data[2] = cell.data[2] | 1;
+                }
+                else
+                {
+                    target_child = target_xy;
+                    child_cell.data[2] = cell.data[2];
+                }

+                sse_t child_tenter[RayPacket::SSE_MaxSize];
+                sse_t child_texit[RayPacket::SSE_MaxSize];
+                sse_t hitmask[RayPacket::SSE_MaxSize];
+                char newfirst, newlast;
+                Vector cmin(child_cell.data[0], child_cell.data[1], 
child_cell.data[2]);
+                Vector cmax(child_cell.data[0]+1, child_cell.data[1]+1, 
child_cell.data[2]+1);
+                intersect_octant(srp, first, last, newfirst, newlast, cmin, 
cmax, child_tenter, child_texit, hitmask);
+                
+                if (newfirst >= newlast)
+                    continue;
+                    
+#ifdef USE_OCTREE_DATA
+                float rho[2][2][2];
+                ST min_rho, max_rho, this_rho;
+                min_rho = max_rho = this_rho = cap.values[target_child];
+                rho[0][0][0] = static_cast<float>(this_rho);
+                int prev_depth = depth-1;
+                Vec3i offset(0,0,1);
+                octvol_fill_cell(cap, 1);
+#else  
+                //use original grid data
+                float rho[2][2][2];
+                ST min_rho, max_rho;
+#define MYDATA octdata->indata
+                min_rho = max_rho = lookup_safe(MYDATA, child_cell.data[0], 
child_cell.data[1], child_cell.data[2]);
+                rho[0][0][0] = static_cast<float>(min_rho);
+                for(int c=1; c<8; c++)
+                {
+                    Vec3i offset((c&4)!=0, (c&2)!=0, c&1);
+                    Vec3i neighboridx = child_cell + offset;
+                    ST this_rho = lookup_safe(MYDATA, neighboridx.data[0], 
neighboridx.data[1], neighboridx.data[2]);
+                    rho[offset.data[0]][offset.data[1]][offset.data[2]] = 
static_cast<float>(this_rho);
+                    min_rho = MIN(this_rho, min_rho);
+                    max_rho = MAX(this_rho, max_rho);
+                }
+#endif    
+
+                if (octdata->get_isovalue() >= min_rho && 
octdata->get_isovalue() <= max_rho)
+                {
+                    //cerr << "in cap " << (unsigned long)(&cap) << ", 
octant " << target_child << endl;
+                    IsosurfaceImplicit::sse_intersect(rays, srp, newfirst, 
newlast, cmin, cmax, rho, 
+                        octdata->get_isovalue(), child_tenter, child_texit, 
hitmask, this, PrimitiveCommon::getMaterial());
+                }
+            }
+        }
+    }
+    return false;
+}
 
+#endif  //#ifdef MANTA_SSE

Modified: trunk/Model/Primitives/IsosurfaceOctreeVolume.h
==============================================================================
--- trunk/Model/Primitives/IsosurfaceOctreeVolume.h     (original)
+++ trunk/Model/Primitives/IsosurfaceOctreeVolume.h     Wed Jun  7 09:06:10 
2006
@@ -8,6 +8,14 @@
 #include <Core/Color/Color.h>
 #include <Interface/Texture.h>
 #include <Model/Primitives/OctreeVolume.h>
+#include <Interface/RayPacket.h>
+
+#include <MantaSSE.h>
+
+#ifdef MANTA_SSE
+#include <Core/Math/SSEDefs.h>
+#endif
+
 
 namespace Manta
 {
@@ -44,6 +52,94 @@
                           const Vector& orig, const Vector& dir, const 
Vector& inv_dir, int res, 
                           int depth, unsigned int cap_index, unsigned int 
index_trace[], Vec3i& cell, const float tenter, 
                           const float texit) const;
+                    
+#ifdef MANTA_SSE
+                       void packet_intersect_implicit_bvh(RayPacket& rays) 
const;
+            
+            bool bvh_octnode(RayPacket& rays, SSERayPacket& srp, char first, 
char last, 
+                            const Vec3i& cell, int stop_depth, int depth, 
unsigned int index, 
+                            unsigned int index_trace[]) const;
+                            
+            bool bvh_octleaf(RayPacket& rays, SSERayPacket& srp, char first, 
char last, 
+                                        const Vec3i& cell, int stop_depth, 
int depth, ST value, 
+                                        unsigned int index_trace[]) const;
+            
+            bool bvh_octcap(RayPacket& rays, SSERayPacket& srp, char first, 
char last, 
+                const Vec3i& cell, int stop_depth, int depth, unsigned int 
index, 
+                unsigned int index_trace[]) const;
+            
+            inline void intersect_octant(SSERayPacket& srp, char first, char 
last, 
+                char& newfirst, char& newlast, const Vector& min, const 
Vector& max) const
+            {
+                newfirst = last;
+                newlast = first;                
+                for(char smd=first; smd<last; smd++)
+                {
+                    sse_t dgt0[3];
+                    sse_t tnear[3];
+                    sse_t tfar[3];
+                        
+                    #pragma unroll(3)
+                    for(int axis=0; axis<3; axis++)
+                    {
+                        dgt0[axis] = cmp4_ge(srp.dir[axis][smd], zero4());   
    //use signs?
+                        sse_t t0 = mul4(sub4(set4(min[axis]), 
srp.orig[axis][smd]), srp.inv_dir[axis][smd]);
+                        sse_t t1 = mul4(sub4(set4(max[axis]), 
srp.orig[axis][smd]), srp.inv_dir[axis][smd]);
+                        tnear[axis] = mask4(dgt0[axis], t0, t1);
+                        tfar[axis] = mask4(dgt0[axis], t1, t0);
+                    }
+                    
+                    sse_t tenter = max4(max4(tnear[0], tnear[1]), tnear[2]);
+                    sse_t texit = min4(min4(tfar[0], tfar[1]), tfar[2]);
+                        
+                    if (_mm_movemask_ps(cmp4_le(tenter, texit)) != 0)  //if 
any hit
+                    {
+                        newfirst = MIN(newfirst, smd);
+                        newlast = smd;
+                    }
+                }
+                newlast++;
+            } 
+            
+            inline void intersect_octant(SSERayPacket& srp, char first, char 
last, 
+                    char& newfirst, char& newlast, const Vector& min, 
+                    const Vector& max, sse_t tenter[], sse_t texit[], sse_t 
hitmask[]) const
+            {
+                #pragma unroll(RayPacket::SSE_MaxSize)
+                for(char smd=0; smd<RayPacket::SSE_MaxSize; smd++)
+                    hitmask[smd] = zero4();
+                    
+                newfirst = last;
+                newlast = first;
+                for(char smd=first; smd<last; smd++)
+                {
+                    sse_t dgt0[3];
+                    sse_t tnear[3];
+                    sse_t tfar[3];
+                        
+                    #pragma unroll(3)
+                    for(int axis=0; axis<3; axis++)
+                    {
+                        dgt0[axis] = cmp4_ge(srp.dir[axis][smd], zero4());   
    //use signs?
+                        sse_t t0 = mul4(sub4(set4(min[axis]), 
srp.orig[axis][smd]), srp.inv_dir[axis][smd]);
+                        sse_t t1 = mul4(sub4(set4(max[axis]), 
srp.orig[axis][smd]), srp.inv_dir[axis][smd]);
+                        tnear[axis] = mask4(dgt0[axis], t0, t1);
+                        tfar[axis] = mask4(dgt0[axis], t1, t0);
+                    }
+                    
+                    tenter[smd] = max4(max4(tnear[0], tnear[1]), tnear[2]);
+                    texit[smd] = min4(min4(tfar[0], tfar[1]), tfar[2]);
+                    
+                    hitmask[smd] = cmp4_le(tenter[smd], texit[smd]);
+                    if (_mm_movemask_ps(hitmask[smd]) != 0)    //if any hit
+                    {
+                        newfirst = MIN(newfirst, smd);
+                        newlast = smd;
+                    }
+                }
+                newlast++;
+            }                 
+#endif                                           
     };
 };
 

Modified: trunk/Model/Primitives/OctreeVolume.h
==============================================================================
--- trunk/Model/Primitives/OctreeVolume.h       (original)
+++ trunk/Model/Primitives/OctreeVolume.h       Wed Jun  7 09:06:10 2006
@@ -328,7 +328,8 @@
                 if (depth == pre_cap_depth)
                 {
                     index = node.children_start + node.offsets[target_child];
-                    int target_child = ((p.data[0] & 1) << 2) | ((p.data[1] 
& 1) << 1) | (p.data[2] & 1);
+                    int target_child = ((p.data[0] & 1) << 
+                    2) | ((p.data[1] & 1) << 1) | (p.data[2] & 1);
                     return 
steps[current_timestep].caps[index].values[target_child];
                 }
                 




  • [MANTA] r1099 - in trunk: Core Core/Math Interface Model/Groups Model/Intersections Model/Primitives, knolla, 06/07/2006

Archive powered by MHonArc 2.6.16.

Top of page