manta - [MANTA] r1099 - in trunk: Core Core/Math Interface Model/Groups Model/Intersections Model/Primitives

Closed list
Subscribers: 0
Owners

sparker

thiago

Subscribe
Unsubscribe
Info
Admin
Archive

Post

Shared documents

Manta Interactive Ray Tracer Development Mailing List

Text archives Help

[MANTA] r1099 - in trunk: Core Core/Math Interface Model/Groups Model/Intersections Model/Primitives

From: knolla@sci.utah.edu
To: manta@sci.utah.edu
Subject: [MANTA] r1099 - in trunk: Core Core/Math Interface Model/Groups Model/Intersections Model/Primitives
Date: Wed, 7 Jun 2006 09:06:16 -0600 (MDT)

Author: knolla
Date: Wed Jun  7 09:06:10 2006
New Revision: 1099

Added:
   trunk/Core/Math/SSEDefs.h
Modified:
   trunk/Core/CMakeLists.txt
   trunk/Interface/RayPacket.h
   trunk/Model/Groups/SSEKDTree.cc
   trunk/Model/Intersections/IsosurfaceImplicit.cc
   trunk/Model/Intersections/IsosurfaceImplicit.h
   trunk/Model/Primitives/IsosurfaceOctreeVolume.cc
   trunk/Model/Primitives/IsosurfaceOctreeVolume.h
   trunk/Model/Primitives/OctreeVolume.h
Log:
implemented first pass at SSE octree using implicit BVH. Slow and buggy.

Modified: trunk/Core/CMakeLists.txt
==============================================================================
--- trunk/Core/CMakeLists.txt   (original)
+++ trunk/Core/CMakeLists.txt   Wed Jun  7 09:06:10 2006
@@ -48,6 +48,7 @@
      Math/Noise.cc
      Math/ipow.h
      Math/CatmullRomInterpolator.h
+     Math/SSEDefs.h
      )
SET (CORE_SOURCES ${CORE_SOURCES}
      Util/Args.h

Added: trunk/Core/Math/SSEDefs.h
==============================================================================
--- (empty file)
+++ trunk/Core/Math/SSEDefs.h   Wed Jun  7 09:06:10 2006
@@ -0,0 +1,251 @@
+// SSEDefs.h
+// A comprehensive set of macros for SSE
+
+#ifndef _MANTA_SSEDEFS_H_
+#define _MANTA_SSEDEFS_H_
+
+#ifdef MANTA_SSE
+#include <xmmintrin.h>
+#include <Core/Util/Align.h>
+#include <Core/Geometry/vecdefs.h>
+#include <Core/Geometry/Vector.h>
+
+typedef __m128 sse_t;
+typedef __m128i sse_int_t;
+
+//add to these macros as necessary.
+#define or4 _mm_or_ps
+#define or4i _mm_or_si128
+#define and4 _mm_and_ps
+#define and4i _mm_and_si128
+#define andnot4 _mm_andnot_ps
+#define andnot4i _mm_andnot_si128
+#define mul4 _mm_mul_ps
+#define add4 _mm_add_ps
+#define sub4 _mm_sub_ps
+#define min4 _mm_min_ps
+#define max4 _mm_max_ps
+#define set4 _mm_set_ps1
+#define set44 _mm_set_ps
+#define set4i _mm_set1_epi32
+#define set44i _mm_set_epi32
+#define zero4 _mm_setzero_ps
+#define getmask4 _mm_movemask_ps
+#define cmp4_ge _mm_cmpge_ps
+#define cmp4_le _mm_cmple_ps
+#define cmp4_gt _mm_cmpgt_ps
+#define cmp4_lt _mm_cmplt_ps
+#define cmp4_eq _mm_cmpeq_ps
+
+//AARONBAD - this should really be somewhere in Core/Math
+#ifndef MIN
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#endif
+#ifndef MAX
+#define MAX(a,b) (((a)>(b))?(a):(b))
+#endif
+
+namespace Manta
+{
+    static const MANTA_ALIGN(16) sse_t _mm_eps = _mm_set_ps1(1e-5);
+    static const MANTA_ALIGN(16) sse_t _mm_minus_eps = _mm_set_ps1(-1e-5);
+    static const MANTA_ALIGN(16) sse_t _mm_epsilon = _mm_set_ps1(1e-5);
+    static const MANTA_ALIGN(16) sse_t _mm_one = _mm_set_ps1(1.f);
+    static const MANTA_ALIGN(16) sse_t _mm_zero = _mm_set_ps1(0.f);
+    static const MANTA_ALIGN(16) sse_t _mm_one_half = _mm_set_ps1(0.5f);
+    static const MANTA_ALIGN(16) sse_t _mm_two = _mm_set_ps1(2.f);
+    static const MANTA_ALIGN(16) sse_t _mm_256 = _mm_set_ps1(256);
+    static const MANTA_ALIGN(16) sse_t _mm_255 = _mm_set_ps1(255);
+    static const MANTA_ALIGN(16) sse_t _mm_infty = _mm_set_ps1(9.9e9999f);
+    static const MANTA_ALIGN(16) sse_t _mm_minus_infty =
_mm_set_ps1(-9.9e9999f);
+    static const int _mm_intabsmask = 0x7fffffff;
+    static const int _mm_intsignbit = 0x80000000;
+    static const int _mm_inttruemask = 0xffffffff;
+    static const MANTA_ALIGN(16) sse_t _mm_absmask =
_mm_set_ps1((float&)_mm_intabsmask);
+    static const MANTA_ALIGN(16) sse_t _mm_signbit =
_mm_set_ps1((float&)_mm_intsignbit);
+    static const MANTA_ALIGN(16) sse_t _mm_true =
_mm_set_ps1((float&)_mm_inttruemask);
+    static const int minusOneI = -1;
+    static const MANTA_ALIGN(16) sse_t _mm_minusOne = _mm_set_ps1((float
&)minusOneI);
+
+    /*! return v0 + t*(v1-v0) */
+    inline sse_t lerp4(const sse_t t, const sse_t v0, const sse_t v1)
+    {
+      return add4(v0,mul4(t,sub4(v1,v0)));
+    }
+
+    inline sse_t dot4(const sse_t &ox, const sse_t &oy, const sse_t &oz,
+                      const sse_t &vx, const sse_t &vy, const sse_t &vz)
+    {
+      return _mm_add_ps(_mm_add_ps(_mm_mul_ps(vx,ox),
+                                   _mm_mul_ps(vy,oy)),
+                        _mm_mul_ps(vz,oz));
+    }
+
+    //equivalent to mask ? dest : src
+    inline sse_t mask4(const sse_t &mask, const sse_t &dest, const sse_t
&src)
+    {
+        return or4(and4(mask,dest),andnot4(mask,src));
+    }
+
+    //equivalent to ~mask ? dest : src
+    inline sse_t masknot4(const sse_t &mask, const sse_t &dest, const sse_t
&src)
+    {
+        return or4(andnot4(mask,dest),and4(mask,src));
+    }
+
+    inline sse_int_t mask4i(const sse_int_t &mask, const sse_int_t &dest,
const sse_int_t &src)
+    {
+      return or4i(and4i(mask,dest),andnot4i(mask,src));
+    }
+
+    inline sse_t abs4(const sse_t &v)
+    {
+      return andnot4(_mm_signbit,v);
+    }
+
+    inline sse_t accurateReciprocal(const sse_t v)
+    {
+      const sse_t rcp = _mm_rcp_ps(v);
+      return
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+    }
+
+    inline sse_t uberAccurateReciprocal(const sse_t v)
+    {
+        sse_t rcp = _mm_rcp_ps(v);
+        rcp =
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+        rcp =
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+        rcp =
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+        return rcp;
+    }
+
+    inline sse_t oneOver(const sse_t v)
+    {
+      const sse_t rcp = _mm_rcp_ps(v);
+      return
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+    }
+
+    inline sse_t accurateReciprocalSqrt(const sse_t v)
+    {
+      const sse_t rcp_sqrt = _mm_rsqrt_ps(v);
+      const sse_t one_point_five = _mm_set_ps1(1.5f);
+      return _mm_mul_ps(rcp_sqrt, _mm_sub_ps(one_point_five,
_mm_mul_ps(_mm_one_half, _mm_mul_ps(_mm_mul_ps(rcp_sqrt,rcp_sqrt),v))));
+    }
+
+
+    inline sse_t reciprocal(const sse_t v)
+    {
+      return _mm_rcp_ps(v);
+    }
+
+    inline sse_t lin(const sse_t &base,
+                     float u,const sse_t du,
+                     float v, const sse_t dv)
+    {
+      return _mm_add_ps(_mm_add_ps(base,_mm_mul_ps(_mm_set_ps1(u),du)),
+                        _mm_mul_ps(_mm_set_ps1(v),dv));
+    }
+
+    inline sse_t lin(const sse_t &base,
+                     const sse_t &u,const sse_t du,
+                     const sse_t &v, const sse_t dv)
+    {
+      return _mm_add_ps(_mm_add_ps(base,_mm_mul_ps(u,du)),
+                        _mm_mul_ps(v,dv));
+    }
+
+
+    inline sse_t dot4(const sse_t &a, const sse_t &b)
+    {
+      const sse_t xyzw = _mm_mul_ps(a,b);
+      const sse_t zwxy = _mm_shuffle_ps(xyzw,xyzw,_MM_SHUFFLE(1,0,3,2));
+      const sse_t xz_yw_zx_wy = _mm_add_ps(zwxy,xyzw);
+      const sse_t wy_zx_yw_xz =
_mm_shuffle_ps(xz_yw_zx_wy,xz_yw_zx_wy,_MM_SHUFFLE(0,1,2,3));
+      const sse_t res = _mm_add_ps(xz_yw_zx_wy,wy_zx_yw_xz);
+      return res;
+    }
+
+    inline float dot1(const sse_t &a, const sse_t &b)
+    {
+      const sse_t d = dot4(a,b);
+      return (float&)d;
+    }
+
+    inline void normalize(sse_t &v)
+    {
+      const sse_t dot = dot4(v,v);
+      v = _mm_mul_ps(v, _mm_rsqrt_ps(dot));
+    };
+
+    inline float sqrLength(const sse_t &a)
+    {
+      return dot1(a,a);
+    }
+
+    inline float length(sse_t a)
+    {
+      const sse_t d = dot4(a,a);
+      const sse_t v = _mm_sqrt_ps(d);
+      return (float &)v;
+    }
+
+    inline float min4f(sse_t t)
+    {
+      MANTA_ALIGN(16)
+      float f[4];
+      _mm_store_ps(f,t);
+      return MIN(MIN(f[0],f[1]),MIN(f[2],f[3]));
+    }
+
+    inline float max4f(sse_t t)
+    {
+      MANTA_ALIGN(16)
+      float f[4];
+      _mm_store_ps(f,t);
+      return MAX(MAX(f[0],f[1]),MAX(f[2],f[3]));
+    }
+
+    inline float min3f(sse_t t)
+    {
+      MANTA_ALIGN(16)
+      float f[4];
+      _mm_store_ps(f,t);
+      return MIN(MIN(f[0],f[1]),f[2]);
+    }
+
+    /*! get horizontal minimum of a whole 4-way simd */
+    inline float max3f(sse_t t)
+    {
+      MANTA_ALIGN(16)
+      float f[4];
+      _mm_store_ps(f,t);
+      return MAX(MAX(f[0],f[1]),f[2]);
+    }
+
+    inline float simd_component(sse_t t, int offset)
+    {
+        MANTA_ALIGN(16)
+        float f[4];
+        _mm_store_ps(f,t);
+        return f[offset];
+    }
+
+    inline Vec3f as_Vec3f(sse_t t)
+    {
+        MANTA_ALIGN(16)
+        float f[4];
+        _mm_store_ps(f,t);
+        return Vec3f(f[2], f[1], f[0]);
+    }
+
+    inline Vector as_Vector(sse_t t)
+    {
+        MANTA_ALIGN(16)
+        float f[4];
+        _mm_store_ps(f,t);
+        return Vector(f[2], f[1], f[0]);
+    }
+
+};
+
+#endif  //#ifdef MANTA_SSE
+#endif
\ No newline at end of file

Modified: trunk/Interface/RayPacket.h
==============================================================================
--- trunk/Interface/RayPacket.h (original)
+++ trunk/Interface/RayPacket.h Wed Jun  7 09:06:10 2006
@@ -14,6 +14,10 @@
#include <RayPacketParameters.h>
#include <MantaSSE.h>

+#ifdef MANTA_SSE
+#include <Core/Math/SSEDefs.h>
+#endif
+
// #include <sgi_stl_warnings_off.h>
// #include <algorithm>
// #include <sgi_stl_warnings_on.h>
@@ -22,12 +26,14 @@
   class Material;
   class RenderContext;

-
   class MANTA_ALIGN(16) RayPacketData {
   public:
     enum {
       MaxScratchpadSize = SCRATCHPAD_MAXSIZE,
-      MaxSize              = RAYPACKET_MAXSIZE
+      MaxSize              = RAYPACKET_MAXSIZE,
+#ifdef MANTA_SSE
+      SSE_MaxSize   = RAYPACKET_MAXSIZE/4,
+#endif
     };
     RayPacketData()
       {
@@ -55,10 +61,10 @@
     MANTA_ALIGN(16) Real inverseDirection[3][MaxSize];
     MANTA_ALIGN(16) Real minT[MaxSize];

-    Real image[2][MaxSize];
-    Real normal[3][MaxSize];
-    Real hitPosition[3][MaxSize];
-    Real texCoords[3][MaxSize];
+    MANTA_ALIGN(16) Real image[2][MaxSize];
+    MANTA_ALIGN(16) Real normal[3][MaxSize];
+    MANTA_ALIGN(16) Real hitPosition[3][MaxSize];
+    MANTA_ALIGN(16) Real texCoords[3][MaxSize];

     // Color-based arrays
@@ -72,11 +78,26 @@
     // Char-based arrays
     char scratchpad_data[MaxSize][MaxScratchpadSize];
   };
+
+#ifdef MANTA_SSE
+  struct MANTA_ALIGN(16) SSERayPacket
+  {
+    sse_t* orig[3];
+    sse_t* dir[3];
+    sse_t* inv_dir[3];
+    sse_t* normal[3];
+    sse_t* minT;
+  };
+#endif

   class RayPacket {
   public:
     enum {
       MaxSize               = RayPacketData::MaxSize,
+
+#ifdef MANTA_SSE
+      SSE_MaxSize           = RayPacketData::SSE_MaxSize,
+#endif

       // Flags.
       ConstantOrigin        = 0x0001,
@@ -280,9 +301,22 @@
     {
       if(flags & HaveInverseDirections)
         return;
+#if MANTA_SSE
+      int b = (rayBegin + 3) & (~3);
+      int e = rayEnd & (~3);
+#pragma unroll(3)
+      for(int j=0; j<3; j++)
+      {
+        sse_t* dirs = (sse_t*)data->direction[j];
+        sse_t* inv_dirs = (sse_t*)data->inverseDirection[j];
+        for(int smd=b; smd<e; smd++)
+          inv_dirs[smd] = oneOver(dirs[smd]);
+      }
+#else
       for(int i=rayBegin;i<rayEnd;i++)
         for(int j=0;j<3;j++)
           data->inverseDirection[j][i] = 1./data->direction[j][i];
+#endif
       flags |= HaveInverseDirections;
     }
     void computeSigns()

Modified: trunk/Model/Groups/SSEKDTree.cc
==============================================================================
--- trunk/Model/Groups/SSEKDTree.cc     (original)
+++ trunk/Model/Groups/SSEKDTree.cc     Wed Jun  7 09:06:10 2006
@@ -48,6 +48,8 @@

#include <stdio.h>

+#include <Core/Math/SSEDefs.h>
+
using namespace Manta;
using namespace Manta::Kdtree;
using namespace SCIRun;
@@ -551,11 +553,14 @@
}

// Newton-Raphson Iteration for 1/x
+/*
+AARONBAD - now implemented in Core/Math/SSEDefs.h
inline __m128 accurateReciprocal(const __m128& v) {

   const __m128 rcp = _mm_rcp_ps(v);
   return _mm_sub_ps(_mm_add_ps(rcp, rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
}
+*/

void intersectTriangleEdgeSSE( IntersectPacket* result,
                                const RayPacket& rays,

Modified: trunk/Model/Intersections/IsosurfaceImplicit.cc
==============================================================================
--- trunk/Model/Intersections/IsosurfaceImplicit.cc     (original)
+++ trunk/Model/Intersections/IsosurfaceImplicit.cc     Wed Jun  7 09:06:10
2006
@@ -1,8 +1,10 @@

#include <Model/Intersections/IsosurfaceImplicit.h>
#include <Core/Math/CubicSolver.h>
+#include <iostream>

using namespace Manta;
+using namespace std;

//From Steven Parker's 1997 RTRT isosurface intersection
bool IsosurfaceImplicit::single_intersect(const Vector& orig, const Vector&
dir,
@@ -112,4 +114,153 @@
             + (x-x_0)*(y-y_0)*rho[1][1][1];
}

+
+#ifdef MANTA_SSE
+//SSE packet implementation
+//Based on Marmitt et al. 04, Wald 05 SSE intersections (OpenRT)
+//  as well as Knoll DynRT implementation
+void IsosurfaceImplicit::sse_intersect(RayPacket& rays, SSERayPacket& srp,
+            char first, char last, const Vector& pmin, const Vector& pmax,
float rho[2][2][2],
+            float isovalue, sse_t tenter[], sse_t texit[], sse_t hitmask[],
+            const Manta::Primitive* prim, const Manta::Material* matl)
+{
+    //cerr << "sse_intersect: first=" << (int)first << ",last=" << (int)last
<< endl;
+
+    for(int smd=first; smd<last; smd++)
+    {
+        if (_mm_movemask_ps(hitmask[smd])==0)
+            continue;
+
+        //compute p0, p1
+        sse_t p0[3];
+        sse_t p1[3];
+
+        #pragma unroll(3)
+        for(int axis=0; axis<3; axis++)
+        {
+            p0[axis] = sub4(add4(srp.orig[axis][smd],
mul4(srp.dir[axis][smd], tenter[smd])), set4(pmin[axis]));
+            p1[axis] = sub4(add4(srp.orig[axis][smd],
mul4(srp.dir[axis][smd], texit[smd])), set4(pmin[axis]));
+        }
+
+        CubicPoly4 poly;
+        poly.generate(p0, p1, rho, isovalue);
+
+        sse_t t0 = zero4();
+        sse_t t1 = _mm_one;
+        sse_t D0 = poly.d;
+        sse_t D1 = add4(add4(poly.a,poly.b), add4(poly.c,poly.d));
+
+        //AARONBAD - we'd want something like this to avoid extra work
+        //sse_t sse_thisvoxelmask = and4(hitmask[smd], cmp4_lt(tenter[smd],
srp.minT[smd]));
+
+        //find which rays have differing signs for D0, D1. Only retain the
ones that have same signs?
+        sse_t differingSigns = cmp4_lt(mul4(D0,D1), zero4());
+        sse_t sse_thisvoxelmask = and4(hitmask[smd], differingSigns);
+        int int_thisvoxelmask = _mm_movemask_ps(sse_thisvoxelmask);
+
+        if (int_thisvoxelmask == 0)    //if none of them hit, don't bother
iterating any more
+            continue;
+
+        #define NEUBAUER_ITERATIONS 3
+        #pragma unroll(NEUBAUER_ITERATIONS)
+        for (int i=0;i<NEUBAUER_ITERATIONS;i++)
+        {
+            //compute linear interpolation
+            const sse_t denom = accurateReciprocal(sub4(D0,D1));
+            sse_t t = add4(t0,mul4(mul4(D0,denom), sub4(t1,t0)));
+
+            //re-evaluate
+            sse_t D = poly.eval(t);
+
+            //conditionally store
+            const sse_t frontHalf = _mm_cmplt_ps(mul4(D0,D), zero4());
+            t1 = or4(_mm_and_ps(frontHalf,t), _mm_andnot_ps(frontHalf,t1));
+            t0 = or4(_mm_and_ps(frontHalf,t0), _mm_andnot_ps(frontHalf,t));
+            D1 = or4(_mm_and_ps(frontHalf,D), _mm_andnot_ps(frontHalf,D1));
+            D0 = or4(_mm_and_ps(frontHalf,D0), _mm_andnot_ps(frontHalf,D));
+        }
+
+        //compute hit distance
+        const sse_t denom = accurateReciprocal(sub4(D0,D1));
+        sse_t t = add4(t0, mul4(mul4(D0,denom), sub4(t1,t0)));
+        sse_t hit_t = add4(tenter[smd], mul4(t, sub4(texit[smd],
tenter[smd])));
+
+        //sse_thisvoxelmask = and4(sse_thisvoxelmask,
cmp4_lt(hit_t,srp.minT[smd]));
+        srp.minT[smd] = mask4(sse_thisvoxelmask, hit_t, srp.minT[smd]);
+        int_thisvoxelmask = _mm_movemask_ps(sse_thisvoxelmask);
+        if (int_thisvoxelmask)
+        {
+            sse_t normal[3];
+            sse_normal(rays, srp, smd, normal, pmin, pmax, rho);
+            #pragma unroll(3)
+            for(int axis=0; axis<3; axis++)
+                srp.normal[axis][smd] = mask4(sse_thisvoxelmask,
normal[axis], srp.normal[axis][smd]);
+
+            #pragma unroll(4);
+            for(int ray=0; ray<4; ray++)
+            {
+                if (int_thisvoxelmask & (1<<ray))
+                {
+                    int realray=(smd<<2)+ray;
+                    rays.data->hitMatl[realray] = matl;
+                    rays.data->hitPrim[realray] = prim;
+                }
+            }
+        }
+    }
+}
+
+void IsosurfaceImplicit::sse_normal(RayPacket &ray, SSERayPacket& srp, int
smd,
+            sse_t normal[], const Vector& pmin, const Vector& pmax,
+            const float rho[2][2][2])
+{
+    sse_t phit[3];
+    #pragma unroll(3)
+    for(int axis=0; axis<3; axis++)
+        phit[axis] = add4(srp.orig[axis][smd], mul4(srp.dir[axis][smd],
srp.minT[smd]));
+
+    int axis, U, V;
+
+    //axis=0
+    axis=0;
+    U=1;
+    V=2;
+    sse_t max_m_hit_U = sub4(set4(pmax[U]), phit[U]);
+    sse_t max_m_hit_V = sub4(set4(pmax[V]), phit[V]);
+    sse_t min_m_hit_U = sub4(set4(pmin[U]), phit[U]);
+    sse_t min_m_hit_V = sub4(set4(pmin[V]), phit[V]);
+    normal[axis] = mul4(mul4(max_m_hit_U, max_m_hit_V), set4(rho[1][0][0] -
rho[0][0][0]));
+    normal[axis] = add4(normal[axis], mul4(mul4(min_m_hit_U, max_m_hit_V),
set4(rho[0][1][0] - rho[1][1][0])));
+    normal[axis] = add4(normal[axis], mul4(mul4(max_m_hit_U, min_m_hit_V),
set4(rho[0][0][1] - rho[1][0][1])));
+    normal[axis] = add4(normal[axis], mul4(mul4(min_m_hit_U, min_m_hit_V),
set4(rho[1][1][1] - rho[0][1][1])));
+
+    //axis=1
+    axis=1;
+    U=0;
+    V=2;
+    max_m_hit_U = sub4(set4(pmax[U]), phit[U]);
+    max_m_hit_V = sub4(set4(pmax[V]), phit[V]);
+    min_m_hit_U = sub4(set4(pmin[U]), phit[U]);
+    min_m_hit_V = sub4(set4(pmin[V]), phit[V]);
+    normal[axis] = mul4(mul4(max_m_hit_U, max_m_hit_V), set4(rho[0][1][0] -
rho[0][0][0]));
+    normal[axis] = add4(normal[axis], mul4(mul4(min_m_hit_U, max_m_hit_V),
set4(rho[1][0][0] - rho[1][1][0])));
+    normal[axis] = add4(normal[axis], mul4(mul4(max_m_hit_U, min_m_hit_V),
set4(rho[0][0][1] - rho[0][1][1])));
+    normal[axis] = add4(normal[axis], mul4(mul4(min_m_hit_U, min_m_hit_V),
set4(rho[1][1][1] - rho[1][0][1])));
+
+    //axis=2
+    axis=2;
+    U=0;
+    V=1;
+    max_m_hit_U = sub4(set4(pmax[U]), phit[U]);
+    max_m_hit_V = sub4(set4(pmax[V]), phit[V]);
+    min_m_hit_U = sub4(set4(pmin[U]), phit[U]);
+    min_m_hit_V = sub4(set4(pmin[V]), phit[V]);
+    normal[axis] = mul4(mul4(max_m_hit_U, max_m_hit_V), set4(rho[0][0][1] -
rho[0][0][0]));
+    normal[axis] = add4(normal[axis], mul4(mul4(min_m_hit_U, max_m_hit_V),
set4(rho[1][0][0] - rho[1][0][1])));
+    normal[axis] = add4(normal[axis], mul4(mul4(max_m_hit_U, min_m_hit_V),
set4(rho[0][1][0] - rho[0][1][1])));
+    normal[axis] = add4(normal[axis], mul4(mul4(min_m_hit_U, min_m_hit_V),
set4(rho[1][1][1] - rho[1][1][0])));
+
+}
+
+#endif

Modified: trunk/Model/Intersections/IsosurfaceImplicit.h
==============================================================================
--- trunk/Model/Intersections/IsosurfaceImplicit.h      (original)
+++ trunk/Model/Intersections/IsosurfaceImplicit.h      Wed Jun  7 09:06:10
2006
@@ -4,6 +4,13 @@
#include <Interface/RayPacket.h>
#include <Core/Geometry/Vector.h>
#include <Interface/Material.h>
+#include <Interface/Primitive.h>
+
+#include <MantaSSE.h>
+
+#ifdef MANTA_SSE
+#include <Core/Math/SSEDefs.h>
+#endif

namespace Manta
{
@@ -15,6 +22,119 @@

         static void single_normal(Vector& outNormal, const Vector& pmin,
                       const Vector& pmax, const Vector& p, float
rho[2][2][2]);
+
+        //TODO - non-SSE packet intersection
+
+#ifdef MANTA_SSE
+        static void sse_intersect(RayPacket& rays, SSERayPacket& srp,
+                    char first, char last, const Vector& pmin, const Vector&
pmax, float rho[2][2][2],
+                    float isovalue, sse_t tenter[], sse_t texit[], sse_t
hitmask[],
+                    const Manta::Primitive* prim, const Manta::Material*
matl);
+
+        static void sse_normal(RayPacket &ray, SSERayPacket& srp, int smd,
+                    sse_t normal[], const Vector& pmin, const Vector& pmax,
+                    const float rho[2][2][2]);
+
+        struct CubicPoly4
+        {
+            MANTA_ALIGN(16) sse_t a, b, c, d;
+
+            inline void generate(sse_t p0[], sse_t p1[], const float
voxels_cell[2][2][2], float isovalue)
+            {
+                sse_t e0[3];
+                sse_t e1[3];
+                sse_t d1[3];
+
+                #pragma unroll(3)
+                for(int axis=0; axis<3; axis++)
+                {
+                    e0[axis] = sub4(_mm_one, p0[axis]);
+                    e1[axis] = p0[axis];
+                    d1[axis] = sub4(p1[axis], p0[axis]);
+                }
+
+                sse_t interimROO = mul4(d1[1], d1[2]);
+                const sse_t interimRRR = mul4(d1[0], interimROO);
+                const sse_t interimNRR = mul4(e1[0], interimROO);
+                const sse_t interimORR = mul4(e0[0], interimROO);
+
+                interimROO = mul4(e1[1], d1[2]);
+                const sse_t interimRNR = mul4(d1[0], interimROO);
+                const sse_t interimNNR = mul4(e1[0], interimROO);
+                const sse_t interimONR = mul4(e0[0], interimROO);
+
+                interimROO = mul4(d1[1], e1[2]);
+                const sse_t interimRRN = mul4(d1[0], interimROO);
+                const sse_t interimNRN = mul4(e1[0], interimROO);
+                const sse_t interimORN = mul4(e0[0], interimROO);
+
+                interimROO = mul4(d1[1], e0[2]);
+                const sse_t interimRRO = mul4(d1[0], interimROO);
+                const sse_t interimNRO = mul4(e1[0], interimROO);
+                const sse_t interimORO = mul4(e0[0], interimROO);
+
+                interimROO = mul4(e0[1], d1[2]);
+                const sse_t interimROR = mul4(d1[0], interimROO);
+                const sse_t interimNOR = mul4(e1[0], interimROO);
+                const sse_t interimOOR = mul4(e0[0], interimROO);
+
+                interimROO = mul4(d1[0], e1[1]);
+                const sse_t interimRNN = mul4(interimROO, e1[2]);
+                const sse_t interimRNO = mul4(interimROO, e0[2]);
+
+                interimROO = mul4(d1[0], e0[1]);
+                const sse_t interimRON = mul4(interimROO, e1[2]);
+                interimROO = mul4(interimROO, e0[2]);
+
+                a = mul4(interimRRR, _mm_set_ps1(+ voxels_cell[1][1][1]
+                        - voxels_cell[0][1][1]
+                                - voxels_cell[1][0][1]
+                                + voxels_cell[0][0][1]
+                                - voxels_cell[1][1][0]
+                                + voxels_cell[0][1][0]
+                                + voxels_cell[1][0][0]
+                                - voxels_cell[0][0][0]));
+
+                b =
mul4(_mm_set_ps1(voxels_cell[1][1][1]),add4(add4(interimNRR,interimRNR),interimRRN));
+                b =
sub4(b,mul4(_mm_set_ps1(voxels_cell[1][1][0]),sub4(add4(interimNRR,interimRNR),interimRRO)));
+                b =
sub4(b,mul4(_mm_set_ps1(voxels_cell[1][0][1]),add4(sub4(interimNRR,interimROR),interimRRN)));
+                b =
add4(b,mul4(_mm_set_ps1(voxels_cell[1][0][0]),sub4(sub4(interimNRR,interimROR),interimRRO)));
+                b =
add4(b,mul4(_mm_set_ps1(voxels_cell[0][1][1]),sub4(sub4(interimORR,interimRNR),interimRRN)));
+                b =
sub4(b,mul4(_mm_set_ps1(voxels_cell[0][1][0]),add4(sub4(interimORR,interimRNR),interimRRO)));
+                b =
sub4(b,mul4(_mm_set_ps1(voxels_cell[0][0][1]),sub4(add4(interimORR,interimROR),interimRRN)));
+                b =
add4(b,mul4(_mm_set_ps1(voxels_cell[0][0][0]),add4(add4(interimORR,interimROR),interimRRO)));
+
+                c =
mul4(_mm_set_ps1(voxels_cell[1][1][1]),add4(add4(interimRNN,interimNRN),interimNNR));
+                c =
add4(c,mul4(_mm_set_ps1(voxels_cell[1][1][0]),sub4(add4(interimRNO,interimNRO),interimNNR)));
+                c =
add4(c,mul4(_mm_set_ps1(voxels_cell[1][0][1]),add4(sub4(interimRON,interimNRN),interimNOR)));
+                c =
add4(c,mul4(_mm_set_ps1(voxels_cell[1][0][0]),sub4(sub4(interimROO,interimNRO),interimNOR)));
+                c =
add4(c,mul4(_mm_set_ps1(voxels_cell[0][1][1]),sub4(add4(interimORN,interimONR),interimRNN)));
+                c =
sub4(c,mul4(_mm_set_ps1(voxels_cell[0][1][0]),add4(sub4(interimRNO,interimORO),interimONR)));
+                c =
sub4(c,mul4(_mm_set_ps1(voxels_cell[0][0][1]),sub4(add4(interimRON,interimORN),interimOOR)));
+                c =
sub4(c,mul4(_mm_set_ps1(voxels_cell[0][0][0]),add4(add4(interimROO,interimORO),interimOOR)));
+
+                d = add4(mul4(e1[0], add4(mul4(e1[1], add4(mul4(e1[2],
+                               set4(voxels_cell[1][1][1])),
+                               mul4(e0[2], set4(voxels_cell[1][1][0])))),
+                               mul4(e0[1], add4(mul4(e1[2],
set4(voxels_cell[1][0][1])),
+                                   mul4(e0[2],
set4(voxels_cell[1][0][0])))))),
+                mul4(e0[0], add4(mul4(e1[1], add4(mul4(e1[2],
+                    set4(voxels_cell[0][1][1])),
+                    mul4(e0[2], set4(voxels_cell[0][1][0])))),
+                    mul4(e0[1], add4(mul4(e1[2], set4(voxels_cell[0][0][1])),
+                        mul4(e0[2], set4(voxels_cell[0][0][0])))))));

+
+                d = sub4(d, set4(isovalue));
+            }
+
+            inline sse_t eval(const sse_t &t) const
+            {
+                return add4(mul4(add4(mul4(add4(mul4(a,t),b),t),c),t),d);
+            }
+        };
+
+
+#endif
     };
};

Modified: trunk/Model/Primitives/IsosurfaceOctreeVolume.cc
==============================================================================
--- trunk/Model/Primitives/IsosurfaceOctreeVolume.cc    (original)
+++ trunk/Model/Primitives/IsosurfaceOctreeVolume.cc    Wed Jun  7 09:06:10
2006
@@ -7,6 +7,10 @@
#include <Interface/RayPacket.h>
#include <Model/Intersections/IsosurfaceImplicit.h>

+#ifdef MANTA_SSE
+#include <Core/Math/SSEDefs.h>
+#endif
+
#define USE_OCTREE_DATA

#define MIN4(a,b,c,d) min(min(a,b), min(c,d));
@@ -109,8 +113,12 @@

void IsosurfaceOctreeVolume::intersect(RenderContext const &context,
RayPacket &packet) const
{
+#if 1
+       packet_intersect_implicit_bvh(packet);
+#else
     for ( int i = packet.rayBegin; i < packet.rayEnd; i++ )
         single_intersect(packet, i);
+#endif
}

void IsosurfaceOctreeVolume::single_intersect(RayPacket& rays, int
which_one) const
@@ -305,7 +313,6 @@
#endif
             if (node.offsets[target_child]==-1)
             {
-                return false;
                 if (single_traverse_leaf(rays, which_one, orig, dir,
inv_dir, stop_depth,
                     next_depth, depth, node.values[target_child],
                     child_cell, index_trace, child_cell, child_tenter,
child_texit))
@@ -735,4 +742,271 @@
     return false;
}

+/*
+       Begin packet intersection code, for SSE packets only.
+*/
+#ifdef MANTA_SSE
+
+//an octree traversal based on implicit BVH
+void IsosurfaceOctreeVolume::packet_intersect_implicit_bvh(RayPacket& rays)
const
+{
+    rays.computeInverseDirections();
+    rays.computeSigns();
+    RayPacketData* data = rays.data;
+    SSERayPacket srp;
+
+    //intersect the global bounding box: find first, last
+    //  this will require a special-case AABB intersection
+
+    #pragma unroll(3)
+    for(int axis=0; axis<3; axis++)
+    {
+        srp.orig[axis] = (sse_t*)(data->origin[axis]);
+        srp.dir[axis] = (sse_t*)(data->direction[axis]);
+        srp.inv_dir[axis] = (sse_t*)(data->inverseDirection[axis]);
+        srp.normal[axis] = (sse_t*)(data->normal[axis]);
+    }
+    srp.minT = (sse_t*)(data->minT);
+
+    int first = RayPacket::SSE_MaxSize;
+    int last = -1;
+    #pragma unroll(RayPacket::SSE_MaxSize)
+    for(int smd=0; smd<RayPacket::SSE_MaxSize; smd++)
+    {
+        sse_t dgt0[3];
+        sse_t tnear[3];
+        sse_t tfar[3];
+        sse_t tnear_unpadded[3];
+        sse_t tfar_unpadded[3];
+
+        #pragma unroll(3)
+        for(int axis=0; axis<3; axis++)
+        {
+            dgt0[axis] = cmp4_ge(srp.dir[axis][smd], zero4());
+            sse_t t0 = mul4(sub4(zero4(), srp.orig[axis][smd]),
srp.inv_dir[axis][smd]);
+            sse_t t1 = mul4(sub4(set4(octdata->dims[axis]),
srp.orig[axis][smd]), srp.inv_dir[axis][smd]);
+            sse_t t1p = mul4(sub4(set4(octdata->padded_dims[axis]),
srp.orig[axis][smd]), srp.inv_dir[axis][smd]);
+
+            tnear_unpadded[axis] = mask4(dgt0[axis], t0, t1);
+            tfar_unpadded[axis] = mask4(dgt0[axis], t1, t0);
+            tnear[axis] = mask4(dgt0[axis], t0, t1p);
+            tfar[axis] = mask4(dgt0[axis], t1p, t0);
+        }
+
+        sse_t tenter_unpadded = max4(max4(tnear_unpadded[0],
tnear_unpadded[1]), tnear_unpadded[2]);
+        sse_t texit_unpadded = min4(min4(tfar_unpadded[0],
tfar_unpadded[1]), tfar_unpadded[2]);
+
+        if (_mm_movemask_ps(_mm_cmple_ps(tenter_unpadded, texit_unpadded))
== 0)       //if none of them were valid
+            continue;
+
+        first = MIN(first, smd);
+        last = smd;
+    }
+    last++;
+
+    if (first >= last)
+        return;
+
+    //cerr << "root node: first = " << (int)first << ", last = " <<
(int)last << endl;
+
+    unsigned int index_trace[octdata->get_max_depth() + 1];
+    Vec3i cell(0,0,0);
+    bvh_octnode(rays, srp, first, last, cell, octdata->get_cap_depth(), 0,
0, index_trace);
+}
+
+bool IsosurfaceOctreeVolume::bvh_octnode(RayPacket& rays, SSERayPacket& srp,
char first, char last,
+            const Vec3i& cell, int stop_depth, int depth, unsigned int
index, unsigned int index_trace[]) const
+{
+    //cerr << "octnode " << (int)depth << ", " << index << "; first=" <<
(int)first << ",last=" << (int)last << endl;
+    OctNode& node = octdata->get_node(depth, index);
+    Vec3i child_cell = cell;
+    int child_bit = octdata->get_child_bit_depth(depth);
+
+    index_trace[depth] = index;
+
+    //intersect all children in order
+    #pragma unroll(2)
+    for(int midplane_x=0; midplane_x!=2; midplane_x++)
+    {
+        int target_x;
+        if (midplane_x - rays.getSign(0,0))
+        {
+            target_x = 4;
+            child_cell.data[0] = cell.data[0] | child_bit;
+        }
+        else
+        {
+            target_x = 0;
+            child_cell.data[0] = cell.data[0];
+        }
+        #pragma unroll(2)
+        for(int midplane_y=0; midplane_y!=2; midplane_y++)
+        {
+            int target_xy;
+            if (midplane_y - rays.getSign(0,1))
+            {
+                target_xy = target_x | 2;
+                child_cell.data[1] = cell.data[1] | child_bit;
+            }
+            else
+            {
+                target_xy = target_x;
+                child_cell.data[1] = cell.data[1];
+            }
+            #pragma unroll(2)
+            for(int midplane_z=0; midplane_z!=2; midplane_z++)
+            {
+                int target_child;
+                if (midplane_z - rays.getSign(0,2))
+                {
+                    target_child = target_xy | 1;
+                    child_cell.data[2] = cell.data[2] | child_bit;
+                }
+                else
+                {
+                    target_child = target_xy;
+                    child_cell.data[2] = cell.data[2];
+                }
+
+                char newfirst, newlast;
+                Vector pmin(child_cell.data[0], child_cell.data[1],
child_cell.data[2]);
+                Vector pmax(child_cell.data[0]+child_bit,
child_cell.data[1]+child_bit, child_cell.data[2]+child_bit);
+                intersect_octant(srp, first, last, newfirst, newlast, pmin,
pmax);
+
+                //cerr << "newfirst=" << (int)newfirst << ", newlast=" <<
(int)newlast << endl;
+
+                if (newfirst < newlast && octdata->get_isovalue() >=
node.mins[target_child] && octdata->get_isovalue() <= node.maxs[target_child])
+                {
+                    if (node.offsets[target_child]==-1)
+                    {
+                        if (bvh_octleaf(rays, srp, newfirst, newlast,
child_cell, stop_depth, depth, node.values[target_child], index_trace))
+                            return true;
+                    }
+                    else
+                    {
+                        unsigned int child_idx = node.children_start +
node.offsets[target_child];
+                        if (depth == octdata->get_pre_cap_depth())     //cap
+                        {
+                            if (bvh_octcap(rays, srp, newfirst, newlast,
child_cell, stop_depth, depth+1, child_idx, index_trace))
+                                return true;
+                        }
+                        else
+                        {
+                            if (bvh_octnode(rays, srp, newfirst, newlast,
child_cell, stop_depth, depth+1, child_idx, index_trace))
+                                return true;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return false;
+}
+
+bool IsosurfaceOctreeVolume::bvh_octleaf(RayPacket& rays, SSERayPacket& srp,
char first, char last,
+            const Vec3i& cell, int stop_depth, int depth, ST value, unsigned
int index_trace[]) const
+{
+}
+
+bool IsosurfaceOctreeVolume::bvh_octcap(RayPacket& rays, SSERayPacket& srp,
char first, char last,
+            const Vec3i& cell, int stop_depth, int depth, unsigned int
index, unsigned int index_trace[]) const
+{
+    //cerr << "octcap " << index << ", first=" << (int)first << ",last=" <<
(int)last << endl;
+    OctCap& cap = octdata->get_cap(index);
+    Vec3i child_cell = cell;
+    index_trace[depth] = index;
+
+    //intersect all children in order
+    #pragma unroll(2)
+    for(int midplane_x=0; midplane_x<2; midplane_x++)
+    {
+        int target_x;
+        if (midplane_x - rays.getSign(0,0))
+        {
+            target_x = 4;
+            child_cell.data[0] = cell.data[0] | 1;
+        }
+        else
+        {
+            target_x = 0;
+            child_cell.data[0] = cell.data[0];
+        }
+        #pragma unroll(2)
+        for(int midplane_y=0; midplane_y<2; midplane_y++)
+        {
+            int target_xy;
+            if (midplane_y - rays.getSign(0,1))
+            {
+                target_xy = target_x | 2;
+                child_cell.data[1] = cell.data[1] | 1;
+            }
+            else
+            {
+                target_xy = target_x;
+                child_cell.data[1] = cell.data[1];
+            }
+            #pragma unroll(2)
+            for(int midplane_z=0; midplane_z<2; midplane_z++)
+            {
+                int target_child;
+                if (midplane_z - rays.getSign(0,2))
+                {
+                    target_child = target_xy | 1;
+                    child_cell.data[2] = cell.data[2] | 1;
+                }
+                else
+                {
+                    target_child = target_xy;
+                    child_cell.data[2] = cell.data[2];
+                }
+
+                sse_t child_tenter[RayPacket::SSE_MaxSize];
+                sse_t child_texit[RayPacket::SSE_MaxSize];
+                sse_t hitmask[RayPacket::SSE_MaxSize];
+                char newfirst, newlast;
+                Vector cmin(child_cell.data[0], child_cell.data[1],
child_cell.data[2]);
+                Vector cmax(child_cell.data[0]+1, child_cell.data[1]+1,
child_cell.data[2]+1);
+                intersect_octant(srp, first, last, newfirst, newlast, cmin,
cmax, child_tenter, child_texit, hitmask);
+
+                if (newfirst >= newlast)
+                    continue;
+
+#ifdef USE_OCTREE_DATA
+                float rho[2][2][2];
+                ST min_rho, max_rho, this_rho;
+                min_rho = max_rho = this_rho = cap.values[target_child];
+                rho[0][0][0] = static_cast<float>(this_rho);
+                int prev_depth = depth-1;
+                Vec3i offset(0,0,1);
+                octvol_fill_cell(cap, 1);
+#else
+                //use original grid data
+                float rho[2][2][2];
+                ST min_rho, max_rho;
+#define MYDATA octdata->indata
+                min_rho = max_rho = lookup_safe(MYDATA, child_cell.data[0],
child_cell.data[1], child_cell.data[2]);
+                rho[0][0][0] = static_cast<float>(min_rho);
+                for(int c=1; c<8; c++)
+                {
+                    Vec3i offset((c&4)!=0, (c&2)!=0, c&1);
+                    Vec3i neighboridx = child_cell + offset;
+                    ST this_rho = lookup_safe(MYDATA, neighboridx.data[0],
neighboridx.data[1], neighboridx.data[2]);
+                    rho[offset.data[0]][offset.data[1]][offset.data[2]] =
static_cast<float>(this_rho);
+                    min_rho = MIN(this_rho, min_rho);
+                    max_rho = MAX(this_rho, max_rho);
+                }
+#endif
+
+                if (octdata->get_isovalue() >= min_rho &&
octdata->get_isovalue() <= max_rho)
+                {
+                    //cerr << "in cap " << (unsigned long)(&cap) << ",
octant " << target_child << endl;
+                    IsosurfaceImplicit::sse_intersect(rays, srp, newfirst,
newlast, cmin, cmax, rho,
+                        octdata->get_isovalue(), child_tenter, child_texit,
hitmask, this, PrimitiveCommon::getMaterial());
+                }
+            }
+        }
+    }
+    return false;
+}

+#endif  //#ifdef MANTA_SSE

Modified: trunk/Model/Primitives/IsosurfaceOctreeVolume.h
==============================================================================
--- trunk/Model/Primitives/IsosurfaceOctreeVolume.h     (original)
+++ trunk/Model/Primitives/IsosurfaceOctreeVolume.h     Wed Jun  7 09:06:10
2006
@@ -8,6 +8,14 @@
#include <Core/Color/Color.h>
#include <Interface/Texture.h>
#include <Model/Primitives/OctreeVolume.h>
+#include <Interface/RayPacket.h>
+
+#include <MantaSSE.h>
+
+#ifdef MANTA_SSE
+#include <Core/Math/SSEDefs.h>
+#endif
+

namespace Manta
{
@@ -44,6 +52,94 @@
                           const Vector& orig, const Vector& dir, const
Vector& inv_dir, int res,
                           int depth, unsigned int cap_index, unsigned int
index_trace[], Vec3i& cell, const float tenter,
                           const float texit) const;
+
+#ifdef MANTA_SSE
+                       void packet_intersect_implicit_bvh(RayPacket& rays)
const;
+
+            bool bvh_octnode(RayPacket& rays, SSERayPacket& srp, char first,
char last,
+                            const Vec3i& cell, int stop_depth, int depth,
unsigned int index,
+                            unsigned int index_trace[]) const;
+
+            bool bvh_octleaf(RayPacket& rays, SSERayPacket& srp, char first,
char last,
+                                        const Vec3i& cell, int stop_depth,
int depth, ST value,
+                                        unsigned int index_trace[]) const;
+
+            bool bvh_octcap(RayPacket& rays, SSERayPacket& srp, char first,
char last,
+                const Vec3i& cell, int stop_depth, int depth, unsigned int
index,
+                unsigned int index_trace[]) const;
+
+            inline void intersect_octant(SSERayPacket& srp, char first, char
last,
+                char& newfirst, char& newlast, const Vector& min, const
Vector& max) const
+            {
+                newfirst = last;
+                newlast = first;
+                for(char smd=first; smd<last; smd++)
+                {
+                    sse_t dgt0[3];
+                    sse_t tnear[3];
+                    sse_t tfar[3];
+
+                    #pragma unroll(3)
+                    for(int axis=0; axis<3; axis++)
+                    {
+                        dgt0[axis] = cmp4_ge(srp.dir[axis][smd], zero4());
    //use signs?
+                        sse_t t0 = mul4(sub4(set4(min[axis]),
srp.orig[axis][smd]), srp.inv_dir[axis][smd]);
+                        sse_t t1 = mul4(sub4(set4(max[axis]),
srp.orig[axis][smd]), srp.inv_dir[axis][smd]);
+                        tnear[axis] = mask4(dgt0[axis], t0, t1);
+                        tfar[axis] = mask4(dgt0[axis], t1, t0);
+                    }
+
+                    sse_t tenter = max4(max4(tnear[0], tnear[1]), tnear[2]);
+                    sse_t texit = min4(min4(tfar[0], tfar[1]), tfar[2]);
+
+                    if (_mm_movemask_ps(cmp4_le(tenter, texit)) != 0)  //if
any hit
+                    {
+                        newfirst = MIN(newfirst, smd);
+                        newlast = smd;
+                    }
+                }
+                newlast++;
+            }
+
+            inline void intersect_octant(SSERayPacket& srp, char first, char
last,
+                    char& newfirst, char& newlast, const Vector& min,
+                    const Vector& max, sse_t tenter[], sse_t texit[], sse_t
hitmask[]) const
+            {
+                #pragma unroll(RayPacket::SSE_MaxSize)
+                for(char smd=0; smd<RayPacket::SSE_MaxSize; smd++)
+                    hitmask[smd] = zero4();
+
+                newfirst = last;
+                newlast = first;
+                for(char smd=first; smd<last; smd++)
+                {
+                    sse_t dgt0[3];
+                    sse_t tnear[3];
+                    sse_t tfar[3];
+
+                    #pragma unroll(3)
+                    for(int axis=0; axis<3; axis++)
+                    {
+                        dgt0[axis] = cmp4_ge(srp.dir[axis][smd], zero4());
    //use signs?
+                        sse_t t0 = mul4(sub4(set4(min[axis]),
srp.orig[axis][smd]), srp.inv_dir[axis][smd]);
+                        sse_t t1 = mul4(sub4(set4(max[axis]),
srp.orig[axis][smd]), srp.inv_dir[axis][smd]);
+                        tnear[axis] = mask4(dgt0[axis], t0, t1);
+                        tfar[axis] = mask4(dgt0[axis], t1, t0);
+                    }
+
+                    tenter[smd] = max4(max4(tnear[0], tnear[1]), tnear[2]);
+                    texit[smd] = min4(min4(tfar[0], tfar[1]), tfar[2]);
+
+                    hitmask[smd] = cmp4_le(tenter[smd], texit[smd]);
+                    if (_mm_movemask_ps(hitmask[smd]) != 0)    //if any hit
+                    {
+                        newfirst = MIN(newfirst, smd);
+                        newlast = smd;
+                    }
+                }
+                newlast++;
+            }
+#endif
     };
};

Modified: trunk/Model/Primitives/OctreeVolume.h
==============================================================================
--- trunk/Model/Primitives/OctreeVolume.h       (original)
+++ trunk/Model/Primitives/OctreeVolume.h       Wed Jun  7 09:06:10 2006
@@ -328,7 +328,8 @@
                 if (depth == pre_cap_depth)
                 {
                     index = node.children_start + node.offsets[target_child];
-                    int target_child = ((p.data[0] & 1) << 2) | ((p.data[1]
& 1) << 1) | (p.data[2] & 1);
+                    int target_child = ((p.data[0] & 1) <<
+                    2) | ((p.data[1] & 1) << 1) | (p.data[2] & 1);
                     return
steps[current_timestep].caps[index].values[target_child];
                 }

[MANTA] r1099 - in trunk: Core Core/Math Interface Model/Groups Model/Intersections Model/Primitives, knolla, 06/07/2006

Archive powered by MHonArc 2.6.16.