Manta Interactive Ray Tracer Development Mailing List

Text archives Help


[MANTA] r1158 - in trunk: Core/Math Model/Primitives


Chronological Thread 
  • From: bigler@sci.utah.edu
  • To: manta@sci.utah.edu
  • Subject: [MANTA] r1158 - in trunk: Core/Math Model/Primitives
  • Date: Thu, 27 Jul 2006 15:03:01 -0600 (MDT)

Author: bigler
Date: Thu Jul 27 15:03:01 2006
New Revision: 1158

Modified:
   trunk/Core/Math/SSEDefs.h
   trunk/Model/Primitives/Sphere.cc
Log:

Core/Math/SSEDefs.h

  Added a few functions: xor4, set4l, unpacklo, unpackhi, maskmove4i,
  load44, load44i, store44, store44i, cast_i2f, cast_f2i, and sqrt4.

  Removed MIN and MAX macros.  One should really use the SCIRun
  versions found in SCIRun/Core/Math/MinMax.h.

  Switched the low and high variable names for _mm_set1_epi64x.  I
  think this is what it is supposed to be.

  Replaced min4f, max4f, min3f, and max3f with faster versions that
  don't have branches.

  simd_cerr should return void.

Model/Primitives/Sphere.cc

  Mainly cosmetic.  Replaced all the SSE function names with
  corresponding abreviations found in SSEDefs.h.

  Fixed indentation.


Modified: trunk/Core/Math/SSEDefs.h
==============================================================================
--- trunk/Core/Math/SSEDefs.h   (original)
+++ trunk/Core/Math/SSEDefs.h   Thu Jul 27 15:03:01 2006
@@ -23,6 +23,7 @@
 #define and4i _mm_and_si128
 #define andnot4 _mm_andnot_ps
 #define andnot4i _mm_andnot_si128
+#define xor4 _mm_xor_ps
 #define mul4 _mm_mul_ps
 #define add4 _mm_add_ps
 #define sub4 _mm_sub_ps
@@ -32,21 +33,24 @@
 #define set44 _mm_set_ps
 #define set4i _mm_set1_epi32
 #define set44i _mm_set_epi32
+#define set4l _mm_set1_epi64x // See comments below
+#define unpacklo _mm_unpacklo_ps // (a,b) => [a0, b0, a1, b1]
+#define unpackhi _mm_unpackhi_ps // (a,b) => [a2, b2, a3, b3]
 #define zero4 _mm_setzero_ps
 #define getmask4 _mm_movemask_ps
+#define maskmove4i _mm_maskmoveu_si128 // destination need not be aligned
 #define cmp4_ge _mm_cmpge_ps
 #define cmp4_le _mm_cmple_ps
 #define cmp4_gt _mm_cmpgt_ps
 #define cmp4_lt _mm_cmplt_ps
 #define cmp4_eq _mm_cmpeq_ps
-
-//AARONBAD - this should really be somewhere in Core/Math
-#ifndef MIN
-#define MIN(a,b) (((a)<(b))?(a):(b))
-#endif
-#ifndef MAX
-#define MAX(a,b) (((a)>(b))?(a):(b))
-#endif
+#define load44 _mm_load_ps
+#define load44i _mm_load_si128
+#define store44 _mm_store_ps
+#define store44i _mm_store_si128
+#define cast_i2f _mm_castsi128_ps
+#define cast_f2i _mm_castps_si128
+#define sqrt4 _mm_sqrt_ps
 
 namespace Manta
 {
@@ -64,12 +68,18 @@
 
 #if defined(__x86_64) && defined(__INTEL_COMPILER)
 
-    static inline
-       __m128i _mm_set1_epi64x(long long val)
+  // Stores 1 64 bit int twice in 4 32 bit ints
+  //
+  // [high32, low32, high32, low32]
+  //
+  // I'm not sure about the high versus low, but the code does what
+  // GCC does (for example, when copying 64 bit pointers).
+  static inline
+  __m128i _mm_set1_epi64x(long long val)
        {
-           const int low  = (0xFFFFFFFF00000000L & val) >> 32 ;
-           const int high = (        0xFFFFFFFFL & val);
-           return _mm_set_epi32(low, high, low, high);
+           const int high = (0xFFFFFFFF00000000L & val) >> 32 ;
+           const int low  = (        0xFFFFFFFFL & val);
+           return _mm_set_epi32(high, low, high, low);
        }
     
 #endif
@@ -211,41 +221,76 @@
     inline float length(sse_t a)
     {
       const sse_t d = dot4(a,a);
-      const sse_t v = _mm_sqrt_ps(d);
+      const sse_t v = sqrt4(d);
       return (float &)v;
     }
 
+    // Get horizontal minimum of all components
     inline float min4f(sse_t t)
     {
-      MANTA_ALIGN(16)
-      float f[4];
-      _mm_store_ps(f,t);
-      return MIN(MIN(f[0],f[1]),MIN(f[2],f[3]));
+      // a = (t0, t0, t1, t1)
+      sse_t a = unpacklo(t,t);
+      // b = (t2, t2, t3, t3)
+      sse_t b = unpackhi(t,t);
+      // c = (min(t0,t2), min(t0, t2), min(t1, t3), min(t1, t3))
+      sse_t c = min4(a, b);
+      // The movehl will move the high 2 values to the low 2 values.
+      // This will allow us to compare min(t0,t2) with min(t1, t3).
+      sse_t min = _mm_min_ss(c, _mm_movehl_ps(c, c));
+      // Return the first value.
+      return *((float*)&min);
     }
 
+    // Get horizontal maximum of all components
     inline float max4f(sse_t t)
     {
-      MANTA_ALIGN(16)
-      float f[4];
-      _mm_store_ps(f,t);
-      return MAX(MAX(f[0],f[1]),MAX(f[2],f[3]));
+      // a = (t0, t0, t1, t1)
+      sse_t a = unpacklo(t,t);
+      // b = (t2, t2, t3, t3)
+      sse_t b = unpackhi(t,t);
+      // c = (max(t0,t2), max(t0, t2), max(t1, t3), max(t1, t3))
+      sse_t c = max4(a, b);
+      // The movehl will move the high 2 values to the low 2 values.
+      // This will allow us to compare max(t0,t2) with max(t1, t3).
+      sse_t max = _mm_max_ss(c, _mm_movehl_ps(c, c));
+      // Return the first value.
+      return *((float*)&max);
     }
 
+    // Get horizontal minimum of the frist 3 components
     inline float min3f(sse_t t)
     {
-      MANTA_ALIGN(16)
-      float f[4];
-      _mm_store_ps(f,t);
-      return MIN(MIN(f[0],f[1]),f[2]);
+      // a = (t0, t0, t1, t1), you might be tempted to make this a movelh,
+      // but you need t1 to be in the 2 index in order to use movehl
+      // later.
+      sse_t a = unpacklo(t,t);
+      // b = (t2, t3, t2, t3)
+      sse_t b = _mm_movehl_ps(t,t);
+      // c = (min(t0,t2), min(t0, t3), min(t1, t2), min(t1, t3))
+      sse_t c = min4(a, b);
+      // The movehl will move the high 2 values to the low 2 values.
+      // This will allow us to compare t1 with min(t0, t2).
+      sse_t min = _mm_min_ss(c, _mm_movehl_ps(a, a));
+      // Return the first value.
+      return *((float*)&min);
     }
 
-    /*! get horizontal minimum of a whole 4-way simd */
+    // Get horizontal maximum of the frist 3 components
     inline float max3f(sse_t t)
     {
-      MANTA_ALIGN(16)
-      float f[4];
-      _mm_store_ps(f,t);
-      return MAX(MAX(f[0],f[1]),f[2]);
+      // a = (t0, t0, t1, t1), you might be tempted to make this a movelh,
+      // but you need t1 to be in the 2 index in order to use movehl
+      // later.
+      sse_t a = unpacklo(t,t);
+      // b = (t2, t3, t2, t3)
+      sse_t b = _mm_movehl_ps(t,t);
+      // c = (max(t0,t2), max(t0, t3), max(t1, t2), max(t1, t3))
+      sse_t c = max4(a, b);
+      // The movehl will move the high 2 values to the low 2 values.
+      // This will allow us to compare t1 with max(t0, t2).
+      sse_t max = _mm_max_ss(c, _mm_movehl_ps(a, a));
+      // Return the first value.
+      return *((float*)&max);
     }
 
     inline float simd_component(sse_t t, int offset)
@@ -256,7 +301,7 @@
         return f[offset];
     }
     
-    inline float simd_cerr(sse_t t)
+    inline void simd_cerr(sse_t t)
     {  
         MANTA_ALIGN(16)
         float f[4];

Modified: trunk/Model/Primitives/Sphere.cc
==============================================================================
--- trunk/Model/Primitives/Sphere.cc    (original)
+++ trunk/Model/Primitives/Sphere.cc    Thu Jul 27 15:03:01 2006
@@ -13,7 +13,7 @@
 using namespace std;
 
 Sphere::Sphere(Material* material, const Vector& center, Real radius)
-: PrimitiveCommon(material, this), center(center), radius(radius)
+  : PrimitiveCommon(material, this), center(center), radius(radius)
 {
   inv_radius = 1/radius;
 }
@@ -103,97 +103,97 @@
         }
         RayPacketData* data = rays.data;
         for(;i<e;i+=4){
-          __m128 Ox = _mm_sub_ps(_mm_load_ps(&data->origin[0][i]), 
_mm_set1_ps(center[0]));
-          __m128 Oy = _mm_sub_ps(_mm_load_ps(&data->origin[1][i]), 
_mm_set1_ps(center[1]));
-          __m128 Oz = _mm_sub_ps(_mm_load_ps(&data->origin[2][i]), 
_mm_set1_ps(center[2]));
-          __m128 Dx = _mm_load_ps(&data->direction[0][i]);
-          __m128 Dy = _mm_load_ps(&data->direction[1][i]);
-          __m128 Dz = _mm_load_ps(&data->direction[2][i]);
-          __m128 B = _mm_add_ps(_mm_add_ps(_mm_mul_ps(Ox, Dx), 
_mm_mul_ps(Oy, Dy)), _mm_mul_ps(Oz, Dz));
-          __m128 disc = _mm_sub_ps(_mm_mul_ps(B, B), _mm_set1_ps(C));
-          __m128 hit = _mm_cmpge_ps(disc, _mm_setzero_ps());
-          if(_mm_movemask_ps(hit) == 0)
+          sse_t Ox = sub4( load44(&data->origin[0][i]), set4(center[0]));
+          sse_t Oy = sub4( load44(&data->origin[1][i]), set4(center[1]));
+          sse_t Oz = sub4( load44(&data->origin[2][i]), set4(center[2]));
+          sse_t Dx = load44(&data->direction[0][i]);
+          sse_t Dy = load44(&data->direction[1][i]);
+          sse_t Dz = load44(&data->direction[2][i]);
+          sse_t B = add4(add4(mul4(Ox, Dx), mul4(Oy, Dy)), mul4(Oz, Dz));
+          sse_t disc = sub4(mul4(B, B), set4(C));
+          sse_t hit = cmp4_ge(disc, zero4());
+          if(getmask4(hit) == 0)
             continue;
 
-          __m128 r = _mm_sqrt_ps(disc);
+          sse_t r = sqrt4(disc);
           // -(r+B)   The xor negates the value
-          __m128 t0 = _mm_xor_ps(_mm_add_ps(r, B), 
(__m128)_mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
-          __m128 hit0 = _mm_and_ps(hit, _mm_cmpgt_ps(t0, 
_mm_set1_ps(T_EPSILON)));
-          if(_mm_movemask_ps(hit0) != 0){
-            hit0 = _mm_and_ps(hit, _mm_cmplt_ps(t0, 
_mm_load_ps(&data->minT[i])));
-           if(_mm_movemask_ps(hit0) == 15){
-             _mm_store_ps(&data->minT[i], t0);
+          sse_t t0 = xor4(add4(r, B), cast_i2f(set4i(0x80000000)));
+          sse_t hit0 = and4(hit, cmp4_gt(t0, set4(T_EPSILON)));
+          if(getmask4(hit0) != 0){
+            hit0 = and4(hit, cmp4_lt(t0, load44(&data->minT[i])));
+            if(getmask4(hit0) == 15){
+              store44(&data->minT[i], t0);
 #ifdef __x86_64
-             _mm_store_si128((__m128i*)&data->hitMatl[i], 
_mm_set1_epi64x((long long)getMaterial()));
-             _mm_store_si128((__m128i*)&data->hitMatl[i+2], 
_mm_set1_epi64x((long long)getMaterial()));
-             _mm_store_si128((__m128i*)&data->hitPrim[i], 
_mm_set1_epi64x((long long)this));
-             _mm_store_si128((__m128i*)&data->hitPrim[i+2], 
_mm_set1_epi64x((long long)this));
-             _mm_store_si128((__m128i*)&data->hitTex[i], 
_mm_set1_epi64x((long long)getTexCoordMapper()));
-             _mm_store_si128((__m128i*)&data->hitTex[i+2], 
_mm_set1_epi64x((long long)getTexCoordMapper()));
+              store44i((sse_int_t*)&data->hitMatl[i], set4l((long 
long)getMaterial()));
+              store44i((sse_int_t*)&data->hitMatl[i+2], set4l((long 
long)getMaterial()));
+              store44i((sse_int_t*)&data->hitPrim[i], set4l((long 
long)this));
+              store44i((sse_int_t*)&data->hitPrim[i+2], set4l((long 
long)this));
+              store44i((sse_int_t*)&data->hitTex[i], set4l((long 
long)getTexCoordMapper()));
+              store44i((sse_int_t*)&data->hitTex[i+2], set4l((long 
long)getTexCoordMapper()));
 #else
-             _mm_store_si128((__m128i*)&data->hitMatl[i], 
_mm_set1_epi32((int)getMaterial()));
-             _mm_store_si128((__m128i*)&data->hitPrim[i], 
_mm_set1_epi32((int)this));
-             _mm_store_si128((__m128i*)&data->hitTex[i], 
_mm_set1_epi32((int)getTexCoordMapper()));
+              store44i((sse_int_t*)&data->hitMatl[i], 
set4i((int)getMaterial()));
+              store44i((sse_int_t*)&data->hitPrim[i], set4i((int)this));
+              store44i((sse_int_t*)&data->hitTex[i], 
set4i((int)getTexCoordMapper()));
 #endif
-           } else {
-             _mm_maskmoveu_si128((__m128i)_mm_castps_si128(t0), 
(__m128i)_mm_castps_si128(hit0), (char*)&data->minT[i]);
+            } else {
+              maskmove4i(cast_f2i(t0), cast_f2i(hit0), 
(char*)&data->minT[i]);
 #ifdef __x86_64
-             __m128i lohit = (__m128i)_mm_castps_si128(_mm_unpacklo_ps(hit0, 
hit0));
-             __m128i hihit = (__m128i)_mm_castps_si128(_mm_unpackhi_ps(hit0, 
hit0));
-             _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()), 
lohit, (char*)&data->hitMatl[i]);
-             _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()), 
hihit, (char*)&data->hitMatl[i+2]);
-         
-             _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), lohit, 
(char*)&data->hitPrim[i]);
-             _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), hihit, 
(char*)&data->hitPrim[i+2]);
-             _mm_maskmoveu_si128(_mm_set1_epi64x((long 
long)getTexCoordMapper()), lohit, (char*)&data->hitTex[i]);
-             _mm_maskmoveu_si128(_mm_set1_epi64x((long 
long)getTexCoordMapper()), hihit, (char*)&data->hitTex[i+2]);
+              sse_int_t lohit = cast_f2i(unpacklo(hit0, hit0));
+              sse_int_t hihit = cast_f2i(unpackhi(hit0, hit0));
+              maskmove4i(set4l((long long)getMaterial()), lohit, 
(char*)&data->hitMatl[i]);
+              maskmove4i(set4l((long long)getMaterial()), hihit, 
(char*)&data->hitMatl[i+2]);
+
+              maskmove4i(set4l((long long)this), lohit, 
(char*)&data->hitPrim[i]);
+              maskmove4i(set4l((long long)this), hihit, 
(char*)&data->hitPrim[i+2]);
+              maskmove4i(set4l((long long)getTexCoordMapper()), lohit, 
(char*)&data->hitTex[i]);
+              maskmove4i(set4l((long long)getTexCoordMapper()), hihit, 
(char*)&data->hitTex[i+2]);
 #else
-             _mm_maskmoveu_si128(_mm_set1_epi32((int)getMaterial()), 
(__m128i)_mm_castps_si128(hit0), (char*)&data->hitMatl[i]);
-             _mm_maskmoveu_si128(_mm_set1_epi32((int)this), 
(__m128i)_mm_castps_si128(hit0), (char*)&data->hitPrim[i]);
-             _mm_maskmoveu_si128(_mm_set1_epi32((int)getTexCoordMapper()), 
(__m128i)_mm_castps_si128(hit0), (char*)&data->hitTex[i]);
+              maskmove4i(set4i((int)getMaterial()), cast_f2i(hit0), 
(char*)&data->hitMatl[i]);
+              maskmove4i(set4i((int)this), cast_f2i(hit0), 
(char*)&data->hitPrim[i]);
+              maskmove4i(set4i((int)getTexCoordMapper()), cast_f2i(hit0), 
(char*)&data->hitTex[i]);
 #endif
-           }
+            }
             // Mask off rays that successfully hit at t0
             hit = _mm_andnot_ps(hit, hit0);
-            if(_mm_movemask_ps(hit) == 0)
+            if(getmask4(hit) == 0)
               continue;
           }
 
-          __m128 t1 = _mm_sub_ps(r, B);
-          __m128 hit1 = _mm_and_ps(hit, _mm_cmpgt_ps(t1, 
_mm_set1_ps(T_EPSILON)));
-          hit1 = _mm_and_ps(hit1, _mm_cmplt_ps(t1, 
_mm_load_ps(&data->minT[i])));
-         if(_mm_movemask_ps(hit1) == 15){
-           _mm_store_ps(&data->minT[i], t1);
+          sse_t t1 = sub4(r, B);
+          sse_t hit1 = and4(hit, cmp4_gt(t1, set4(T_EPSILON)));
+          hit1 = and4(hit1, cmp4_lt(t1, load44(&data->minT[i])));
+          if(getmask4(hit1) == 15){
+            store44(&data->minT[i], t1);
 #ifdef __x86_64
-           _mm_store_si128((__m128i*)&data->hitMatl[i], 
_mm_set1_epi64x((long long)getMaterial()));
-           _mm_store_si128((__m128i*)&data->hitMatl[i+2], 
_mm_set1_epi64x((long long)getMaterial()));
-           _mm_store_si128((__m128i*)&data->hitPrim[i], 
_mm_set1_epi64x((long long)this));
-           _mm_store_si128((__m128i*)&data->hitPrim[i+2], 
_mm_set1_epi64x((long long)this));
-           _mm_store_si128((__m128i*)&data->hitTex[i], _mm_set1_epi64x((long 
long)getTexCoordMapper()));
-           _mm_store_si128((__m128i*)&data->hitTex[i+2], 
_mm_set1_epi64x((long long)getTexCoordMapper()));
+            store44i((sse_int_t*)&data->hitMatl[i], set4l((long 
long)getMaterial()));
+            store44i((sse_int_t*)&data->hitMatl[i+2], set4l((long 
long)getMaterial()));
+            store44i((sse_int_t*)&data->hitPrim[i], set4l((long long)this));
+            store44i((sse_int_t*)&data->hitPrim[i+2], set4l((long 
long)this));
+            store44i((sse_int_t*)&data->hitTex[i], set4l((long 
long)getTexCoordMapper()));
+            store44i((sse_int_t*)&data->hitTex[i+2], set4l((long 
long)getTexCoordMapper()));
 #else
-           _mm_store_si128((__m128i*)&data->hitMatl[i], 
_mm_set1_epi32((int)getMaterial()));
-           _mm_store_si128((__m128i*)&data->hitPrim[i], 
_mm_set1_epi32((int)this));
-           _mm_store_si128((__m128i*)&data->hitTex[i], 
_mm_set1_epi32((int)getTexCoordMapper()));
+            store44i((sse_int_t*)&data->hitMatl[i], 
set4i((int)getMaterial()));
+            store44i((sse_int_t*)&data->hitPrim[i], set4i((int)this));
+            store44i((sse_int_t*)&data->hitTex[i], 
set4i((int)getTexCoordMapper()));
 #endif
-         } else {
-           _mm_maskmoveu_si128((__m128i)_mm_castps_si128(t1), 
(__m128i)_mm_castps_si128(hit1), (char*)&data->minT[i]);
+          } else {
+            maskmove4i(cast_f2i(t1), cast_f2i(hit1), (char*)&data->minT[i]);
 #ifdef __x86_64
-           __m128i lohit = (__m128i)_mm_castps_si128(_mm_unpacklo_ps(hit1, 
hit1));
-           __m128i hihit = (__m128i)_mm_castps_si128(_mm_unpackhi_ps(hit1, 
hit1));
-           _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()), 
lohit, (char*)&data->hitMatl[i]);
-           _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()), 
hihit, (char*)&data->hitMatl[i+2]);
-         
-           _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), lohit, 
(char*)&data->hitPrim[i]);
-           _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), hihit, 
(char*)&data->hitPrim[i+2]);
-           _mm_maskmoveu_si128(_mm_set1_epi64x((long 
long)getTexCoordMapper()), lohit, (char*)&data->hitTex[i]);
-           _mm_maskmoveu_si128(_mm_set1_epi64x((long 
long)getTexCoordMapper()), hihit, (char*)&data->hitTex[i+2]);
+            sse_int_t lohit = cast_f2i(unpacklo(hit1, hit1));
+            sse_int_t hihit = cast_f2i(unpackhi(hit1, hit1));
+            maskmove4i(set4l((long long)getMaterial()), lohit, 
(char*)&data->hitMatl[i]);
+            maskmove4i(set4l((long long)getMaterial()), hihit, 
(char*)&data->hitMatl[i+2]);
+
+            maskmove4i(set4l((long long)this), lohit, 
(char*)&data->hitPrim[i]);
+            maskmove4i(set4l((long long)this), hihit, 
(char*)&data->hitPrim[i+2]);
+            maskmove4i(set4l((long long)getTexCoordMapper()), lohit, 
(char*)&data->hitTex[i]);
+            maskmove4i(set4l((long long)getTexCoordMapper()), hihit, 
(char*)&data->hitTex[i+2]);
 #else
-           _mm_maskmoveu_si128(_mm_set1_epi32((int)getMaterial()), 
(__m128i)_mm_castps_si128(hit1), (char*)&data->hitMatl[i]);
-           _mm_maskmoveu_si128(_mm_set1_epi32((int)this), 
(__m128i)_mm_castps_si128(hit1), (char*)&data->hitPrim[i]);
-           _mm_maskmoveu_si128(_mm_set1_epi32((int)getTexCoordMapper()), 
(__m128i)_mm_castps_si128(hit1), (char*)&data->hitTex[i]);
+            maskmove4i(set4i((int)getMaterial()), cast_f2i(hit1), 
(char*)&data->hitMatl[i]);
+            maskmove4i(set4i((int)this), cast_f2i(hit1), 
(char*)&data->hitPrim[i]);
+            maskmove4i(set4i((int)getTexCoordMapper()), cast_f2i(hit1), 
(char*)&data->hitTex[i]);
 #endif
-         }
+          }
         }
         for(;i<rays.rayEnd;i++){
           Vector D(rays.getDirection(i));
@@ -229,7 +229,7 @@
       }
 #endif
     }
-    break;
+  break;
   case RayPacket::ConstantOrigin:
     {
       // Rays of constant origin for not normalized directions
@@ -297,98 +297,98 @@
         }
         RayPacketData* data = rays.data;
         for(;i<e;i+=4){
-          __m128 Ox = _mm_sub_ps(_mm_load_ps(&data->origin[0][i]), 
_mm_set1_ps(center[0]));
-          __m128 Oy = _mm_sub_ps(_mm_load_ps(&data->origin[1][i]), 
_mm_set1_ps(center[1]));
-          __m128 Oz = _mm_sub_ps(_mm_load_ps(&data->origin[2][i]), 
_mm_set1_ps(center[2]));
-          __m128 Dx = _mm_load_ps(&data->direction[0][i]);
-          __m128 Dy = _mm_load_ps(&data->direction[1][i]);
-          __m128 Dz = _mm_load_ps(&data->direction[2][i]);
-          __m128 B = _mm_add_ps(_mm_add_ps(_mm_mul_ps(Ox, Dx), 
_mm_mul_ps(Oy, Dy)), _mm_mul_ps(Oz, Dz));
-          __m128 C = _mm_sub_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(Ox, Ox), 
_mm_mul_ps(Oy, Oy)), _mm_mul_ps(Oz, Oz)), _mm_set1_ps(radius*radius));
-          __m128 disc = _mm_sub_ps(_mm_mul_ps(B, B), C);
-          __m128 hit = _mm_cmpge_ps(disc, _mm_setzero_ps());
-          if(_mm_movemask_ps(hit) == 0)
+          sse_t Ox = sub4(load44(&data->origin[0][i]), set4(center[0]));
+          sse_t Oy = sub4(load44(&data->origin[1][i]), set4(center[1]));
+          sse_t Oz = sub4(load44(&data->origin[2][i]), set4(center[2]));
+          sse_t Dx = load44(&data->direction[0][i]);
+          sse_t Dy = load44(&data->direction[1][i]);
+          sse_t Dz = load44(&data->direction[2][i]);
+          sse_t B = add4(add4(mul4(Ox, Dx), mul4(Oy, Dy)), mul4(Oz, Dz));
+          sse_t C = sub4(add4(add4(mul4(Ox, Ox), mul4(Oy, Oy)), mul4(Oz, 
Oz)), set4(radius*radius));
+          sse_t disc = sub4(mul4(B, B), C);
+          sse_t hit = cmp4_ge(disc, zero4());
+          if(getmask4(hit) == 0)
             continue;
 
-          __m128 r = _mm_sqrt_ps(disc);
+          sse_t r = _mm_sqrt_ps(disc);
           // -(r+B)   The xor negates the value
-          __m128 t0 = _mm_xor_ps(_mm_add_ps(r, B), 
(__m128)_mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
-          __m128 hit0 = _mm_and_ps(hit, _mm_cmpgt_ps(t0, 
_mm_set1_ps(T_EPSILON)));
-          if(_mm_movemask_ps(hit0) != 0){
-            hit0 = _mm_and_ps(hit, _mm_cmplt_ps(t0, 
_mm_load_ps(&data->minT[i])));
-           if(_mm_movemask_ps(hit0) == 15){
-             _mm_store_ps(&data->minT[i], t0);
+          sse_t t0 = xor4(add4(r, B), cast_i2f(set4i(0x80000000)));
+          sse_t hit0 = and4(hit, cmp4_gt(t0, set4(T_EPSILON)));
+          if(getmask4(hit0) != 0){
+            hit0 = and4(hit, cmp4_lt(t0, load44(&data->minT[i])));
+            if(getmask4(hit0) == 15){
+              store44(&data->minT[i], t0);
 #ifdef __x86_64
-             _mm_store_si128((__m128i*)&data->hitMatl[i], 
_mm_set1_epi64x((long long)getMaterial()));
-             _mm_store_si128((__m128i*)&data->hitMatl[i+2], 
_mm_set1_epi64x((long long)getMaterial()));
-             _mm_store_si128((__m128i*)&data->hitPrim[i], 
_mm_set1_epi64x((long long)this));
-             _mm_store_si128((__m128i*)&data->hitPrim[i+2], 
_mm_set1_epi64x((long long)this));
-             _mm_store_si128((__m128i*)&data->hitTex[i], 
_mm_set1_epi64x((long long)getTexCoordMapper()));
-             _mm_store_si128((__m128i*)&data->hitTex[i+2], 
_mm_set1_epi64x((long long)getTexCoordMapper()));
+              store44i((sse_int_t*)&data->hitMatl[i], set4l((long 
long)getMaterial()));
+              store44i((sse_int_t*)&data->hitMatl[i+2], set4l((long 
long)getMaterial()));
+              store44i((sse_int_t*)&data->hitPrim[i], set4l((long 
long)this));
+              store44i((sse_int_t*)&data->hitPrim[i+2], set4l((long 
long)this));
+              store44i((sse_int_t*)&data->hitTex[i], set4l((long 
long)getTexCoordMapper()));
+              store44i((sse_int_t*)&data->hitTex[i+2], set4l((long 
long)getTexCoordMapper()));
 #else
-             _mm_store_si128((__m128i*)&data->hitMatl[i], 
_mm_set1_epi32((int)getMaterial()));
-             _mm_store_si128((__m128i*)&data->hitPrim[i], 
_mm_set1_epi32((int)this));
-             _mm_store_si128((__m128i*)&data->hitTex[i], 
_mm_set1_epi32((int)getTexCoordMapper()));
+              store44i((sse_int_t*)&data->hitMatl[i], 
set4i((int)getMaterial()));
+              store44i((sse_int_t*)&data->hitPrim[i], set4i((int)this));
+              store44i((sse_int_t*)&data->hitTex[i], 
set4i((int)getTexCoordMapper()));
 #endif
-           } else {
-             _mm_maskmoveu_si128((__m128i)_mm_castps_si128(t0), 
(__m128i)_mm_castps_si128(hit0), (char*)&data->minT[i]);
+            } else {
+              maskmove4i(cast_f2i(t0), cast_f2i(hit0), 
(char*)&data->minT[i]);
 #ifdef __x86_64
-             __m128i lohit = (__m128i)_mm_castps_si128(_mm_unpacklo_ps(hit0, 
hit0));
-             __m128i hihit = (__m128i)_mm_castps_si128(_mm_unpackhi_ps(hit0, 
hit0));
-             _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()), 
lohit, (char*)&data->hitMatl[i]);
-             _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()), 
hihit, (char*)&data->hitMatl[i+2]);
-         
-             _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), lohit, 
(char*)&data->hitPrim[i]);
-             _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), hihit, 
(char*)&data->hitPrim[i+2]);
-             _mm_maskmoveu_si128(_mm_set1_epi64x((long 
long)getTexCoordMapper()), lohit, (char*)&data->hitTex[i]);
-             _mm_maskmoveu_si128(_mm_set1_epi64x((long 
long)getTexCoordMapper()), hihit, (char*)&data->hitTex[i+2]);
+              sse_int_t lohit = cast_f2i(unpacklo(hit0, hit0));
+              sse_int_t hihit = cast_f2i(unpackhi(hit0, hit0));
+              maskmove4i(set4l((long long)getMaterial()), lohit, 
(char*)&data->hitMatl[i]);
+              maskmove4i(set4l((long long)getMaterial()), hihit, 
(char*)&data->hitMatl[i+2]);
+
+              maskmove4i(set4l((long long)this), lohit, 
(char*)&data->hitPrim[i]);
+              maskmove4i(set4l((long long)this), hihit, 
(char*)&data->hitPrim[i+2]);
+              maskmove4i(set4l((long long)getTexCoordMapper()), lohit, 
(char*)&data->hitTex[i]);
+              maskmove4i(set4l((long long)getTexCoordMapper()), hihit, 
(char*)&data->hitTex[i+2]);
 #else
-             _mm_maskmoveu_si128(_mm_set1_epi32((int)getMaterial()), 
(__m128i)_mm_castps_si128(hit0), (char*)&data->hitMatl[i]);
-             _mm_maskmoveu_si128(_mm_set1_epi32((int)this), 
(__m128i)_mm_castps_si128(hit0), (char*)&data->hitPrim[i]);
-             _mm_maskmoveu_si128(_mm_set1_epi32((int)getTexCoordMapper()), 
(__m128i)_mm_castps_si128(hit0), (char*)&data->hitTex[i]);
+              maskmove4i(set4i((int)getMaterial()), cast_f2i(hit0), 
(char*)&data->hitMatl[i]);
+              maskmove4i(set4i((int)this), cast_f2i(hit0), 
(char*)&data->hitPrim[i]);
+              maskmove4i(set4i((int)getTexCoordMapper()), cast_f2i(hit0), 
(char*)&data->hitTex[i]);
 #endif
-           }
+            }
             // Mask off rays that successfully hit at t0
             hit = _mm_andnot_ps(hit, hit0);
-            if(_mm_movemask_ps(hit) == 0)
+            if(getmask4(hit) == 0)
               continue;
           }
 
-          __m128 t1 = _mm_sub_ps(r, B);
-          __m128 hit1 = _mm_and_ps(hit, _mm_cmpgt_ps(t1, 
_mm_set1_ps(T_EPSILON)));
-          hit1 = _mm_and_ps(hit1, _mm_cmplt_ps(t1, 
_mm_load_ps(&data->minT[i])));
-         if(_mm_movemask_ps(hit1) == 15){
-           _mm_store_ps(&data->minT[i], t1);
+          sse_t t1 = sub4(r, B);
+          sse_t hit1 = and4(hit, cmp4_gt(t1, set4(T_EPSILON)));
+          hit1 = and4(hit1, cmp4_lt(t1, load44(&data->minT[i])));
+          if(getmask4(hit1) == 15){
+            store44(&data->minT[i], t1);
 #ifdef __x86_64
-           _mm_store_si128((__m128i*)&data->hitMatl[i], 
_mm_set1_epi64x((long long)getMaterial()));
-           _mm_store_si128((__m128i*)&data->hitMatl[i+2], 
_mm_set1_epi64x((long long)getMaterial()));
-           _mm_store_si128((__m128i*)&data->hitPrim[i], 
_mm_set1_epi64x((long long)this));
-           _mm_store_si128((__m128i*)&data->hitPrim[i+2], 
_mm_set1_epi64x((long long)this));
-           _mm_store_si128((__m128i*)&data->hitTex[i], _mm_set1_epi64x((long 
long)getTexCoordMapper()));
-           _mm_store_si128((__m128i*)&data->hitTex[i+2], 
_mm_set1_epi64x((long long)getTexCoordMapper()));
+            store44i((sse_int_t*)&data->hitMatl[i], set4l((long 
long)getMaterial()));
+            store44i((sse_int_t*)&data->hitMatl[i+2], set4l((long 
long)getMaterial()));
+            store44i((sse_int_t*)&data->hitPrim[i], set4l((long long)this));
+            store44i((sse_int_t*)&data->hitPrim[i+2], set4l((long 
long)this));
+            store44i((sse_int_t*)&data->hitTex[i], set4l((long 
long)getTexCoordMapper()));
+            store44i((sse_int_t*)&data->hitTex[i+2], set4l((long 
long)getTexCoordMapper()));
 #else
-           _mm_store_si128((__m128i*)&data->hitMatl[i], 
_mm_set1_epi32((int)getMaterial()));
-           _mm_store_si128((__m128i*)&data->hitPrim[i], 
_mm_set1_epi32((int)this));
-           _mm_store_si128((__m128i*)&data->hitTex[i], 
_mm_set1_epi32((int)getTexCoordMapper()));
+            store44i((sse_int_t*)&data->hitMatl[i], 
set4i((int)getMaterial()));
+            store44i((sse_int_t*)&data->hitPrim[i], set4i((int)this));
+            store44i((sse_int_t*)&data->hitTex[i], 
set4i((int)getTexCoordMapper()));
 #endif
-         } else {
-           _mm_maskmoveu_si128((__m128i)_mm_castps_si128(t1), 
(__m128i)_mm_castps_si128(hit1), (char*)&data->minT[i]);
+          } else {
+            maskmove4i(cast_f2i(t1), cast_f2i(hit1), (char*)&data->minT[i]);
 #ifdef __x86_64
-           __m128i lohit = (__m128i)_mm_castps_si128(_mm_unpacklo_ps(hit1, 
hit1));
-           __m128i hihit = (__m128i)_mm_castps_si128(_mm_unpackhi_ps(hit1, 
hit1));
-           _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()), 
lohit, (char*)&data->hitMatl[i]);
-           _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()), 
hihit, (char*)&data->hitMatl[i+2]);
-         
-           _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), lohit, 
(char*)&data->hitPrim[i]);
-           _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), hihit, 
(char*)&data->hitPrim[i+2]);
-           _mm_maskmoveu_si128(_mm_set1_epi64x((long 
long)getTexCoordMapper()), lohit, (char*)&data->hitTex[i]);
-           _mm_maskmoveu_si128(_mm_set1_epi64x((long 
long)getTexCoordMapper()), hihit, (char*)&data->hitTex[i+2]);
+            sse_int_t lohit = cast_f2i(unpacklo(hit1, hit1));
+            sse_int_t hihit = cast_f2i(unpackhi(hit1, hit1));
+            maskmove4i(set4l((long long)getMaterial()), lohit, 
(char*)&data->hitMatl[i]);
+            maskmove4i(set4l((long long)getMaterial()), hihit, 
(char*)&data->hitMatl[i+2]);
+
+            maskmove4i(set4l((long long)this), lohit, 
(char*)&data->hitPrim[i]);
+            maskmove4i(set4l((long long)this), hihit, 
(char*)&data->hitPrim[i+2]);
+            maskmove4i(set4l((long long)getTexCoordMapper()), lohit, 
(char*)&data->hitTex[i]);
+            maskmove4i(set4l((long long)getTexCoordMapper()), hihit, 
(char*)&data->hitTex[i+2]);
 #else
-           _mm_maskmoveu_si128(_mm_set1_epi32((int)getMaterial()), 
(__m128i)_mm_castps_si128(hit1), (char*)&data->hitMatl[i]);
-           _mm_maskmoveu_si128(_mm_set1_epi32((int)this), 
(__m128i)_mm_castps_si128(hit1), (char*)&data->hitPrim[i]);
-           _mm_maskmoveu_si128(_mm_set1_epi32((int)getTexCoordMapper()), 
(__m128i)_mm_castps_si128(hit1), (char*)&data->hitTex[i]);
+            maskmove4i(set4i((int)getMaterial()), cast_f2i(hit1), 
(char*)&data->hitMatl[i]);
+            maskmove4i(set4i((int)this), cast_f2i(hit1), 
(char*)&data->hitPrim[i]);
+            maskmove4i(set4i((int)getTexCoordMapper()), cast_f2i(hit1), 
(char*)&data->hitTex[i]);
 #endif
-         }
+          }
         }
         for(;i<rays.rayEnd;i++){
           Vector O(rays.getOrigin(i)-center);




  • [MANTA] r1158 - in trunk: Core/Math Model/Primitives, bigler, 07/27/2006

Archive powered by MHonArc 2.6.16.

Top of page