Text archives Help
- From: bigler@sci.utah.edu
- To: manta@sci.utah.edu
- Subject: [MANTA] r1158 - in trunk: Core/Math Model/Primitives
- Date: Thu, 27 Jul 2006 15:03:01 -0600 (MDT)
Author: bigler
Date: Thu Jul 27 15:03:01 2006
New Revision: 1158
Modified:
trunk/Core/Math/SSEDefs.h
trunk/Model/Primitives/Sphere.cc
Log:
Core/Math/SSEDefs.h
Added a few functions: xor4, set4l, unpacklo, unpackhi, maskmove4i,
load44, load44i, store44, store44i, cast_i2f, cast_f2i, and sqrt4.
Removed MIN and MAX macros. One should really use the SCIRun
versions found in SCIRun/Core/Math/MinMax.h.
Switched the low and high variable names for _mm_set1_epi64x. I
think this is what it is supposed to be.
Replaced min4f, max4f, min3f, and max3f with faster versions that
don't have branches.
simd_cerr should return void.
Model/Primitives/Sphere.cc
Mainly cosmetic. Replaced all the SSE function names with
corresponding abreviations found in SSEDefs.h.
Fixed indentation.
Modified: trunk/Core/Math/SSEDefs.h
==============================================================================
--- trunk/Core/Math/SSEDefs.h (original)
+++ trunk/Core/Math/SSEDefs.h Thu Jul 27 15:03:01 2006
@@ -23,6 +23,7 @@
#define and4i _mm_and_si128
#define andnot4 _mm_andnot_ps
#define andnot4i _mm_andnot_si128
+#define xor4 _mm_xor_ps
#define mul4 _mm_mul_ps
#define add4 _mm_add_ps
#define sub4 _mm_sub_ps
@@ -32,21 +33,24 @@
#define set44 _mm_set_ps
#define set4i _mm_set1_epi32
#define set44i _mm_set_epi32
+#define set4l _mm_set1_epi64x // See comments below
+#define unpacklo _mm_unpacklo_ps // (a,b) => [a0, b0, a1, b1]
+#define unpackhi _mm_unpackhi_ps // (a,b) => [a2, b2, a3, b3]
#define zero4 _mm_setzero_ps
#define getmask4 _mm_movemask_ps
+#define maskmove4i _mm_maskmoveu_si128 // destination need not be aligned
#define cmp4_ge _mm_cmpge_ps
#define cmp4_le _mm_cmple_ps
#define cmp4_gt _mm_cmpgt_ps
#define cmp4_lt _mm_cmplt_ps
#define cmp4_eq _mm_cmpeq_ps
-
-//AARONBAD - this should really be somewhere in Core/Math
-#ifndef MIN
-#define MIN(a,b) (((a)<(b))?(a):(b))
-#endif
-#ifndef MAX
-#define MAX(a,b) (((a)>(b))?(a):(b))
-#endif
+#define load44 _mm_load_ps
+#define load44i _mm_load_si128
+#define store44 _mm_store_ps
+#define store44i _mm_store_si128
+#define cast_i2f _mm_castsi128_ps
+#define cast_f2i _mm_castps_si128
+#define sqrt4 _mm_sqrt_ps
namespace Manta
{
@@ -64,12 +68,18 @@
#if defined(__x86_64) && defined(__INTEL_COMPILER)
- static inline
- __m128i _mm_set1_epi64x(long long val)
+ // Stores 1 64 bit int twice in 4 32 bit ints
+ //
+ // [high32, low32, high32, low32]
+ //
+ // I'm not sure about the high versus low, but the code does what
+ // GCC does (for example, when copying 64 bit pointers).
+ static inline
+ __m128i _mm_set1_epi64x(long long val)
{
- const int low = (0xFFFFFFFF00000000L & val) >> 32 ;
- const int high = ( 0xFFFFFFFFL & val);
- return _mm_set_epi32(low, high, low, high);
+ const int high = (0xFFFFFFFF00000000L & val) >> 32 ;
+ const int low = ( 0xFFFFFFFFL & val);
+ return _mm_set_epi32(high, low, high, low);
}
#endif
@@ -211,41 +221,76 @@
inline float length(sse_t a)
{
const sse_t d = dot4(a,a);
- const sse_t v = _mm_sqrt_ps(d);
+ const sse_t v = sqrt4(d);
return (float &)v;
}
+ // Get horizontal minimum of all components
inline float min4f(sse_t t)
{
- MANTA_ALIGN(16)
- float f[4];
- _mm_store_ps(f,t);
- return MIN(MIN(f[0],f[1]),MIN(f[2],f[3]));
+ // a = (t0, t0, t1, t1)
+ sse_t a = unpacklo(t,t);
+ // b = (t2, t2, t3, t3)
+ sse_t b = unpackhi(t,t);
+ // c = (min(t0,t2), min(t0, t2), min(t1, t3), min(t1, t3))
+ sse_t c = min4(a, b);
+ // The movehl will move the high 2 values to the low 2 values.
+ // This will allow us to compare min(t0,t2) with min(t1, t3).
+ sse_t min = _mm_min_ss(c, _mm_movehl_ps(c, c));
+ // Return the first value.
+ return *((float*)&min);
}
+ // Get horizontal maximum of all components
inline float max4f(sse_t t)
{
- MANTA_ALIGN(16)
- float f[4];
- _mm_store_ps(f,t);
- return MAX(MAX(f[0],f[1]),MAX(f[2],f[3]));
+ // a = (t0, t0, t1, t1)
+ sse_t a = unpacklo(t,t);
+ // b = (t2, t2, t3, t3)
+ sse_t b = unpackhi(t,t);
+ // c = (max(t0,t2), max(t0, t2), max(t1, t3), max(t1, t3))
+ sse_t c = max4(a, b);
+ // The movehl will move the high 2 values to the low 2 values.
+ // This will allow us to compare max(t0,t2) with max(t1, t3).
+ sse_t max = _mm_max_ss(c, _mm_movehl_ps(c, c));
+ // Return the first value.
+ return *((float*)&max);
}
+ // Get horizontal minimum of the frist 3 components
inline float min3f(sse_t t)
{
- MANTA_ALIGN(16)
- float f[4];
- _mm_store_ps(f,t);
- return MIN(MIN(f[0],f[1]),f[2]);
+ // a = (t0, t0, t1, t1), you might be tempted to make this a movelh,
+ // but you need t1 to be in the 2 index in order to use movehl
+ // later.
+ sse_t a = unpacklo(t,t);
+ // b = (t2, t3, t2, t3)
+ sse_t b = _mm_movehl_ps(t,t);
+ // c = (min(t0,t2), min(t0, t3), min(t1, t2), min(t1, t3))
+ sse_t c = min4(a, b);
+ // The movehl will move the high 2 values to the low 2 values.
+ // This will allow us to compare t1 with min(t0, t2).
+ sse_t min = _mm_min_ss(c, _mm_movehl_ps(a, a));
+ // Return the first value.
+ return *((float*)&min);
}
- /*! get horizontal minimum of a whole 4-way simd */
+ // Get horizontal maximum of the frist 3 components
inline float max3f(sse_t t)
{
- MANTA_ALIGN(16)
- float f[4];
- _mm_store_ps(f,t);
- return MAX(MAX(f[0],f[1]),f[2]);
+ // a = (t0, t0, t1, t1), you might be tempted to make this a movelh,
+ // but you need t1 to be in the 2 index in order to use movehl
+ // later.
+ sse_t a = unpacklo(t,t);
+ // b = (t2, t3, t2, t3)
+ sse_t b = _mm_movehl_ps(t,t);
+ // c = (max(t0,t2), max(t0, t3), max(t1, t2), max(t1, t3))
+ sse_t c = max4(a, b);
+ // The movehl will move the high 2 values to the low 2 values.
+ // This will allow us to compare t1 with max(t0, t2).
+ sse_t max = _mm_max_ss(c, _mm_movehl_ps(a, a));
+ // Return the first value.
+ return *((float*)&max);
}
inline float simd_component(sse_t t, int offset)
@@ -256,7 +301,7 @@
return f[offset];
}
- inline float simd_cerr(sse_t t)
+ inline void simd_cerr(sse_t t)
{
MANTA_ALIGN(16)
float f[4];
Modified: trunk/Model/Primitives/Sphere.cc
==============================================================================
--- trunk/Model/Primitives/Sphere.cc (original)
+++ trunk/Model/Primitives/Sphere.cc Thu Jul 27 15:03:01 2006
@@ -13,7 +13,7 @@
using namespace std;
Sphere::Sphere(Material* material, const Vector& center, Real radius)
-: PrimitiveCommon(material, this), center(center), radius(radius)
+ : PrimitiveCommon(material, this), center(center), radius(radius)
{
inv_radius = 1/radius;
}
@@ -103,97 +103,97 @@
}
RayPacketData* data = rays.data;
for(;i<e;i+=4){
- __m128 Ox = _mm_sub_ps(_mm_load_ps(&data->origin[0][i]),
_mm_set1_ps(center[0]));
- __m128 Oy = _mm_sub_ps(_mm_load_ps(&data->origin[1][i]),
_mm_set1_ps(center[1]));
- __m128 Oz = _mm_sub_ps(_mm_load_ps(&data->origin[2][i]),
_mm_set1_ps(center[2]));
- __m128 Dx = _mm_load_ps(&data->direction[0][i]);
- __m128 Dy = _mm_load_ps(&data->direction[1][i]);
- __m128 Dz = _mm_load_ps(&data->direction[2][i]);
- __m128 B = _mm_add_ps(_mm_add_ps(_mm_mul_ps(Ox, Dx),
_mm_mul_ps(Oy, Dy)), _mm_mul_ps(Oz, Dz));
- __m128 disc = _mm_sub_ps(_mm_mul_ps(B, B), _mm_set1_ps(C));
- __m128 hit = _mm_cmpge_ps(disc, _mm_setzero_ps());
- if(_mm_movemask_ps(hit) == 0)
+ sse_t Ox = sub4( load44(&data->origin[0][i]), set4(center[0]));
+ sse_t Oy = sub4( load44(&data->origin[1][i]), set4(center[1]));
+ sse_t Oz = sub4( load44(&data->origin[2][i]), set4(center[2]));
+ sse_t Dx = load44(&data->direction[0][i]);
+ sse_t Dy = load44(&data->direction[1][i]);
+ sse_t Dz = load44(&data->direction[2][i]);
+ sse_t B = add4(add4(mul4(Ox, Dx), mul4(Oy, Dy)), mul4(Oz, Dz));
+ sse_t disc = sub4(mul4(B, B), set4(C));
+ sse_t hit = cmp4_ge(disc, zero4());
+ if(getmask4(hit) == 0)
continue;
- __m128 r = _mm_sqrt_ps(disc);
+ sse_t r = sqrt4(disc);
// -(r+B) The xor negates the value
- __m128 t0 = _mm_xor_ps(_mm_add_ps(r, B),
(__m128)_mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
- __m128 hit0 = _mm_and_ps(hit, _mm_cmpgt_ps(t0,
_mm_set1_ps(T_EPSILON)));
- if(_mm_movemask_ps(hit0) != 0){
- hit0 = _mm_and_ps(hit, _mm_cmplt_ps(t0,
_mm_load_ps(&data->minT[i])));
- if(_mm_movemask_ps(hit0) == 15){
- _mm_store_ps(&data->minT[i], t0);
+ sse_t t0 = xor4(add4(r, B), cast_i2f(set4i(0x80000000)));
+ sse_t hit0 = and4(hit, cmp4_gt(t0, set4(T_EPSILON)));
+ if(getmask4(hit0) != 0){
+ hit0 = and4(hit, cmp4_lt(t0, load44(&data->minT[i])));
+ if(getmask4(hit0) == 15){
+ store44(&data->minT[i], t0);
#ifdef __x86_64
- _mm_store_si128((__m128i*)&data->hitMatl[i],
_mm_set1_epi64x((long long)getMaterial()));
- _mm_store_si128((__m128i*)&data->hitMatl[i+2],
_mm_set1_epi64x((long long)getMaterial()));
- _mm_store_si128((__m128i*)&data->hitPrim[i],
_mm_set1_epi64x((long long)this));
- _mm_store_si128((__m128i*)&data->hitPrim[i+2],
_mm_set1_epi64x((long long)this));
- _mm_store_si128((__m128i*)&data->hitTex[i],
_mm_set1_epi64x((long long)getTexCoordMapper()));
- _mm_store_si128((__m128i*)&data->hitTex[i+2],
_mm_set1_epi64x((long long)getTexCoordMapper()));
+ store44i((sse_int_t*)&data->hitMatl[i], set4l((long
long)getMaterial()));
+ store44i((sse_int_t*)&data->hitMatl[i+2], set4l((long
long)getMaterial()));
+ store44i((sse_int_t*)&data->hitPrim[i], set4l((long
long)this));
+ store44i((sse_int_t*)&data->hitPrim[i+2], set4l((long
long)this));
+ store44i((sse_int_t*)&data->hitTex[i], set4l((long
long)getTexCoordMapper()));
+ store44i((sse_int_t*)&data->hitTex[i+2], set4l((long
long)getTexCoordMapper()));
#else
- _mm_store_si128((__m128i*)&data->hitMatl[i],
_mm_set1_epi32((int)getMaterial()));
- _mm_store_si128((__m128i*)&data->hitPrim[i],
_mm_set1_epi32((int)this));
- _mm_store_si128((__m128i*)&data->hitTex[i],
_mm_set1_epi32((int)getTexCoordMapper()));
+ store44i((sse_int_t*)&data->hitMatl[i],
set4i((int)getMaterial()));
+ store44i((sse_int_t*)&data->hitPrim[i], set4i((int)this));
+ store44i((sse_int_t*)&data->hitTex[i],
set4i((int)getTexCoordMapper()));
#endif
- } else {
- _mm_maskmoveu_si128((__m128i)_mm_castps_si128(t0),
(__m128i)_mm_castps_si128(hit0), (char*)&data->minT[i]);
+ } else {
+ maskmove4i(cast_f2i(t0), cast_f2i(hit0),
(char*)&data->minT[i]);
#ifdef __x86_64
- __m128i lohit = (__m128i)_mm_castps_si128(_mm_unpacklo_ps(hit0,
hit0));
- __m128i hihit = (__m128i)_mm_castps_si128(_mm_unpackhi_ps(hit0,
hit0));
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()),
lohit, (char*)&data->hitMatl[i]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()),
hihit, (char*)&data->hitMatl[i+2]);
-
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), lohit,
(char*)&data->hitPrim[i]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), hihit,
(char*)&data->hitPrim[i+2]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long
long)getTexCoordMapper()), lohit, (char*)&data->hitTex[i]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long
long)getTexCoordMapper()), hihit, (char*)&data->hitTex[i+2]);
+ sse_int_t lohit = cast_f2i(unpacklo(hit0, hit0));
+ sse_int_t hihit = cast_f2i(unpackhi(hit0, hit0));
+ maskmove4i(set4l((long long)getMaterial()), lohit,
(char*)&data->hitMatl[i]);
+ maskmove4i(set4l((long long)getMaterial()), hihit,
(char*)&data->hitMatl[i+2]);
+
+ maskmove4i(set4l((long long)this), lohit,
(char*)&data->hitPrim[i]);
+ maskmove4i(set4l((long long)this), hihit,
(char*)&data->hitPrim[i+2]);
+ maskmove4i(set4l((long long)getTexCoordMapper()), lohit,
(char*)&data->hitTex[i]);
+ maskmove4i(set4l((long long)getTexCoordMapper()), hihit,
(char*)&data->hitTex[i+2]);
#else
- _mm_maskmoveu_si128(_mm_set1_epi32((int)getMaterial()),
(__m128i)_mm_castps_si128(hit0), (char*)&data->hitMatl[i]);
- _mm_maskmoveu_si128(_mm_set1_epi32((int)this),
(__m128i)_mm_castps_si128(hit0), (char*)&data->hitPrim[i]);
- _mm_maskmoveu_si128(_mm_set1_epi32((int)getTexCoordMapper()),
(__m128i)_mm_castps_si128(hit0), (char*)&data->hitTex[i]);
+ maskmove4i(set4i((int)getMaterial()), cast_f2i(hit0),
(char*)&data->hitMatl[i]);
+ maskmove4i(set4i((int)this), cast_f2i(hit0),
(char*)&data->hitPrim[i]);
+ maskmove4i(set4i((int)getTexCoordMapper()), cast_f2i(hit0),
(char*)&data->hitTex[i]);
#endif
- }
+ }
// Mask off rays that successfully hit at t0
hit = _mm_andnot_ps(hit, hit0);
- if(_mm_movemask_ps(hit) == 0)
+ if(getmask4(hit) == 0)
continue;
}
- __m128 t1 = _mm_sub_ps(r, B);
- __m128 hit1 = _mm_and_ps(hit, _mm_cmpgt_ps(t1,
_mm_set1_ps(T_EPSILON)));
- hit1 = _mm_and_ps(hit1, _mm_cmplt_ps(t1,
_mm_load_ps(&data->minT[i])));
- if(_mm_movemask_ps(hit1) == 15){
- _mm_store_ps(&data->minT[i], t1);
+ sse_t t1 = sub4(r, B);
+ sse_t hit1 = and4(hit, cmp4_gt(t1, set4(T_EPSILON)));
+ hit1 = and4(hit1, cmp4_lt(t1, load44(&data->minT[i])));
+ if(getmask4(hit1) == 15){
+ store44(&data->minT[i], t1);
#ifdef __x86_64
- _mm_store_si128((__m128i*)&data->hitMatl[i],
_mm_set1_epi64x((long long)getMaterial()));
- _mm_store_si128((__m128i*)&data->hitMatl[i+2],
_mm_set1_epi64x((long long)getMaterial()));
- _mm_store_si128((__m128i*)&data->hitPrim[i],
_mm_set1_epi64x((long long)this));
- _mm_store_si128((__m128i*)&data->hitPrim[i+2],
_mm_set1_epi64x((long long)this));
- _mm_store_si128((__m128i*)&data->hitTex[i], _mm_set1_epi64x((long
long)getTexCoordMapper()));
- _mm_store_si128((__m128i*)&data->hitTex[i+2],
_mm_set1_epi64x((long long)getTexCoordMapper()));
+ store44i((sse_int_t*)&data->hitMatl[i], set4l((long
long)getMaterial()));
+ store44i((sse_int_t*)&data->hitMatl[i+2], set4l((long
long)getMaterial()));
+ store44i((sse_int_t*)&data->hitPrim[i], set4l((long long)this));
+ store44i((sse_int_t*)&data->hitPrim[i+2], set4l((long
long)this));
+ store44i((sse_int_t*)&data->hitTex[i], set4l((long
long)getTexCoordMapper()));
+ store44i((sse_int_t*)&data->hitTex[i+2], set4l((long
long)getTexCoordMapper()));
#else
- _mm_store_si128((__m128i*)&data->hitMatl[i],
_mm_set1_epi32((int)getMaterial()));
- _mm_store_si128((__m128i*)&data->hitPrim[i],
_mm_set1_epi32((int)this));
- _mm_store_si128((__m128i*)&data->hitTex[i],
_mm_set1_epi32((int)getTexCoordMapper()));
+ store44i((sse_int_t*)&data->hitMatl[i],
set4i((int)getMaterial()));
+ store44i((sse_int_t*)&data->hitPrim[i], set4i((int)this));
+ store44i((sse_int_t*)&data->hitTex[i],
set4i((int)getTexCoordMapper()));
#endif
- } else {
- _mm_maskmoveu_si128((__m128i)_mm_castps_si128(t1),
(__m128i)_mm_castps_si128(hit1), (char*)&data->minT[i]);
+ } else {
+ maskmove4i(cast_f2i(t1), cast_f2i(hit1), (char*)&data->minT[i]);
#ifdef __x86_64
- __m128i lohit = (__m128i)_mm_castps_si128(_mm_unpacklo_ps(hit1,
hit1));
- __m128i hihit = (__m128i)_mm_castps_si128(_mm_unpackhi_ps(hit1,
hit1));
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()),
lohit, (char*)&data->hitMatl[i]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()),
hihit, (char*)&data->hitMatl[i+2]);
-
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), lohit,
(char*)&data->hitPrim[i]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), hihit,
(char*)&data->hitPrim[i+2]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long
long)getTexCoordMapper()), lohit, (char*)&data->hitTex[i]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long
long)getTexCoordMapper()), hihit, (char*)&data->hitTex[i+2]);
+ sse_int_t lohit = cast_f2i(unpacklo(hit1, hit1));
+ sse_int_t hihit = cast_f2i(unpackhi(hit1, hit1));
+ maskmove4i(set4l((long long)getMaterial()), lohit,
(char*)&data->hitMatl[i]);
+ maskmove4i(set4l((long long)getMaterial()), hihit,
(char*)&data->hitMatl[i+2]);
+
+ maskmove4i(set4l((long long)this), lohit,
(char*)&data->hitPrim[i]);
+ maskmove4i(set4l((long long)this), hihit,
(char*)&data->hitPrim[i+2]);
+ maskmove4i(set4l((long long)getTexCoordMapper()), lohit,
(char*)&data->hitTex[i]);
+ maskmove4i(set4l((long long)getTexCoordMapper()), hihit,
(char*)&data->hitTex[i+2]);
#else
- _mm_maskmoveu_si128(_mm_set1_epi32((int)getMaterial()),
(__m128i)_mm_castps_si128(hit1), (char*)&data->hitMatl[i]);
- _mm_maskmoveu_si128(_mm_set1_epi32((int)this),
(__m128i)_mm_castps_si128(hit1), (char*)&data->hitPrim[i]);
- _mm_maskmoveu_si128(_mm_set1_epi32((int)getTexCoordMapper()),
(__m128i)_mm_castps_si128(hit1), (char*)&data->hitTex[i]);
+ maskmove4i(set4i((int)getMaterial()), cast_f2i(hit1),
(char*)&data->hitMatl[i]);
+ maskmove4i(set4i((int)this), cast_f2i(hit1),
(char*)&data->hitPrim[i]);
+ maskmove4i(set4i((int)getTexCoordMapper()), cast_f2i(hit1),
(char*)&data->hitTex[i]);
#endif
- }
+ }
}
for(;i<rays.rayEnd;i++){
Vector D(rays.getDirection(i));
@@ -229,7 +229,7 @@
}
#endif
}
- break;
+ break;
case RayPacket::ConstantOrigin:
{
// Rays of constant origin for not normalized directions
@@ -297,98 +297,98 @@
}
RayPacketData* data = rays.data;
for(;i<e;i+=4){
- __m128 Ox = _mm_sub_ps(_mm_load_ps(&data->origin[0][i]),
_mm_set1_ps(center[0]));
- __m128 Oy = _mm_sub_ps(_mm_load_ps(&data->origin[1][i]),
_mm_set1_ps(center[1]));
- __m128 Oz = _mm_sub_ps(_mm_load_ps(&data->origin[2][i]),
_mm_set1_ps(center[2]));
- __m128 Dx = _mm_load_ps(&data->direction[0][i]);
- __m128 Dy = _mm_load_ps(&data->direction[1][i]);
- __m128 Dz = _mm_load_ps(&data->direction[2][i]);
- __m128 B = _mm_add_ps(_mm_add_ps(_mm_mul_ps(Ox, Dx),
_mm_mul_ps(Oy, Dy)), _mm_mul_ps(Oz, Dz));
- __m128 C = _mm_sub_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(Ox, Ox),
_mm_mul_ps(Oy, Oy)), _mm_mul_ps(Oz, Oz)), _mm_set1_ps(radius*radius));
- __m128 disc = _mm_sub_ps(_mm_mul_ps(B, B), C);
- __m128 hit = _mm_cmpge_ps(disc, _mm_setzero_ps());
- if(_mm_movemask_ps(hit) == 0)
+ sse_t Ox = sub4(load44(&data->origin[0][i]), set4(center[0]));
+ sse_t Oy = sub4(load44(&data->origin[1][i]), set4(center[1]));
+ sse_t Oz = sub4(load44(&data->origin[2][i]), set4(center[2]));
+ sse_t Dx = load44(&data->direction[0][i]);
+ sse_t Dy = load44(&data->direction[1][i]);
+ sse_t Dz = load44(&data->direction[2][i]);
+ sse_t B = add4(add4(mul4(Ox, Dx), mul4(Oy, Dy)), mul4(Oz, Dz));
+ sse_t C = sub4(add4(add4(mul4(Ox, Ox), mul4(Oy, Oy)), mul4(Oz,
Oz)), set4(radius*radius));
+ sse_t disc = sub4(mul4(B, B), C);
+ sse_t hit = cmp4_ge(disc, zero4());
+ if(getmask4(hit) == 0)
continue;
- __m128 r = _mm_sqrt_ps(disc);
+ sse_t r = _mm_sqrt_ps(disc);
// -(r+B) The xor negates the value
- __m128 t0 = _mm_xor_ps(_mm_add_ps(r, B),
(__m128)_mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
- __m128 hit0 = _mm_and_ps(hit, _mm_cmpgt_ps(t0,
_mm_set1_ps(T_EPSILON)));
- if(_mm_movemask_ps(hit0) != 0){
- hit0 = _mm_and_ps(hit, _mm_cmplt_ps(t0,
_mm_load_ps(&data->minT[i])));
- if(_mm_movemask_ps(hit0) == 15){
- _mm_store_ps(&data->minT[i], t0);
+ sse_t t0 = xor4(add4(r, B), cast_i2f(set4i(0x80000000)));
+ sse_t hit0 = and4(hit, cmp4_gt(t0, set4(T_EPSILON)));
+ if(getmask4(hit0) != 0){
+ hit0 = and4(hit, cmp4_lt(t0, load44(&data->minT[i])));
+ if(getmask4(hit0) == 15){
+ store44(&data->minT[i], t0);
#ifdef __x86_64
- _mm_store_si128((__m128i*)&data->hitMatl[i],
_mm_set1_epi64x((long long)getMaterial()));
- _mm_store_si128((__m128i*)&data->hitMatl[i+2],
_mm_set1_epi64x((long long)getMaterial()));
- _mm_store_si128((__m128i*)&data->hitPrim[i],
_mm_set1_epi64x((long long)this));
- _mm_store_si128((__m128i*)&data->hitPrim[i+2],
_mm_set1_epi64x((long long)this));
- _mm_store_si128((__m128i*)&data->hitTex[i],
_mm_set1_epi64x((long long)getTexCoordMapper()));
- _mm_store_si128((__m128i*)&data->hitTex[i+2],
_mm_set1_epi64x((long long)getTexCoordMapper()));
+ store44i((sse_int_t*)&data->hitMatl[i], set4l((long
long)getMaterial()));
+ store44i((sse_int_t*)&data->hitMatl[i+2], set4l((long
long)getMaterial()));
+ store44i((sse_int_t*)&data->hitPrim[i], set4l((long
long)this));
+ store44i((sse_int_t*)&data->hitPrim[i+2], set4l((long
long)this));
+ store44i((sse_int_t*)&data->hitTex[i], set4l((long
long)getTexCoordMapper()));
+ store44i((sse_int_t*)&data->hitTex[i+2], set4l((long
long)getTexCoordMapper()));
#else
- _mm_store_si128((__m128i*)&data->hitMatl[i],
_mm_set1_epi32((int)getMaterial()));
- _mm_store_si128((__m128i*)&data->hitPrim[i],
_mm_set1_epi32((int)this));
- _mm_store_si128((__m128i*)&data->hitTex[i],
_mm_set1_epi32((int)getTexCoordMapper()));
+ store44i((sse_int_t*)&data->hitMatl[i],
set4i((int)getMaterial()));
+ store44i((sse_int_t*)&data->hitPrim[i], set4i((int)this));
+ store44i((sse_int_t*)&data->hitTex[i],
set4i((int)getTexCoordMapper()));
#endif
- } else {
- _mm_maskmoveu_si128((__m128i)_mm_castps_si128(t0),
(__m128i)_mm_castps_si128(hit0), (char*)&data->minT[i]);
+ } else {
+ maskmove4i(cast_f2i(t0), cast_f2i(hit0),
(char*)&data->minT[i]);
#ifdef __x86_64
- __m128i lohit = (__m128i)_mm_castps_si128(_mm_unpacklo_ps(hit0,
hit0));
- __m128i hihit = (__m128i)_mm_castps_si128(_mm_unpackhi_ps(hit0,
hit0));
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()),
lohit, (char*)&data->hitMatl[i]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()),
hihit, (char*)&data->hitMatl[i+2]);
-
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), lohit,
(char*)&data->hitPrim[i]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), hihit,
(char*)&data->hitPrim[i+2]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long
long)getTexCoordMapper()), lohit, (char*)&data->hitTex[i]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long
long)getTexCoordMapper()), hihit, (char*)&data->hitTex[i+2]);
+ sse_int_t lohit = cast_f2i(unpacklo(hit0, hit0));
+ sse_int_t hihit = cast_f2i(unpackhi(hit0, hit0));
+ maskmove4i(set4l((long long)getMaterial()), lohit,
(char*)&data->hitMatl[i]);
+ maskmove4i(set4l((long long)getMaterial()), hihit,
(char*)&data->hitMatl[i+2]);
+
+ maskmove4i(set4l((long long)this), lohit,
(char*)&data->hitPrim[i]);
+ maskmove4i(set4l((long long)this), hihit,
(char*)&data->hitPrim[i+2]);
+ maskmove4i(set4l((long long)getTexCoordMapper()), lohit,
(char*)&data->hitTex[i]);
+ maskmove4i(set4l((long long)getTexCoordMapper()), hihit,
(char*)&data->hitTex[i+2]);
#else
- _mm_maskmoveu_si128(_mm_set1_epi32((int)getMaterial()),
(__m128i)_mm_castps_si128(hit0), (char*)&data->hitMatl[i]);
- _mm_maskmoveu_si128(_mm_set1_epi32((int)this),
(__m128i)_mm_castps_si128(hit0), (char*)&data->hitPrim[i]);
- _mm_maskmoveu_si128(_mm_set1_epi32((int)getTexCoordMapper()),
(__m128i)_mm_castps_si128(hit0), (char*)&data->hitTex[i]);
+ maskmove4i(set4i((int)getMaterial()), cast_f2i(hit0),
(char*)&data->hitMatl[i]);
+ maskmove4i(set4i((int)this), cast_f2i(hit0),
(char*)&data->hitPrim[i]);
+ maskmove4i(set4i((int)getTexCoordMapper()), cast_f2i(hit0),
(char*)&data->hitTex[i]);
#endif
- }
+ }
// Mask off rays that successfully hit at t0
hit = _mm_andnot_ps(hit, hit0);
- if(_mm_movemask_ps(hit) == 0)
+ if(getmask4(hit) == 0)
continue;
}
- __m128 t1 = _mm_sub_ps(r, B);
- __m128 hit1 = _mm_and_ps(hit, _mm_cmpgt_ps(t1,
_mm_set1_ps(T_EPSILON)));
- hit1 = _mm_and_ps(hit1, _mm_cmplt_ps(t1,
_mm_load_ps(&data->minT[i])));
- if(_mm_movemask_ps(hit1) == 15){
- _mm_store_ps(&data->minT[i], t1);
+ sse_t t1 = sub4(r, B);
+ sse_t hit1 = and4(hit, cmp4_gt(t1, set4(T_EPSILON)));
+ hit1 = and4(hit1, cmp4_lt(t1, load44(&data->minT[i])));
+ if(getmask4(hit1) == 15){
+ store44(&data->minT[i], t1);
#ifdef __x86_64
- _mm_store_si128((__m128i*)&data->hitMatl[i],
_mm_set1_epi64x((long long)getMaterial()));
- _mm_store_si128((__m128i*)&data->hitMatl[i+2],
_mm_set1_epi64x((long long)getMaterial()));
- _mm_store_si128((__m128i*)&data->hitPrim[i],
_mm_set1_epi64x((long long)this));
- _mm_store_si128((__m128i*)&data->hitPrim[i+2],
_mm_set1_epi64x((long long)this));
- _mm_store_si128((__m128i*)&data->hitTex[i], _mm_set1_epi64x((long
long)getTexCoordMapper()));
- _mm_store_si128((__m128i*)&data->hitTex[i+2],
_mm_set1_epi64x((long long)getTexCoordMapper()));
+ store44i((sse_int_t*)&data->hitMatl[i], set4l((long
long)getMaterial()));
+ store44i((sse_int_t*)&data->hitMatl[i+2], set4l((long
long)getMaterial()));
+ store44i((sse_int_t*)&data->hitPrim[i], set4l((long long)this));
+ store44i((sse_int_t*)&data->hitPrim[i+2], set4l((long
long)this));
+ store44i((sse_int_t*)&data->hitTex[i], set4l((long
long)getTexCoordMapper()));
+ store44i((sse_int_t*)&data->hitTex[i+2], set4l((long
long)getTexCoordMapper()));
#else
- _mm_store_si128((__m128i*)&data->hitMatl[i],
_mm_set1_epi32((int)getMaterial()));
- _mm_store_si128((__m128i*)&data->hitPrim[i],
_mm_set1_epi32((int)this));
- _mm_store_si128((__m128i*)&data->hitTex[i],
_mm_set1_epi32((int)getTexCoordMapper()));
+ store44i((sse_int_t*)&data->hitMatl[i],
set4i((int)getMaterial()));
+ store44i((sse_int_t*)&data->hitPrim[i], set4i((int)this));
+ store44i((sse_int_t*)&data->hitTex[i],
set4i((int)getTexCoordMapper()));
#endif
- } else {
- _mm_maskmoveu_si128((__m128i)_mm_castps_si128(t1),
(__m128i)_mm_castps_si128(hit1), (char*)&data->minT[i]);
+ } else {
+ maskmove4i(cast_f2i(t1), cast_f2i(hit1), (char*)&data->minT[i]);
#ifdef __x86_64
- __m128i lohit = (__m128i)_mm_castps_si128(_mm_unpacklo_ps(hit1,
hit1));
- __m128i hihit = (__m128i)_mm_castps_si128(_mm_unpackhi_ps(hit1,
hit1));
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()),
lohit, (char*)&data->hitMatl[i]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)getMaterial()),
hihit, (char*)&data->hitMatl[i+2]);
-
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), lohit,
(char*)&data->hitPrim[i]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long long)this), hihit,
(char*)&data->hitPrim[i+2]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long
long)getTexCoordMapper()), lohit, (char*)&data->hitTex[i]);
- _mm_maskmoveu_si128(_mm_set1_epi64x((long
long)getTexCoordMapper()), hihit, (char*)&data->hitTex[i+2]);
+ sse_int_t lohit = cast_f2i(unpacklo(hit1, hit1));
+ sse_int_t hihit = cast_f2i(unpackhi(hit1, hit1));
+ maskmove4i(set4l((long long)getMaterial()), lohit,
(char*)&data->hitMatl[i]);
+ maskmove4i(set4l((long long)getMaterial()), hihit,
(char*)&data->hitMatl[i+2]);
+
+ maskmove4i(set4l((long long)this), lohit,
(char*)&data->hitPrim[i]);
+ maskmove4i(set4l((long long)this), hihit,
(char*)&data->hitPrim[i+2]);
+ maskmove4i(set4l((long long)getTexCoordMapper()), lohit,
(char*)&data->hitTex[i]);
+ maskmove4i(set4l((long long)getTexCoordMapper()), hihit,
(char*)&data->hitTex[i+2]);
#else
- _mm_maskmoveu_si128(_mm_set1_epi32((int)getMaterial()),
(__m128i)_mm_castps_si128(hit1), (char*)&data->hitMatl[i]);
- _mm_maskmoveu_si128(_mm_set1_epi32((int)this),
(__m128i)_mm_castps_si128(hit1), (char*)&data->hitPrim[i]);
- _mm_maskmoveu_si128(_mm_set1_epi32((int)getTexCoordMapper()),
(__m128i)_mm_castps_si128(hit1), (char*)&data->hitTex[i]);
+ maskmove4i(set4i((int)getMaterial()), cast_f2i(hit1),
(char*)&data->hitMatl[i]);
+ maskmove4i(set4i((int)this), cast_f2i(hit1),
(char*)&data->hitPrim[i]);
+ maskmove4i(set4i((int)getTexCoordMapper()), cast_f2i(hit1),
(char*)&data->hitTex[i]);
#endif
- }
+ }
}
for(;i<rays.rayEnd;i++){
Vector O(rays.getOrigin(i)-center);
- [MANTA] r1158 - in trunk: Core/Math Model/Primitives, bigler, 07/27/2006
Archive powered by MHonArc 2.6.16.