Text archives Help
- From: arobison@sci.utah.edu
- To: manta@sci.utah.edu
- Subject: [Manta] r1721 - in trunk: Core/Math Model/Cameras
- Date: Wed, 12 Sep 2007 01:40:20 -0600 (MDT)
Author: arobison
Date: Wed Sep 12 01:40:18 2007
New Revision: 1721
Modified:
trunk/Core/Math/SSEDefs.h
trunk/Model/Cameras/ThinLensCamera.cc
Log:
Adding three new SSE inline functions: sin4, cos4 and sincos4
sin4 and cos4 compute the sine and cosine of their vector argument
while sincos4 computes both the sine and the cosine argument simultanously
since there is a lot of shared computation.
ThinLensCamera::makeRays has been SSE-ified.
Modified: trunk/Core/Math/SSEDefs.h
==============================================================================
--- trunk/Core/Math/SSEDefs.h (original)
+++ trunk/Core/Math/SSEDefs.h Wed Sep 12 01:40:18 2007
@@ -69,17 +69,17 @@
namespace Manta
{
- union sse_union
- {
- sse_t sse;
- float f[4];
- };
+ union sse_union
+ {
+ sse_t sse;
+ float f[4];
+ };
- union sse_int_union
- {
- __m128i ssei;
- int i[4];
- };
+ union sse_int_union
+ {
+ __m128i ssei;
+ int i[4];
+ };
#if defined(__x86_64) && defined(__INTEL_COMPILER)
@@ -90,12 +90,12 @@
// I'm not sure about the high versus low, but the code does what
// GCC does (for example, when copying 64 bit pointers).
static inline
- __m128i _mm_set1_epi64x(long long val)
- {
- const int high = (0xFFFFFFFF00000000L & val) >> 32 ;
- const int low = ( 0xFFFFFFFFL & val);
- return _mm_set_epi32(high, low, high, low);
- }
+ __m128i _mm_set1_epi64x(long long val)
+ {
+ const int high = (0xFFFFFFFF00000000L & val) >> 32 ;
+ const int low = ( 0xFFFFFFFFL & val);
+ return _mm_set_epi32(high, low, high, low);
+ }
#endif
@@ -108,275 +108,288 @@
static inline __m128 _mm_castpd_ps (__m128d val) { return (__m128) val;
}
#endif
- static const MANTA_ALIGN(16) sse_t _mm_eps = _mm_set_ps1(1e-5);
- static const MANTA_ALIGN(16) sse_t _mm_minus_eps = _mm_set_ps1(-1e-5);
- static const MANTA_ALIGN(16) sse_t _mm_epsilon = _mm_set_ps1(1e-5);
- static const MANTA_ALIGN(16) sse_t _mm_one = _mm_set_ps1(1.f);
- static const MANTA_ALIGN(16) sse_int_t _mm_one_si128 = _mm_set1_epi32(1);
- static const MANTA_ALIGN(16) sse_t _mm_zero = _mm_setzero_ps();
- static const MANTA_ALIGN(16) sse_int_t _mm_zero_si128
=_mm_setzero_si128();
- static const MANTA_ALIGN(16) sse_t _mm_one_half = _mm_set_ps1(0.5f);
- static const MANTA_ALIGN(16) sse_t _mm_two = _mm_set_ps1(2.f);
- static const MANTA_ALIGN(16) sse_t _mm_256 = _mm_set_ps1(256);
- static const MANTA_ALIGN(16) sse_t _mm_255 = _mm_set_ps1(255);
- static const MANTA_ALIGN(16) sse_t _mm_infty = _mm_set_ps1(9.9e9999f);
- static const MANTA_ALIGN(16) sse_t _mm_minus_infty =
_mm_set_ps1(-9.9e9999f);
- static const int _mm_intabsmask = 0x7fffffff;
- static const int _mm_intsignbit = 0x80000000;
- static const int _mm_inttruemask = 0xffffffff;
- static const MANTA_ALIGN(16) sse_t _mm_absmask =
_mm_set_ps1((float&)_mm_intabsmask);
- static const MANTA_ALIGN(16) sse_t _mm_signbit =
_mm_set_ps1((float&)_mm_intsignbit);
- static const MANTA_ALIGN(16) sse_t _mm_true =
_mm_set_ps1((float&)_mm_inttruemask);
- static const int minusOneI = -1;
- static const MANTA_ALIGN(16) sse_t _mm_minusOne = _mm_set_ps1((float
&)minusOneI);
+ static const MANTA_ALIGN(16) sse_t _mm_eps = _mm_set_ps1(1e-5);
+ static const MANTA_ALIGN(16) sse_t _mm_minus_eps = _mm_set_ps1(-1e-5);
+ static const MANTA_ALIGN(16) sse_t _mm_epsilon = _mm_set_ps1(1e-5);
+ static const MANTA_ALIGN(16) sse_t _mm_one = _mm_set_ps1(1.f);
+ static const MANTA_ALIGN(16) sse_int_t _mm_one_si128 = _mm_set1_epi32(1);
+ static const MANTA_ALIGN(16) sse_int_t _mm_two_si128 = _mm_set1_epi32(2);
+ static const MANTA_ALIGN(16) sse_int_t _mm_four_si128 = _mm_set1_epi32(4);
+ static const MANTA_ALIGN(16) sse_int_t _mm_inv_one_si128 =
_mm_set1_epi32(~1);
+ static const MANTA_ALIGN(16) sse_t _mm_zero = _mm_setzero_ps();
+ static const MANTA_ALIGN(16) sse_int_t _mm_zero_si128 =_mm_setzero_si128();
+ static const MANTA_ALIGN(16) sse_t _mm_one_half = _mm_set_ps1(0.5f);
+ static const MANTA_ALIGN(16) sse_t _mm_two = _mm_set_ps1(2.f);
+ static const MANTA_ALIGN(16) sse_t _mm_256 = _mm_set_ps1(256);
+ static const MANTA_ALIGN(16) sse_t _mm_255 = _mm_set_ps1(255);
+ static const MANTA_ALIGN(16) sse_t _mm_infty = _mm_set_ps1(9.9e9999f);
+ static const MANTA_ALIGN(16) sse_t _mm_minus_infty =
_mm_set_ps1(-9.9e9999f);
+ static const int _mm_intabsmask = 0x7fffffff;
+ static const int _mm_intsignbit = 0x80000000;
+ static const int _mm_inttruemask = 0xffffffff;
+ static const MANTA_ALIGN(16) sse_t _mm_absmask =
_mm_set_ps1((float&)_mm_intabsmask);
+ static const MANTA_ALIGN(16) sse_t _mm_signbit =
_mm_set_ps1((float&)_mm_intsignbit);
+ static const MANTA_ALIGN(16) sse_t _mm_true =
_mm_set_ps1((float&)_mm_inttruemask);
+ static const int minusOneI = -1;
+ static const MANTA_ALIGN(16) sse_t _mm_minusOne = _mm_set_ps1((float
&)minusOneI);
static const MANTA_ALIGN(16) sse_int_t _mm_absmask_si128 =
set4i(_mm_intabsmask);
static const MANTA_ALIGN(16) sse_int_t _mm_signbit_si128 =
set4i(_mm_intsignbit);
+ static const MANTA_ALIGN(16) sse_t _ps_cephes_FOPI =
_mm_set_ps1(1.27323954473516);
+ static const MANTA_ALIGN(16) sse_t _ps_minus_cephes_DP1 =
_mm_set_ps1(-0.78515625);
+ static const MANTA_ALIGN(16) sse_t _ps_minus_cephes_DP2 =
_mm_set_ps1(-2.4187564849853515625e-4);
+ static const MANTA_ALIGN(16) sse_t _ps_minus_cephes_DP3 =
_mm_set_ps1(-3.77489497744594108e-8);
+ static const MANTA_ALIGN(16) sse_t _ps_sincof_p0 =
_mm_set_ps1(-1.9515295891E-4);
+ static const MANTA_ALIGN(16) sse_t _ps_sincof_p1 =
_mm_set_ps1(8.3321608736E-3);
+ static const MANTA_ALIGN(16) sse_t _ps_sincof_p2 =
_mm_set_ps1(-1.6666654611E-1);
+ static const MANTA_ALIGN(16) sse_t _ps_coscof_p0 =
_mm_set_ps1(2.443315711809948E-005);
+ static const MANTA_ALIGN(16) sse_t _ps_coscof_p1 =
_mm_set_ps1(-1.388731625493765E-003);
+ static const MANTA_ALIGN(16) sse_t _ps_coscof_p2 =
_mm_set_ps1(4.166664568298827E-002);
+
+ /*! return v0 + t*(v1-v0) */
+ inline sse_t lerp4(const sse_t t, const sse_t v0, const sse_t v1)
+ {
+ return add4(v0,mul4(t,sub4(v1,v0)));
+ }
+
+ inline sse_t dot4(const sse_t &ox, const sse_t &oy, const sse_t &oz,
+ const sse_t &vx, const sse_t &vy, const sse_t &vz)
+ {
+ return _mm_add_ps(_mm_add_ps(_mm_mul_ps(vx,ox),
+ _mm_mul_ps(vy,oy)),
+ _mm_mul_ps(vz,oz));
+ }
+
+ //equivalent to mask ? newD : oldD
+ inline sse_t mask4(const sse_t &mask, const sse_t &newD, const sse_t &oldD)
+ {
+ return or4(and4(mask,newD),andnot4(mask,oldD));
+ }
+
+ //equivalent to ~mask ? newD : oldD
+ inline sse_t masknot4(const sse_t &mask, const sse_t &newD, const sse_t
&oldD)
+ {
+ return or4(andnot4(mask,newD),and4(mask,oldD));
+ }
+
+ inline sse_int_t mask4i(const sse_int_t &mask, const sse_int_t &newD,
const sse_int_t &oldD)
+ {
+ return or4i(and4i(mask,newD),andnot4i(mask,oldD));
+ }
+
+ inline sse_t abs4(const sse_t &v)
+ {
+ return andnot4(_mm_signbit,v);
+ }
+
+ inline sse_t accurateReciprocal(const sse_t v)
+ {
+ const sse_t rcp = _mm_rcp_ps(v);
+ return _mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+ }
+
+ inline sse_t uberAccurateReciprocal(const sse_t v)
+ {
+ sse_t rcp = _mm_rcp_ps(v);
+ rcp = _mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+ rcp = _mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+ rcp = _mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+ return rcp;
+ }
+
+ inline sse_t oneOver(const sse_t v)
+ {
+ const sse_t rcp = _mm_rcp_ps(v);
+ return _mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
+ }
+
+ inline sse_t accurateReciprocalSqrt(const sse_t v)
+ {
+ const sse_t rcp_sqrt = _mm_rsqrt_ps(v);
+ const sse_t one_point_five = _mm_set_ps1(1.5f);
+ return _mm_mul_ps(rcp_sqrt, _mm_sub_ps(one_point_five,
_mm_mul_ps(_mm_one_half, _mm_mul_ps(_mm_mul_ps(rcp_sqrt,rcp_sqrt),v))));
+ }
+
+
+ inline sse_t reciprocal(const sse_t v)
+ {
+ return _mm_rcp_ps(v);
+ }
+
+ inline sse_t lin(const sse_t &base,
+ float u,const sse_t du,
+ float v, const sse_t dv)
+ {
+ return _mm_add_ps(_mm_add_ps(base,_mm_mul_ps(_mm_set_ps1(u),du)),
+ _mm_mul_ps(_mm_set_ps1(v),dv));
+ }
+
+ inline sse_t lin(const sse_t &base,
+ const sse_t &u,const sse_t du,
+ const sse_t &v, const sse_t dv)
+ {
+ return _mm_add_ps(_mm_add_ps(base,_mm_mul_ps(u,du)),
+ _mm_mul_ps(v,dv));
+ }
+
- /*! return v0 + t*(v1-v0) */
- inline sse_t lerp4(const sse_t t, const sse_t v0, const sse_t v1)
- {
- return add4(v0,mul4(t,sub4(v1,v0)));
- }
-
- inline sse_t dot4(const sse_t &ox, const sse_t &oy, const sse_t &oz,
- const sse_t &vx, const sse_t &vy, const sse_t &vz)
- {
- return _mm_add_ps(_mm_add_ps(_mm_mul_ps(vx,ox),
- _mm_mul_ps(vy,oy)),
- _mm_mul_ps(vz,oz));
- }
-
- //equivalent to mask ? newD : oldD
- inline sse_t mask4(const sse_t &mask, const sse_t &newD, const sse_t
&oldD)
- {
- return or4(and4(mask,newD),andnot4(mask,oldD));
- }
-
- //equivalent to ~mask ? newD : oldD
- inline sse_t masknot4(const sse_t &mask, const sse_t &newD, const sse_t
&oldD)
- {
- return or4(andnot4(mask,newD),and4(mask,oldD));
- }
-
- inline sse_int_t mask4i(const sse_int_t &mask, const sse_int_t &newD,
const sse_int_t &oldD)
- {
- return or4i(and4i(mask,newD),andnot4i(mask,oldD));
- }
-
- inline sse_t abs4(const sse_t &v)
- {
- return andnot4(_mm_signbit,v);
- }
-
- inline sse_t accurateReciprocal(const sse_t v)
- {
- const sse_t rcp = _mm_rcp_ps(v);
- return
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
- }
-
- inline sse_t uberAccurateReciprocal(const sse_t v)
- {
- sse_t rcp = _mm_rcp_ps(v);
- rcp =
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
- rcp =
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
- rcp =
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
- return rcp;
- }
-
- inline sse_t oneOver(const sse_t v)
- {
- const sse_t rcp = _mm_rcp_ps(v);
- return
_mm_sub_ps(_mm_add_ps(rcp,rcp),_mm_mul_ps(_mm_mul_ps(rcp,rcp),v));
- }
-
- inline sse_t accurateReciprocalSqrt(const sse_t v)
- {
- const sse_t rcp_sqrt = _mm_rsqrt_ps(v);
- const sse_t one_point_five = _mm_set_ps1(1.5f);
- return _mm_mul_ps(rcp_sqrt, _mm_sub_ps(one_point_five,
_mm_mul_ps(_mm_one_half, _mm_mul_ps(_mm_mul_ps(rcp_sqrt,rcp_sqrt),v))));
- }
-
-
- inline sse_t reciprocal(const sse_t v)
- {
- return _mm_rcp_ps(v);
- }
-
- inline sse_t lin(const sse_t &base,
- float u,const sse_t du,
- float v, const sse_t dv)
- {
- return _mm_add_ps(_mm_add_ps(base,_mm_mul_ps(_mm_set_ps1(u),du)),
- _mm_mul_ps(_mm_set_ps1(v),dv));
- }
-
- inline sse_t lin(const sse_t &base,
- const sse_t &u,const sse_t du,
- const sse_t &v, const sse_t dv)
- {
- return _mm_add_ps(_mm_add_ps(base,_mm_mul_ps(u,du)),
- _mm_mul_ps(v,dv));
- }
-
-
- inline sse_t dot4(const sse_t &a, const sse_t &b)
- {
- const sse_t xyzw = _mm_mul_ps(a,b);
- const sse_t zwxy = _mm_shuffle_ps(xyzw,xyzw,_MM_SHUFFLE(1,0,3,2));
- const sse_t xz_yw_zx_wy = _mm_add_ps(zwxy,xyzw);
- const sse_t wy_zx_yw_xz =
_mm_shuffle_ps(xz_yw_zx_wy,xz_yw_zx_wy,_MM_SHUFFLE(0,1,2,3));
- const sse_t res = _mm_add_ps(xz_yw_zx_wy,wy_zx_yw_xz);
- return res;
- }
-
- inline float dot1(const sse_t &a, const sse_t &b)
- {
- const sse_t d = dot4(a,b);
- return (float&)d;
- }
-
- inline void normalize(sse_t &v)
- {
- const sse_t dot = dot4(v,v);
- v = _mm_mul_ps(v, _mm_rsqrt_ps(dot));
- };
-
- inline float sqrLength(const sse_t &a)
- {
- return dot1(a,a);
- }
-
- inline float length(sse_t a)
- {
- const sse_t d = dot4(a,a);
- const sse_t v = sqrt4(d);
- return (float &)v;
- }
-
- // Get horizontal minimum of all components
- inline float min4f(sse_t t)
- {
- // a = (t0, t0, t1, t1)
- sse_t a = unpacklo(t,t);
- // b = (t2, t2, t3, t3)
- sse_t b = unpackhi(t,t);
- // c = (min(t0,t2), min(t0, t2), min(t1, t3), min(t1, t3))
- sse_t c = min4(a, b);
- // The movehl will move the high 2 values to the low 2 values.
- // This will allow us to compare min(t0,t2) with min(t1, t3).
- sse_t min = _mm_min_ss(c, _mm_movehl_ps(c, c));
- // Return the first value.
- return *((float*)&min);
- }
-
- // Get horizontal maximum of all components
- inline float max4f(sse_t t)
- {
- // a = (t0, t0, t1, t1)
- sse_t a = unpacklo(t,t);
- // b = (t2, t2, t3, t3)
- sse_t b = unpackhi(t,t);
- // c = (max(t0,t2), max(t0, t2), max(t1, t3), max(t1, t3))
- sse_t c = max4(a, b);
- // The movehl will move the high 2 values to the low 2 values.
- // This will allow us to compare max(t0,t2) with max(t1, t3).
- sse_t max = _mm_max_ss(c, _mm_movehl_ps(c, c));
- // Return the first value.
- return *((float*)&max);
- }
-
- // Get horizontal minimum of the frist 3 components
- inline float min3f(sse_t t)
- {
- // a = (t0, t0, t1, t1), you might be tempted to make this a movelh,
- // but you need t1 to be in the 2 index in order to use movehl
- // later.
- sse_t a = unpacklo(t,t);
- // b = (t2, t3, t2, t3)
- sse_t b = _mm_movehl_ps(t,t);
- // c = (min(t0,t2), min(t0, t3), min(t1, t2), min(t1, t3))
- sse_t c = min4(a, b);
- // The movehl will move the high 2 values to the low 2 values.
- // This will allow us to compare t1 with min(t0, t2).
- sse_t min = _mm_min_ss(c, _mm_movehl_ps(a, a));
- // Return the first value.
- return *((float*)&min);
- }
-
- // Get horizontal maximum of the frist 3 components
- inline float max3f(sse_t t)
- {
- // a = (t0, t0, t1, t1), you might be tempted to make this a movelh,
- // but you need t1 to be in the 2 index in order to use movehl
- // later.
- sse_t a = unpacklo(t,t);
- // b = (t2, t3, t2, t3)
- sse_t b = _mm_movehl_ps(t,t);
- // c = (max(t0,t2), max(t0, t3), max(t1, t2), max(t1, t3))
- sse_t c = max4(a, b);
- // The movehl will move the high 2 values to the low 2 values.
- // This will allow us to compare t1 with max(t0, t2).
- sse_t max = _mm_max_ss(c, _mm_movehl_ps(a, a));
- // Return the first value.
- return *((float*)&max);
- }
-
- inline float simd_component(sse_t t, int offset)
- {
- MANTA_ALIGN(16)
- float f[4];
- _mm_store_ps(f,t);
- return f[offset];
- }
+ inline sse_t dot4(const sse_t &a, const sse_t &b)
+ {
+ const sse_t xyzw = _mm_mul_ps(a,b);
+ const sse_t zwxy = _mm_shuffle_ps(xyzw,xyzw,_MM_SHUFFLE(1,0,3,2));
+ const sse_t xz_yw_zx_wy = _mm_add_ps(zwxy,xyzw);
+ const sse_t wy_zx_yw_xz =
_mm_shuffle_ps(xz_yw_zx_wy,xz_yw_zx_wy,_MM_SHUFFLE(0,1,2,3));
+ const sse_t res = _mm_add_ps(xz_yw_zx_wy,wy_zx_yw_xz);
+ return res;
+ }
+
+ inline float dot1(const sse_t &a, const sse_t &b)
+ {
+ const sse_t d = dot4(a,b);
+ return (float&)d;
+ }
+
+ inline void normalize(sse_t &v)
+ {
+ const sse_t dot = dot4(v,v);
+ v = _mm_mul_ps(v, _mm_rsqrt_ps(dot));
+ };
+
+ inline float sqrLength(const sse_t &a)
+ {
+ return dot1(a,a);
+ }
+
+ inline float length(sse_t a)
+ {
+ const sse_t d = dot4(a,a);
+ const sse_t v = sqrt4(d);
+ return (float &)v;
+ }
+
+ // Get horizontal minimum of all components
+ inline float min4f(sse_t t)
+ {
+ // a = (t0, t0, t1, t1)
+ sse_t a = unpacklo(t,t);
+ // b = (t2, t2, t3, t3)
+ sse_t b = unpackhi(t,t);
+ // c = (min(t0,t2), min(t0, t2), min(t1, t3), min(t1, t3))
+ sse_t c = min4(a, b);
+ // The movehl will move the high 2 values to the low 2 values.
+ // This will allow us to compare min(t0,t2) with min(t1, t3).
+ sse_t min = _mm_min_ss(c, _mm_movehl_ps(c, c));
+ // Return the first value.
+ return *((float*)&min);
+ }
+
+ // Get horizontal maximum of all components
+ inline float max4f(sse_t t)
+ {
+ // a = (t0, t0, t1, t1)
+ sse_t a = unpacklo(t,t);
+ // b = (t2, t2, t3, t3)
+ sse_t b = unpackhi(t,t);
+ // c = (max(t0,t2), max(t0, t2), max(t1, t3), max(t1, t3))
+ sse_t c = max4(a, b);
+ // The movehl will move the high 2 values to the low 2 values.
+ // This will allow us to compare max(t0,t2) with max(t1, t3).
+ sse_t max = _mm_max_ss(c, _mm_movehl_ps(c, c));
+ // Return the first value.
+ return *((float*)&max);
+ }
+
+ // Get horizontal minimum of the frist 3 components
+ inline float min3f(sse_t t)
+ {
+ // a = (t0, t0, t1, t1), you might be tempted to make this a movelh,
+ // but you need t1 to be in the 2 index in order to use movehl
+ // later.
+ sse_t a = unpacklo(t,t);
+ // b = (t2, t3, t2, t3)
+ sse_t b = _mm_movehl_ps(t,t);
+ // c = (min(t0,t2), min(t0, t3), min(t1, t2), min(t1, t3))
+ sse_t c = min4(a, b);
+ // The movehl will move the high 2 values to the low 2 values.
+ // This will allow us to compare t1 with min(t0, t2).
+ sse_t min = _mm_min_ss(c, _mm_movehl_ps(a, a));
+ // Return the first value.
+ return *((float*)&min);
+ }
+
+ // Get horizontal maximum of the frist 3 components
+ inline float max3f(sse_t t)
+ {
+ // a = (t0, t0, t1, t1), you might be tempted to make this a movelh,
+ // but you need t1 to be in the 2 index in order to use movehl
+ // later.
+ sse_t a = unpacklo(t,t);
+ // b = (t2, t3, t2, t3)
+ sse_t b = _mm_movehl_ps(t,t);
+ // c = (max(t0,t2), max(t0, t3), max(t1, t2), max(t1, t3))
+ sse_t c = max4(a, b);
+ // The movehl will move the high 2 values to the low 2 values.
+ // This will allow us to compare t1 with max(t0, t2).
+ sse_t max = _mm_max_ss(c, _mm_movehl_ps(a, a));
+ // Return the first value.
+ return *((float*)&max);
+ }
+
+ inline float simd_component(sse_t t, int offset)
+ {
+ MANTA_ALIGN(16)
+ float f[4];
+ _mm_store_ps(f,t);
+ return f[offset];
+ }
- inline void simd_cerr(sse_t t)
- {
- std::cerr << t << std::endl;
- }
-
- inline Vec3f as_Vec3f(sse_t t)
- {
- MANTA_ALIGN(16)
- float f[4];
- _mm_store_ps(f,t);
- return Vec3f(f[2], f[1], f[0]);
- }
+ inline void simd_cerr(sse_t t)
+ {
+ std::cerr << t << std::endl;
+ }
+
+ inline Vec3f as_Vec3f(sse_t t)
+ {
+ MANTA_ALIGN(16)
+ float f[4];
+ _mm_store_ps(f,t);
+ return Vec3f(f[2], f[1], f[0]);
+ }
- inline Vector as_Vector(sse_t t)
- {
- MANTA_ALIGN(16)
- float f[4];
- _mm_store_ps(f,t);
- return Vector(f[2], f[1], f[0]);
- }
+ inline Vector as_Vector(sse_t t)
+ {
+ MANTA_ALIGN(16)
+ float f[4];
+ _mm_store_ps(f,t);
+ return Vector(f[2], f[1], f[0]);
+ }
- inline int count_nonzeros(sse_t t)
- {
- int mask = _mm_movemask_ps(t);
- // Isolate each bit with a mask, then shift it down to the one's
- // spot. Look, ma. No branches!
- int nonzeros = (((mask & (1 << 0)) >> 0) +
- ((mask & (1 << 1)) >> 1) +
- ((mask & (1 << 2)) >> 2) +
- ((mask & (1 << 3)) >> 3));
- return nonzeros;
- }
+ inline int count_nonzeros(sse_t t)
+ {
+ int mask = _mm_movemask_ps(t);
+ // Isolate each bit with a mask, then shift it down to the one's
+ // spot. Look, ma. No branches!
+ int nonzeros = (((mask & (1 << 0)) >> 0) +
+ ((mask & (1 << 1)) >> 1) +
+ ((mask & (1 << 2)) >> 2) +
+ ((mask & (1 << 3)) >> 3));
+ return nonzeros;
+ }
- inline int count_zeros(sse_t t)
- {
- // You could use this if your compiler isn't stupid.
- // return 4-count_nonzeros(t);
-
- // But most compilers are stupid, so we'll manually inline it:
- int mask = _mm_movemask_ps(t);
- // Isolate each bit with a mask, then shift it down to the one's
- // spot. Look, ma. No branches!
- int nonzeros = (((mask & (1 << 0)) >> 0) +
- ((mask & (1 << 1)) >> 1) +
- ((mask & (1 << 2)) >> 2) +
- ((mask & (1 << 3)) >> 3));
- return 4-nonzeros;
- }
+ inline int count_zeros(sse_t t)
+ {
+ // You could use this if your compiler isn't stupid.
+ // return 4-count_nonzeros(t);
+
+ // But most compilers are stupid, so we'll manually inline it:
+ int mask = _mm_movemask_ps(t);
+ // Isolate each bit with a mask, then shift it down to the one's
+ // spot. Look, ma. No branches!
+ int nonzeros = (((mask & (1 << 0)) >> 0) +
+ ((mask & (1 << 1)) >> 1) +
+ ((mask & (1 << 2)) >> 2) +
+ ((mask & (1 << 3)) >> 3));
+ return 4-nonzeros;
+ }
// The sse code below is based off of this algorithm
@@ -440,6 +453,357 @@
return product;
#endif
}
+
+ ///////////////////////
+ // SSE sine and cosine, based off of code available at:
+ //
http://gruntthepeon.free.fr/ssemath/
+ // with the following license:
+
+ /* Copyright (C) 2007 Julien Pommier
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not
be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source
distribution.
+
+ (this is the zlib license)
+ */
+
+ // end license
+ ////////////////////////////
+
+ typedef union xmm_mm_union {
+ __m128 xmm;
+ __m64 mm[2];
+ } xmm_mm_union;
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
+ xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
+ }
+
+ inline sse_t sin4(sse_t x) {
+ typedef __m128 v4sf;
+ typedef __m64 v2si;
+
+ v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+ v2si mm0, mm1, mm2, mm3;
+ sign_bit = x;
+ /* take the absolute value */
+ x = _mm_and_ps(x, _mm_absmask);
+ /* extract the sign bit (upper one) */
+ sign_bit = _mm_and_ps(sign_bit, _mm_signbit);
+
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(x, _ps_cephes_FOPI);
+
+ /* store the integer part of y in mm0:mm1 */
+ xmm2 = _mm_movehl_ps(xmm2, y);
+ mm2 = _mm_cvttps_pi32(y);
+ mm3 = _mm_cvttps_pi32(xmm2);
+
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
+ mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
+ mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
+ mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
+
+ y = _mm_cvtpi32x2_ps(mm2, mm3);
+ //printf("\nsin_ps: mm2:mm3 = "); print2i(mm2); print2i(mm3);
printf("\n");
+ //printf("sin_ps: y="); print4(y); printf("\n");
+
+
+ /* get the swap sign flag */
+ mm0 = _mm_and_si64(mm2, *((v2si*)&_mm_four_si128));
+ mm1 = _mm_and_si64(mm3, *((v2si*)&_mm_four_si128));
+ mm0 = _mm_slli_pi32(mm0, 29);
+ mm1 = _mm_slli_pi32(mm1, 29);
+
+ /* get the polynom selection mask
+ there is one polynom for 0 <= x <= Pi/4
+ and another one for Pi/4<x<=Pi/2
+
+ Both branches will be computed.
+ */
+ mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
+ mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
+ mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+ mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+ v4sf swap_sign_bit, poly_mask;
+ COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+ COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+ sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = _ps_minus_cephes_DP1;
+ xmm2 = _ps_minus_cephes_DP2;
+ xmm3 = _ps_minus_cephes_DP3;
+ xmm1 = _mm_mul_ps(y, xmm1);
+ xmm2 = _mm_mul_ps(y, xmm2);
+ xmm3 = _mm_mul_ps(y, xmm3);
+ x = _mm_add_ps(x, xmm1);
+ x = _mm_add_ps(x, xmm2);
+ x = _mm_add_ps(x, xmm3);
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+ y = _ps_coscof_p0;
+ v4sf z = _mm_mul_ps(x,x);
+
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p1);
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p2);
+ y = _mm_mul_ps(y, z);
+ y = _mm_mul_ps(y, z);
+ v4sf tmp = _mm_mul_ps(z, _mm_one_half);
+ y = _mm_sub_ps(y, tmp);
+ y = _mm_add_ps(y, _mm_one);
+
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+
+ v4sf y2 = _ps_sincof_p0;
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p1);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p2);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_mul_ps(y2, x);
+ y2 = _mm_add_ps(y2, x);
+
+ /* select the correct result from the two polynoms */
+ xmm3 = poly_mask;
+ y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+ y = _mm_andnot_ps(xmm3, y);
+ y = _mm_add_ps(y,y2);
+ /* update the sign */
+ y = _mm_xor_ps(y, sign_bit);
+ _mm_empty(); /* good-bye mmx */
+ return y;
+ }
+
+ inline sse_t cos4(sse_t x) {
+ typedef __m128 v4sf;
+ typedef __m64 v2si;
+
+ v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+ v2si mm0, mm1, mm2, mm3;
+ /* take the absolute value */
+ x = _mm_and_ps(x, _mm_absmask);
+
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(x, _ps_cephes_FOPI);
+
+ /* store the integer part of y in mm0:mm1 */
+ xmm2 = _mm_movehl_ps(xmm2, y);
+ mm2 = _mm_cvttps_pi32(y);
+ mm3 = _mm_cvttps_pi32(xmm2);
+
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
+ mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
+ mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
+ mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
+
+ y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+
+ mm2 = _mm_sub_pi32(mm2, *((v2si*)&_mm_two_si128));
+ mm3 = _mm_sub_pi32(mm3, *((v2si*)&_mm_two_si128));
+
+ /* get the swap sign flag in mm0:mm1 and the
+ polynom selection mask in mm2:mm3 */
+
+ mm0 = _mm_andnot_si64(mm2, *((v2si*)&_mm_four_si128));
+ mm1 = _mm_andnot_si64(mm3, *((v2si*)&_mm_four_si128));
+ mm0 = _mm_slli_pi32(mm0, 29);
+ mm1 = _mm_slli_pi32(mm1, 29);
+
+ mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
+ mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
+
+ mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+ mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+ v4sf sign_bit, poly_mask;
+ COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+ COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = _ps_minus_cephes_DP1;
+ xmm2 = _ps_minus_cephes_DP2;
+ xmm3 = _ps_minus_cephes_DP3;
+ xmm1 = _mm_mul_ps(y, xmm1);
+ xmm2 = _mm_mul_ps(y, xmm2);
+ xmm3 = _mm_mul_ps(y, xmm3);
+ x = _mm_add_ps(x, xmm1);
+ x = _mm_add_ps(x, xmm2);
+ x = _mm_add_ps(x, xmm3);
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+ y = _ps_coscof_p0;
+ v4sf z = _mm_mul_ps(x,x);
+
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p1);
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p2);
+ y = _mm_mul_ps(y, z);
+ y = _mm_mul_ps(y, z);
+ v4sf tmp = _mm_mul_ps(z, _mm_one_half);
+ y = _mm_sub_ps(y, tmp);
+ y = _mm_add_ps(y, _mm_one);
+
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+
+ v4sf y2 = _ps_sincof_p0;
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p1);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p2);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_mul_ps(y2, x);
+ y2 = _mm_add_ps(y2, x);
+
+ /* select the correct result from the two polynoms */
+ xmm3 = poly_mask;
+ y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+ y = _mm_andnot_ps(xmm3, y);
+ y = _mm_add_ps(y,y2);
+ /* update the sign */
+ y = _mm_xor_ps(y, sign_bit);
+
+ _mm_empty(); /* good-bye mmx */
+ return y;
+ }
+
+ inline void sincos4(sse_t x, sse_t* s, sse_t* c) {
+ typedef __m128 v4sf;
+ typedef __m64 v2si;
+
+ v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+ v2si mm0, mm1, mm2, mm3, mm4, mm5;
+ sign_bit_sin = x;
+ /* take the absolute value */
+ x = _mm_and_ps(x, _mm_absmask);
+ /* extract the sign bit (upper one) */
+ sign_bit_sin = _mm_and_ps(sign_bit_sin, _mm_signbit);
+
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(x, _ps_cephes_FOPI);
+
+ /* store the integer part of y in mm0:mm1 */
+ xmm3 = _mm_movehl_ps(xmm3, y);
+ mm2 = _mm_cvttps_pi32(y);
+ mm3 = _mm_cvttps_pi32(xmm3);
+
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
+ mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
+ mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
+ mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
+
+ y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+ mm4 = mm2;
+ mm5 = mm3;
+
+ /* get the swap sign flag for the sine */
+ mm0 = _mm_and_si64(mm2, *((v2si*)&_mm_four_si128));
+ mm1 = _mm_and_si64(mm3, *((v2si*)&_mm_four_si128));
+ mm0 = _mm_slli_pi32(mm0, 29);
+ mm1 = _mm_slli_pi32(mm1, 29);
+ v4sf swap_sign_bit_sin;
+ COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+ /* get the polynom selection mask for the sine */
+
+ mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
+ mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
+ mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+ mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+ v4sf poly_mask;
+ COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = _ps_minus_cephes_DP1;
+ xmm2 = _ps_minus_cephes_DP2;
+ xmm3 = _ps_minus_cephes_DP3;
+ xmm1 = _mm_mul_ps(y, xmm1);
+ xmm2 = _mm_mul_ps(y, xmm2);
+ xmm3 = _mm_mul_ps(y, xmm3);
+ x = _mm_add_ps(x, xmm1);
+ x = _mm_add_ps(x, xmm2);
+ x = _mm_add_ps(x, xmm3);
+
+
+ /* get the sign flag for the cosine */
+
+ mm4 = _mm_sub_pi32(mm4, *((v2si*)&_mm_two_si128));
+ mm5 = _mm_sub_pi32(mm5, *((v2si*)&_mm_two_si128));
+ mm4 = _mm_andnot_si64(mm4, *((v2si*)&_mm_four_si128));
+ mm5 = _mm_andnot_si64(mm5, *((v2si*)&_mm_four_si128));
+ mm4 = _mm_slli_pi32(mm4, 29);
+ mm5 = _mm_slli_pi32(mm5, 29);
+ v4sf sign_bit_cos;
+ COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+
+ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+ v4sf z = _mm_mul_ps(x,x);
+ y = _ps_coscof_p0;
+
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p1);
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p2);
+ y = _mm_mul_ps(y, z);
+ y = _mm_mul_ps(y, z);
+ v4sf tmp = _mm_mul_ps(z, _mm_one_half);
+ y = _mm_sub_ps(y, tmp);
+ y = _mm_add_ps(y, _mm_one);
+
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+
+ v4sf y2 = _ps_sincof_p0;
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p1);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p2);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_mul_ps(y2, x);
+ y2 = _mm_add_ps(y2, x);
+
+ /* select the correct result from the two polynoms */
+ xmm3 = poly_mask;
+ v4sf ysin2 = _mm_and_ps(xmm3, y2);
+ v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+ y2 = _mm_sub_ps(y2,ysin2);
+ y = _mm_sub_ps(y, ysin1);
+
+ xmm1 = _mm_add_ps(ysin1,ysin2);
+ xmm2 = _mm_add_ps(y,y2);
+
+ /* update the sign */
+ *s = _mm_xor_ps(xmm1, sign_bit_sin);
+ *c = _mm_xor_ps(xmm2, sign_bit_cos);
+ _mm_empty(); /* good-bye mmx */
+ }
};
Modified: trunk/Model/Cameras/ThinLensCamera.cc
==============================================================================
--- trunk/Model/Cameras/ThinLensCamera.cc (original)
+++ trunk/Model/Cameras/ThinLensCamera.cc Wed Sep 12 01:40:18 2007
@@ -161,6 +161,114 @@
context.rng->nextPacket(lens_coord_x, rays);
context.rng->nextPacket(lens_coord_y, rays);
+#ifdef MANTA_SSE
+ int b = (rays.rayBegin + 3) & (~3);
+ int e = rays.rayEnd & (~3);
+
+ if(b>=e) {
+ for(int i = rays.begin(); i < rays.end(); ++i) {
+ Real imageX = rays.getImageCoordinates(i, 0);
+ Real imageY = rays.getImageCoordinates(i, 1);
+
+ Real theta = 2.0 * M_PI * lens_coord_x.get(i);
+ Real r = radius * SCIRun::Sqrt( lens_coord_y.get(i) );
+
+ Vector origin = eye + r*Cos(theta)*u + r*Sin(theta)*v;
+
+ rays.setRay(i, origin, imageX*v+imageY*u+direction*focal_length);
+ }
+ } else {
+ int i = rays.rayBegin;
+ for(;i<b;i++) {
+ Real imageX = rays.getImageCoordinates(i, 0);
+ Real imageY = rays.getImageCoordinates(i, 1);
+
+ Real theta = 2.0 * M_PI * lens_coord_x.get(i);
+ Real r = radius * SCIRun::Sqrt( lens_coord_y.get(i) );
+
+ Vector origin = eye + r*Cos(theta)*u + r*Sin(theta)*v;
+
+ rays.setRay(i, origin, imageX*v+imageY*u+direction*focal_length);
+ }
+ for(i=e;i<rays.rayEnd;i++) {
+ Real imageX = rays.getImageCoordinates(i, 0);
+ Real imageY = rays.getImageCoordinates(i, 1);
+
+ Real theta = 2.0 * M_PI * lens_coord_x.get(i);
+ Real r = radius * SCIRun::Sqrt( lens_coord_y.get(i) );
+
+ Vector origin = eye + r*Cos(theta)*u + r*Sin(theta)*v;
+
+ rays.setRay(i, origin, imageX*v+imageY*u+direction*focal_length);
+ }
+
+ RayPacketData* data = rays.data;
+
+ const sse_t eyex = set4(eye.data[0]);
+ const sse_t eyey = set4(eye.data[1]);
+ const sse_t eyez = set4(eye.data[2]);
+
+ const sse_t v0 = set4(v[0]);
+ const sse_t v1 = set4(v[1]);
+ const sse_t v2 = set4(v[2]);
+
+ const sse_t u0 = set4(u[0]);
+ const sse_t u1 = set4(u[1]);
+ const sse_t u2 = set4(u[2]);
+
+ const sse_t dirx = set4(direction[0]);
+ const sse_t diry = set4(direction[1]);
+ const sse_t dirz = set4(direction[2]);
+
+ const sse_t twoPI = set4(2.0*M_PI);
+ const sse_t radius4 = set4(radius);
+ const sse_t focal4 = set4(focal_length);
+
+ for(i=b;i<e;i+=4){
+ const sse_t imageX = load44(&data->image[0][i]);
+ const sse_t imageY = load44(&data->image[1][i]);
+ const sse_t lensCoordsX = load44(&lens_coord_x.data[i]);
+ const sse_t lensCoordsY = load44(&lens_coord_y.data[i]);
+ const sse_t theta4 = mul4(twoPI, lensCoordsX);
+ const sse_t r4 = mul4(radius4, sqrt4( lensCoordsY ));
+
+#if 0
+ const sse_t rCosTheta = mul4(r4, cos4(theta4));
+ const sse_t rSinTheta = mul4(r4, sin4(theta4));
+#else
+ sse_t rCosTheta, rSinTheta;
+ sincos4(theta4, &rSinTheta, &rCosTheta);
+ rCosTheta = mul4(r4, rCosTheta);
+ rSinTheta = mul4(r4, rSinTheta);
+#endif
+
+ const sse_t origin_x = add4( add4( eyex, mul4(rCosTheta, u0) ),
+ mul4(rSinTheta, v0));
+ const sse_t origin_y = add4( add4( eyey, mul4(rCosTheta, u1) ),
+ mul4(rSinTheta, v1));
+ const sse_t origin_z = add4( add4( eyez, mul4(rCosTheta, u2) ),
+ mul4(rSinTheta, v2));
+
+ store44(&data->origin[0][i], origin_x);
+ store44(&data->origin[1][i], origin_y);
+ store44(&data->origin[2][i], origin_z);
+
+ const sse_t direction_x = add4( add4( mul4( imageX, v0 ),
+ mul4( imageY, u0 ) ),
+ mul4( dirx, focal4 ) );
+ const sse_t direction_y = add4( add4( mul4( imageX, v1 ),
+ mul4( imageY, u1 ) ),
+ mul4( diry, focal4 ) );
+ const sse_t direction_z = add4( add4( mul4( imageX, v2 ),
+ mul4( imageY, u2 ) ),
+ mul4( dirz, focal4 ) );
+
+ store44(&data->direction[0][i], direction_x);
+ store44(&data->direction[1][i], direction_y);
+ store44(&data->direction[2][i], direction_z);
+ }
+ }
+#else
for(int i = rays.begin(); i < rays.end(); ++i) {
Real imageX = rays.getImageCoordinates(i, 0);
Real imageY = rays.getImageCoordinates(i, 1);
@@ -172,6 +280,7 @@
rays.setRay(i, origin, imageX*v+imageY*u+direction*focal_length);
}
+#endif
}
void ThinLensCamera::scaleFOV(Real scale)
- [Manta] r1721 - in trunk: Core/Math Model/Cameras, arobison, 09/12/2007
Archive powered by MHonArc 2.6.16.