manta - [Manta] r1726 - in trunk: Core Core/Math Model/Cameras

Closed list
Subscribers: 0
Owners

sparker

thiago

Subscribe
Unsubscribe
Info
Admin
Archive

Post

Shared documents

Manta Interactive Ray Tracer Development Mailing List

Text archives Help

[Manta] r1726 - in trunk: Core Core/Math Model/Cameras

From: bigler@sci.utah.edu
To: manta@sci.utah.edu
Subject: [Manta] r1726 - in trunk: Core Core/Math Model/Cameras
Date: Tue, 18 Sep 2007 14:13:08 -0600 (MDT)

Author: bigler
Date: Tue Sep 18 14:13:07 2007
New Revision: 1726

Added:
   trunk/Core/Math/TrigSSE.cc
   trunk/Core/Math/TrigSSE.h
Modified:
   trunk/Core/CMakeLists.txt
   trunk/Core/Math/SSEDefs.cc
   trunk/Core/Math/SSEDefs.h
   trunk/Model/Cameras/ThinLensCamera.cc
Log:
Core/CMakeLists.txt
Core/Math/SSEDefs.cc
Core/Math/SSEDefs.h
Core/Math/TrigSSE.cc
Core/Math/TrigSSE.h

  Moved the trig functions sin4, cos4, sincos4 to their own files,
  TrigSSE.{h,cc}.

Model/Cameras/ThinLensCamera.cc

  Include TrigSSE.h for new location of sin4.

Modified: trunk/Core/CMakeLists.txt
==============================================================================
--- trunk/Core/CMakeLists.txt   (original)
+++ trunk/Core/CMakeLists.txt   Tue Sep 18 14:13:07 2007
@@ -57,6 +57,8 @@
      Math/Noise.h
      Math/SSEDefs.cc
      Math/SSEDefs.h
+     Math/TrigSSE.cc
+     Math/TrigSSE.h
      Math/ipow.h
      )
SET (CORE_SOURCES ${CORE_SOURCES}

Modified: trunk/Core/Math/SSEDefs.cc
==============================================================================
--- trunk/Core/Math/SSEDefs.cc  (original)
+++ trunk/Core/Math/SSEDefs.cc  Tue Sep 18 14:13:07 2007
@@ -38,357 +38,4 @@
#endif
}

-
-///////////////////////
-// SSE sine and cosine, based off of code available at:
-// http://gruntthepeon.free.fr/ssemath/
-// with the following license:
-
-/* Copyright (C) 2007  Julien Pommier
-
-   This software is provided 'as-is', without any express or implied
-   warranty.  In no event will the authors be held liable for any damages
-   arising from the use of this software.
-
-   Permission is granted to anyone to use this software for any purpose,
-   including commercial applications, and to alter it and redistribute it
-   freely, subject to the following restrictions:
-
-   1. The origin of this software must not be misrepresented; you must not
-   claim that you wrote the original software. If you use this software
-   in a product, an acknowledgment in the product documentation would be
-   appreciated but is not required.
-   2. Altered source versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.
-   3. This notice may not be removed or altered from any source distribution.
-
-   (this is the zlib license)
-*/
-
-// end license
-////////////////////////////
-
-typedef union xmm_mm_union {
-  __m128 xmm;
-  __m64 mm[2];
-} xmm_mm_union;
-
-#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                    \
-    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
-  }
-
-sse_t Manta::sin4(sse_t x) {
-  typedef __m128 v4sf;
-  typedef __m64 v2si;
-
-  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
-  v2si mm0, mm1, mm2, mm3;
-  sign_bit = x;
-  /* take the absolute value */
-  x = _mm_and_ps(x, _mm_absmask);
-  /* extract the sign bit (upper one) */
-  sign_bit = _mm_and_ps(sign_bit, _mm_signbit);
-
-  /* scale by 4/Pi */
-  y = _mm_mul_ps(x, _ps_cephes_FOPI);
-
-  /* store the integer part of y in mm0:mm1 */
-  xmm2 = _mm_movehl_ps(xmm2, y);
-  mm2 = _mm_cvttps_pi32(y);
-  mm3 = _mm_cvttps_pi32(xmm2);
-
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
-  mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
-  mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
-  mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
-
-  y = _mm_cvtpi32x2_ps(mm2, mm3);
-  //printf("\nsin_ps: mm2:mm3 = "); print2i(mm2); print2i(mm3); printf("\n");
-  //printf("sin_ps: y="); print4(y); printf("\n");
-
-
-  /* get the swap sign flag */
-  mm0 = _mm_and_si64(mm2, *((v2si*)&_mm_four_si128));
-  mm1 = _mm_and_si64(mm3, *((v2si*)&_mm_four_si128));
-  mm0 = _mm_slli_pi32(mm0, 29);
-  mm1 = _mm_slli_pi32(mm1, 29);
-
-  /* get the polynom selection mask
-     there is one polynom for 0 <= x <= Pi/4
-     and another one for Pi/4<x<=Pi/2
-
-     Both branches will be computed.
-  */
-  mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
-  mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
-  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
-  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
-
-  v4sf swap_sign_bit, poly_mask;
-  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
-  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
-  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = _ps_minus_cephes_DP1;
-  xmm2 = _ps_minus_cephes_DP2;
-  xmm3 = _ps_minus_cephes_DP3;
-  xmm1 = _mm_mul_ps(y, xmm1);
-  xmm2 = _mm_mul_ps(y, xmm2);
-  xmm3 = _mm_mul_ps(y, xmm3);
-  x = _mm_add_ps(x, xmm1);
-  x = _mm_add_ps(x, xmm2);
-  x = _mm_add_ps(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = _ps_coscof_p0;
-  v4sf z = _mm_mul_ps(x,x);
-
-  y = _mm_mul_ps(y, z);
-  y = _mm_add_ps(y, _ps_coscof_p1);
-  y = _mm_mul_ps(y, z);
-  y = _mm_add_ps(y, _ps_coscof_p2);
-  y = _mm_mul_ps(y, z);
-  y = _mm_mul_ps(y, z);
-  v4sf tmp = _mm_mul_ps(z, _mm_one_half);
-  y = _mm_sub_ps(y, tmp);
-  y = _mm_add_ps(y, _mm_one);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v4sf y2 = _ps_sincof_p0;
-  y2 = _mm_mul_ps(y2, z);
-  y2 = _mm_add_ps(y2, _ps_sincof_p1);
-  y2 = _mm_mul_ps(y2, z);
-  y2 = _mm_add_ps(y2, _ps_sincof_p2);
-  y2 = _mm_mul_ps(y2, z);
-  y2 = _mm_mul_ps(y2, x);
-  y2 = _mm_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
-  y = _mm_andnot_ps(xmm3, y);
-  y = _mm_add_ps(y,y2);
-  /* update the sign */
-  y = _mm_xor_ps(y, sign_bit);
-  _mm_empty(); /* good-bye mmx */
-  return y;
-}
-
-sse_t Manta::cos4(sse_t x) {
-  typedef __m128 v4sf;
-  typedef __m64 v2si;
-
-  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
-  v2si mm0, mm1, mm2, mm3;
-  /* take the absolute value */
-  x = _mm_and_ps(x, _mm_absmask);
-
-  /* scale by 4/Pi */
-  y = _mm_mul_ps(x, _ps_cephes_FOPI);
-
-  /* store the integer part of y in mm0:mm1 */
-  xmm2 = _mm_movehl_ps(xmm2, y);
-  mm2 = _mm_cvttps_pi32(y);
-  mm3 = _mm_cvttps_pi32(xmm2);
-
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
-  mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
-  mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
-  mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
-
-  y = _mm_cvtpi32x2_ps(mm2, mm3);
-
-
-  mm2 = _mm_sub_pi32(mm2, *((v2si*)&_mm_two_si128));
-  mm3 = _mm_sub_pi32(mm3, *((v2si*)&_mm_two_si128));
-
-  /* get the swap sign flag in mm0:mm1 and the
-     polynom selection mask in mm2:mm3 */
-
-  mm0 = _mm_andnot_si64(mm2, *((v2si*)&_mm_four_si128));
-  mm1 = _mm_andnot_si64(mm3, *((v2si*)&_mm_four_si128));
-  mm0 = _mm_slli_pi32(mm0, 29);
-  mm1 = _mm_slli_pi32(mm1, 29);
-
-  mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
-  mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
-
-  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
-  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
-
-  v4sf sign_bit, poly_mask;
-  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
-  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = _ps_minus_cephes_DP1;
-  xmm2 = _ps_minus_cephes_DP2;
-  xmm3 = _ps_minus_cephes_DP3;
-  xmm1 = _mm_mul_ps(y, xmm1);
-  xmm2 = _mm_mul_ps(y, xmm2);
-  xmm3 = _mm_mul_ps(y, xmm3);
-  x = _mm_add_ps(x, xmm1);
-  x = _mm_add_ps(x, xmm2);
-  x = _mm_add_ps(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = _ps_coscof_p0;
-  v4sf z = _mm_mul_ps(x,x);
-
-  y = _mm_mul_ps(y, z);
-  y = _mm_add_ps(y, _ps_coscof_p1);
-  y = _mm_mul_ps(y, z);
-  y = _mm_add_ps(y, _ps_coscof_p2);
-  y = _mm_mul_ps(y, z);
-  y = _mm_mul_ps(y, z);
-  v4sf tmp = _mm_mul_ps(z, _mm_one_half);
-  y = _mm_sub_ps(y, tmp);
-  y = _mm_add_ps(y, _mm_one);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v4sf y2 = _ps_sincof_p0;
-  y2 = _mm_mul_ps(y2, z);
-  y2 = _mm_add_ps(y2, _ps_sincof_p1);
-  y2 = _mm_mul_ps(y2, z);
-  y2 = _mm_add_ps(y2, _ps_sincof_p2);
-  y2 = _mm_mul_ps(y2, z);
-  y2 = _mm_mul_ps(y2, x);
-  y2 = _mm_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
-  y = _mm_andnot_ps(xmm3, y);
-  y = _mm_add_ps(y,y2);
-  /* update the sign */
-  y = _mm_xor_ps(y, sign_bit);
-
-  _mm_empty(); /* good-bye mmx */
-  return y;
-}
-
-void Manta::sincos4(sse_t x, sse_t* s, sse_t* c) {
-  typedef __m128 v4sf;
-  typedef __m64 v2si;
-
-  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
-  v2si mm0, mm1, mm2, mm3, mm4, mm5;
-  sign_bit_sin = x;
-  /* take the absolute value */
-  x = _mm_and_ps(x, _mm_absmask);
-  /* extract the sign bit (upper one) */
-  sign_bit_sin = _mm_and_ps(sign_bit_sin, _mm_signbit);
-
-  /* scale by 4/Pi */
-  y = _mm_mul_ps(x, _ps_cephes_FOPI);
-
-  /* store the integer part of y in mm0:mm1 */
-  xmm3 = _mm_movehl_ps(xmm3, y);
-  mm2 = _mm_cvttps_pi32(y);
-  mm3 = _mm_cvttps_pi32(xmm3);
-
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
-  mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
-  mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
-  mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
-
-  y = _mm_cvtpi32x2_ps(mm2, mm3);
-
-  mm4 = mm2;
-  mm5 = mm3;
-
-  /* get the swap sign flag for the sine */
-  mm0 = _mm_and_si64(mm2, *((v2si*)&_mm_four_si128));
-  mm1 = _mm_and_si64(mm3, *((v2si*)&_mm_four_si128));
-  mm0 = _mm_slli_pi32(mm0, 29);
-  mm1 = _mm_slli_pi32(mm1, 29);
-  v4sf swap_sign_bit_sin;
-  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
-
-  /* get the polynom selection mask for the sine */
-
-  mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
-  mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
-  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
-  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
-  v4sf poly_mask;
-  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = _ps_minus_cephes_DP1;
-  xmm2 = _ps_minus_cephes_DP2;
-  xmm3 = _ps_minus_cephes_DP3;
-  xmm1 = _mm_mul_ps(y, xmm1);
-  xmm2 = _mm_mul_ps(y, xmm2);
-  xmm3 = _mm_mul_ps(y, xmm3);
-  x = _mm_add_ps(x, xmm1);
-  x = _mm_add_ps(x, xmm2);
-  x = _mm_add_ps(x, xmm3);
-
-
-  /* get the sign flag for the cosine */
-
-  mm4 = _mm_sub_pi32(mm4, *((v2si*)&_mm_two_si128));
-  mm5 = _mm_sub_pi32(mm5, *((v2si*)&_mm_two_si128));
-  mm4 = _mm_andnot_si64(mm4, *((v2si*)&_mm_four_si128));
-  mm5 = _mm_andnot_si64(mm5, *((v2si*)&_mm_four_si128));
-  mm4 = _mm_slli_pi32(mm4, 29);
-  mm5 = _mm_slli_pi32(mm5, 29);
-  v4sf sign_bit_cos;
-  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
-
-  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  v4sf z = _mm_mul_ps(x,x);
-  y = _ps_coscof_p0;
-
-  y = _mm_mul_ps(y, z);
-  y = _mm_add_ps(y, _ps_coscof_p1);
-  y = _mm_mul_ps(y, z);
-  y = _mm_add_ps(y, _ps_coscof_p2);
-  y = _mm_mul_ps(y, z);
-  y = _mm_mul_ps(y, z);
-  v4sf tmp = _mm_mul_ps(z, _mm_one_half);
-  y = _mm_sub_ps(y, tmp);
-  y = _mm_add_ps(y, _mm_one);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v4sf y2 = _ps_sincof_p0;
-  y2 = _mm_mul_ps(y2, z);
-  y2 = _mm_add_ps(y2, _ps_sincof_p1);
-  y2 = _mm_mul_ps(y2, z);
-  y2 = _mm_add_ps(y2, _ps_sincof_p2);
-  y2 = _mm_mul_ps(y2, z);
-  y2 = _mm_mul_ps(y2, x);
-  y2 = _mm_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  v4sf ysin2 = _mm_and_ps(xmm3, y2);
-  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
-  y2 = _mm_sub_ps(y2,ysin2);
-  y = _mm_sub_ps(y, ysin1);
-
-  xmm1 = _mm_add_ps(ysin1,ysin2);
-  xmm2 = _mm_add_ps(y,y2);
-
-  /* update the sign */
-  *s = _mm_xor_ps(xmm1, sign_bit_sin);
-  *c = _mm_xor_ps(xmm2, sign_bit_cos);
-  _mm_empty(); /* good-bye mmx */
-}
-
-
#endif // MANTA_SSE

Modified: trunk/Core/Math/SSEDefs.h
==============================================================================
--- trunk/Core/Math/SSEDefs.h   (original)
+++ trunk/Core/Math/SSEDefs.h   Tue Sep 18 14:13:07 2007
@@ -454,15 +454,9 @@
#endif
   }

-  // Some Trig functions
-  sse_t sin4(sse_t x);
-  sse_t cos4(sse_t x);
-  // Computes both the sin and cos of a given value.  This is useful
-  // if you want both value, because many of the intermediate
-  // computation can be shared.
-  void sincos4(sse_t x, sse_t* s, sse_t* c);
+  // For trig functions like sin and cos, see TrigSSE.h

-  // For functions that compute pow, exp, log, exp2, and log2, see Expon.h
+  // For functions that compute pow, exp, log, exp2, and log2, see ExponSSE.h
};

#endif  //#ifdef MANTA_SSE

Added: trunk/Core/Math/TrigSSE.cc
==============================================================================
--- (empty file)
+++ trunk/Core/Math/TrigSSE.cc  Tue Sep 18 14:13:07 2007
@@ -0,0 +1,385 @@
+/*
+  For more information, please see: http://software.sci.utah.edu
+
+  The MIT License
+
+  Copyright (c) 2005-2007
+  Scientific Computing and Imaging Institute, University of Utah
+
+  License for the specific language governing rights and limitations under
+  Permission is hereby granted, free of charge, to any person obtaining a
+  copy of this software and associated documentation files (the "Software"),
+  to deal in the Software without restriction, including without limitation
+  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+  and/or sell copies of the Software, and to permit persons to whom the
+  Software is furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included
+  in all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+  DEALINGS IN THE SOFTWARE.
+*/
+
+#include <MantaSSE.h>
+#include <Core/Math/TrigSSE.h>
+
+#ifdef MANTA_SSE
+
+///////////////////////
+// SSE sine and cosine, based off of code available at:
+// http://gruntthepeon.free.fr/ssemath/
+// with the following license:
+
+/* Copyright (C) 2007  Julien Pommier
+
+   This software is provided 'as-is', without any express or implied
+   warranty.  In no event will the authors be held liable for any damages
+   arising from the use of this software.
+
+   Permission is granted to anyone to use this software for any purpose,
+   including commercial applications, and to alter it and redistribute it
+   freely, subject to the following restrictions:
+
+   1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+   2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+   3. This notice may not be removed or altered from any source distribution.
+
+   (this is the zlib license)
+*/
+
+// end license
+////////////////////////////
+
+typedef union xmm_mm_union {
+  __m128 xmm;
+  __m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                    \
+    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
+  }
+
+sse_t Manta::sin4(sse_t x) {
+  typedef __m128 v4sf;
+  typedef __m64 v2si;
+
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+  v2si mm0, mm1, mm2, mm3;
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, _mm_absmask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, _mm_signbit);
+
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, _ps_cephes_FOPI);
+
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
+  mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
+  mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
+  mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+  //printf("\nsin_ps: mm2:mm3 = "); print2i(mm2); print2i(mm3); printf("\n");
+  //printf("sin_ps: y="); print4(y); printf("\n");
+
+
+  /* get the swap sign flag */
+  mm0 = _mm_and_si64(mm2, *((v2si*)&_mm_four_si128));
+  mm1 = _mm_and_si64(mm3, *((v2si*)&_mm_four_si128));
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+
+  /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
+  mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+  v4sf swap_sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = _ps_minus_cephes_DP1;
+  xmm2 = _ps_minus_cephes_DP2;
+  xmm3 = _ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = _ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, _ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, _ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, _mm_one_half);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, _mm_one);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = _ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, _ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, _ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+  _mm_empty(); /* good-bye mmx */
+  return y;
+}
+
+sse_t Manta::cos4(sse_t x) {
+  typedef __m128 v4sf;
+  typedef __m64 v2si;
+
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+  v2si mm0, mm1, mm2, mm3;
+  /* take the absolute value */
+  x = _mm_and_ps(x, _mm_absmask);
+
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, _ps_cephes_FOPI);
+
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
+  mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
+  mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
+  mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+
+  mm2 = _mm_sub_pi32(mm2, *((v2si*)&_mm_two_si128));
+  mm3 = _mm_sub_pi32(mm3, *((v2si*)&_mm_two_si128));
+
+  /* get the swap sign flag in mm0:mm1 and the
+     polynom selection mask in mm2:mm3 */
+
+  mm0 = _mm_andnot_si64(mm2, *((v2si*)&_mm_four_si128));
+  mm1 = _mm_andnot_si64(mm3, *((v2si*)&_mm_four_si128));
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+
+  mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
+  mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
+
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+  v4sf sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = _ps_minus_cephes_DP1;
+  xmm2 = _ps_minus_cephes_DP2;
+  xmm3 = _ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = _ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, _ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, _ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, _mm_one_half);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, _mm_one);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = _ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, _ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, _ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  _mm_empty(); /* good-bye mmx */
+  return y;
+}
+
+void Manta::sincos4(sse_t x, sse_t* s, sse_t* c) {
+  typedef __m128 v4sf;
+  typedef __m64 v2si;
+
+  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+  v2si mm0, mm1, mm2, mm3, mm4, mm5;
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, _mm_absmask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm_and_ps(sign_bit_sin, _mm_signbit);
+
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, _ps_cephes_FOPI);
+
+  /* store the integer part of y in mm0:mm1 */
+  xmm3 = _mm_movehl_ps(xmm3, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm3);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
+  mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
+  mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
+  mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+  mm4 = mm2;
+  mm5 = mm3;
+
+  /* get the swap sign flag for the sine */
+  mm0 = _mm_and_si64(mm2, *((v2si*)&_mm_four_si128));
+  mm1 = _mm_and_si64(mm3, *((v2si*)&_mm_four_si128));
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  v4sf swap_sign_bit_sin;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+  /* get the polynom selection mask for the sine */
+
+  mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
+  mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf poly_mask;
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = _ps_minus_cephes_DP1;
+  xmm2 = _ps_minus_cephes_DP2;
+  xmm3 = _ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+
+  /* get the sign flag for the cosine */
+
+  mm4 = _mm_sub_pi32(mm4, *((v2si*)&_mm_two_si128));
+  mm5 = _mm_sub_pi32(mm5, *((v2si*)&_mm_two_si128));
+  mm4 = _mm_andnot_si64(mm4, *((v2si*)&_mm_four_si128));
+  mm5 = _mm_andnot_si64(mm5, *((v2si*)&_mm_four_si128));
+  mm4 = _mm_slli_pi32(mm4, 29);
+  mm5 = _mm_slli_pi32(mm5, 29);
+  v4sf sign_bit_cos;
+  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+
+  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v4sf z = _mm_mul_ps(x,x);
+  y = _ps_coscof_p0;
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, _ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, _ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, _mm_one_half);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, _mm_one);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = _ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, _ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, _ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  v4sf ysin2 = _mm_and_ps(xmm3, y2);
+  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+  y2 = _mm_sub_ps(y2,ysin2);
+  y = _mm_sub_ps(y, ysin1);
+
+  xmm1 = _mm_add_ps(ysin1,ysin2);
+  xmm2 = _mm_add_ps(y,y2);
+
+  /* update the sign */
+  *s = _mm_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+  _mm_empty(); /* good-bye mmx */
+}
+
+#endif

Added: trunk/Core/Math/TrigSSE.h
==============================================================================
--- (empty file)
+++ trunk/Core/Math/TrigSSE.h   Tue Sep 18 14:13:07 2007
@@ -0,0 +1,48 @@
+/*
+  For more information, please see: http://software.sci.utah.edu
+
+  The MIT License
+
+  Copyright (c) 2005-2007
+  Scientific Computing and Imaging Institute, University of Utah
+
+  License for the specific language governing rights and limitations under
+  Permission is hereby granted, free of charge, to any person obtaining a
+  copy of this software and associated documentation files (the "Software"),
+  to deal in the Software without restriction, including without limitation
+  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+  and/or sell copies of the Software, and to permit persons to whom the
+  Software is furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included
+  in all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+  DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef Manta_Core_TrigSSE_H__
+#define Manta_Core_TrigSSE_H__
+
+#include <MantaSSE.h>
+#include <Core/Math/SSEDefs.h>
+
+namespace Manta {
+
+#ifdef MANTA_SSE
+  sse_t sin4(sse_t x);
+  sse_t cos4(sse_t x);
+  // Computes both the sin and cos of a given value.  This is useful
+  // if you want both value, because many of the intermediate
+  // computation can be shared.
+  void sincos4(sse_t x, sse_t* s, sse_t* c);
+#endif
+
+} // end namespace Manta
+
+#endif // #ifndef Manta_Core_TrigSSE_H__

Modified: trunk/Model/Cameras/ThinLensCamera.cc
==============================================================================
--- trunk/Model/Cameras/ThinLensCamera.cc       (original)
+++ trunk/Model/Cameras/ThinLensCamera.cc       Tue Sep 18 14:13:07 2007
@@ -12,6 +12,7 @@
#include <Core/Geometry/AffineTransform.h>
#include <Core/Math/MiscMath.h>
#include <Core/Math/Trig.h>
+#include <Core/Math/TrigSSE.h>
#include <Core/Util/Assert.h>
#include <MantaSSE.h>
#include <iostream>
@@ -82,7 +83,7 @@
       haveCamera = true;
     } else if(arg == "-focal_length"){
       if(!getArg<Real>(i, args, focal_length))
-        throw IllegalArgument("ThinLensCamera -focal_distance", i, args);
+        throw IllegalArgument("ThinLensCamera -focal_length", i, args);
       haveCamera = true;
     } else {
       throw IllegalArgument("ThinLensCamera", i, args);

[Manta] r1726 - in trunk: Core Core/Math Model/Cameras, bigler, 09/18/2007

Archive powered by MHonArc 2.6.16.