Text archives Help
- From: bigler@sci.utah.edu
- To: manta@sci.utah.edu
- Subject: [Manta] r1723 - in trunk: Core Core/Math Engine/PixelSamplers
- Date: Wed, 12 Sep 2007 07:46:56 -0600 (MDT)
Author: bigler
Date: Wed Sep 12 07:46:52 2007
New Revision: 1723
Added:
trunk/Core/Math/ExponSSE.cc
trunk/Core/Math/ExponSSE.h
Modified:
trunk/Core/CMakeLists.txt
trunk/Core/Math/SSEDefs.cc
trunk/Core/Math/SSEDefs.h
trunk/Engine/PixelSamplers/JitterSampler.cc
Log:
Core/CMakeLists.txt
Core/Math/ExponSSE.cc
Core/Math/ExponSSE.h
Added Exponent functions: Pow, Exp, Log, Exp2, Log2. There are
single scalar and sse versions of all functions. There are also
versions that call math.h in between unpacking and packing to SSE.
Core/Math/SSEDefs.cc
Core/Math/SSEDefs.h
Move the implementations of sin4, cos4, and sincos4 to the .cc file.
Engine/PixelSamplers/JitterSampler.cc
Delete the set of random number generators before allocating more.
Modified: trunk/Core/CMakeLists.txt
==============================================================================
--- trunk/Core/CMakeLists.txt (original)
+++ trunk/Core/CMakeLists.txt Wed Sep 12 07:46:52 2007
@@ -45,6 +45,8 @@
Math/CatmullRomInterpolator.h
Math/CheapRNG.cc
Math/CheapRNG.h
+ Math/ExponSSE.cc
+ Math/ExponSSE.h
Math/HaltonSequence.cc
Math/HaltonSequence.h
Math/KorobovRNG.cc
Added: trunk/Core/Math/ExponSSE.cc
==============================================================================
--- (empty file)
+++ trunk/Core/Math/ExponSSE.cc Wed Sep 12 07:46:52 2007
@@ -0,0 +1,463 @@
+#include <Core/Math/ExponSSE.h>
+
+// Don't set this outside the range [1,5]
+#ifndef SERIES_ITERATIONS
+#define SERIES_ITERATIONS 3
+#endif
+
+#ifndef NUM_LN_SERIES
+#define NUM_LN_SERIES SERIES_ITERATIONS
+#endif
+
+#ifndef NUM_EXP_SERIES
+#define NUM_EXP_SERIES SERIES_ITERATIONS
+#endif
+
+
+// Math taken from:
+//
+//
http://en.wikipedia.org/wiki/Natural_log#Numerical_value
+//
+// We are computing ln (x) as:
+//
+// 2*( (z-1)/(z+1) + 1/3 * (z-1)/(z+1)^3 + 1/5 * (z-1)/(z+1)^5 ... )
+//
+// The first (z-1)/(z+1) is factored out leaving:
+//
+// 2*(z-1)/(z+1)*( 1 + 1/3 * (z-1)/(z+1)^2 + 1/5 * (z-1)/(z+1)^4 ... )
+//
+float Manta::Logf(float val, int series)
+{
+ float zOverZ = (val-1)/(val+1);
+ float sum = 1;
+ if (series > 1) {
+ float zOverZ2 = zOverZ*zOverZ;
+ float mult = 1;
+ for(int s = 2; s <= series; ++s) {
+ mult *= zOverZ2;
+ sum += 1.0f/(s*2-1) * mult;
+ }
+ }
+ return 2.f * zOverZ * sum;
+}
+
+float Manta::Logf(float val)
+{
+ float zOverZ = (val-1)/(val+1);
+ float sum = 1;
+#if (NUM_LN_SERIES >= 2)
+ float zOverZ2 = zOverZ*zOverZ;
+ float mult = zOverZ2;
+ sum += 0.33333333f * mult; // (1/3) * zOverZ^2
+#endif
+#if (NUM_LN_SERIES >= 3)
+ mult *= zOverZ2;
+ sum += 0.2f * mult; // (1/5) * zOverZ^4
+#endif
+#if (NUM_LN_SERIES >= 4)
+ mult *= zOverZ2;
+ sum += 0.14285714f * mult; // (1/7) * zOverZ^6
+#endif
+#if (NUM_LN_SERIES >= 5)
+ mult *= zOverZ2;
+ sum += 0.11111111f * mult; // (1/9) * zOverZ^8
+#endif
+ return 2.f * zOverZ * sum;
+}
+
+#ifdef MANTA_SSE
+__m128 Manta::LogSSE(__m128 val)
+{
+ // float zOverZ = (val-1)/(val+1);
+ __m128 zOverZ = _mm_div_ps(_mm_sub_ps(val, _mm_set_ps1(1.f)),
+ _mm_add_ps(val, _mm_set_ps1(1.f)));
+ // float sum = 1;
+ __m128 sum = _mm_set_ps1(1.f);
+#if (NUM_LN_SERIES >= 2)
+ // float zOverZ2 = zOverZ*zOverZ;
+ __m128 zOverZ2 = _mm_mul_ps(zOverZ, zOverZ);
+ // float mult = zOverZ2;
+ __m128 mult = zOverZ2;
+ // sum += 0.33333333f * mult; // (1/3) * zOverZ^2
+ sum = _mm_add_ps(sum, _mm_mul_ps(mult, _mm_set_ps1(0.333333333f)));
+#endif
+#if (NUM_LN_SERIES >= 3)
+ // mult *= zOverZ2;
+ mult = _mm_mul_ps(mult, zOverZ2);
+ // sum += 0.2f * mult; // (1/5) * zOverZ^4
+ sum = _mm_add_ps(sum, _mm_mul_ps(mult, _mm_set_ps1(0.2f)));
+#endif
+#if (NUM_LN_SERIES >= 4)
+ // mult *= zOverZ2;
+ mult = _mm_mul_ps(mult, zOverZ2);
+ // sum += 0.14285714f * mult; // (1/7) * zOverZ^6
+ sum = _mm_add_ps(sum, _mm_mul_ps(mult, _mm_set_ps1(0.14285714f)));
+#endif
+#if (NUM_LN_SERIES >= 5)
+ // mult *= zOverZ2;
+ mult = _mm_mul_ps(mult, zOverZ2);
+ // sum += 0.11111111f * mult; // (1/9) * zOverZ^8
+ sum = _mm_add_ps(sum, _mm_mul_ps(mult, _mm_set_ps1(0.11111111f)));
+#endif
+ // return 2.f * zOverZ * sum;
+
+ // I'm arranging the multiplies, so that 2*zOverZ could be scheduled
+ // earlier, since it is invariant on sum. (2*(zOverZ*sum)) would
+ // have to wait for sum to be computed before it could do
+ // multiply by 2.
+ return _mm_mul_ps(_mm_mul_ps(_mm_set_ps1(2.f), zOverZ), sum);
+}
+#endif // #ifdef MANTA_SSE
+
+// The math for all of this came from this web page:
+//
+//
http://en.wikipedia.org/wiki/Exponential_function#Formal_definition
+//
+// e^x = sum(n=0..inf)((x^n)/n!)
+//
+// This series doesn't converge very quickly for numbers greater
+// than 1, so you should use the Expf function below.
+float Manta::ExpSeries(float val, int series)
+{
+ float sum = 1;
+ float xpow = 1;
+ float fact = 1;
+ for(int i = 1; i < series; ++i) {
+ fact *= i;
+ xpow *= val;
+ sum += xpow/fact;
+ }
+ return sum;
+}
+
+float Manta::ExpSeries(float val)
+{
+ float sum = 1 + val; // series 0, 1
+#if (NUM_EXP_SERIES >= 2)
+ float xpow = val*val;
+ sum += xpow * 0.5f; // x^2/2!
+#endif
+#if (NUM_EXP_SERIES >= 3)
+ xpow *= val;
+ sum += xpow * 0.16666666667f; // x^3/3!
+#endif
+#if (NUM_EXP_SERIES >= 4)
+ xpow *= val;
+ sum += xpow * 0.041666666667f; // x^4/4!
+#endif
+#if (NUM_EXP_SERIES >= 5)
+ xpow *= val;
+ sum += xpow * 0.0083333333333f; // x^5/5!
+#endif
+ return sum;
+}
+
+#ifdef MANTA_SSE
+__m128 Manta::ExpSeriesSSE(__m128 val)
+{
+ // float sum = 1 + val; // series 0, 1
+ __m128 sum = _mm_add_ps(_mm_set_ps1(1.f), val);
+#if (NUM_EXP_SERIES >= 2)
+ // float xpow = val * val;
+ __m128 xpow = _mm_mul_ps(val, val);
+ // sum += xpow * 0.5f; // x^2/2!
+ sum = _mm_add_ps(sum, _mm_mul_ps(xpow, _mm_set_ps1(0.5f)));
+#endif
+#if (NUM_EXP_SERIES >= 3)
+ // xpow *= val;
+ xpow = _mm_mul_ps(xpow, val);
+ // sum += xpow * 0.16666666667f; // x^3/3!
+ sum = _mm_add_ps(sum, _mm_mul_ps(xpow, _mm_set_ps1(0.16666666667f)));
+#endif
+#if (NUM_EXP_SERIES >= 4)
+ // xpow *= val;
+ xpow = _mm_mul_ps(xpow, val);
+ // sum += xpow * 0.041666666667f; // x^4/4!
+ sum = _mm_add_ps(sum, _mm_mul_ps(xpow, _mm_set_ps1(0.041666666667f)));
+#endif
+#if (NUM_EXP_SERIES >= 5)
+ // xpow *= val;
+ xpow = _mm_mul_ps(xpow, val);
+ // sum += xpow * 0.0083333333333f; // x^5/5!
+ sum = _mm_add_ps(sum, _mm_mul_ps(xpow, _mm_set_ps1(0.0083333333333f)));
+#endif
+ return sum;
+}
+#endif
+
+// The problem with the ExpSeries is that it is slow to converge for
+// numbers larger than one. This employs a trick that we can
+// exploit with regard to how floating point numbers are
+// represented to speed up convergence.
+//
+// Taken from:
+//
http://en.wikipedia.org/wiki/Exponential_function#Computing_exp.28x.29_for_real_x
+//
+// y = m*2^n = e^x
+// n = floor(x/log(2)) // note this is the natural log
+// u = x - n * log(2)
+
+// The number u is small and in the range of (0, log(2)), so the
+// exp series will converge quickly.
+//
+// m = e^u = ExpSeries(u)
+//
+// Since n is a power of 2 we can stuff this into the exponent
+// portion of the floating point number. We can do all this with
+// bit twiddling.
+
+float Manta::Expf(float val, int series)
+{
+ float log2_val = logf(2);
+ float n = floorf(val/log2_val);
+ float u = val - n*log2_val;
+ union {
+ float f;
+ int i;
+ } m;
+ m.f = ExpSeries(u, series);
+ int nInt = (int)n + 127;
+ // We can run into problems if the exponent is sufficiently small
+ // such that it causes an underflow. This test caps it off.
+ if (nInt < 0) nInt = 0;
+ unsigned int exp_bits = nInt << 23;
+ unsigned int exp_bits_masked = exp_bits & 0x7F800000;
+ unsigned int m_bits = m.i;
+ unsigned int m_bits_masked = m_bits & 0x807FFFFF;
+ union {
+ float f;
+ unsigned int i;
+ } exp_val2_bits;
+ exp_val2_bits.i = m_bits_masked | exp_bits_masked;
+ float exp_val2 = exp_val2_bits.f;
+ return exp_val2;
+}
+
+float Manta::Expf(float val)
+{
+ float n = floorf(val*1.44269503997304722501f); // 1/logf(2)
+ float u = val - n*0.693147181f; // (logf(2))
+ union {
+ float f;
+ int i;
+ } m;
+ m.f = ExpSeries(u);
+ int nInt = (int)n + 127;
+ if (nInt < 0) nInt = 0;
+ unsigned int exp_bits = nInt << 23;
+ unsigned int exp_bits_masked = exp_bits & 0x7F800000;
+ unsigned int m_bits = m.i;
+ unsigned int m_bits_masked = m_bits & 0x807FFFFF;
+ union {
+ float f;
+ unsigned int i;
+ } exp_val2_bits;
+ exp_val2_bits.i = m_bits_masked | exp_bits_masked;
+ float exp_val2 = exp_val2_bits.f;
+ return exp_val2;
+}
+
+#ifdef MANTA_SSE
+// Unline the Expf functions, this code contains no branches.
+__m128 Manta::ExpSSE(__m128 val)
+{
+ // float log2_val = logf(2); // = 0.693147181f
+ // float n = floorf(val/log2_val);
+ __m128 n = floorSSE(_mm_div_ps(val, _mm_set_ps1(0.693147181f)));
+ // float u = val - n*log2_val;
+ __m128 u = _mm_sub_ps(val, _mm_mul_ps(n, _mm_set_ps1(0.693147181f)));
+ __m128 m = ExpSeriesSSE(u);
+ // int nInt = (int)n + 127;
+ __m128i nInt = _mm_add_epi32(_mm_cvttps_epi32(n), _mm_set1_epi32(127));
+ // if (nInt < 0) nInt = 0;
+ __m128i lessThanZero = _mm_cmplt_epi32(nInt, _mm_setzero_si128());
+ nInt = _mm_andnot_si128(lessThanZero, nInt);
+ // unsigned int exp_bits = nInt << 23;
+ __m128i exp_bits = _mm_slli_epi32(nInt, 23);
+ // unsigned int exp_bits_masked = exp_bits & 0x7F800000;
+ __m128i exp_bits_masked =
_mm_and_si128(_mm_set1_epi32(0x7F800000),exp_bits);
+ // unsigned int m_bits = float2bits(m);
+ __m128i m_bits = _mm_castps_si128(m);
+ // unsigned int m_bits_masked = m_bits & 0x807FFFFF;
+ __m128i m_bits_masked = _mm_andnot_si128(_mm_set1_epi32(0x7F800000),
m_bits);
+ // unsigned int exp_val2_bits = m_bits_masked | exp_bits_masked;
+ __m128i exp_val2_bits = _mm_or_si128(m_bits_masked, exp_bits_masked);
+ // float exp_val2 = bits2float(exp_val2_bits);
+ __m128 exp_val2 = _mm_castsi128_ps(exp_val2_bits);
+ return exp_val2;
+}
+#endif
+
+// const float log_correction_factor = .325f;
+// const float pow_correction_factor = .33979f;
+const float log_correction_factor = 0.346607f;
+const float pow_correction_factor = 0.33971f;
+
+const bool print_stuff = false;
+
+float Manta::Log2f(float val)
+{
+ float LogBodge=log_correction_factor;
+ union {
+ float f;
+ int i;
+ } val_union;
+ val_union.f = val;
+ float x, y;
+ x = val_union.i;
+ x *= 1.0/(1<<23); //1/pow(2,23);
+ x = x-127;
+
+ y=x-floorf(x);
+ y=(y-y*y)*LogBodge;
+ // cerr << "myLog2("<<val<<") = "<<x+y<<"\n";
+ return x+y;
+}
+
+float Manta::Pow2f(float val)
+{
+ float PowBodge=pow_correction_factor;
+ union {
+ float f;
+ int i;
+ } x;
+ float y=val-floorf(val);
+ y=(y-y*y)*PowBodge;
+
+ float expon = val+127;
+ // printf("expon = %f, i = %f, y = %f\n", expon, i, y);
+ if (expon < 1) {
+ //printf("expon = %f, val = %f, y = %f\n", expon, val, y);
+ expon = 1;
+ }
+ x.f=expon-y;
+ // printf("x = "); printBinary(float2bits(x)); printf("\n");
+ x.f*= (1<<23); //pow(2,23);
+ x.i = (int)x.f;
+ if (print_stuff) printf("myPow2(%g)= %g\n", val, x.f);
+ return x.f;
+}
+
+#ifdef MANTA_SSE
+__m128 Manta::Log2SSE(__m128 val)
+{
+ __m128 LogBodge = _mm_set1_ps(log_correction_factor);
+ __m128 x = _mm_cvtepi32_ps(_mm_castps_si128(val));
+ x = _mm_mul_ps(x, _mm_set1_ps(1.f/(1<<23)));
+ x = _mm_sub_ps(x, _mm_set1_ps(127.f));
+ __m128 y = _mm_sub_ps(x, floorSSE(x));
+ y = _mm_mul_ps(LogBodge, _mm_sub_ps(y, _mm_mul_ps(y, y)));
+ return _mm_add_ps(x,y);
+}
+
+__m128 Manta::Pow2SSE(__m128 val)
+{
+ __m128 PowBodge = _mm_set1_ps(pow_correction_factor);
+ __m128 y = _mm_sub_ps(val, floorSSE(val));
+ y = _mm_mul_ps(PowBodge, _mm_sub_ps(y, _mm_mul_ps(y, y)));
+ __m128 expon = _mm_add_ps(val, _mm_set1_ps(127.f));
+ __m128 mask = _mm_cmplt_ps(expon, _mm_set1_ps(1.f));
+ expon = _mm_or_ps(_mm_and_ps (mask, _mm_set1_ps(1.f)),
+ _mm_andnot_ps(mask, expon));
+ __m128 x = _mm_sub_ps(expon, y);
+ x = _mm_mul_ps(x, _mm_set1_ps(1<<23));
+// __m128i int_result = _mm_cvttps_epi32(x);
+// __m128 result;
+// _mm_store_si128((__m128i*)&result, int_result);
+ return _mm_castsi128_ps(_mm_cvttps_epi32(x));
+}
+
+// This will unpack, call powf, and pack.
+__m128 Manta::PowSSEMathH(__m128 a, __m128 b)
+{
+ // Unpack the values
+#if 1
+ float* base_unpack = (float*)(&a);
+ float* expon_unpack = (float*)(&b);
+#else
+ MANTA_ALIGN(16) float base_unpack[4];
+ MANTA_ALIGN(16) float expon_unpack[4];
+ _mm_store_ps(base_unpack, a);
+ _mm_store_ps(expon_unpack, b);
+#endif
+ MANTA_ALIGN(16) float result_unpack[4];
+ // Do the computation
+ for(unsigned int i = 0; i < 4; ++i)
+ result_unpack[i] = powf(base_unpack[i], expon_unpack[i]);
+ // Pack the results
+ return _mm_load_ps(result_unpack);
+}
+
+// The same, but uses a mask to avoid calls to powf.
+__m128 Manta::PowSSEMathH(__m128 a, __m128 b, __m128 mask)
+{
+ // Unpack the values
+ float* base_unpack = (float*)(&a);
+ float* expon_unpack = (float*)(&b);
+ MANTA_ALIGN(16) float result_unpack[4];
+ int intmask = _mm_movemask_ps(mask);
+ // Do the computation
+ for(unsigned int i = 0; i < 4; ++i)
+ if (intmask & (1 << i))
+ result_unpack[i] = powf(base_unpack[i], expon_unpack[i]);
+ // Pack the results
+ return _mm_load_ps(result_unpack);
+}
+
+// This will unpack, call expf, and pack.
+__m128 Manta::ExpSSEMathH(__m128 val)
+{
+ // Unpack the values
+ float* val_unpack = (float*)(&val);
+ MANTA_ALIGN(16) float result_unpack[4];
+ // Do the computation
+ for(unsigned int i = 0; i < 4; ++i)
+ result_unpack[i] = expf(val_unpack[i]);
+ // Pack the results
+ return _mm_load_ps(result_unpack);
+}
+
+// The same, but uses a mask to avoid calls to expf.
+__m128 Manta::ExpSSEMathH(__m128 val, __m128 mask)
+{
+ // Unpack the values
+ float* val_unpack = (float*)(&val);
+ MANTA_ALIGN(16) float result_unpack[4];
+ // Do the computation
+ int intmask = _mm_movemask_ps(mask);
+ for(unsigned int i = 0; i < 4; ++i)
+ if (intmask & (1 << i))
+ result_unpack[i] = expf(val_unpack[i]);
+ // Pack the results
+ return _mm_load_ps(result_unpack);
+}
+
+// This will unpack, call logf, and pack.
+__m128 Manta::LogSSEMathH(__m128 val)
+{
+ // Unpack the values
+ float* val_unpack = (float*)(&val);
+ MANTA_ALIGN(16) float result_unpack[4];
+ // Do the computation
+ for(unsigned int i = 0; i < 4; ++i)
+ result_unpack[i] = logf(val_unpack[i]);
+ // Pack the results
+ return _mm_load_ps(result_unpack);
+}
+
+// The same, but uses a mask to avoid calls to logf.
+__m128 Manta::LogSSEMathH(__m128 val, __m128 mask)
+{
+ // Unpack the values
+ float* val_unpack = (float*)(&val);
+ MANTA_ALIGN(16) float result_unpack[4];
+ // Do the computation
+ int intmask = _mm_movemask_ps(mask);
+ for(unsigned int i = 0; i < 4; ++i)
+ if (intmask & (1 << i))
+ result_unpack[i] = logf(val_unpack[i]);
+ // Pack the results
+ return _mm_load_ps(result_unpack);
+}
+#endif
+
Added: trunk/Core/Math/ExponSSE.h
==============================================================================
--- (empty file)
+++ trunk/Core/Math/ExponSSE.h Wed Sep 12 07:46:52 2007
@@ -0,0 +1,78 @@
+
+#ifndef Manta_Core_ExponSSE_h
+#define Manta_Core_ExponSSE_h
+
+#include <MantaSSE.h>
+
+#ifdef MANTA_SSE
+#include <emmintrin.h>
+#include <Core/Util/Align.h>
+#include <Core/Math/SSEDefs.h>
+#endif // MANTA_SSE
+
+namespace Manta {
+
+ float Logf(float val, int series);
+ float Logf(float val);
+#ifdef MANTA_SSE
+ __m128 LogSSE(__m128 val);
+#endif
+
+ float ExpSeries(float val, int series);
+ float ExpSeries(float val);
+#ifdef MANTA_SSE
+ __m128 ExpSeriesSSE(__m128 val);
+#endif
+
+ float Expf(float val, int series);
+ float Expf(float val);
+#ifdef MANTA_SSE
+ // Unline the Expf functions, this code contains no branches.
+ __m128 ExpSSE(__m128 val);
+#endif
+
+ float Log2f(float val);
+ float Pow2f(float val);
+#ifdef MANTA_SSE
+ __m128 Log2SSE(__m128 val);
+ __m128 Pow2SSE(__m128 val);
+#endif
+
+ // Pow is computed a^b = e^(b*ln(a))
+ inline float Powf(float a, float b)
+ {
+ return Expf(b*Logf(a));
+ }
+
+ inline float Powfv2(float a, float b)
+ {
+ return Pow2f(b*Log2f(a));
+ }
+
+#ifdef MANTA_SSE
+ inline __m128 PowSSE(__m128 a, __m128 b)
+ {
+ return ExpSSE(_mm_mul_ps(b, LogSSE(a)));
+ }
+
+ inline __m128 PowSSEv2(__m128 a, __m128 b)
+ {
+ return Pow2SSE(_mm_mul_ps(b, Log2SSE(a)));
+ }
+
+ // This will unpack, call powf, and pack.
+ __m128 PowSSEMathH(__m128 a, __m128 b);
+ // The same, but uses a mask to avoid calls to powf.
+ __m128 PowSSEMathH(__m128 a, __m128 b, __m128 mask);
+ //
+ __m128 ExpSSEMathH(__m128 val);
+ __m128 ExpSSEMathH(__m128 val, __m128 mask);
+ //
+ __m128 LogSSEMathH(__m128 val);
+ __m128 LogSSEMathH(__m128 val, __m128 mask);
+
+#endif
+
+} // end namespace Manta
+
+#endif // #ifndef Manta_Core_ExponSSE_h
Modified: trunk/Core/Math/SSEDefs.cc
==============================================================================
--- trunk/Core/Math/SSEDefs.cc (original)
+++ trunk/Core/Math/SSEDefs.cc Wed Sep 12 07:46:52 2007
@@ -3,10 +3,13 @@
#ifdef MANTA_SSE
#include <iostream>
+#include <xmmintrin.h>
namespace std {
std::ostream& operator<<(std::ostream& os, __m128 t)
{
+ // If you have problems getting out the correct value look at the
+ // __m128i version for an alternative implementation.
MANTA_ALIGN(16) float f[4];
_mm_store_ps(f,t);
os << f[0] << ", " << f[1] << ", " << f[2] << ", " << f[3];
@@ -19,17 +22,373 @@
// single version.
std::ostream& operator<<(std::ostream& os, __m128i t)
{
- //MANTA_ALIGN(16) int i[4];
- //_mm_store_si128((__m128i*)&i[0],t);
- //os << "__m128i = " << i[0] << ", " << i[1] << ", " << i[2] << ", " <<
i[3];
- os << "__m128 = ";
- for (int i = 0; i < 4; i++) {
- os << ((int*)&t)[i];
- if (i != 3)
- os << ", ";
- }
+ // This code caused bad values to be printed out. Don't do it this way.
+ //
+ // MANTA_ALIGN(16) int i[4];
+ // _mm_store_si128((__m128i*)&i[0],t);
+ // os << "__m128i = " << i[0] << ", " << i[1] << ", " << i[2] << ", " <<
i[3];
+ os << "__m128i = ";
+ for (int i = 0; i < 4; i++) {
+ os << ((int*)&t)[i];
+ if (i != 3)
+ os << ", ";
+ }
return os;
}
#endif
}
+
+
+///////////////////////
+// SSE sine and cosine, based off of code available at:
+//
http://gruntthepeon.free.fr/ssemath/
+// with the following license:
+
+/* Copyright (C) 2007 Julien Pommier
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ (this is the zlib license)
+*/
+
+// end license
+////////////////////////////
+
+typedef union xmm_mm_union {
+ __m128 xmm;
+ __m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
+ xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
+ }
+
+sse_t Manta::sin4(sse_t x) {
+ typedef __m128 v4sf;
+ typedef __m64 v2si;
+
+ v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+ v2si mm0, mm1, mm2, mm3;
+ sign_bit = x;
+ /* take the absolute value */
+ x = _mm_and_ps(x, _mm_absmask);
+ /* extract the sign bit (upper one) */
+ sign_bit = _mm_and_ps(sign_bit, _mm_signbit);
+
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(x, _ps_cephes_FOPI);
+
+ /* store the integer part of y in mm0:mm1 */
+ xmm2 = _mm_movehl_ps(xmm2, y);
+ mm2 = _mm_cvttps_pi32(y);
+ mm3 = _mm_cvttps_pi32(xmm2);
+
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
+ mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
+ mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
+ mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
+
+ y = _mm_cvtpi32x2_ps(mm2, mm3);
+ //printf("\nsin_ps: mm2:mm3 = "); print2i(mm2); print2i(mm3); printf("\n");
+ //printf("sin_ps: y="); print4(y); printf("\n");
+
+
+ /* get the swap sign flag */
+ mm0 = _mm_and_si64(mm2, *((v2si*)&_mm_four_si128));
+ mm1 = _mm_and_si64(mm3, *((v2si*)&_mm_four_si128));
+ mm0 = _mm_slli_pi32(mm0, 29);
+ mm1 = _mm_slli_pi32(mm1, 29);
+
+ /* get the polynom selection mask
+ there is one polynom for 0 <= x <= Pi/4
+ and another one for Pi/4<x<=Pi/2
+
+ Both branches will be computed.
+ */
+ mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
+ mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
+ mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+ mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+ v4sf swap_sign_bit, poly_mask;
+ COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+ COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+ sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = _ps_minus_cephes_DP1;
+ xmm2 = _ps_minus_cephes_DP2;
+ xmm3 = _ps_minus_cephes_DP3;
+ xmm1 = _mm_mul_ps(y, xmm1);
+ xmm2 = _mm_mul_ps(y, xmm2);
+ xmm3 = _mm_mul_ps(y, xmm3);
+ x = _mm_add_ps(x, xmm1);
+ x = _mm_add_ps(x, xmm2);
+ x = _mm_add_ps(x, xmm3);
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+ y = _ps_coscof_p0;
+ v4sf z = _mm_mul_ps(x,x);
+
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p1);
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p2);
+ y = _mm_mul_ps(y, z);
+ y = _mm_mul_ps(y, z);
+ v4sf tmp = _mm_mul_ps(z, _mm_one_half);
+ y = _mm_sub_ps(y, tmp);
+ y = _mm_add_ps(y, _mm_one);
+
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+
+ v4sf y2 = _ps_sincof_p0;
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p1);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p2);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_mul_ps(y2, x);
+ y2 = _mm_add_ps(y2, x);
+
+ /* select the correct result from the two polynoms */
+ xmm3 = poly_mask;
+ y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+ y = _mm_andnot_ps(xmm3, y);
+ y = _mm_add_ps(y,y2);
+ /* update the sign */
+ y = _mm_xor_ps(y, sign_bit);
+ _mm_empty(); /* good-bye mmx */
+ return y;
+}
+
+sse_t Manta::cos4(sse_t x) {
+ typedef __m128 v4sf;
+ typedef __m64 v2si;
+
+ v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+ v2si mm0, mm1, mm2, mm3;
+ /* take the absolute value */
+ x = _mm_and_ps(x, _mm_absmask);
+
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(x, _ps_cephes_FOPI);
+
+ /* store the integer part of y in mm0:mm1 */
+ xmm2 = _mm_movehl_ps(xmm2, y);
+ mm2 = _mm_cvttps_pi32(y);
+ mm3 = _mm_cvttps_pi32(xmm2);
+
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
+ mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
+ mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
+ mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
+
+ y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+
+ mm2 = _mm_sub_pi32(mm2, *((v2si*)&_mm_two_si128));
+ mm3 = _mm_sub_pi32(mm3, *((v2si*)&_mm_two_si128));
+
+ /* get the swap sign flag in mm0:mm1 and the
+ polynom selection mask in mm2:mm3 */
+
+ mm0 = _mm_andnot_si64(mm2, *((v2si*)&_mm_four_si128));
+ mm1 = _mm_andnot_si64(mm3, *((v2si*)&_mm_four_si128));
+ mm0 = _mm_slli_pi32(mm0, 29);
+ mm1 = _mm_slli_pi32(mm1, 29);
+
+ mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
+ mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
+
+ mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+ mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+ v4sf sign_bit, poly_mask;
+ COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+ COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = _ps_minus_cephes_DP1;
+ xmm2 = _ps_minus_cephes_DP2;
+ xmm3 = _ps_minus_cephes_DP3;
+ xmm1 = _mm_mul_ps(y, xmm1);
+ xmm2 = _mm_mul_ps(y, xmm2);
+ xmm3 = _mm_mul_ps(y, xmm3);
+ x = _mm_add_ps(x, xmm1);
+ x = _mm_add_ps(x, xmm2);
+ x = _mm_add_ps(x, xmm3);
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+ y = _ps_coscof_p0;
+ v4sf z = _mm_mul_ps(x,x);
+
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p1);
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p2);
+ y = _mm_mul_ps(y, z);
+ y = _mm_mul_ps(y, z);
+ v4sf tmp = _mm_mul_ps(z, _mm_one_half);
+ y = _mm_sub_ps(y, tmp);
+ y = _mm_add_ps(y, _mm_one);
+
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+
+ v4sf y2 = _ps_sincof_p0;
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p1);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p2);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_mul_ps(y2, x);
+ y2 = _mm_add_ps(y2, x);
+
+ /* select the correct result from the two polynoms */
+ xmm3 = poly_mask;
+ y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+ y = _mm_andnot_ps(xmm3, y);
+ y = _mm_add_ps(y,y2);
+ /* update the sign */
+ y = _mm_xor_ps(y, sign_bit);
+
+ _mm_empty(); /* good-bye mmx */
+ return y;
+}
+
+void Manta::sincos4(sse_t x, sse_t* s, sse_t* c) {
+ typedef __m128 v4sf;
+ typedef __m64 v2si;
+
+ v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+ v2si mm0, mm1, mm2, mm3, mm4, mm5;
+ sign_bit_sin = x;
+ /* take the absolute value */
+ x = _mm_and_ps(x, _mm_absmask);
+ /* extract the sign bit (upper one) */
+ sign_bit_sin = _mm_and_ps(sign_bit_sin, _mm_signbit);
+
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(x, _ps_cephes_FOPI);
+
+ /* store the integer part of y in mm0:mm1 */
+ xmm3 = _mm_movehl_ps(xmm3, y);
+ mm2 = _mm_cvttps_pi32(y);
+ mm3 = _mm_cvttps_pi32(xmm3);
+
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
+ mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
+ mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
+ mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
+
+ y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+ mm4 = mm2;
+ mm5 = mm3;
+
+ /* get the swap sign flag for the sine */
+ mm0 = _mm_and_si64(mm2, *((v2si*)&_mm_four_si128));
+ mm1 = _mm_and_si64(mm3, *((v2si*)&_mm_four_si128));
+ mm0 = _mm_slli_pi32(mm0, 29);
+ mm1 = _mm_slli_pi32(mm1, 29);
+ v4sf swap_sign_bit_sin;
+ COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+ /* get the polynom selection mask for the sine */
+
+ mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
+ mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
+ mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+ mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+ v4sf poly_mask;
+ COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = _ps_minus_cephes_DP1;
+ xmm2 = _ps_minus_cephes_DP2;
+ xmm3 = _ps_minus_cephes_DP3;
+ xmm1 = _mm_mul_ps(y, xmm1);
+ xmm2 = _mm_mul_ps(y, xmm2);
+ xmm3 = _mm_mul_ps(y, xmm3);
+ x = _mm_add_ps(x, xmm1);
+ x = _mm_add_ps(x, xmm2);
+ x = _mm_add_ps(x, xmm3);
+
+
+ /* get the sign flag for the cosine */
+
+ mm4 = _mm_sub_pi32(mm4, *((v2si*)&_mm_two_si128));
+ mm5 = _mm_sub_pi32(mm5, *((v2si*)&_mm_two_si128));
+ mm4 = _mm_andnot_si64(mm4, *((v2si*)&_mm_four_si128));
+ mm5 = _mm_andnot_si64(mm5, *((v2si*)&_mm_four_si128));
+ mm4 = _mm_slli_pi32(mm4, 29);
+ mm5 = _mm_slli_pi32(mm5, 29);
+ v4sf sign_bit_cos;
+ COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+
+ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+ v4sf z = _mm_mul_ps(x,x);
+ y = _ps_coscof_p0;
+
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p1);
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, _ps_coscof_p2);
+ y = _mm_mul_ps(y, z);
+ y = _mm_mul_ps(y, z);
+ v4sf tmp = _mm_mul_ps(z, _mm_one_half);
+ y = _mm_sub_ps(y, tmp);
+ y = _mm_add_ps(y, _mm_one);
+
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+
+ v4sf y2 = _ps_sincof_p0;
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p1);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, _ps_sincof_p2);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_mul_ps(y2, x);
+ y2 = _mm_add_ps(y2, x);
+
+ /* select the correct result from the two polynoms */
+ xmm3 = poly_mask;
+ v4sf ysin2 = _mm_and_ps(xmm3, y2);
+ v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+ y2 = _mm_sub_ps(y2,ysin2);
+ y = _mm_sub_ps(y, ysin1);
+
+ xmm1 = _mm_add_ps(ysin1,ysin2);
+ xmm2 = _mm_add_ps(y,y2);
+
+ /* update the sign */
+ *s = _mm_xor_ps(xmm1, sign_bit_sin);
+ *c = _mm_xor_ps(xmm2, sign_bit_cos);
+ _mm_empty(); /* good-bye mmx */
+}
+
+
#endif // MANTA_SSE
Modified: trunk/Core/Math/SSEDefs.h
==============================================================================
--- trunk/Core/Math/SSEDefs.h (original)
+++ trunk/Core/Math/SSEDefs.h Wed Sep 12 07:46:52 2007
@@ -90,7 +90,7 @@
// I'm not sure about the high versus low, but the code does what
// GCC does (for example, when copying 64 bit pointers).
static inline
- __m128i _mm_set1_epi64x(long long val)
+ __m128i _mm_set1_epi64x(long long val)
{
const int high = (0xFFFFFFFF00000000L & val) >> 32 ;
const int low = ( 0xFFFFFFFFL & val);
@@ -453,359 +453,16 @@
return product;
#endif
}
-
- ///////////////////////
- // SSE sine and cosine, based off of code available at:
- //
http://gruntthepeon.free.fr/ssemath/
- // with the following license:
-
- /* Copyright (C) 2007 Julien Pommier
-
- This software is provided 'as-is', without any express or implied
- warranty. In no event will the authors be held liable for any damages
- arising from the use of this software.
-
- Permission is granted to anyone to use this software for any purpose,
- including commercial applications, and to alter it and redistribute it
- freely, subject to the following restrictions:
-
- 1. The origin of this software must not be misrepresented; you must not
- claim that you wrote the original software. If you use this software
- in a product, an acknowledgment in the product documentation would be
- appreciated but is not required.
- 2. Altered source versions must be plainly marked as such, and must not
be
- misrepresented as being the original software.
- 3. This notice may not be removed or altered from any source
distribution.
-
- (this is the zlib license)
- */
-
- // end license
- ////////////////////////////
-
- typedef union xmm_mm_union {
- __m128 xmm;
- __m64 mm[2];
- } xmm_mm_union;
-
-#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
- xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
- }
-
- inline sse_t sin4(sse_t x) {
- typedef __m128 v4sf;
- typedef __m64 v2si;
-
- v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
- v2si mm0, mm1, mm2, mm3;
- sign_bit = x;
- /* take the absolute value */
- x = _mm_and_ps(x, _mm_absmask);
- /* extract the sign bit (upper one) */
- sign_bit = _mm_and_ps(sign_bit, _mm_signbit);
-
- /* scale by 4/Pi */
- y = _mm_mul_ps(x, _ps_cephes_FOPI);
-
- /* store the integer part of y in mm0:mm1 */
- xmm2 = _mm_movehl_ps(xmm2, y);
- mm2 = _mm_cvttps_pi32(y);
- mm3 = _mm_cvttps_pi32(xmm2);
-
- /* j=(j+1) & (~1) (see the cephes sources) */
- mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
- mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
- mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
- mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
-
- y = _mm_cvtpi32x2_ps(mm2, mm3);
- //printf("\nsin_ps: mm2:mm3 = "); print2i(mm2); print2i(mm3);
printf("\n");
- //printf("sin_ps: y="); print4(y); printf("\n");
-
-
- /* get the swap sign flag */
- mm0 = _mm_and_si64(mm2, *((v2si*)&_mm_four_si128));
- mm1 = _mm_and_si64(mm3, *((v2si*)&_mm_four_si128));
- mm0 = _mm_slli_pi32(mm0, 29);
- mm1 = _mm_slli_pi32(mm1, 29);
-
- /* get the polynom selection mask
- there is one polynom for 0 <= x <= Pi/4
- and another one for Pi/4<x<=Pi/2
-
- Both branches will be computed.
- */
- mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
- mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
- mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
- mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
-
- v4sf swap_sign_bit, poly_mask;
- COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
- COPY_MM_TO_XMM(mm2, mm3, poly_mask);
- sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = _ps_minus_cephes_DP1;
- xmm2 = _ps_minus_cephes_DP2;
- xmm3 = _ps_minus_cephes_DP3;
- xmm1 = _mm_mul_ps(y, xmm1);
- xmm2 = _mm_mul_ps(y, xmm2);
- xmm3 = _mm_mul_ps(y, xmm3);
- x = _mm_add_ps(x, xmm1);
- x = _mm_add_ps(x, xmm2);
- x = _mm_add_ps(x, xmm3);
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- y = _ps_coscof_p0;
- v4sf z = _mm_mul_ps(x,x);
-
- y = _mm_mul_ps(y, z);
- y = _mm_add_ps(y, _ps_coscof_p1);
- y = _mm_mul_ps(y, z);
- y = _mm_add_ps(y, _ps_coscof_p2);
- y = _mm_mul_ps(y, z);
- y = _mm_mul_ps(y, z);
- v4sf tmp = _mm_mul_ps(z, _mm_one_half);
- y = _mm_sub_ps(y, tmp);
- y = _mm_add_ps(y, _mm_one);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
-
- v4sf y2 = _ps_sincof_p0;
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_add_ps(y2, _ps_sincof_p1);
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_add_ps(y2, _ps_sincof_p2);
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_mul_ps(y2, x);
- y2 = _mm_add_ps(y2, x);
-
- /* select the correct result from the two polynoms */
- xmm3 = poly_mask;
- y2 = _mm_and_ps(xmm3, y2); //, xmm3);
- y = _mm_andnot_ps(xmm3, y);
- y = _mm_add_ps(y,y2);
- /* update the sign */
- y = _mm_xor_ps(y, sign_bit);
- _mm_empty(); /* good-bye mmx */
- return y;
- }
-
- inline sse_t cos4(sse_t x) {
- typedef __m128 v4sf;
- typedef __m64 v2si;
-
- v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
- v2si mm0, mm1, mm2, mm3;
- /* take the absolute value */
- x = _mm_and_ps(x, _mm_absmask);
-
- /* scale by 4/Pi */
- y = _mm_mul_ps(x, _ps_cephes_FOPI);
-
- /* store the integer part of y in mm0:mm1 */
- xmm2 = _mm_movehl_ps(xmm2, y);
- mm2 = _mm_cvttps_pi32(y);
- mm3 = _mm_cvttps_pi32(xmm2);
-
- /* j=(j+1) & (~1) (see the cephes sources) */
- mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
- mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
- mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
- mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
-
- y = _mm_cvtpi32x2_ps(mm2, mm3);
-
-
- mm2 = _mm_sub_pi32(mm2, *((v2si*)&_mm_two_si128));
- mm3 = _mm_sub_pi32(mm3, *((v2si*)&_mm_two_si128));
-
- /* get the swap sign flag in mm0:mm1 and the
- polynom selection mask in mm2:mm3 */
-
- mm0 = _mm_andnot_si64(mm2, *((v2si*)&_mm_four_si128));
- mm1 = _mm_andnot_si64(mm3, *((v2si*)&_mm_four_si128));
- mm0 = _mm_slli_pi32(mm0, 29);
- mm1 = _mm_slli_pi32(mm1, 29);
-
- mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
- mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
-
- mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
- mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
-
- v4sf sign_bit, poly_mask;
- COPY_MM_TO_XMM(mm0, mm1, sign_bit);
- COPY_MM_TO_XMM(mm2, mm3, poly_mask);
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = _ps_minus_cephes_DP1;
- xmm2 = _ps_minus_cephes_DP2;
- xmm3 = _ps_minus_cephes_DP3;
- xmm1 = _mm_mul_ps(y, xmm1);
- xmm2 = _mm_mul_ps(y, xmm2);
- xmm3 = _mm_mul_ps(y, xmm3);
- x = _mm_add_ps(x, xmm1);
- x = _mm_add_ps(x, xmm2);
- x = _mm_add_ps(x, xmm3);
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- y = _ps_coscof_p0;
- v4sf z = _mm_mul_ps(x,x);
-
- y = _mm_mul_ps(y, z);
- y = _mm_add_ps(y, _ps_coscof_p1);
- y = _mm_mul_ps(y, z);
- y = _mm_add_ps(y, _ps_coscof_p2);
- y = _mm_mul_ps(y, z);
- y = _mm_mul_ps(y, z);
- v4sf tmp = _mm_mul_ps(z, _mm_one_half);
- y = _mm_sub_ps(y, tmp);
- y = _mm_add_ps(y, _mm_one);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
-
- v4sf y2 = _ps_sincof_p0;
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_add_ps(y2, _ps_sincof_p1);
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_add_ps(y2, _ps_sincof_p2);
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_mul_ps(y2, x);
- y2 = _mm_add_ps(y2, x);
-
- /* select the correct result from the two polynoms */
- xmm3 = poly_mask;
- y2 = _mm_and_ps(xmm3, y2); //, xmm3);
- y = _mm_andnot_ps(xmm3, y);
- y = _mm_add_ps(y,y2);
- /* update the sign */
- y = _mm_xor_ps(y, sign_bit);
-
- _mm_empty(); /* good-bye mmx */
- return y;
- }
-
- inline void sincos4(sse_t x, sse_t* s, sse_t* c) {
- typedef __m128 v4sf;
- typedef __m64 v2si;
-
- v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
- v2si mm0, mm1, mm2, mm3, mm4, mm5;
- sign_bit_sin = x;
- /* take the absolute value */
- x = _mm_and_ps(x, _mm_absmask);
- /* extract the sign bit (upper one) */
- sign_bit_sin = _mm_and_ps(sign_bit_sin, _mm_signbit);
-
- /* scale by 4/Pi */
- y = _mm_mul_ps(x, _ps_cephes_FOPI);
-
- /* store the integer part of y in mm0:mm1 */
- xmm3 = _mm_movehl_ps(xmm3, y);
- mm2 = _mm_cvttps_pi32(y);
- mm3 = _mm_cvttps_pi32(xmm3);
-
- /* j=(j+1) & (~1) (see the cephes sources) */
- mm2 = _mm_add_pi32(mm2, *((v2si*)&_mm_one_si128));
- mm3 = _mm_add_pi32(mm3, *((v2si*)&_mm_one_si128));
- mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_inv_one_si128));
- mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_inv_one_si128));
-
- y = _mm_cvtpi32x2_ps(mm2, mm3);
-
- mm4 = mm2;
- mm5 = mm3;
-
- /* get the swap sign flag for the sine */
- mm0 = _mm_and_si64(mm2, *((v2si*)&_mm_four_si128));
- mm1 = _mm_and_si64(mm3, *((v2si*)&_mm_four_si128));
- mm0 = _mm_slli_pi32(mm0, 29);
- mm1 = _mm_slli_pi32(mm1, 29);
- v4sf swap_sign_bit_sin;
- COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
-
- /* get the polynom selection mask for the sine */
-
- mm2 = _mm_and_si64(mm2, *((v2si*)&_mm_two_si128));
- mm3 = _mm_and_si64(mm3, *((v2si*)&_mm_two_si128));
- mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
- mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
- v4sf poly_mask;
- COPY_MM_TO_XMM(mm2, mm3, poly_mask);
-
- /* The magic pass: "Extended precision modular arithmetic"
- x = ((x - y * DP1) - y * DP2) - y * DP3; */
- xmm1 = _ps_minus_cephes_DP1;
- xmm2 = _ps_minus_cephes_DP2;
- xmm3 = _ps_minus_cephes_DP3;
- xmm1 = _mm_mul_ps(y, xmm1);
- xmm2 = _mm_mul_ps(y, xmm2);
- xmm3 = _mm_mul_ps(y, xmm3);
- x = _mm_add_ps(x, xmm1);
- x = _mm_add_ps(x, xmm2);
- x = _mm_add_ps(x, xmm3);
-
-
- /* get the sign flag for the cosine */
-
- mm4 = _mm_sub_pi32(mm4, *((v2si*)&_mm_two_si128));
- mm5 = _mm_sub_pi32(mm5, *((v2si*)&_mm_two_si128));
- mm4 = _mm_andnot_si64(mm4, *((v2si*)&_mm_four_si128));
- mm5 = _mm_andnot_si64(mm5, *((v2si*)&_mm_four_si128));
- mm4 = _mm_slli_pi32(mm4, 29);
- mm5 = _mm_slli_pi32(mm5, 29);
- v4sf sign_bit_cos;
- COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
-
- sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-
-
- /* Evaluate the first polynom (0 <= x <= Pi/4) */
- v4sf z = _mm_mul_ps(x,x);
- y = _ps_coscof_p0;
-
- y = _mm_mul_ps(y, z);
- y = _mm_add_ps(y, _ps_coscof_p1);
- y = _mm_mul_ps(y, z);
- y = _mm_add_ps(y, _ps_coscof_p2);
- y = _mm_mul_ps(y, z);
- y = _mm_mul_ps(y, z);
- v4sf tmp = _mm_mul_ps(z, _mm_one_half);
- y = _mm_sub_ps(y, tmp);
- y = _mm_add_ps(y, _mm_one);
-
- /* Evaluate the second polynom (Pi/4 <= x <= 0) */
-
- v4sf y2 = _ps_sincof_p0;
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_add_ps(y2, _ps_sincof_p1);
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_add_ps(y2, _ps_sincof_p2);
- y2 = _mm_mul_ps(y2, z);
- y2 = _mm_mul_ps(y2, x);
- y2 = _mm_add_ps(y2, x);
-
- /* select the correct result from the two polynoms */
- xmm3 = poly_mask;
- v4sf ysin2 = _mm_and_ps(xmm3, y2);
- v4sf ysin1 = _mm_andnot_ps(xmm3, y);
- y2 = _mm_sub_ps(y2,ysin2);
- y = _mm_sub_ps(y, ysin1);
-
- xmm1 = _mm_add_ps(ysin1,ysin2);
- xmm2 = _mm_add_ps(y,y2);
-
- /* update the sign */
- *s = _mm_xor_ps(xmm1, sign_bit_sin);
- *c = _mm_xor_ps(xmm2, sign_bit_cos);
- _mm_empty(); /* good-bye mmx */
- }
+ // Some Trig functions
+ sse_t sin4(sse_t x);
+ sse_t cos4(sse_t x);
+ // Computes both the sin and cos of a given value. This is useful
+ // if you want both value, because many of the intermediate
+ // computation can be shared.
+ void sincos4(sse_t x, sse_t* s, sse_t* c);
+ // For functions that compute pow, exp, log, exp2, and log2, see Expon.h
};
#endif //#ifdef MANTA_SSE
Modified: trunk/Engine/PixelSamplers/JitterSampler.cc
==============================================================================
--- trunk/Engine/PixelSamplers/JitterSampler.cc (original)
+++ trunk/Engine/PixelSamplers/JitterSampler.cc Wed Sep 12 07:46:52 2007
@@ -55,6 +55,7 @@
void JitterSampler::setupBegin(const SetupContext& context, int numChannels)
{
channelInfo.resize(numChannels);
+ if (random) delete[] random;
random = new MT_RNG[context.numProcs];
context.renderer->setupBegin(context, numChannels);
}
- [Manta] r1723 - in trunk: Core Core/Math Engine/PixelSamplers, bigler, 09/12/2007
Archive powered by MHonArc 2.6.16.