Manta Interactive Ray Tracer Development Mailing List

Text archives Help


[MANTA] r1455 - trunk/Core/Math


Chronological Thread 
  • From: bigler@sci.utah.edu
  • To: manta@sci.utah.edu
  • Subject: [MANTA] r1455 - trunk/Core/Math
  • Date: Mon, 9 Jul 2007 11:04:16 -0600 (MDT)

Author: bigler
Date: Mon Jul  9 11:04:15 2007
New Revision: 1455

Modified:
   trunk/Core/Math/Noise.cc
Log:

Core/Math/Noise.cc

  Added an option that does all the unpacking and packing in one go.
  This is faster than Steve's ssehash on my laptop.


Modified: trunk/Core/Math/Noise.cc
==============================================================================
--- trunk/Core/Math/Noise.cc    (original)
+++ trunk/Core/Math/Noise.cc    Mon Jul  9 11:04:15 2007
@@ -447,13 +447,25 @@
                       _mm_mul_ps(weight,
                                  _mm_sub_ps(d2, d1)));
   }
+
+#define USE_PERMUTATION_UNPACKED 1
+
+  typedef union {
+      unsigned int i[4];
+      __m128i      s;
+  } unsigned_int_ssei;
   
   __m128 ScalarNoiseSSE( const __m128& location_x,
                          const __m128& location_y,
                          const __m128& location_z)
   {
     __m128  offset_of_x  = fracSSE(location_x);
+#if USE_PERMUTATION_UNPACKED
+    unsigned_int_ssei integer_of_x;
+    integer_of_x.s = _mm_and_si128(_mm_cvttps_epi32(_mm_sub_ps(location_x, 
offset_of_x)), _mm_set1_epi32(0xFF));
+#else
     __m128i integer_of_x = _mm_cvttps_epi32(_mm_sub_ps(location_x, 
offset_of_x));
+#endif
     __m128  fade_x       = _mm_mul_ps(_mm_mul_ps(offset_of_x, offset_of_x),
                                       _mm_mul_ps(offset_of_x,
                                                  
_mm_add_ps(_mm_mul_ps(offset_of_x,
@@ -463,59 +475,94 @@
                                                             
_mm_set_ps1(10.f))));
 
     __m128  offset_of_y  = fracSSE(location_y);
+#if USE_PERMUTATION_UNPACKED
+    unsigned_int_ssei integer_of_y;
+    integer_of_y.s = _mm_cvttps_epi32(_mm_sub_ps(location_y, offset_of_y));
+#else
     __m128i integer_of_y = _mm_cvttps_epi32(_mm_sub_ps(location_y, 
offset_of_y));
+#endif
     __m128  fade_y       = _mm_mul_ps(_mm_mul_ps(offset_of_y, offset_of_y), 
_mm_mul_ps(offset_of_y, _mm_add_ps(_mm_mul_ps(offset_of_y, 
_mm_sub_ps(_mm_mul_ps(offset_of_y, _mm_set_ps1(6.f)), _mm_set_ps1(15.f))), 
_mm_set_ps1(10.f))));
 
     __m128  offset_of_z  = fracSSE(location_z);
+#if USE_PERMUTATION_UNPACKED
+    unsigned_int_ssei integer_of_z;
+    integer_of_z.s = _mm_cvttps_epi32(_mm_sub_ps(location_z, offset_of_z));
+#else
     __m128i integer_of_z = _mm_cvttps_epi32(_mm_sub_ps(location_z, 
offset_of_z));
+#endif
     __m128  fade_z       = _mm_mul_ps(_mm_mul_ps(offset_of_z, offset_of_z), 
_mm_mul_ps(offset_of_z, _mm_add_ps(_mm_mul_ps(offset_of_z, 
_mm_sub_ps(_mm_mul_ps(offset_of_z, _mm_set_ps1(6.f)), _mm_set_ps1(15.f))), 
_mm_set_ps1(10.f))));
-    
+
+#if USE_PERMUTATION_UNPACKED
+    unsigned_int_ssei hash_000, hash_001, hash_010, hash_011, hash_100, 
hash_101, hash_110, hash_111;
+    for(unsigned int i = 0; i < 4; ++i) {
+      unsigned int x = integer_of_x.i[i];
+      unsigned int y = integer_of_y.i[i];
+      unsigned int z = integer_of_z.i[i];
+      unsigned int hash_0  = NoiseXPermutationTable[(x)                   ];
+      unsigned int hash_1  = NoiseXPermutationTable[(x+1)           & 0xFF];
+      unsigned int hash_00 = NoiseXPermutationTable[(hash_0 + y)    & 0xFF];
+      unsigned int hash_01 = NoiseXPermutationTable[(hash_0 + y+1)  & 0xFF];
+      unsigned int hash_10 = NoiseXPermutationTable[(hash_1 + y)    & 0xFF];
+      unsigned int hash_11 = NoiseXPermutationTable[(hash_1 + y+1)  & 0xFF];
+      hash_000.i[i]        = NoiseXPermutationTable[(hash_00 + z)   & 0xFF];
+      hash_001.i[i]        = NoiseXPermutationTable[(hash_00 + z+1) & 0xFF];
+      hash_010.i[i]        = NoiseXPermutationTable[(hash_01 + z)   & 0xFF];
+      hash_011.i[i]        = NoiseXPermutationTable[(hash_01 + z+1) & 0xFF];
+      hash_100.i[i]        = NoiseXPermutationTable[(hash_10 + z)   & 0xFF];
+      hash_101.i[i]        = NoiseXPermutationTable[(hash_10 + z+1) & 0xFF];
+      hash_110.i[i]        = NoiseXPermutationTable[(hash_11 + z)   & 0xFF];
+      hash_111.i[i]        = NoiseXPermutationTable[(hash_11 + z+1) & 0xFF];
+    }
+
+    __m128  value_000    = grad(hash_000.s, offset_of_x, offset_of_y, 
offset_of_z);
+    __m128  value_001    = grad(hash_001.s, offset_of_x, offset_of_y, 
_mm_sub_ps(offset_of_z, _mm_set_ps1(1)));
+    __m128  value_010    = grad(hash_010.s, offset_of_x, 
_mm_sub_ps(offset_of_y, _mm_set_ps1(1)), offset_of_z);
+    __m128  value_011    = grad(hash_011.s, offset_of_x, 
_mm_sub_ps(offset_of_y, _mm_set_ps1(1)), _mm_sub_ps(offset_of_z, 
_mm_set_ps1(1)));
+    __m128  value_100    = grad(hash_100.s, _mm_sub_ps(offset_of_x, 
_mm_set_ps1(1)), offset_of_y, offset_of_z);
+    __m128  value_101    = grad(hash_101.s, _mm_sub_ps(offset_of_x, 
_mm_set_ps1(1)), offset_of_y, _mm_sub_ps(offset_of_z, _mm_set_ps1(1)));
+    __m128  value_110    = grad(hash_110.s, _mm_sub_ps(offset_of_x, 
_mm_set_ps1(1)), _mm_sub_ps(offset_of_y, _mm_set_ps1(1)), offset_of_z);
+    __m128  value_111    = grad(hash_111.s, _mm_sub_ps(offset_of_x, 
_mm_set_ps1(1)), _mm_sub_ps(offset_of_y, _mm_set_ps1(1)), 
_mm_sub_ps(offset_of_z, _mm_set_ps1(1)));
+
+#else
 #if 0
     __m128i hash_0       = permutationSSE(integer_of_x);
     __m128i hash_00      = permutationSSE(_mm_add_epi32(hash_0,  
integer_of_y));
     __m128i hash_000     = permutationSSE(_mm_add_epi32(hash_00, 
integer_of_z));
     __m128i hash_001     = 
permutationSSE(_mm_add_epi32(_mm_add_epi32(hash_00, integer_of_z), 
_mm_set1_epi32(1)));
-#else
-    __m128i hash_000 = ssehash(integer_of_x, integer_of_y, integer_of_z);
-    __m128i hash_001 = ssehash(integer_of_x, integer_of_y, 
_mm_add_epi32(integer_of_z, _mm_set1_epi32(1)));
-#endif
-    __m128  value_000    = grad(hash_000, offset_of_x, offset_of_y, 
offset_of_z);
-    __m128  value_001    = grad(hash_001, offset_of_x, offset_of_y, 
_mm_sub_ps(offset_of_z, _mm_set_ps1(1)));
 
-#if 0    
     __m128i hash_01      = permutationSSE(_mm_add_epi32(_mm_add_epi32(hash_0 
, integer_of_y), _mm_set1_epi32(1)));
     __m128i hash_010     = permutationSSE(_mm_add_epi32(hash_01, 
integer_of_z));
     __m128i hash_011     = 
permutationSSE(_mm_add_epi32(_mm_add_epi32(hash_01, integer_of_z), 
_mm_set1_epi32(1)));
-#else
-    __m128i hash_010 = ssehash(integer_of_x, _mm_add_epi32(integer_of_y, 
_mm_set1_epi32(1)), integer_of_z);
-    __m128i hash_011 = ssehash(integer_of_x, _mm_add_epi32(integer_of_y, 
_mm_set1_epi32(1)), _mm_add_epi32(integer_of_z, _mm_set1_epi32(1)));
-#endif
-    __m128  value_010    = grad(hash_010, offset_of_x, 
_mm_sub_ps(offset_of_y, _mm_set_ps1(1)), offset_of_z);
-    __m128  value_011    = grad(hash_011, offset_of_x, 
_mm_sub_ps(offset_of_y, _mm_set_ps1(1)), _mm_sub_ps(offset_of_z, 
_mm_set_ps1(1)));
 
-#if 0
     __m128i hash_1       = permutationSSE(_mm_add_epi32(integer_of_x, 
_mm_set1_epi32(1)));
     __m128i hash_10      = permutationSSE(_mm_add_epi32(hash_1,  
integer_of_y));
     __m128i hash_100     = permutationSSE(_mm_add_epi32(hash_10, 
integer_of_z));
     __m128i hash_101     = 
permutationSSE(_mm_add_epi32(_mm_add_epi32(hash_10, integer_of_z), 
_mm_set1_epi32(1)));
-#else
-    __m128i hash_100 = ssehash(_mm_add_epi32(integer_of_x, 
_mm_set1_epi32(1)), integer_of_y, integer_of_z);
-    __m128i hash_101 = ssehash(_mm_add_epi32(integer_of_x, 
_mm_set1_epi32(1)), integer_of_y, _mm_add_epi32(integer_of_z, 
_mm_set1_epi32(1)));
-#endif
-    __m128  value_100    = grad(hash_100, _mm_sub_ps(offset_of_x, 
_mm_set_ps1(1)), offset_of_y, offset_of_z);
-    __m128  value_101    = grad(hash_101, _mm_sub_ps(offset_of_x, 
_mm_set_ps1(1)), offset_of_y, _mm_sub_ps(offset_of_z, _mm_set_ps1(1)));
 
-#if 0
     __m128i hash_11      = permutationSSE(_mm_add_epi32(_mm_add_epi32(hash_1 
, integer_of_y), _mm_set1_epi32(1)));
     __m128i hash_110     = permutationSSE(_mm_add_epi32(hash_11, 
integer_of_z));
     __m128i hash_111     = 
permutationSSE(_mm_add_epi32(_mm_add_epi32(hash_11, integer_of_z), 
_mm_set1_epi32(1)));
 #else
+    __m128i hash_000 = ssehash(integer_of_x, integer_of_y, integer_of_z);
+    __m128i hash_001 = ssehash(integer_of_x, integer_of_y, 
_mm_add_epi32(integer_of_z, _mm_set1_epi32(1)));
+    __m128i hash_010 = ssehash(integer_of_x, _mm_add_epi32(integer_of_y, 
_mm_set1_epi32(1)), integer_of_z);
+    __m128i hash_011 = ssehash(integer_of_x, _mm_add_epi32(integer_of_y, 
_mm_set1_epi32(1)), _mm_add_epi32(integer_of_z, _mm_set1_epi32(1)));
+    __m128i hash_100 = ssehash(_mm_add_epi32(integer_of_x, 
_mm_set1_epi32(1)), integer_of_y, integer_of_z);
+    __m128i hash_101 = ssehash(_mm_add_epi32(integer_of_x, 
_mm_set1_epi32(1)), integer_of_y, _mm_add_epi32(integer_of_z, 
_mm_set1_epi32(1)));
     __m128i hash_110 = ssehash(_mm_add_epi32(integer_of_x, 
_mm_set1_epi32(1)), _mm_add_epi32(integer_of_y, _mm_set1_epi32(1)), 
integer_of_z);
     __m128i hash_111 = ssehash(_mm_add_epi32(integer_of_x, 
_mm_set1_epi32(1)), _mm_add_epi32(integer_of_y, _mm_set1_epi32(1)), 
_mm_add_epi32(integer_of_z, _mm_set1_epi32(1)));
 #endif
+
+    __m128  value_000    = grad(hash_000, offset_of_x, offset_of_y, 
offset_of_z);
+    __m128  value_001    = grad(hash_001, offset_of_x, offset_of_y, 
_mm_sub_ps(offset_of_z, _mm_set_ps1(1)));
+    __m128  value_010    = grad(hash_010, offset_of_x, 
_mm_sub_ps(offset_of_y, _mm_set_ps1(1)), offset_of_z);
+    __m128  value_011    = grad(hash_011, offset_of_x, 
_mm_sub_ps(offset_of_y, _mm_set_ps1(1)), _mm_sub_ps(offset_of_z, 
_mm_set_ps1(1)));
+    __m128  value_100    = grad(hash_100, _mm_sub_ps(offset_of_x, 
_mm_set_ps1(1)), offset_of_y, offset_of_z);
+    __m128  value_101    = grad(hash_101, _mm_sub_ps(offset_of_x, 
_mm_set_ps1(1)), offset_of_y, _mm_sub_ps(offset_of_z, _mm_set_ps1(1)));
     __m128  value_110    = grad(hash_110, _mm_sub_ps(offset_of_x, 
_mm_set_ps1(1)), _mm_sub_ps(offset_of_y, _mm_set_ps1(1)), offset_of_z);
     __m128  value_111    = grad(hash_111, _mm_sub_ps(offset_of_x, 
_mm_set_ps1(1)), _mm_sub_ps(offset_of_y, _mm_set_ps1(1)), 
_mm_sub_ps(offset_of_z, _mm_set_ps1(1)));
 
+#endif // #if USE_PERMUTATION_UNPACKED
     //
     __m128  value_00     = Interpolate(value_000, value_001, fade_z);
     __m128  value_01     = Interpolate(value_010, value_011, fade_z);




  • [MANTA] r1455 - trunk/Core/Math, bigler, 07/09/2007

Archive powered by MHonArc 2.6.16.

Top of page