manta - [Manta] r1991 - trunk/Model/Instances

Closed list
Subscribers: 0
Owners

sparker

thiago

Subscribe
Unsubscribe
Info
Admin
Archive

Post

Shared documents

Manta Interactive Ray Tracer Development Mailing List

Text archives Help

[Manta] r1991 - trunk/Model/Instances

From: boulos@sci.utah.edu
To: manta@sci.utah.edu
Subject: [Manta] r1991 - trunk/Model/Instances
Date: Sat, 12 Jan 2008 19:24:11 -0700 (MST)

Author: boulos
Date: Sat Jan 12 19:24:10 2008
New Revision: 1991

Modified:
   trunk/Model/Instances/Instance.cc
Log:
Model/Instances/Instance.cc

Speeding up Instance peformance by 2x (for bin/manta -scene
"lib/libscene_primtest.dylib(-array spinscale2)") by using SSE. My
seemingly great idea of using macros for the kernels seems to have
confused Shark at our debugging level (-g3 right?).

Also switching over to using the Packet class instead of a raw array
of Reals for scales and inv_scales.

Bug fix for the instance_rays flags (should have been or instead of
and)

Modified: trunk/Model/Instances/Instance.cc
==============================================================================
--- trunk/Model/Instances/Instance.cc   (original)
+++ trunk/Model/Instances/Instance.cc   Sat Jan 12 19:24:10 2008
@@ -85,7 +85,7 @@
Interpolable::InterpErr Instance::serialInterpolate(const
std::vector<keyframe_t> &keyframes)
{
   instance->serialInterpolate(keyframes);
-
+
   for (int r=0; r<3; ++r)
     for (int c=0; c<4; ++c)
       transform.mat[r][c] = 0;
@@ -146,6 +146,50 @@
     bbox.extendByPoint(transform.multiply_point(ibox.getCorner(i)));
}

+// Compute 4 points at once
+inline void MultiplyPointSSE(const AffineTransform& t,
+                             __m128 x_in,
+                             __m128 y_in,
+                             __m128 z_in,
+                             __m128& x_out,
+                             __m128& y_out,
+                             __m128& z_out) {
+  x_out  = _mm_mul_ps(_mm_set1_ps(t.mat[0][0]), x_in);
+  x_out  = _mm_add_ps(x_out, _mm_mul_ps(_mm_set1_ps(t.mat[0][1]), y_in));
+  x_out  = _mm_add_ps(x_out, _mm_mul_ps(_mm_set1_ps(t.mat[0][2]), z_in));
+  x_out  = _mm_add_ps(x_out, _mm_set1_ps(t.mat[0][3]));
+
+  y_out  = _mm_mul_ps(_mm_set1_ps(t.mat[1][0]), x_in);
+  y_out  = _mm_add_ps(y_out, _mm_mul_ps(_mm_set1_ps(t.mat[1][1]), y_in));
+  y_out  = _mm_add_ps(y_out, _mm_mul_ps(_mm_set1_ps(t.mat[1][2]), z_in));
+  y_out  = _mm_add_ps(y_out, _mm_set1_ps(t.mat[1][3]));
+
+  z_out  = _mm_mul_ps(_mm_set1_ps(t.mat[2][0]), x_in);
+  z_out  = _mm_add_ps(z_out, _mm_mul_ps(_mm_set1_ps(t.mat[2][1]), y_in));
+  z_out  = _mm_add_ps(z_out, _mm_mul_ps(_mm_set1_ps(t.mat[2][2]), z_in));
+  z_out  = _mm_add_ps(z_out, _mm_set1_ps(t.mat[2][3]));
+}
+
+inline void MultiplyVectorSSE(const AffineTransform& t,
+                              __m128 x_in,
+                              __m128 y_in,
+                              __m128 z_in,
+                              __m128& x_out,
+                              __m128& y_out,
+                              __m128& z_out) {
+  x_out  = _mm_mul_ps(_mm_set1_ps(t.mat[0][0]), x_in);
+  x_out  = _mm_add_ps(x_out, _mm_mul_ps(_mm_set1_ps(t.mat[0][1]), y_in));
+  x_out  = _mm_add_ps(x_out, _mm_mul_ps(_mm_set1_ps(t.mat[0][2]), z_in));
+
+  y_out  = _mm_mul_ps(_mm_set1_ps(t.mat[1][0]), x_in);
+  y_out  = _mm_add_ps(y_out, _mm_mul_ps(_mm_set1_ps(t.mat[1][1]), y_in));
+  y_out  = _mm_add_ps(y_out, _mm_mul_ps(_mm_set1_ps(t.mat[1][2]), z_in));
+
+  z_out  = _mm_mul_ps(_mm_set1_ps(t.mat[2][0]), x_in);
+  z_out  = _mm_add_ps(z_out, _mm_mul_ps(_mm_set1_ps(t.mat[2][1]), y_in));
+  z_out  = _mm_add_ps(z_out, _mm_mul_ps(_mm_set1_ps(t.mat[2][2]), z_in));
+}
+
void Instance::intersect(const RenderContext& context, RayPacket& rays) const
{
   bool debugFlag = rays.getFlag(RayPacket::DebugPacket);
@@ -169,39 +213,127 @@
   //
   // Finally, if the parent is a debug packet, we should be too.
   instance_rays.setFlag(
-    RayPacket::NormalizedDirections &
+    RayPacket::NormalizedDirections |
     (rays.getAllFlags() & (RayPacket::ConstantOrigin |
RayPacket::DebugPacket)));

-  Real scales[RayPacket::MaxSize];
-  Real inv_scales[RayPacket::MaxSize];
+  // Clears things for us, so we don't have to reproduce this code everywhere
+  instance_rays.resetHits();
+  Packet<Real> scales;
+  Packet<Real> inv_scales;

   if(rays.getFlag(RayPacket::ConstantOrigin)){
     Vector o = transform_inv.multiply_point(rays.getOrigin(rays.begin()));

+#define SCALAR_KERNEL_CONSTANT_ORIGIN \
+    Vector dir = transform_inv.multiply_vector(rays.getDirection(i)); \
+    Real length = dir.length(); \
+    inv_scales.set(i, length);  \
+    Real ilength = 1/length; \
+    scales.set(i, ilength);                  \
+    instance_rays.setRay(i, o, dir*ilength); \
+    instance_rays.overrideMinT(i, rays.getMinT(i)*length); \
+
+#define SCALAR_KERNEL_NONCONSTANT_ORIGIN \
+    Vector o = transform_inv.multiply_point(rays.getOrigin(i)); \
+    SCALAR_KERNEL_CONSTANT_ORIGIN ; \
+
+
+#define SSE_KERNEL_CONSTANT_ORIGIN \
+    __m128 dir_x, dir_y, dir_z;
\
+    MultiplyVectorSSE(transform_inv,
\
+                      _mm_load_ps(&(rays.data->direction[0][i])),
\
+                      _mm_load_ps(&(rays.data->direction[1][i])),
\
+                      _mm_load_ps(&(rays.data->direction[2][i])),
\
+                      dir_x, dir_y, dir_z);
\
+    __m128 length = _mm_add_ps(_mm_mul_ps(dir_x, dir_x),
\
+                               _mm_add_ps(_mm_mul_ps(dir_y, dir_y),
\
+                                          _mm_mul_ps(dir_z, dir_z)));
\
+    length = _mm_sqrt_ps(length);
\
+    _mm_store_ps(&(inv_scales.data[i]), length);
\
+    __m128 inv_length = _mm_div_ps(_mm_set1_ps(1.f), length);
\
+    _mm_store_ps(&(scales.data[i]), inv_length);
\
+
\
+    _mm_store_ps(&(instance_rays.data->origin[0][i]), org_x);
\
+    _mm_store_ps(&(instance_rays.data->origin[1][i]), org_y);
\
+    _mm_store_ps(&(instance_rays.data->origin[2][i]), org_z);
\
+
\
+    dir_x = _mm_mul_ps(dir_x, inv_length);
\
+    dir_y = _mm_mul_ps(dir_y, inv_length);
\
+    dir_z = _mm_mul_ps(dir_z, inv_length);
\
+    _mm_store_ps(&(instance_rays.data->direction[0][i]), dir_x);
\
+    _mm_store_ps(&(instance_rays.data->direction[1][i]), dir_y);
\
+    _mm_store_ps(&(instance_rays.data->direction[2][i]), dir_z);
\
+
\
+    __m128 new_t = _mm_mul_ps(length, _mm_load_ps(&(rays.data->minT[i])));
\
+    _mm_store_ps(&(instance_rays.data->minT[i]), new_t);
\
+
+#define SSE_KERNEL_NONCONSTANT_ORIGIN \
+    __m128 org_x, org_y, org_z;
\
+    MultiplyPointSSE(transform_inv,
\
+                     _mm_load_ps(&(rays.data->origin[0][i])),
\
+                     _mm_load_ps(&(rays.data->origin[1][i])),
\
+                     _mm_load_ps(&(rays.data->origin[2][i])),
\
+                     org_x, org_y, org_z);
\
+    SSE_KERNEL_CONSTANT_ORIGIN;
\
+
+#ifdef MANTA_SSE
+    int b = (rays.rayBegin + 3) & (~3);
+    int e = (rays.rayEnd) & (~3);
+    if (b >= e) {
+      for (int i = rays.begin(); i < rays.end(); i++) {
+        SCALAR_KERNEL_CONSTANT_ORIGIN;
+      }
+    } else {
+      for (int i = rays.begin(); i < b; i++) {
+        SCALAR_KERNEL_CONSTANT_ORIGIN;
+      }
+      __m128 org_x = _mm_set1_ps(o[0]);
+      __m128 org_y = _mm_set1_ps(o[1]);
+      __m128 org_z = _mm_set1_ps(o[2]);
+      for (int i = b; i < e; i+= 4) {
+        SSE_KERNEL_CONSTANT_ORIGIN;
+      }
+      for (int i = e; i < rays.end(); i++) {
+        SCALAR_KERNEL_CONSTANT_ORIGIN;
+      }
+    }
+#else
     for(int i = rays.begin();i<rays.end();i++){
-      Vector dir = transform_inv.multiply_vector(rays.getDirection(i));
-
-      Real length = dir.length();
-      inv_scales[i] = length;
-      Real ilength = 1/length;
-      scales[i] = ilength;
-      instance_rays.setRay(i, o, dir*ilength);
-      instance_rays.resetHit(i, rays.getMinT(i)*length);
+      SCALAR_KERNEL_CONSTANT_ORIGIN;
     }
+#endif
+
   } else {
+#ifdef MANTA_SSE
+    int b = (rays.rayBegin + 3) & (~3);
+    int e = (rays.rayEnd) & (~3);
+    if (b >= e) {
+      for (int i = rays.begin(); i < rays.end(); i++) {
+        SCALAR_KERNEL_NONCONSTANT_ORIGIN;
+      }
+    } else {
+      for (int i = rays.begin(); i < b; i++) {
+        SCALAR_KERNEL_NONCONSTANT_ORIGIN;
+      }
+      for (int i = b; i < e; i+= 4) {
+        SSE_KERNEL_NONCONSTANT_ORIGIN;
+      }
+      for (int i = e; i < rays.end(); i++) {
+        SCALAR_KERNEL_NONCONSTANT_ORIGIN;
+      }
+    }
+#else
     for(int i = rays.begin();i<rays.end();i++){
-      Vector o = transform_inv.multiply_point(rays.getOrigin(i));
-      Vector dir = transform_inv.multiply_vector(rays.getDirection(i));
-
-      Real length = dir.length();
-      inv_scales[i] = length;
-      Real ilength = 1/length;
-      scales[i] = ilength;
-      instance_rays.setRay(i, o, dir*ilength);
-      instance_rays.resetHit(i, rays.getMinT(i)*length);
+      SCALAR_KERNEL_NONCONSTANT_ORIGIN;
     }
+#endif
   }

+#undef SCALAR_KERNEL_CONSTANT_ORIGIN
+#undef SCALAR_KERNEL_NONCONSTANT_ORIGIN
+#undef SSE_KERNEL_CONSTANT_ORIGIN
+#undef SSE_KERNEL_NONCONSTANT_ORIGIN
+
   if (debugFlag) {
     cerr << "After transforming the incoming rays:" << endl;
     cerr << rays << endl;
@@ -230,7 +362,7 @@
       bool override_material = material != this;

       for (int j = i; j < end; j++) {
-        Real s = scales[j];
+        Real s = scales.get(j);
         const Material* hit_material = override_material ? material :
instance_rays.getHitMaterial(j);
         if(rays.hit(j,
                     instance_rays.getMinT(j)*s,

[Manta] r1991 - trunk/Model/Instances, boulos, 01/12/2008

Archive powered by MHonArc 2.6.16.