Text archives Help
- From: boulos@sci.utah.edu
- To: manta@sci.utah.edu
- Subject: [Manta] r1991 - trunk/Model/Instances
- Date: Sat, 12 Jan 2008 19:24:11 -0700 (MST)
Author: boulos
Date: Sat Jan 12 19:24:10 2008
New Revision: 1991
Modified:
trunk/Model/Instances/Instance.cc
Log:
Model/Instances/Instance.cc
Speeding up Instance peformance by 2x (for bin/manta -scene
"lib/libscene_primtest.dylib(-array spinscale2)") by using SSE. My
seemingly great idea of using macros for the kernels seems to have
confused Shark at our debugging level (-g3 right?).
Also switching over to using the Packet class instead of a raw array
of Reals for scales and inv_scales.
Bug fix for the instance_rays flags (should have been or instead of
and)
Modified: trunk/Model/Instances/Instance.cc
==============================================================================
--- trunk/Model/Instances/Instance.cc (original)
+++ trunk/Model/Instances/Instance.cc Sat Jan 12 19:24:10 2008
@@ -85,7 +85,7 @@
Interpolable::InterpErr Instance::serialInterpolate(const
std::vector<keyframe_t> &keyframes)
{
instance->serialInterpolate(keyframes);
-
+
for (int r=0; r<3; ++r)
for (int c=0; c<4; ++c)
transform.mat[r][c] = 0;
@@ -146,6 +146,50 @@
bbox.extendByPoint(transform.multiply_point(ibox.getCorner(i)));
}
+// Compute 4 points at once
+inline void MultiplyPointSSE(const AffineTransform& t,
+ __m128 x_in,
+ __m128 y_in,
+ __m128 z_in,
+ __m128& x_out,
+ __m128& y_out,
+ __m128& z_out) {
+ x_out = _mm_mul_ps(_mm_set1_ps(t.mat[0][0]), x_in);
+ x_out = _mm_add_ps(x_out, _mm_mul_ps(_mm_set1_ps(t.mat[0][1]), y_in));
+ x_out = _mm_add_ps(x_out, _mm_mul_ps(_mm_set1_ps(t.mat[0][2]), z_in));
+ x_out = _mm_add_ps(x_out, _mm_set1_ps(t.mat[0][3]));
+
+ y_out = _mm_mul_ps(_mm_set1_ps(t.mat[1][0]), x_in);
+ y_out = _mm_add_ps(y_out, _mm_mul_ps(_mm_set1_ps(t.mat[1][1]), y_in));
+ y_out = _mm_add_ps(y_out, _mm_mul_ps(_mm_set1_ps(t.mat[1][2]), z_in));
+ y_out = _mm_add_ps(y_out, _mm_set1_ps(t.mat[1][3]));
+
+ z_out = _mm_mul_ps(_mm_set1_ps(t.mat[2][0]), x_in);
+ z_out = _mm_add_ps(z_out, _mm_mul_ps(_mm_set1_ps(t.mat[2][1]), y_in));
+ z_out = _mm_add_ps(z_out, _mm_mul_ps(_mm_set1_ps(t.mat[2][2]), z_in));
+ z_out = _mm_add_ps(z_out, _mm_set1_ps(t.mat[2][3]));
+}
+
+inline void MultiplyVectorSSE(const AffineTransform& t,
+ __m128 x_in,
+ __m128 y_in,
+ __m128 z_in,
+ __m128& x_out,
+ __m128& y_out,
+ __m128& z_out) {
+ x_out = _mm_mul_ps(_mm_set1_ps(t.mat[0][0]), x_in);
+ x_out = _mm_add_ps(x_out, _mm_mul_ps(_mm_set1_ps(t.mat[0][1]), y_in));
+ x_out = _mm_add_ps(x_out, _mm_mul_ps(_mm_set1_ps(t.mat[0][2]), z_in));
+
+ y_out = _mm_mul_ps(_mm_set1_ps(t.mat[1][0]), x_in);
+ y_out = _mm_add_ps(y_out, _mm_mul_ps(_mm_set1_ps(t.mat[1][1]), y_in));
+ y_out = _mm_add_ps(y_out, _mm_mul_ps(_mm_set1_ps(t.mat[1][2]), z_in));
+
+ z_out = _mm_mul_ps(_mm_set1_ps(t.mat[2][0]), x_in);
+ z_out = _mm_add_ps(z_out, _mm_mul_ps(_mm_set1_ps(t.mat[2][1]), y_in));
+ z_out = _mm_add_ps(z_out, _mm_mul_ps(_mm_set1_ps(t.mat[2][2]), z_in));
+}
+
void Instance::intersect(const RenderContext& context, RayPacket& rays) const
{
bool debugFlag = rays.getFlag(RayPacket::DebugPacket);
@@ -169,39 +213,127 @@
//
// Finally, if the parent is a debug packet, we should be too.
instance_rays.setFlag(
- RayPacket::NormalizedDirections &
+ RayPacket::NormalizedDirections |
(rays.getAllFlags() & (RayPacket::ConstantOrigin |
RayPacket::DebugPacket)));
- Real scales[RayPacket::MaxSize];
- Real inv_scales[RayPacket::MaxSize];
+ // Clears things for us, so we don't have to reproduce this code everywhere
+ instance_rays.resetHits();
+ Packet<Real> scales;
+ Packet<Real> inv_scales;
if(rays.getFlag(RayPacket::ConstantOrigin)){
Vector o = transform_inv.multiply_point(rays.getOrigin(rays.begin()));
+#define SCALAR_KERNEL_CONSTANT_ORIGIN \
+ Vector dir = transform_inv.multiply_vector(rays.getDirection(i)); \
+ Real length = dir.length(); \
+ inv_scales.set(i, length); \
+ Real ilength = 1/length; \
+ scales.set(i, ilength); \
+ instance_rays.setRay(i, o, dir*ilength); \
+ instance_rays.overrideMinT(i, rays.getMinT(i)*length); \
+
+#define SCALAR_KERNEL_NONCONSTANT_ORIGIN \
+ Vector o = transform_inv.multiply_point(rays.getOrigin(i)); \
+ SCALAR_KERNEL_CONSTANT_ORIGIN ; \
+
+
+#define SSE_KERNEL_CONSTANT_ORIGIN \
+ __m128 dir_x, dir_y, dir_z;
\
+ MultiplyVectorSSE(transform_inv,
\
+ _mm_load_ps(&(rays.data->direction[0][i])),
\
+ _mm_load_ps(&(rays.data->direction[1][i])),
\
+ _mm_load_ps(&(rays.data->direction[2][i])),
\
+ dir_x, dir_y, dir_z);
\
+ __m128 length = _mm_add_ps(_mm_mul_ps(dir_x, dir_x),
\
+ _mm_add_ps(_mm_mul_ps(dir_y, dir_y),
\
+ _mm_mul_ps(dir_z, dir_z)));
\
+ length = _mm_sqrt_ps(length);
\
+ _mm_store_ps(&(inv_scales.data[i]), length);
\
+ __m128 inv_length = _mm_div_ps(_mm_set1_ps(1.f), length);
\
+ _mm_store_ps(&(scales.data[i]), inv_length);
\
+
\
+ _mm_store_ps(&(instance_rays.data->origin[0][i]), org_x);
\
+ _mm_store_ps(&(instance_rays.data->origin[1][i]), org_y);
\
+ _mm_store_ps(&(instance_rays.data->origin[2][i]), org_z);
\
+
\
+ dir_x = _mm_mul_ps(dir_x, inv_length);
\
+ dir_y = _mm_mul_ps(dir_y, inv_length);
\
+ dir_z = _mm_mul_ps(dir_z, inv_length);
\
+ _mm_store_ps(&(instance_rays.data->direction[0][i]), dir_x);
\
+ _mm_store_ps(&(instance_rays.data->direction[1][i]), dir_y);
\
+ _mm_store_ps(&(instance_rays.data->direction[2][i]), dir_z);
\
+
\
+ __m128 new_t = _mm_mul_ps(length, _mm_load_ps(&(rays.data->minT[i])));
\
+ _mm_store_ps(&(instance_rays.data->minT[i]), new_t);
\
+
+#define SSE_KERNEL_NONCONSTANT_ORIGIN \
+ __m128 org_x, org_y, org_z;
\
+ MultiplyPointSSE(transform_inv,
\
+ _mm_load_ps(&(rays.data->origin[0][i])),
\
+ _mm_load_ps(&(rays.data->origin[1][i])),
\
+ _mm_load_ps(&(rays.data->origin[2][i])),
\
+ org_x, org_y, org_z);
\
+ SSE_KERNEL_CONSTANT_ORIGIN;
\
+
+#ifdef MANTA_SSE
+ int b = (rays.rayBegin + 3) & (~3);
+ int e = (rays.rayEnd) & (~3);
+ if (b >= e) {
+ for (int i = rays.begin(); i < rays.end(); i++) {
+ SCALAR_KERNEL_CONSTANT_ORIGIN;
+ }
+ } else {
+ for (int i = rays.begin(); i < b; i++) {
+ SCALAR_KERNEL_CONSTANT_ORIGIN;
+ }
+ __m128 org_x = _mm_set1_ps(o[0]);
+ __m128 org_y = _mm_set1_ps(o[1]);
+ __m128 org_z = _mm_set1_ps(o[2]);
+ for (int i = b; i < e; i+= 4) {
+ SSE_KERNEL_CONSTANT_ORIGIN;
+ }
+ for (int i = e; i < rays.end(); i++) {
+ SCALAR_KERNEL_CONSTANT_ORIGIN;
+ }
+ }
+#else
for(int i = rays.begin();i<rays.end();i++){
- Vector dir = transform_inv.multiply_vector(rays.getDirection(i));
-
- Real length = dir.length();
- inv_scales[i] = length;
- Real ilength = 1/length;
- scales[i] = ilength;
- instance_rays.setRay(i, o, dir*ilength);
- instance_rays.resetHit(i, rays.getMinT(i)*length);
+ SCALAR_KERNEL_CONSTANT_ORIGIN;
}
+#endif
+
} else {
+#ifdef MANTA_SSE
+ int b = (rays.rayBegin + 3) & (~3);
+ int e = (rays.rayEnd) & (~3);
+ if (b >= e) {
+ for (int i = rays.begin(); i < rays.end(); i++) {
+ SCALAR_KERNEL_NONCONSTANT_ORIGIN;
+ }
+ } else {
+ for (int i = rays.begin(); i < b; i++) {
+ SCALAR_KERNEL_NONCONSTANT_ORIGIN;
+ }
+ for (int i = b; i < e; i+= 4) {
+ SSE_KERNEL_NONCONSTANT_ORIGIN;
+ }
+ for (int i = e; i < rays.end(); i++) {
+ SCALAR_KERNEL_NONCONSTANT_ORIGIN;
+ }
+ }
+#else
for(int i = rays.begin();i<rays.end();i++){
- Vector o = transform_inv.multiply_point(rays.getOrigin(i));
- Vector dir = transform_inv.multiply_vector(rays.getDirection(i));
-
- Real length = dir.length();
- inv_scales[i] = length;
- Real ilength = 1/length;
- scales[i] = ilength;
- instance_rays.setRay(i, o, dir*ilength);
- instance_rays.resetHit(i, rays.getMinT(i)*length);
+ SCALAR_KERNEL_NONCONSTANT_ORIGIN;
}
+#endif
}
+#undef SCALAR_KERNEL_CONSTANT_ORIGIN
+#undef SCALAR_KERNEL_NONCONSTANT_ORIGIN
+#undef SSE_KERNEL_CONSTANT_ORIGIN
+#undef SSE_KERNEL_NONCONSTANT_ORIGIN
+
if (debugFlag) {
cerr << "After transforming the incoming rays:" << endl;
cerr << rays << endl;
@@ -230,7 +362,7 @@
bool override_material = material != this;
for (int j = i; j < end; j++) {
- Real s = scales[j];
+ Real s = scales.get(j);
const Material* hit_material = override_material ? material :
instance_rays.getHitMaterial(j);
if(rays.hit(j,
instance_rays.getMinT(j)*s,
- [Manta] r1991 - trunk/Model/Instances, boulos, 01/12/2008
Archive powered by MHonArc 2.6.16.