manta - [MANTA] r1530 - trunk/Model/Primitives

Closed list
Subscribers: 0
Owners

sparker

thiago

Subscribe
Unsubscribe
Info
Admin
Archive

Post

Shared documents

Manta Interactive Ray Tracer Development Mailing List

Text archives Help

[MANTA] r1530 - trunk/Model/Primitives

From: thiago@sci.utah.edu
To: manta@sci.utah.edu
Subject: [MANTA] r1530 - trunk/Model/Primitives
Date: Sat, 21 Jul 2007 22:28:40 -0600 (MDT)

Author: thiago
Date: Sat Jul 21 22:28:39 2007
New Revision: 1530

Modified:
   trunk/Model/Primitives/WaldTriangle.cc
Log:
There seems to be a bug in the wald triangle sse intersection code
that handles leading and trailing rays. For now here's a hack that
guarentees to fix that. This is rather slow though (completely
non-sse) and does some redundant computations, so it needs to be
optimized.

Modified: trunk/Model/Primitives/WaldTriangle.cc
==============================================================================
--- trunk/Model/Primitives/WaldTriangle.cc      (original)
+++ trunk/Model/Primitives/WaldTriangle.cc      Sat Jul 21 22:28:39 2007
@@ -248,65 +248,66 @@
     }

     if (ray_begin < sse_begin) {
-      const int ray_begin_aligned = ray_begin & (~3);
-      const sse_t nd0 = add4(add4(mul4(sse_n_u,
load44(&data->direction[ku][ray_begin_aligned])),
-                                  mul4(sse_n_v,
load44(&data->direction[kv][ray_begin_aligned]))),
-                             load44(&data->direction[k][ray_begin_aligned]));
-      const sse_t nd  = oneOver(nd0);
-
-      if (!HasCommonOrigin) {
-        org_k  = load44(&data->origin[axis][ray_begin_aligned]);
-        org_ku = load44(&data->origin[ku][ray_begin_aligned]);
-        org_kv = load44(&data->origin[kv][ray_begin_aligned]);
-        f0     = sub4(set4(n_d),
-                      add4(org_k, add4(mul4(sse_n_u,
-                                            org_ku),
-                                       mul4(sse_n_v,
-                                            org_kv))));
+      //TODO: the trailing code is just copied over from the non-sse
+      //intersection method.  There is some redundant work and plenty
+      //that could still be optimized with sse.
+      const Real* const dir_k  = data->direction[axis];
+      const Real* const dir_ku = data->direction[ku];
+      const Real* const dir_kv = data->direction[kv];
+
+      float org_k, org_ku, org_kv, f0;
+
+      const bool RaysConstantOrigin = rays.getAllFlags() &
RayPacket::ConstantOrigin;
+
+      if (RaysConstantOrigin)
+      {
+        org_k  = data->origin[axis][rays.begin()];
+        org_ku = data->origin[ku][rays.begin()];
+        org_kv = data->origin[kv][rays.begin()];
+        f0 = n_d - (org_k + n_u * org_ku + n_v * org_kv);
       }

-      const sse_t f = mul4(f0, nd); // maybe these would be faster as
swizzle after load44
-      // plane test
-
-      const unsigned int active_ray_mask = ~((1<<(4-(sse_begin -
ray_begin))) - 1);
-
-      sse_t mask_test = and4( _mm_cmpnle_ps(f, set4(T_EPSILON)),
-
_mm_cmpnle_ps(load44(&data->minT[ray_begin_aligned]), f));
-      if ( (getmask4(mask_test) & active_ray_mask) == 0x0) return;
-
-      const sse_t hu = add4( mul4(f,
load44(&data->direction[ku][ray_begin_aligned])), org_ku);
-      const sse_t hv = add4( mul4(f,
load44(&data->direction[kv][ray_begin_aligned])), org_kv);
-      const sse_t lambda = add4( sse_b_d,
-                                 add4( mul4(hu, sse_b_nu),
-                                       mul4(hv, sse_b_nv)));
-      mask_test = and4(mask_test, _mm_cmpnlt_ps(lambda, zero4()));
-      if ( (getmask4(mask_test) & active_ray_mask) == 0x0) return;
+      for (int i = ray_begin; i < sse_end; i++ )
+      {
+        const float nd0 = n_u * dir_ku[i] + n_v * dir_kv[i] + dir_k[i];
+        const float nd  = 1.f/nd0;
+
+        if (!RaysConstantOrigin)
+        {
+          org_k  = data->origin[axis][i];
+          org_ku = data->origin[ku][i];
+          org_kv = data->origin[kv][i];

-      const sse_t mue = add4( sse_c_d,
-                              add4( mul4(hu, sse_c_nu),
-                                    mul4(hv, sse_c_nv)));
-
-      mask_test = and4(mask_test, and4( _mm_cmpnlt_ps(mue, zero4()),
-                                        _mm_cmpnlt_ps(set4(1.f), add4(mue,
lambda))));
+          f0 = n_d - (org_k + n_u * org_ku + n_v * org_kv);
+        }

-      const int hit_mask = getmask4(mask_test);
-      for (int ray = ray_begin; ray < sse_begin; ++ray) {
-        if ( hit_mask & (1<<(4 - (ray-ray_begin) )) ) {
-          const bool hit = rays.hit(ray, ((float*)&f)[ray-sse_end],
mesh->materials[mesh->face_material[myID]], this, this);
-          if (hit) {
-
-            float *u = &rays.getScratchpad<float>(0)[ray];
-            float *v = &rays.getScratchpad<float>(1)[ray];
-//             int *which = rays.getScratchpad<int*>(2)[ray];
-            *u = ((float*)&lambda)[ray-sse_end];
-            *v = ((float*)&mue)[ray-sse_end];
-//             *which = myID;
-          }
+        const float f = f0 * nd;
+        // plane test
+        if ( f < T_EPSILON || f > data->minT[i] )
+          continue;
+
+        const float hu = org_ku + f*dir_ku[i];
+        const float hv = org_kv + f*dir_kv[i];
+        const float lambda = b_d + hu*b_nu + hv * b_nv;
+
+        // barycentric test
+        if ( lambda < 0.f )
+          continue;
+
+        const float mue = c_d + hu * c_nu + hv * c_nv;
+        if ( mue < 0.f || mue + lambda > 1.f )
+          continue;
+
+        const bool hit = rays.hit(i, f,
mesh->materials[mesh->face_material[myID]], this, this);
+        if (hit) {
+          float *u = &rays.getScratchpad<float>(0)[i];
+          float *v = &rays.getScratchpad<float>(1)[i];
+          *u = lambda;
+          *v = mue;
         }
       }
     }

-
     for (int ray = sse_begin; ray < sse_end; ray += 4) {
       const sse_t nd0 = add4(add4(mul4(sse_n_u,
load44(&data->direction[ku][ray])),
                                   mul4(sse_n_v,
load44(&data->direction[kv][ray]))),
@@ -357,59 +358,59 @@
     }

     if (sse_end < ray_end) {
-      const sse_t nd0 = add4(add4(mul4(sse_n_u,
load44(&data->direction[ku][sse_end])),
-                                  mul4(sse_n_v,
load44(&data->direction[kv][sse_end]))),
-                             load44(&data->direction[k][sse_end]));
-      const sse_t nd  = oneOver(nd0);
-
-      if (!HasCommonOrigin) {
-        org_k  = load44(&data->origin[axis][sse_end]);
-        org_ku = load44(&data->origin[ku][sse_end]);
-        org_kv = load44(&data->origin[kv][sse_end]);
-        f0     = sub4(set4(n_d),
-                      add4(org_k, add4(mul4(sse_n_u,
-                                            org_ku),
-                                       mul4(sse_n_v,
-                                            org_kv))));
+      const Real* const dir_k  = data->direction[axis];
+      const Real* const dir_ku = data->direction[ku];
+      const Real* const dir_kv = data->direction[kv];
+
+      float org_k, org_ku, org_kv, f0;
+
+      const bool RaysConstantOrigin = rays.getAllFlags() &
RayPacket::ConstantOrigin;
+
+      if (RaysConstantOrigin)
+      {
+        org_k  = data->origin[axis][rays.begin()];
+        org_ku = data->origin[ku][rays.begin()];
+        org_kv = data->origin[kv][rays.begin()];
+        f0 = n_d - (org_k + n_u * org_ku + n_v * org_kv);
       }

-      const sse_t f = mul4(f0, nd); // maybe these would be faster as
swizzle after load44
-      // plane test
-
-      const unsigned int active_ray_mask = (1<<(ray_end - sse_end)) - 1;
-
-      sse_t mask_test = and4( _mm_cmpnle_ps(f, set4(T_EPSILON)),
-                              _mm_cmpnle_ps(load44(&data->minT[sse_end]),
f));
-
-      if ( (getmask4(mask_test) & active_ray_mask) == 0x0) return;
+      for (int i = sse_end; i < ray_end; i++ )
+      {
+        const float nd0 = n_u * dir_ku[i] + n_v * dir_kv[i] + dir_k[i];
+        const float nd  = 1.f/nd0;
+
+        if (!RaysConstantOrigin)
+        {
+          org_k  = data->origin[axis][i];
+          org_ku = data->origin[ku][i];
+          org_kv = data->origin[kv][i];

-      const sse_t hu = add4( mul4(f, load44(&data->direction[ku][sse_end])),
org_ku);
-      const sse_t hv = add4( mul4(f, load44(&data->direction[kv][sse_end])),
org_kv);
-      const sse_t lambda = add4( sse_b_d,
-                                 add4( mul4(hu, sse_b_nu),
-                                       mul4(hv, sse_b_nv)));
-      mask_test = and4(mask_test, _mm_cmpnlt_ps(lambda, zero4()));
-      if ( (getmask4(mask_test) & active_ray_mask) == 0x0) return;
-
-      const sse_t mue = add4( sse_c_d,
-                              add4( mul4(hu, sse_c_nu),
-                                    mul4(hv, sse_c_nv)));
-
-      mask_test = and4(mask_test, and4( _mm_cmpnlt_ps(mue, zero4()),
-                                        _mm_cmpnlt_ps(set4(1.f), add4(mue,
lambda))));
+          f0 = n_d - (org_k + n_u * org_ku + n_v * org_kv);
+        }

-      const int hit_mask = getmask4(mask_test);
-      for (int ray = sse_end; ray < ray_end; ++ray) {
-        if ( hit_mask & (1<<(ray-sse_end)) ) {
-          const bool hit = rays.hit(ray, ((float*)&f)[ray-sse_end],
mesh->materials[mesh->face_material[myID]], this, this);
-          if (hit) {
-            float *u = &rays.getScratchpad<float>(0)[ray];
-            float *v = &rays.getScratchpad<float>(1)[ray];
-//             int *which = rays.getScratchpad<int*>(2)[ray];
-            *u = ((float*)&lambda)[ray-sse_end];
-            *v = ((float*)&mue)[ray-sse_end];
-//             *which = myID;
-          }
+        const float f = f0 * nd;
+        // plane test
+        if ( f < T_EPSILON || f > data->minT[i] )
+          continue;
+
+        const float hu = org_ku + f*dir_ku[i];
+        const float hv = org_kv + f*dir_kv[i];
+        const float lambda = b_d + hu*b_nu + hv * b_nv;
+
+        // barycentric test
+        if ( lambda < 0.f )
+          continue;
+
+        const float mue = c_d + hu * c_nu + hv * c_nv;
+        if ( mue < 0.f || mue + lambda > 1.f )
+          continue;
+
+        const bool hit = rays.hit(i, f,
mesh->materials[mesh->face_material[myID]], this, this);
+        if (hit) {
+          float *u = &rays.getScratchpad<float>(0)[i];
+          float *v = &rays.getScratchpad<float>(1)[i];
+          *u = lambda;
+          *v = mue;
         }
       }
     }

[MANTA] r1530 - trunk/Model/Primitives, thiago, 07/22/2007

Archive powered by MHonArc 2.6.16.