manta - [MANTA] r1272 - in trunk: Interface Model/Groups/private

Closed list
Subscribers: 0
Owners

sparker

thiago

Subscribe
Unsubscribe
Info
Admin
Archive

Post

Shared documents

Manta Interactive Ray Tracer Development Mailing List

Text archives Help

[MANTA] r1272 - in trunk: Interface Model/Groups/private

From: thiago@sci.utah.edu
To: manta@sci.utah.edu
Subject: [MANTA] r1272 - in trunk: Interface Model/Groups/private
Date: Wed, 24 Jan 2007 18:23:53 -0700 (MST)

Author: thiago
Date: Wed Jan 24 18:23:52 2007
New Revision: 1272

Modified:
   trunk/Interface/Context.h
   trunk/Model/Groups/private/CGT.cc
   trunk/Model/Groups/private/CGT.h
Log:
Model/Groups/private/CGT.cc
Model/Groups/private/CGT.h :

- Added parallel build. Currently the grid performs a build every
   frame, even when it's not needed. Also, there are still some
   expensive spots that need to be parallelized (like calculating
   bounding boxes of primitives), and there is no working NUMA
   supporting code (NUMA stuff is very important for scalability).

- Fixed a race condition when mailboxes were being resized.

- Switched to using a different timer, so now the timing information
   is correct.

Interface/Context.h:

- Removed procs and numProcs since they do not belong here.

Modified: trunk/Interface/Context.h
==============================================================================
--- trunk/Interface/Context.h   (original)
+++ trunk/Interface/Context.h   Wed Jan 24 18:23:52 2007
@@ -185,10 +185,8 @@
   class PreprocessContext {
   public:
     PreprocessContext() {}
-    PreprocessContext(MantaInterface* manta_interface, LightSet*
globalLights,
-                      int proc=0, int numProcs=1)
-      : manta_interface(manta_interface), globalLights(globalLights),
-        proc(proc), numProcs(numProcs)
+    PreprocessContext(MantaInterface* manta_interface, LightSet*
globalLights)
+      : manta_interface(manta_interface), globalLights(globalLights)
     {
     }
     ~PreprocessContext()
@@ -197,8 +195,6 @@

     MantaInterface* manta_interface;
     LightSet* globalLights;
-    int proc;
-    int numProcs;

   private:
     PreprocessContext(const PreprocessContext&);

Modified: trunk/Model/Groups/private/CGT.cc
==============================================================================
--- trunk/Model/Groups/private/CGT.cc   (original)
+++ trunk/Model/Groups/private/CGT.cc   Wed Jan 24 18:23:52 2007
@@ -1,7 +1,9 @@
#include <Model/Groups/private/CGT.h>
-#include <SCIRun/Core/Util/Timer.h> //"base/Clock.hxx"
+#include <SCIRun/Core/Thread/Time.h>
#include <Core/Geometry/BBox.h>
#include <Model/Primitives/Triangle.h>
+#include <Interface/MantaInterface.h>
+#include <Core/Thread/Barrier.h>

#ifdef USE_NUMA
#include <numa.h>
@@ -11,7 +13,7 @@

using namespace Manta;

-float Grid::resolutionFactor = 1.0f;
+float Grid::resolutionFactor = 5.0f;

#define DISPLAY_BUILD_TIMES true

@@ -26,42 +28,60 @@
vector <mailbox_struct> mailboxes;
#endif

-//barrier to wait for NUM_BUILD_THREADS
-#define barrier(mutex, condition)                           \
-  {                                                         \
-    static int count = context.numProcs;                    \
-    pthread_mutex_lock( &mutex );                           \
-    --count;                                                \
-    if (count > 0)                                          \
-      pthread_cond_wait( &condition, &mutex );              \
-    else {                                                  \
-      pthread_cond_broadcast( &condition );                 \
-      count = context.numProcs;                             \
-    }                                                       \
-    pthread_mutex_unlock( &mutex );                         \
-  }                                                         \
-
void Grid::newFrame()
{
}

void Grid::preprocess(const PreprocessContext& context)
{
+  context.manta_interface->registerParallelAnimationCallback(
+    Callback::create(this, &Grid::build));
+
+  context.manta_interface->registerSerialPreRenderCallback(
+    Callback::create(this, &Grid::preFrameSetup));
+
+  //XXXX: temporary code...
+  for (int i = 0; i < getSize(); i++) {
+    get(i)->preprocess(context);
+  }
+}
+
+void Grid::preFrameSetup(int proc, int numProcs)
+{
+  //TODO: Make it so that mailbox memory is loaded on the thread's
+  //memory (For NUMA machines), and not all on the primary thread's
+  //node's.
+
+  if (mailboxes.empty() != true)
+    if (mailboxes[0].mailbox.size() < getSize())
+      for (size_t i=0; i < mailboxes.size(); ++i)
+        mailboxes[i].mailbox.resize(getSize(), -1);
+
+  if (mailboxes.size() < numProcs) {
+    int oldSize = mailboxes.size();
+    mailboxes.resize(numProcs);
+    for (size_t i=max(oldSize-1, 0); i < numProcs; ++i)
+      mailboxes[i].mailbox.resize(getSize(), mailboxes[i].rayID);
+  }
+}
+
+void Grid::build(int proc, int numProcs, bool &)
+{
+  //TODO: Make it so that build is only performed when required (for
+  //instance, like when the geometry changes).
+
   static bool firstTime = true;
   static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-  static pthread_mutex_t countMutex = PTHREAD_MUTEX_INITIALIZER;
-  static pthread_cond_t condition = PTHREAD_COND_INITIALIZER;
-  static pthread_mutex_t condition_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+  static SCIRun::Barrier barrier("CGT build barrier");

   CellData *myCell;

-  WallClockTimer firstTimer;
-  static WallClockTimer c;
+  double startThreadBuildTime = SCIRun::Time::currentSeconds();
+  static double startGlobalBuildTime;

   cout <<"Number of triangles is: " << getSize() <<endl;  //DEBUG

-  firstTimer.start();
-
#ifdef MACRO_CELLS
   static int oldN_mc[3] = { -1, -1, -1 };
#endif
@@ -73,10 +93,14 @@
     first = false;
     pthread_mutex_unlock(&mutex);

-    c.start();
+    startGlobalBuildTime = SCIRun::Time::currentSeconds();

     BBox bbox;
-    computeBounds(context, bbox);
+    //XXXX: this blah is a hack. Furthermore, we need to have the
+    //computation of the bounding box parallelized or already given to
+    //us...
+    PreprocessContext blah;
+    computeBounds(blah, bbox);
     bounds.min = set44(0.0f, bbox[0][2], bbox[0][1], bbox[0][0]);
     bounds.max = set44(0.0f, bbox[1][2], bbox[1][1], bbox[1][0]);

@@ -158,10 +182,9 @@
     if (firstTime)
       cout << "building grid for " << getSize() << " triangles, res = " <<
N[0] << "x" << N[1] << "x" << N[2] << endl << "bounds = " << bounds << endl;

-    firstTimer.stop();
-
     if (firstTime || DISPLAY_BUILD_TIMES)
-      cout << "first stage of grid build : " << firstTimer.time() << "ms" <<
endl;
+      cout << "first stage of grid build : "
+           << (SCIRun::Time::currentSeconds() - startThreadBuildTime)*1e3 <<
"ms" << endl;

     primitives.resize(getSize());
     for (int i=0; i<getSize(); ++i) {
@@ -169,34 +192,19 @@
       primitives[i].setMaterial(triangle->getMaterial());
       primitives[i].setPoints(triangle->p1,triangle->p2,triangle->p3);
     }
-
-    if (mailboxes.size() < 1) {
-      //make it big enough so that resize doesn't require
-      //copying. This helps ensure that if a resize occurs when a
-      //thread is accessing that mailbox (mutex is too expensive in
-      //traversal loop) then data won't get corrupted.
-      mailboxes.reserve(128);
-
-      mailboxes.resize(1);
-      mailboxes[0].mailbox.resize(getSize(), mailboxes[0].rayID);
-    }
-    if (mailboxes[0].mailbox.size() < getSize())
-      for (size_t i=0; i < mailboxes.size(); ++i)
-        mailboxes[i].mailbox.resize(getSize(), -1);
   }
   else
     pthread_mutex_unlock(&mutex);

-  barrier(condition_mutex, condition);
+  barrier.wait(numProcs);

   myCell = &cellVector[0];

-  WallClockTimer clearTimer;
-  clearTimer.start();
+  double startClearTime = SCIRun::Time::currentSeconds();

#if 0 || !defined(MACRO_CELLS)
-  const int start_z = (oldN[2] * context.proc) / context.numProcs;
-  const int end_z = (oldN[2] * (context.proc+1)) / context.numProcs;
+  const int start_z = (oldN[2] * proc) / numProcs;
+  const int end_z = (oldN[2] * (proc+1)) / numProcs;
   for (int i=oldN[0]*oldN[1]*start_z;i<oldN[0]*oldN[1]*end_z;i++)
   {
//         if (myCell[i].triIDs.size() > 500)
@@ -209,23 +217,23 @@
   //that had triangles and just clear those.
#if 1
   const int oldlength = oldN_mc[1]*oldN_mc[2] - 1;
-  const int start_z = oldlength*context.proc /
-    (oldN_mc[1]*context.numProcs);
-  const int end_z = oldlength*(context.proc+1) /
-    (oldN_mc[1]*context.numProcs);
-  const int start_y = oldlength*context.proc / context.numProcs -
+  const int start_z = oldlength*proc /
+    (oldN_mc[1]*numProcs);
+  const int end_z = oldlength*(proc+1) /
+    (oldN_mc[1]*numProcs);
+  const int start_y = oldlength*proc / numProcs -
     start_z*oldN_mc[1];
-  const int end_y = oldlength*(context.proc+1) / context.numProcs -
-    end_z*oldN_mc[1] + ((context.proc+1) == context.numProcs);
+  const int end_y = oldlength*(proc+1) / numProcs -
+    end_z*oldN_mc[1] + ((proc+1) == numProcs);

   for (int z = start_z; z<=end_z; z++) {
     for (int y = ((z==start_z)? start_y: 0);
          y < ((z==end_z)? end_y:oldN_mc[1]); y++) {
#else
-  int y = context.proc;
-  for (int z = fabs((float)context.proc/oldN_mc[1]) ; z < oldN_mc[2]; z +=
y/oldN_mc[1]) {
+  int y = proc;
+  for (int z = fabs((float)proc/oldN_mc[1]) ; z < oldN_mc[2]; z +=
y/oldN_mc[1]) {
     y %= oldN_mc[1];
-    for ( ; y < oldN_mc[1]; y += context.numProcs
+    for ( ; y < oldN_mc[1]; y += numProcs
           ) {
#endif
       for (int x=0; x < oldN_mc[0]; x++) {
@@ -255,28 +263,50 @@
   }
#endif //macro cell clear

-  clearTimer.stop();
   if (firstTime || DISPLAY_BUILD_TIMES)
   {
     char buf[128];
     sprintf(buf, "%d: time to clear old grid: %fms\n",
-           context.proc, clearTimer.time()*1e-3);
+            proc, (SCIRun::Time::currentSeconds() - startClearTime)*1e3);
     cout<<buf;
   }

-  //TODO: this assumes number of triangles does not change!
-  if (buildQueues[context.proc][0] == NULL)
-    for (int i=0; i < context.numProcs; ++i) {
+  static pthread_mutex_t buildQueue_resize_mutex = PTHREAD_MUTEX_INITIALIZER;
+  static bool buildQueue_was_resized = false;
+  pthread_mutex_lock(&buildQueue_resize_mutex);
+  if (buildQueues.size() != numProcs*numProcs) {
+    //The current method of deleting BIG arrays and recreating them
+    //whenever the number of threads change is expensive. But since we
+    //don't expect the number of threads to change very often, we'll
+    //allow this occasional slow down. The advantage of deleting them
+    //all is that when we recreate the new arrays they can be
+    //allocated on the proper node.
+    for (size_t i = 0; i < buildQueues.size(); ++i)
+      delete[] buildQueues[i];
+    buildQueues.resize(numProcs*numProcs);
+    buildQueue_sizes.resize(numProcs*numProcs);
+    buildQueue_was_resized = true;
+  }
+  pthread_mutex_unlock(&buildQueue_resize_mutex);
+
+  if (buildQueue_was_resized) {
+    for (size_t i = proc * numProcs;
+         i < (proc+1) * numProcs; ++i) {

+      //Note: we could make buildQueue contain vectors of
+      //location_primitives instead of a ptr to a dynamically created
+      //array, which would be safer. But this would prevent us from
+      //allocating the memory on each node if we want to target NUMA
+      //systems.
+
       //XXXXXX need to convert from array to vector since we do not
       //know how many tri will be in each queue since one triangle
       //can be in more than one cell.
       const int SCALER = 10;
-
#if !defined(__linux__) || !defined(USE_NUMA) || 0
-      buildQueues[context.proc][i] = new
location_primitive[SCALER*getSize()];
+      buildQueues[i] = new location_primitive[SCALER*getSize()];
#else
-      const int node =
RayTraceRenderer::getBuildPreprocessNode(context.proc);
+      const int node = RayTraceRenderer::getBuildPreprocessNode(proc);
       const int node2 = RayTraceRenderer::getBuildPreprocessNode(i);

       nodemask_t interleaveNodes;
@@ -292,42 +322,44 @@
       //total number of local misses will end up being the same.
       nodemask_set(&interleaveNodes, node2);
#endif
-      buildQueues[context.proc][i] = (location_primitive*)
+      buildQueues[i] = (location_primitive*)
         numa_alloc_interleaved_subset(SCALER*getSize() *
                                       sizeof(location_primitive),
                                       &interleaveNodes);
#endif
+
     }
+  }
+
   location_primitive **const  myBuildQueues =
-    &buildQueues[context.proc][0];
+    &buildQueues[proc*numProcs];

   //to reduce false sharing we create this buildQueue counter
   //and use it while engridding instead of the array of counters.
-  //int myBuildQueue_sizes[context.numProcs] = {0};
-  int myBuildQueue_sizes[256] = {0}; //XXXXX hard coded!!!
+  //int myBuildQueue_sizes[numProcs] = {0};
+  vector<int> myBuildQueue_sizes(numProcs, 0);

-//   printf("thread %d exited first barrier\n", context.proc);
+//   printf("thread %d exited first barrier\n", proc);
   const int numTri = getSize();
-  const int startTri = (numTri * context.proc) / context.numProcs;
-  const int endTri = (numTri * (context.proc+1)) / context.numProcs;
+  const int startTri = (numTri * proc) / numProcs;
+  const int endTri = (numTri * (proc+1)) / numProcs;

   //even though we don't really need a barrier here, this
   //ends up, I think, not slowing down the code and it produces
   //nicer more uniform and easier to understand timings.
   //Make sure to double check when getting overall best timings
   //whether this is slowing down the code...
-  barrier(condition_mutex, condition);
-
-  WallClockTimer engriddingTimer;
-  engriddingTimer.start();
+  barrier.wait(numProcs);

+  double startSortTime = SCIRun::Time::currentSeconds();

   for (int i = startTri; i < endTri; i++) {
-    get(i)->preprocess(context);
+    //get(i)->preprocess(context);

     BBox bbox;
-    get(i)->computeBounds(context, bbox);
+    PreprocessContext blah;
+    get(i)->computeBounds(blah, bbox);
     Box4 triBounds;
     triBounds.min = set44(0.0f, bbox[0][2], bbox[0][1], bbox[0][0]);
     triBounds.max = set44(0.0f, bbox[1][2], bbox[1][1], bbox[1][0]);
@@ -355,7 +387,7 @@
       const int numCells = (z1-z0+1)*(y1-y0+1)*(x1-x0+1);

       for (int z=z0;z<=z1;z++) {
-        int q = z % (context.numProcs);
+        int q = z % (numProcs);

         for (int y=y0;y<=y1;y++) {
           for (int x=x0;x<=x1;x++)
@@ -371,61 +403,55 @@
       }
     }

-  engriddingTimer.stop();
-
   if (DISPLAY_BUILD_TIMES) {
     char buf[128];
     sprintf(buf, "%d engriddingTimer time: %fms\n",
-           context.proc, engriddingTimer.time()*1e-3f);
+            proc, (SCIRun::Time::currentSeconds()-startSortTime)*1e3f);
     cout<<buf;
   }

-  for (int i=0; i < context.numProcs; ++i)
-    buildQueue_sizes[context.proc][i] = myBuildQueue_sizes[i];
+  for (int i=0; i < numProcs; ++i)
+    buildQueue_sizes[proc*numProcs+i] = myBuildQueue_sizes[i];

-  barrier(condition_mutex, condition);
+  barrier.wait(numProcs);

-  WallClockTimer mergeTimer;
-  mergeTimer.start();
+  double startMergeTime = SCIRun::Time::currentSeconds();

-  for (int i=0; i < context.numProcs; i++) {
-    for (int k=0; k < buildQueue_sizes[i][context.proc]; ++k) {
+  for (int i=0; i < numProcs; i++) {
+    for (int k=0; k < buildQueue_sizes[i*numProcs + proc]; ++k) {
       const location_primitive &prim_lp =
-        buildQueues[i][context.proc][k];
+        buildQueues[i*numProcs + proc][k];

       myCell[prim_lp.gridLocation].triIDs.push_back(prim_lp.which_primitive);
     }
   }

-  mergeTimer.stop();
-
   if (DISPLAY_BUILD_TIMES) {
     char buf[128];
     sprintf(buf, "%d merge time: %fms\n",
-           context.proc, mergeTimer.time()/1000.0f);
+            proc, (SCIRun::Time::currentSeconds() - startMergeTime)*1e3);
     cout<<buf;
   }

#ifdef MACRO_CELLS
   //cout << "macrocell-res " << N_mc[0] << "x" << N_mc[1] << "x" << N_mc[2]
<< endl;
-  if (context.proc == 0)
+  if (proc == 0)
       if (cellVector_mc.size() < N_mc[0]*N_mc[1]*N_mc[2])
           cellVector_mc.resize(N_mc[0]*N_mc[1]*N_mc[2]);

-  barrier(condition_mutex, condition);
+  barrier.wait(numProcs);

-  WallClockTimer macroBuildTimer;
-  macroBuildTimer.start();
+  double startMacroBuildTime = SCIRun::Time::currentSeconds();

   const int length = N_mc[1]*N_mc[2] - 1;
-  const int start_iz = length*context.proc /
-    (N_mc[1]*context.numProcs);
-  const int end_iz = length*(context.proc+1) /
-    (N_mc[1]*context.numProcs);
-  const int start_iy = length*context.proc / context.numProcs -
+  const int start_iz = length*proc /
+    (N_mc[1]*numProcs);
+  const int end_iz = length*(proc+1) /
+    (N_mc[1]*numProcs);
+  const int start_iy = length*proc / numProcs -
     start_iz*N_mc[1];
-  const int end_iy = length*(context.proc+1) / context.numProcs -
-    end_iz*N_mc[1] + ((context.proc+1) == context.numProcs);
+  const int end_iy = length*(proc+1) / numProcs -
+    end_iz*N_mc[1] + ((proc+1) == numProcs);

   for (int iz = start_iz; iz<=end_iz; iz++)
     for (int iy = ((iz==start_iz)? start_iy: 0);
@@ -454,11 +480,10 @@
         set_mc(ix,iy,iz,count);
       }

-  macroBuildTimer.stop();
   if (DISPLAY_BUILD_TIMES) {
     char buf[128];
     sprintf(buf, "%d macro cell build time: %fms\n",
-            context.proc, macroBuildTimer.time()/1000.0f);
+            proc, (SCIRun::Time::currentSeconds() -
startMacroBuildTime)*1e3);
     cout<<buf;
   }

@@ -467,24 +492,24 @@

   { //only let the last thread through so it can cleaup.
-    static int done_count = context.numProcs;
+    static int done_count = numProcs;
     pthread_mutex_lock(&mutex);
     if (--done_count != 0) {
       pthread_mutex_unlock(&mutex);
       return;
     }
     pthread_mutex_unlock(&mutex);
-    done_count = context.numProcs;
+    done_count = numProcs;
   }

-//   printf("thead %d has gotten to the end\n", context.proc);
+//   printf("thead %d has gotten to the end\n", proc);

   first = true;
-
-  c.stop();
+  buildQueue_was_resized = false;

   if (firstTime|| DISPLAY_BUILD_TIMES)
-    cout << "grid built in " << c.time()/1000.0f << "ms" << endl << endl;
+    cout << "grid built in "
+         << (SCIRun::Time::currentSeconds() - startGlobalBuildTime)*1e3 <<
"ms\n\n";
   firstTime = false;
};

@@ -494,25 +519,6 @@
inline void Grid::TrvMajor(RayPacket &ray, const RenderContext& context )
const
{
#ifdef MAILBOXING
-  if (mailboxes.size() < context.numProcs) {
-    static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-    pthread_mutex_lock(&mutex);
-    if (mailboxes.size() < context.numProcs) {
-      //It's possible some other threads currently traversing might
-      //try accessing a mailbox while the resize is occuring. Since we
-      //don't want to make the inner traversal loop use a mutex, lets
-      //just have this thread sleep for a while and hope that by the
-      //time it wakes up all the other threads will have finished
-      //their current traversal.
-      sleep(1);
-      int oldSize = mailboxes.size();
-      mailboxes.resize(context.numProcs);
-      for (size_t i=oldSize-1; i < context.numProcs; ++i)
-        mailboxes[i].mailbox.resize(getSize(), -1);
-    }
-    pthread_mutex_unlock(&mutex);
-  }
-
   mailboxes[context.proc].rayID++;
#endif

@@ -533,7 +539,7 @@
   float minVd;
   float maxVd;

-  if (ray.getFlag(RayPacket::HaveCornerRays) &&
ray.getFlag(RayPacket::ConstantOrigin))
+  if (ray.getFlag(RayPacket::HaveCornerRays) && COMMON_ORIGIN)
   {
     printf("oops, never expected this to be needed!. Look In CGT.cc...\n");
     /*
@@ -615,9 +621,8 @@
   sse_t minMax_dudv = set44(maxDv,maxDu,minDv,minDu);
   //printf("%f %f %f %f\n", maxDv,maxDu,minDv,minDu);
   if (COMMON_ORIGIN) {
-    //XXX using 0th ray might not work if that ray is invalid
-    sse_t org4 = set44(0.0f, ray.getOrigin(0,2),
-                       ray.getOrigin(0,1), ray.getOrigin(0,0));
+    sse_t org4 = set44(0.0f, ray.getOrigin(ray.begin(),2),
+                       ray.getOrigin(ray.begin(),1),
ray.getOrigin(ray.begin(),0));
     sse_t s_org = mul4(scaleN,sub4(org4,bounds.min));

     f_orgK = ((float4&)s_org)[K];

Modified: trunk/Model/Groups/private/CGT.h
==============================================================================
--- trunk/Model/Groups/private/CGT.h    (original)
+++ trunk/Model/Groups/private/CGT.h    Wed Jan 24 18:23:52 2007
@@ -88,8 +88,8 @@
     int gridLocation;
     int which_primitive;
   };
-  location_primitive* buildQueues[NUM_BUILD_THREADS][NUM_BUILD_THREADS];
-  int buildQueue_sizes[NUM_BUILD_THREADS][NUM_BUILD_THREADS];
+  vector<location_primitive*> buildQueues;
+  vector<int>buildQueue_sizes;
#endif //NUM_BUILD_THREADS
   vector <CellData> cellVector;

@@ -159,15 +159,12 @@
   }

   Grid()
-  {
-#ifdef NUM_BUILD_THREADS
-    for (int i=0; i < NUM_BUILD_THREADS; ++i)
-      for (int k=0; i < NUM_BUILD_THREADS; ++i)
-        buildQueues[i][k] = NULL;
-#endif //NUM_BUILD_THREADS
+  {
   }

   void preprocess(const PreprocessContext&);
+  void build(int proc, int numProcs, bool &);
+  void preFrameSetup(int proc, int numProcs);

   template<bool SHADOWS_ONLY, bool COMMON_ORIGIN, bool SQUARE_PACKETS>
   void traverse(RayPacket &packet, const RenderContext& context) const;

[MANTA] r1272 - in trunk: Interface Model/Groups/private, thiago, 01/24/2007

Archive powered by MHonArc 2.6.16.