Text archives Help
- From: "James Bigler" <bigler@cs.utah.edu>
- To: manta@sci.utah.edu
- Subject: [Manta] r2136 - trunk/Interface
- Date: Fri, 29 Feb 2008 08:41:47 -0700 (MST)
Author: bigler
Date: Fri Feb 29 08:41:47 2008
New Revision: 2136
Modified:
trunk/Interface/Primitive.cc
Log:
Interface/Primitive.cc
Added horrible SSE code for copying.
On my laptop for one processor:
before: 11.51 fps
C copy: 11.61 fps
SSE copy: 11.67 fps
I wonder if the masked copies with all the load and unloads are kill us.
Perhaps making the prologue, epilogue, and single SSE vector cases use C
copies would help. Austin wants to try memcpy. That's also worth a shot,
now
that we have sse code to compare it with.
I believe the really cost to using geometricNormals is in the forward facing
computation where we have to compute the dot products again and do sign
changes. That bit of code could use some SSE.
Modified: trunk/Interface/Primitive.cc
==============================================================================
--- trunk/Interface/Primitive.cc (original)
+++ trunk/Interface/Primitive.cc Fri Feb 29 08:41:47 2008
@@ -25,8 +25,82 @@
RayPacket& rays) const {
rays.computeNormals(context);
+#ifdef MANTA_SSE
+ RayPacketData* data = rays.data;
+ if((rays.rayBegin ^ (rays.rayEnd-1)) & ~3){
+ int i = rays.rayBegin & ~3;
+ // Prologue
+ if(i != rays.rayBegin){
+ __m128i ray_idx = _mm_set_epi32(3, 2, 1, 0);
+ // mask is on for active rays
+ __m128 mask = _mm_castsi128_ps(_mm_cmpgt_epi32(ray_idx,
_mm_set1_epi32(rays.rayBegin-i-1)));
+ _mm_store_ps(&data->geometricNormal[0][i],
+ mask4(mask,
+ _mm_load_ps(&data->normal[0][i]),
+ _mm_load_ps(&data->geometricNormal[0][i])));
+ _mm_store_ps(&data->geometricNormal[1][i],
+ mask4(mask,
+ _mm_load_ps(&data->normal[1][i]),
+ _mm_load_ps(&data->geometricNormal[1][i])));
+ _mm_store_ps(&data->geometricNormal[2][i],
+ mask4(mask,
+ _mm_load_ps(&data->normal[2][i]),
+ _mm_load_ps(&data->geometricNormal[2][i])));
+ }
+ // Primary loop body
+ int e = rays.rayEnd - 3;
+ for(;i<e;i+=4){
+ _mm_store_ps(&data->geometricNormal[0][i],
_mm_load_ps(&data->normal[0][i]));
+ _mm_store_ps(&data->geometricNormal[1][i],
_mm_load_ps(&data->normal[1][i]));
+ _mm_store_ps(&data->geometricNormal[2][i],
_mm_load_ps(&data->normal[2][i]));
+ }
+ // Epilogue
+ if(i != rays.rayEnd){
+ __m128i ray_idx = _mm_set_epi32(3, 2, 1, 0);
+ // mask is on for active rays
+ __m128 mask = _mm_castsi128_ps(_mm_cmplt_epi32(ray_idx,
_mm_set1_epi32(rays.rayEnd-i)));
+ _mm_store_ps(&data->geometricNormal[0][i],
+ mask4(mask,
+ _mm_load_ps(&data->normal[0][i]),
+ _mm_load_ps(&data->geometricNormal[0][i])));
+ _mm_store_ps(&data->geometricNormal[1][i],
+ mask4(mask,
+ _mm_load_ps(&data->normal[1][i]),
+ _mm_load_ps(&data->geometricNormal[1][i])));
+ _mm_store_ps(&data->geometricNormal[2][i],
+ mask4(mask,
+ _mm_load_ps(&data->normal[2][i]),
+ _mm_load_ps(&data->geometricNormal[2][i])));
+ }
+ } else {
+ // Single SSE vector
+ int i = rays.rayBegin & ~3;
+ __m128i ray_idx = _mm_set_epi32(3, 2, 1, 0);
+ __m128 mask = _mm_castsi128_ps(_mm_and_si128(_mm_cmpgt_epi32(ray_idx,
_mm_set1_epi32(rays.rayBegin-i-1)), _mm_cmplt_epi32(ray_idx,
_mm_set1_epi32(rays.rayEnd-i))));
+ _mm_store_ps(&data->geometricNormal[0][i],
+ mask4(mask,
+ _mm_load_ps(&data->normal[0][i]),
+ _mm_load_ps(&data->geometricNormal[0][i])));
+ _mm_store_ps(&data->geometricNormal[1][i],
+ mask4(mask,
+ _mm_load_ps(&data->normal[1][i]),
+ _mm_load_ps(&data->geometricNormal[1][i])));
+ _mm_store_ps(&data->geometricNormal[2][i],
+ mask4(mask,
+ _mm_load_ps(&data->normal[2][i]),
+ _mm_load_ps(&data->geometricNormal[2][i])));
+ }
+#elif 1
+ RayPacketData* data = rays.data;
+ for(int i = rays.begin(); i < rays.end(); ++i) {
+ data->geometricNormal[0][i] = data->normal[0][i];
+ data->geometricNormal[1][i] = data->normal[1][i];
+ data->geometricNormal[2][i] = data->normal[2][i];
+ }
+#else // #ifdef MANTA_SSE
for(int i = rays.begin(); i != rays.end(); ++i)
rays.setGeometricNormal(i, rays.getNormal(i));
+#endif
rays.setFlag( RayPacket::HaveGeometricNormals );
if( rays.getFlag( RayPacket::HaveUnitNormals ) )
- [Manta] r2136 - trunk/Interface, James Bigler, 02/29/2008
Archive powered by MHonArc 2.6.16.