dimanche 27 août 2017

Naive translation C++ into Julia, is there a room for optimization?

Whenever I compare the performance of Julia against the monstrous C++, I always get impressed. In my own experience, many times I find Julia on par with or even faster than C++. My last experiment was comparing Julia speed against C++ Armadillo library using their micro benchmarks. In most cases, I found Julia way faster than C++ in such numerical code (caution: these are my own claims, YMMV of course).

Now, I did a naive translation of this ray-triangle intersection C++ code into Julia and expected to get similar results as my previous experiments. Running both codes, I got the following result for C++ (compiled with: gcc -std=c++11 -lm -O3 -ffast-math):

Total intersection tests:  100,000,000
  Hits:                      4,930,610 ( 4.93%)
  Misses:                   95,069,390 (95.07%)

Total time:                       1.51 seconds
Millions of tests per second:    66.18

and for Julia, I got:

Total intersection tests:  100000000
  Hits:             4910255  ( 4.910255%)
  Misses:           95089745 (95.08974500000001%)

Total time:         2.481432264 seconds
Millions of tests per second:   40.29930675552786

The following is a line-by-line translation into Julia without any attempt to optimize. What is wrong with my code? or is Julia struct less efficient? Where does this 0.65X come from? and how can I optimize this code?

mutable struct vec3
  x::Float64
  y::Float64
  z::Float64
end

mutable struct ray  
  orig::vec3
  dir ::vec3
end

sub3(a::vec3, b::vec3) = vec3(a.x - b.x, a.y - b.y, a.z - b.z)
dot3(a::vec3, b::vec3) = a.x*b.x +  a.y*b.y + a.z*b.z
len(v::vec3) = sqrt( v.x * v.x + v.y * v.y + v.z * v.z  )
normalize(v) = begin l = length(v); vec3( v[1]/l, v[2]/l, v[3]/l ) end
cross3(a::vec3, b::vec3) = vec3( a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x )

# Ray-triangle intersection routine 
function rayTriangleIntersect(r::ray, v0::vec3, v1::vec3, v2::vec3)::Float64
  v0v1 = sub3(v1, v0);
  v0v2 = sub3(v2, v0);
  pvec = cross3(r.dir, v0v2);
  detc = dot3(v0v1, pvec);

  (detc < 0.000001) && return -Inf

  invDet = 1.0 / detc;
  tvec = sub3(r.orig, v0);
  u = dot3(tvec, pvec) * invDet;

  (u < 0 || u > 1) && return -Inf

  qvec = cross3(tvec, v0v1);
  v = dot3(r.dir, qvec) * invDet;

  (v < 0 || u + v > 1) && return -Inf 

  return dot3( v0v2, qvec ) * invDet;
end

function randomSphere()::vec3
  r1  = rand();
  r2  = rand();
  lat = acos(2r1 - 1) - pi/2;
  lon = 2pi * r2;
  return vec3( cos(lat)cos(lon), cos(lat)sin(lon), sin(lat) )
end    

function generateRandomTriangles(numTriangles::Int64)#::Array{vec3}(3numTriangles)
  vertices = Array{vec3}(3numTriangles)    
  for i = 1:3numTriangles
    vertices[i] = randomSphere();
  end
  return vertices
end

function main_ray()
  const NUM_RAYS = 1000;
  const NUM_TRIANGLES = 100 * 1000;

  vertices = generateRandomTriangles(NUM_TRIANGLES);
  const numVertices = NUM_TRIANGLES * 3;

  numHit  = 0;
  numMiss = 0;

  o  = vec3(0.0, 0.0, 0.0)
  d  = vec3(0.0, 0.0, 0.0)
  p1 = vec3(0.0, 0.0, 0.0)
  r  = ray(o,d)

  tTotal = @elapsed for i = 1:NUM_RAYS
    r.orig   = randomSphere();
    p1       = randomSphere();
    r.dir    = sub3(p1,r.orig); 

    for j = 0:div(numVertices,3) - 1
      t = rayTriangleIntersect(r, vertices[j*3 + 1],
                                  vertices[j*3 + 2],
                                  vertices[j*3 + 3]);
      t >= 0? numHit+=1 : numMiss+=1
    end
  end

  numTests = NUM_RAYS * NUM_TRIANGLES;
  hitPerc  = numHit / numTests * 100.0
  missPerc = numMiss / numTests * 100.0
  mTestsPerSecond = numTests / tTotal / 10^6

  println("Total intersection tests:  $numTests");
  println("  Hits:\t\t\t    $numHit  ( $hitPerc%)");
  println("  Misses:\t\t    $numMiss ($missPerc%)\n");
  println("Total time:\t\t\t$tTotal seconds");
  println("Millions of tests per second:\t$mTestsPerSecond\n");    
end

main_ray();

Aucun commentaire:

Enregistrer un commentaire