vendredi 30 janvier 2015

Modifying a function to use SSE intrinsics

I am trying to calculate the approximate value of the radical: sqrt(i + sqrt(i + sqrt(i + ...))) using SSE in order to get a speedup from vectorization (I also read that the SIMD square-root function runs approximately 4.7x faster than the innate FPU square-root function). However, I am having problems getting the same functionality in the vectorized version; I am getting the incorrect value and I'm not sure


My original function is this:



template <typename T>
T CalculateRadical( T tValue, T tEps = std::numeric_limits<T>::epsilon() )
{
static std::unordered_map<T,T> setResults;

auto it = setResults.find( tValue );
if( it != setResults.end() )
{
return it->second;
}

T tPrev = std::sqrt(tValue + std::sqrt(tValue)), tCurr = std::sqrt(tValue + tPrev);

// Keep iterating until we get convergence:
while( std::abs( tPrev - tCurr ) > tEps )
{
tPrev = tCurr;
tCurr = std::sqrt(tValue + tPrev);
}

setResults.insert( std::make_pair( tValue, tCurr ) );
return tCurr;
}


And the SIMD equivalent (when this template function is instantiated with T = float and given tEps = 0.0005f) I have written is:



// SSE intrinsics hard-coded function:
__m128 CalculateRadicals( __m128 values )
{
static std::unordered_map<float, __m128> setResults;

// Store our epsilon as a vector for quick comparison:
__declspec(align(16)) float flEps[4] = { 0.0005f, 0.0005f, 0.0005f, 0.0005f };
__m128 eps = _mm_load_ps( flEps );

union U {
__m128 vec;
float flArray[4];
};

U u;
u.vec = values;

float flFirstVal = u.flArray[0];
auto it = setResults.find( flFirstVal );
if( it != setResults.end( ) )
{
return it->second;
}

__m128 prev = _mm_sqrt_ps( _mm_add_ps( values, _mm_sqrt_ps( values ) ) );
__m128 curr = _mm_sqrt_ps( _mm_add_ps( values, prev ) );

while( _mm_movemask_ps( _mm_cmplt_ps( _mm_sub_ps( curr, prev ), eps ) ) != 0xF )
{
prev = curr;
curr = _mm_sqrt_ps( _mm_add_ps( values, prev ) );
}

setResults.insert( std::make_pair( flFirstVal, curr ) );
return curr;
}


I am calling the function in a loop using the following code:



long long N;
std::cin >> N;

float flExpectation = 0.0f;
long long iMultipleOf4 = (N / 4LL) * 4LL;
for( long long i = iMultipleOf4; i > 0LL; i -= 4LL )
{
__declspec(align(16)) float flArray[4] = { static_cast<float>(i - 3), static_cast<float>(i - 2), static_cast<float>(i - 1), static_cast<float>(i) };
__m128 arg = _mm_load_ps( flArray );
__m128 vec = CalculateRadicals( arg );

float flSum = Sum( vec );
flExpectation += flSum;
}

for( long long i = iMultipleOf4; i < N; ++i )
{
flExpectation += CalculateRadical( static_cast<float>(i), 0.0005f );
}

flExpectation /= N;


I get the following outputs for input 5:



With SSE version: 2.20873
With FPU verison: 1.69647


Where does the discrepancy come from, what am I doing wrong in the SIMD equivalent?


Aucun commentaire:

Enregistrer un commentaire