std::accumulate inaccurate result for la

Forum

Forum
General C++ Programming
std::accumulate inaccurate result for la

std::accumulate inaccurate result for large container

Not really a c++ question, but more about floating point precision (I suspect).

This gives 1.82 for the average instead of 2. For 5,000,000 it works fine. So it's something to do with having a large N.

// Example program
#include <iostream>
#include <string>
#include <vector>
#include <numeric>

int main()
{
    int N = 15'000'000;
    std::vector<float> testData;
    testData.reserve(N);
    for (size_t i = 0; i < N; ++i)
        testData.push_back(float(i % 5));

    auto sum = std::accumulate(std::begin(testData), std::end(testData), 0.0f);
    std::cout << sum << ' '<< sum / N;
}

Ah never mind. The size of the fraction is only 24 bits right (23 + implied one)? That explains it. After 16m or so, it wouldn't add 1s or 3s accurately.

Last edited on

JLBorges (13770)

#include <iostream>
#include <string>
#include <vector>
#include <numeric>

template < typename T > void test()
{
    std::size_t N = 15'000'000;
    std::vector<T> testData;

    testData.reserve(N);
    for (size_t i = 0; i < N; ++i) testData.push_back( i % 5 );

    auto sum = std::accumulate( std::begin(testData), std::end(testData), T(0.0) );
    std::cout << "     simple summation: " << sum << ' ' << sum / N << '\n' ;

    T compensation = 0.0 ;
    // https://en.wikipedia.org/wiki/Kahan_summation_algorithm#The_algorithm
    const auto plus = [&compensation] ( T accumulator, T value )
    {
        const T vx = value - compensation ;
        const T ax = accumulator + vx ;
        compensation = ( ax - accumulator ) - vx ;
        return ax ;
    };

    sum = std::accumulate( std::begin(testData), std::end(testData), T(0.0), plus );
    std::cout << "compensated summation: " << sum << ' ' << sum / N << '\n' ;
}

int main()
{
    std::cout << std::fixed
              << "float:\n-----------\n" ;
    test<float>() ;

    std::cout << "\ndouble:\n----------\n" ;
    test<double>() ;
}

http://coliru.stacked-crooked.com/a/4ea66ddc68c483f4

elohssa (88)

Ah nice trick. Plus the final value that ends up in compensation can be carried over to the next set of calculations (if there were any).

Last edited on

elohssa (88)

I got another question that's kinda related to this. I'm using AVX vaddps (_mm256_add_ps) to sum up floats but it seems to actually be slower than std::accumulate, which just does a basic addss behind the scenes. Any idea as to why? Surely SIMD shouldn't be slower for this?

JLBorges (13770)

This post on stackoverflow may be of interest: https://stackoverflow.com/a/41349852

elohssa (88)

Nah in my case I only ever use 1 avx instruction really - no SSE. So doubt it's that, but might be something similar. Here is my full code, but basically if you jump to float SIMD::detail::sumFloatsAVX -> that's my only SIMD code for this test.

Timer.h

#pragma once
#include <chrono>

namespace ScopedTimer
{
    namespace detail
    {
        struct ScopedTimerBase
        {
            auto GetDuration() const
            {
                return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - mCreated);
            }

            const std::chrono::time_point<std::chrono::system_clock> mCreated = std::chrono::system_clock::now();
        };

        template<typename F>
        struct ScopedTimer : protected ScopedTimerBase
        {
            ScopedTimer(const F& func) : mFunc(func) {}
            ~ScopedTimer()
            {
                mFunc(GetDuration());
            }
        private:
            const F mFunc;
        };
    }

    template<typename F>
    static auto MakeScopedTimer(const F& func)
    {
        return detail::ScopedTimer<F>(func);
    }

    static auto MakeScopedTimer(std::chrono::milliseconds& output)
    {
        return MakeScopedTimer([&](auto& ms) { output = ms; });
    }
}

SIMD.h

#pragma once
#include <algorithm>
#include <intrin.h>
#include <vector>
#include <numeric>

namespace SIMD
{
    namespace detail
    {
        enum AlignTo : size_t
        {
            SSE = 16,
            AVX = 32
        };
        template<typename T>
        struct MemoryTypeToSIMDType {};

        template<>
        struct MemoryTypeToSIMDType<__m256>
        {
            static constexpr auto value = AlignTo::AVX;
        };

        template<>
        struct MemoryTypeToSIMDType<__m128>
        {
            static constexpr auto value = AlignTo::SSE;
        };

        template<typename T, AlignTo alignTo>
        constexpr size_t numberInSIMD = alignTo / sizeof(T);

        template<typename T>
        constexpr size_t numberInAVX = numberInSIMD<T, AlignTo::AVX>;

        template<typename T, typename TOut = T, typename MemoryType>
        static auto HorizontalSum(const MemoryType& data)
        {
            // these are usually done at the end of tight loops, so performance isn't really important. Something generic is better.
            constexpr auto alignType = MemoryTypeToSIMDType<MemoryType>::value;
            auto& values = reinterpret_cast<const float(&)[numberInSIMD<T, alignType>]>(data);
            return std::accumulate(std::begin(values), std::end(values), TOut());
        }

        float sumFloatsAVX(const float* arr, size_t arrSize);

        template<AlignTo alignTo, typename T, typename F>
        static auto ProcessScalar(T* arr, size_t arrSize, const F& func)
        {
            typedef decltype(func(arr, arrSize)) ReturnValue;
            struct ProcessScalarResult
            {
                ReturnValue mBeforeArrayStart;
                ReturnValue mAfterArrayEnd;
                T* const mContinueFrom;
                size_t mRemainingElements;
            };
            // assuming array is sizeof(T) aligned
            constexpr auto alignToMask = alignTo - 1;
            const auto addressStart = reinterpret_cast<size_t>(arr);
            const auto elementsNeededToAlign = (alignTo - (addressStart & alignToMask)) / sizeof(T);
            const auto addressEnd = addressStart + arrSize * sizeof(T);
            const auto elementsAtEnd = (addressEnd & alignToMask) / sizeof(T);
            return ProcessScalarResult
            {
                func(arr, elementsNeededToAlign),
                func(arr + arrSize - elementsAtEnd, elementsAtEnd),
                arr + elementsNeededToAlign,
                arrSize - elementsNeededToAlign - elementsAtEnd
            };
        }
    }

    float average(const float* arr, size_t arrSize);

    template<size_t N>
    inline float average(const float(&arr)[N])
    {
        return average(arr, N);
    }

    inline float average(std::vector<float> data)
    {
        return average(&data[0], data.size());
    }
}

SIMD.cpp

#include "SIMD.h"

float SIMD::average(const float* arr, size_t arrSize)
{
    const auto calculateSum = [&](const float* value, size_t arrSize) { return std::accumulate(value, value + arrSize, 0.0f); };
    const auto averageScalar = detail::ProcessScalar<detail::AVX>(arr, arrSize, calculateSum);
    const auto totalVectorized = averageScalar.mRemainingElements ? detail::sumFloatsAVX(averageScalar.mContinueFrom, averageScalar.mRemainingElements) : 0.0f;
    return (totalVectorized + averageScalar.mBeforeArrayStart + averageScalar.mAfterArrayEnd) / arrSize;
}

float SIMD::detail::sumFloatsAVX(const float* arr, size_t arrSize)
{
    __m256 result = {};

    for (size_t i = 0; i < arrSize; i += numberInAVX<float>)
        result = _mm256_add_ps(result, *reinterpret_cast<const __m256*>(arr + i));

    return HorizontalSum<float>(result);
}

Main.cpp

#include "stdafx.h"
#include "SIMD.h"
#include "Timer.h"
#include <iostream>
#include <numeric>

template<size_t N>
static void testAverage()
{    
    std::vector<float> testData;
    testData.reserve(N);
    for (size_t i = 0; i < N; ++i)
        testData.push_back(float(i % 5));

    std::chrono::milliseconds timeSTL;
    float resultSTL = 0.0f;
    std::chrono::milliseconds timeSIMD;
    float resultSIMD = 0.0f;
    
    {
        auto timer = ScopedTimer::MakeScopedTimer(timeSTL);
        resultSTL = std::accumulate(std::begin(testData), std::end(testData), 0.0f) / N;
    }
    {
        auto timer = ScopedTimer::MakeScopedTimer(timeSIMD);
        resultSIMD = SIMD::average(testData);
    }

    std::cout << resultSTL << " (" << timeSTL.count() << ") " << resultSIMD << " (" << timeSIMD.count() << ")";
}

int main()
{
    testAverage<5'000'000>();
}

SIMD is 2.25 times slower.

Last edited on

Topic archived. No new replies allowed.

C++

Forum

std::accumulate inaccurate result for large container