std::accumulate inaccurate result for large container

Not really a c++ question, but more about floating point precision (I suspect).

This gives 1.82 for the average instead of 2. For 5,000,000 it works fine. So it's something to do with having a large N.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
// Example program
#include <iostream>
#include <string>
#include <vector>
#include <numeric>

int main()
{
    int N = 15'000'000;
    std::vector<float> testData;
    testData.reserve(N);
    for (size_t i = 0; i < N; ++i)
        testData.push_back(float(i % 5));

    auto sum = std::accumulate(std::begin(testData), std::end(testData), 0.0f);
    std::cout << sum << ' '<< sum / N;
}


Ah never mind. The size of the fraction is only 24 bits right (23 + implied one)? That explains it. After 16m or so, it wouldn't add 1s or 3s accurately.
Last edited on
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#include <iostream>
#include <string>
#include <vector>
#include <numeric>

template < typename T > void test()
{
    std::size_t N = 15'000'000;
    std::vector<T> testData;

    testData.reserve(N);
    for (size_t i = 0; i < N; ++i) testData.push_back( i % 5 );

    auto sum = std::accumulate( std::begin(testData), std::end(testData), T(0.0) );
    std::cout << "     simple summation: " << sum << ' ' << sum / N << '\n' ;

    T compensation = 0.0 ;
    // https://en.wikipedia.org/wiki/Kahan_summation_algorithm#The_algorithm
    const auto plus = [&compensation] ( T accumulator, T value )
    {
        const T vx = value - compensation ;
        const T ax = accumulator + vx ;
        compensation = ( ax - accumulator ) - vx ;
        return ax ;
    };

    sum = std::accumulate( std::begin(testData), std::end(testData), T(0.0), plus );
    std::cout << "compensated summation: " << sum << ' ' << sum / N << '\n' ;
}

int main()
{
    std::cout << std::fixed
              << "float:\n-----------\n" ;
    test<float>() ;

    std::cout << "\ndouble:\n----------\n" ;
    test<double>() ;
}

http://coliru.stacked-crooked.com/a/4ea66ddc68c483f4
Ah nice trick. Plus the final value that ends up in compensation can be carried over to the next set of calculations (if there were any).
Last edited on
I got another question that's kinda related to this. I'm using AVX vaddps (_mm256_add_ps) to sum up floats but it seems to actually be slower than std::accumulate, which just does a basic addss behind the scenes. Any idea as to why? Surely SIMD shouldn't be slower for this?
This post on stackoverflow may be of interest: https://stackoverflow.com/a/41349852
Nah in my case I only ever use 1 avx instruction really - no SSE. So doubt it's that, but might be something similar. Here is my full code, but basically if you jump to float SIMD::detail::sumFloatsAVX -> that's my only SIMD code for this test.

Timer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#pragma once
#include <chrono>

namespace ScopedTimer
{
    namespace detail
    {
        struct ScopedTimerBase
        {
            auto GetDuration() const
            {
                return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - mCreated);
            }

            const std::chrono::time_point<std::chrono::system_clock> mCreated = std::chrono::system_clock::now();
        };

        template<typename F>
        struct ScopedTimer : protected ScopedTimerBase
        {
            ScopedTimer(const F& func) : mFunc(func) {}
            ~ScopedTimer()
            {
                mFunc(GetDuration());
            }
        private:
            const F mFunc;
        };
    }

    template<typename F>
    static auto MakeScopedTimer(const F& func)
    {
        return detail::ScopedTimer<F>(func);
    }

    static auto MakeScopedTimer(std::chrono::milliseconds& output)
    {
        return MakeScopedTimer([&](auto& ms) { output = ms; });
    }
}


SIMD.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#pragma once
#include <algorithm>
#include <intrin.h>
#include <vector>
#include <numeric>

namespace SIMD
{
    namespace detail
    {
        enum AlignTo : size_t
        {
            SSE = 16,
            AVX = 32
        };
        template<typename T>
        struct MemoryTypeToSIMDType {};

        template<>
        struct MemoryTypeToSIMDType<__m256>
        {
            static constexpr auto value = AlignTo::AVX;
        };

        template<>
        struct MemoryTypeToSIMDType<__m128>
        {
            static constexpr auto value = AlignTo::SSE;
        };

        template<typename T, AlignTo alignTo>
        constexpr size_t numberInSIMD = alignTo / sizeof(T);

        template<typename T>
        constexpr size_t numberInAVX = numberInSIMD<T, AlignTo::AVX>;

        template<typename T, typename TOut = T, typename MemoryType>
        static auto HorizontalSum(const MemoryType& data)
        {
            // these are usually done at the end of tight loops, so performance isn't really important. Something generic is better.
            constexpr auto alignType = MemoryTypeToSIMDType<MemoryType>::value;
            auto& values = reinterpret_cast<const float(&)[numberInSIMD<T, alignType>]>(data);
            return std::accumulate(std::begin(values), std::end(values), TOut());
        }

        float sumFloatsAVX(const float* arr, size_t arrSize);

        template<AlignTo alignTo, typename T, typename F>
        static auto ProcessScalar(T* arr, size_t arrSize, const F& func)
        {
            typedef decltype(func(arr, arrSize)) ReturnValue;
            struct ProcessScalarResult
            {
                ReturnValue mBeforeArrayStart;
                ReturnValue mAfterArrayEnd;
                T* const mContinueFrom;
                size_t mRemainingElements;
            };
            // assuming array is sizeof(T) aligned
            constexpr auto alignToMask = alignTo - 1;
            const auto addressStart = reinterpret_cast<size_t>(arr);
            const auto elementsNeededToAlign = (alignTo - (addressStart & alignToMask)) / sizeof(T);
            const auto addressEnd = addressStart + arrSize * sizeof(T);
            const auto elementsAtEnd = (addressEnd & alignToMask) / sizeof(T);
            return ProcessScalarResult
            {
                func(arr, elementsNeededToAlign),
                func(arr + arrSize - elementsAtEnd, elementsAtEnd),
                arr + elementsNeededToAlign,
                arrSize - elementsNeededToAlign - elementsAtEnd
            };
        }
    }

    float average(const float* arr, size_t arrSize);

    template<size_t N>
    inline float average(const float(&arr)[N])
    {
        return average(arr, N);
    }

    inline float average(std::vector<float> data)
    {
        return average(&data[0], data.size());
    }
}


SIMD.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#include "SIMD.h"

float SIMD::average(const float* arr, size_t arrSize)
{
    const auto calculateSum = [&](const float* value, size_t arrSize) { return std::accumulate(value, value + arrSize, 0.0f); };
    const auto averageScalar = detail::ProcessScalar<detail::AVX>(arr, arrSize, calculateSum);
    const auto totalVectorized = averageScalar.mRemainingElements ? detail::sumFloatsAVX(averageScalar.mContinueFrom, averageScalar.mRemainingElements) : 0.0f;
    return (totalVectorized + averageScalar.mBeforeArrayStart + averageScalar.mAfterArrayEnd) / arrSize;
}

float SIMD::detail::sumFloatsAVX(const float* arr, size_t arrSize)
{
    __m256 result = {};

    for (size_t i = 0; i < arrSize; i += numberInAVX<float>)
        result = _mm256_add_ps(result, *reinterpret_cast<const __m256*>(arr + i));

    return HorizontalSum<float>(result);
}


Main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#include "stdafx.h"
#include "SIMD.h"
#include "Timer.h"
#include <iostream>
#include <numeric>

template<size_t N>
static void testAverage()
{    
    std::vector<float> testData;
    testData.reserve(N);
    for (size_t i = 0; i < N; ++i)
        testData.push_back(float(i % 5));

    std::chrono::milliseconds timeSTL;
    float resultSTL = 0.0f;
    std::chrono::milliseconds timeSIMD;
    float resultSIMD = 0.0f;
    
    {
        auto timer = ScopedTimer::MakeScopedTimer(timeSTL);
        resultSTL = std::accumulate(std::begin(testData), std::end(testData), 0.0f) / N;
    }
    {
        auto timer = ScopedTimer::MakeScopedTimer(timeSIMD);
        resultSIMD = SIMD::average(testData);
    }

    std::cout << resultSTL << " (" << timeSTL.count() << ") " << resultSIMD << " (" << timeSIMD.count() << ")";
}

int main()
{
    testAverage<5'000'000>();
}


SIMD is 2.25 times slower.
Last edited on
Registered users can post here. Sign in or register to post.