From 5de12ee2f722dc51e4db62a5bad3311f180d12b5 Mon Sep 17 00:00:00 2001
From: "A. R. Shajii" <ars@ars.me>
Date: Wed, 25 Jan 2023 13:01:22 -0500
Subject: [PATCH] Add math.fsum() (#182)

---
 stdlib/math.codon           |  92 ++++++++++++++++++++++++++
 test/stdlib/math_test.codon | 125 ++++++++++++++++++++++++++++++++++++
 2 files changed, 217 insertions(+)

diff --git a/stdlib/math.codon b/stdlib/math.codon
index 61938bf9..61780dd5 100644
--- a/stdlib/math.codon
+++ b/stdlib/math.codon
@@ -587,6 +587,98 @@ def isclose(a: float, b: float, rel_tol: float = 1e-09, abs_tol: float = 0.0) ->
         diff <= abs_tol
     )
 
+def fsum(seq):
+    def _fsum_realloc(p: Ptr[float], ps: Ptr[float], n: int, m: int):
+        from internal.gc import realloc, sizeof
+        v = Ptr[float]()
+        m += m
+        if n < m:
+            if p == ps:
+                v = Ptr[float](m)
+                str.memcpy(v.as_byte(), ps.as_byte(), n * sizeof(float))
+            else:
+                v = Ptr[float](realloc(p.as_byte(), m * sizeof(float), n * sizeof(float)))
+        return v, m
+
+    _NUM_PARTIALS: Static[int] = 32
+    ps_arr = __array__[float](_NUM_PARTIALS)
+    ps = ps_arr.ptr
+    p = ps
+    n, m = 0, _NUM_PARTIALS
+    xsave, special_sum, inf_sum = 0.0, 0.0, 0.0
+    hi, yr, lo = 0.0, 0.0, 0.0
+
+    for item in seq:
+        x = float(item)
+        xsave = x
+        i = 0
+
+        for j in range(n):  # for y in partials
+            y = p[j]
+            if fabs(x) < fabs(y):
+                x, y = y, x
+            hi = x + y
+            yr = hi - x
+            lo = y - yr
+            if lo != 0.0:
+                p[i] = lo
+                i += 1
+            x = hi
+
+        n = i
+        if x != 0.0:
+            if not isfinite(x):
+                # a nonfinite x could arise either as
+                # a result of intermediate overflow, or
+                if isfinite(xsave):
+                    raise OverflowError("intermediate overflow in fsum")
+                if isinf(xsave):
+                    inf_sum += xsave
+                special_sum += xsave
+                # reset partials
+                n = 0
+            else:
+                if n >= m:
+                    p, m = _fsum_realloc(p, ps, n, m)
+                p[n] = x
+                n += 1
+
+    if special_sum != 0.0:
+        if isnan(inf_sum):
+            raise ValueError("-inf + inf in fsum")
+        else:
+            return special_sum
+
+    hi = 0.0
+    if n > 0:
+        hi = p[n - 1]
+        n -= 1
+        # sum_exact(ps, hi) from the top, stop when the sum becomes inexact
+        while n > 0:
+            x = hi
+            y = p[n - 1]
+            n -= 1
+            # assert fabs(y) < fabs(x)
+            hi = x + y
+            yr = hi - x
+            lo = y - yr
+            if lo != 0.0:
+                break
+
+        # Make half-even rounding work across multiple partials.
+        # Needed so that sum([1e-16, 1, 1e16]) will round-up the last
+        # digit to two instead of down to zero (the 1e-16 makes the 1
+        # slightly closer to two).  With a potential 1 ULP rounding
+        # error fixed-up, math.fsum() can guarantee commutativity.
+        if n > 0 and ((lo < 0.0 and p[n-1] < 0.0) or (lo > 0.0 and p[n-1] > 0.0)):
+            y = lo * 2.0
+            x = hi + y
+            yr = x - hi
+            if y == yr:
+                hi = x
+
+    return hi
+
 # 32-bit float ops
 
 e32 = float32(e)
diff --git a/test/stdlib/math_test.codon b/test/stdlib/math_test.codon
index c72bd53f..c27d8f06 100644
--- a/test/stdlib/math_test.codon
+++ b/test/stdlib/math_test.codon
@@ -461,6 +461,130 @@ def test_isclose():
     assert math.isclose(INF, NINF) == False
 
 
+@test
+def test_fsum():
+    assert math.fsum((42,)) == 42.0
+    assert math.fsum((1,2,3)) == 6.0
+    assert math.fsum((1,2,-3)) == 0.0
+    assert math.fsum(()) == 0.0
+    assert math.fsum([.1] * 10) == 1.0
+
+    mant_dig = 53
+    etiny = -1074
+
+    def sum_exact(p):
+        n = len(p)
+        hi = 0.0
+        if n > 0:
+            hi = p[n-1]
+            n -= 1
+            while n > 0:
+                x = hi
+                y = p[n-1]
+                n -= 1
+                hi = x + y
+                yr = hi - x
+                lo = y - yr
+                if lo != 0.0:
+                    break
+
+            if n > 0 and ((lo < 0 and p[n-1] < 0) or (lo > 0 and p[n-1] > 0)):
+                y = lo * 2
+                x = hi + y
+                yr = x - hi
+                if y == yr:
+                    hi = x
+        return hi
+
+    def msum(iterable):
+        "Full precision summation using multiple floats for intermediate values"
+        # Rounded x+y stored in hi with the round-off stored in lo.  Together
+        # hi+lo are exactly equal to x+y.  The inner loop applies hi/lo summation
+        # to each partial so that the list of partial sums remains exact.
+        # Depends on IEEE-754 arithmetic guarantees.  See proof of correctness at:
+        # www-2.cs.cmu.edu/afs/cs/project/quake/public/papers/robust-arithmetic.ps
+
+        partials = []               # sorted, non-overlapping partial sums
+        for x in iterable:
+            i = 0
+            for y in partials:
+                if abs(x) < abs(y):
+                    x, y = y, x
+                hi = x + y
+                lo = y - (hi - x)
+                if lo:
+                    partials[i] = lo
+                    i += 1
+                x = hi
+            partials[i:] = [x]
+        return sum_exact(partials)
+
+    @pure
+    @llvm
+    def float1() -> float:
+        ret double 0x401DF11F45F4E61A
+
+    @pure
+    @llvm
+    def float2() -> float:
+        ret double 0xBFE62A2AF1BD3624
+
+    @pure
+    @llvm
+    def float3() -> float:
+        ret double 0x7C95555555555555
+
+    test_values = [
+        ([], 0.0),
+        ([0.0], 0.0),
+        ([1e100, 1.0, -1e100, 1e-100, 1e50, -1.0, -1e50], 1e-100),
+        ([2.0**53, -0.5, -2.0**-54], 2.0**53-1.0),
+        ([2.0**53, 1.0, 2.0**-100], 2.0**53+2.0),
+        ([2.0**53+10.0, 1.0, 2.0**-100], 2.0**53+12.0),
+        ([2.0**53-4.0, 0.5, 2.0**-54], 2.0**53-3.0),
+        ([1./n for n in range(1, 1001)], float1()),
+        ([(-1.)**n/n for n in range(1, 1001)], float2()),
+        ([1e16, 1., 1e-16], 10000000000000002.0),
+        ([1e16-2., 1.-2.**-53, -(1e16-2.), -(1.-2.**-53)], 0.0),
+        # exercise code for resizing partials array
+        ([2.**n - 2.**(n+50) + 2.**(n+52) for n in range(-1074, 972, 2)] +
+         [-2.**1022],
+         float3()),
+        ]
+
+    # Telescoping sum, with exact differences (due to Sterbenz)
+    terms = [1.7**i for i in range(1001)]
+    test_values.append((
+        [terms[i+1] - terms[i] for i in range(1000)] + [-terms[1000]],
+        -terms[0]
+    ))
+
+    for i, (vals, expected) in enumerate(test_values):
+        try:
+            actual = math.fsum(vals)
+        except OverflowError:
+            # self.fail("test %d failed: got OverflowError, expected %r "
+            #           "for math.fsum(%.100r)" % (i, expected, vals))
+            assert False
+        except ValueError:
+            # self.fail("test %d failed: got ValueError, expected %r "
+            #           "for math.fsum(%.100r)" % (i, expected, vals))
+            assert False
+
+        assert actual == expected
+
+    from random import random, gauss, shuffle
+    for j in range(10000):
+        vals = [7, 1e100, -7, -1e100, -9e-20, 8e-20] * 10
+        s = 0.
+        for i in range(200):
+            v = gauss(0, random()) ** 7 - s
+            s += v
+            vals.append(v)
+        shuffle(vals)
+        assert msum(vals) == math.fsum(vals)
+
+
 test_isnan()
 test_isinf()
 test_isfinite()
@@ -504,6 +628,7 @@ test_gcd()
 test_frexp()
 test_modf()
 test_isclose()
+test_fsum()
 
 
 # 32-bit float ops