Backport PR #45646: Revert "PERF: nancorr pearson (#42761)" (#45649)

meeseeksmachine · phofl · web-flow · commit 2fcb0cd1e9fc · 2022-01-26T20:13:41.000-05:00
Co-authored-by: Patrick Hoefler &lt;61934744+phofl@users.noreply.github.com&gt;
diff --git a/doc/source/whatsnew/v1.4.1.rst b/doc/source/whatsnew/v1.4.1.rst
@@ -32,7 +32,7 @@ Bug fixes
 
 Other
 ~~~~~
--
+- Reverted performance speedup of :meth:`DataFrame.corr` for ``method=pearson`` to fix precision regression (:issue:`45640`, :issue:`42761`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -329,12 +329,8 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
         Py_ssize_t i, j, xi, yi, N, K
         bint minpv
         float64_t[:, ::1] result
-        # Initialize to None since we only use in the no missing value case
-        float64_t[::1] means=None, ssqds=None
         ndarray[uint8_t, ndim=2] mask
-        bint no_nans
         int64_t nobs = 0
-        float64_t mean, ssqd, val
         float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy
 
     N, K = (<object>mat).shape
@@ -346,57 +342,25 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
 
     result = np.empty((K, K), dtype=np.float64)
     mask = np.isfinite(mat).view(np.uint8)
-    no_nans = mask.all()
-
-    # Computing the online means and variances is expensive - so if possible we can
-    # precompute these and avoid repeating the computations each time we handle
-    # an (xi, yi) pair
-    if no_nans:
-        means = np.empty(K, dtype=np.float64)
-        ssqds = np.empty(K, dtype=np.float64)
-
-        with nogil:
-            for j in range(K):
-                ssqd = mean = 0
-                for i in range(N):
-                    val = mat[i, j]
-                    dx = val - mean
-                    mean += 1 / (i + 1) * dx
-                    ssqd += (val - mean) * dx
-
-                means[j] = mean
-                ssqds[j] = ssqd
 
     with nogil:
         for xi in range(K):
             for yi in range(xi + 1):
-                covxy = 0
-                if no_nans:
-                    for i in range(N):
+                # Welford's method for the variance-calculation
+                # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
+                for i in range(N):
+                    if mask[i, xi] and mask[i, yi]:
                         vx = mat[i, xi]
                         vy = mat[i, yi]
-                        covxy += (vx - means[xi]) * (vy - means[yi])
-
-                    ssqdmx = ssqds[xi]
-                    ssqdmy = ssqds[yi]
-                    nobs = N
-
-                else:
-                    nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
-                    for i in range(N):
-                        # Welford's method for the variance-calculation
-                        # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-                        if mask[i, xi] and mask[i, yi]:
-                            vx = mat[i, xi]
-                            vy = mat[i, yi]
-                            nobs += 1
-                            dx = vx - meanx
-                            dy = vy - meany
-                            meanx += 1 / nobs * dx
-                            meany += 1 / nobs * dy
-                            ssqdmx += (vx - meanx) * dx
-                            ssqdmy += (vy - meany) * dy
-                            covxy += (vx - meanx) * dy
+                        nobs += 1
+                        dx = vx - meanx
+                        dy = vy - meany
+                        meanx += 1 / nobs * dx
+                        meany += 1 / nobs * dy
+                        ssqdmx += (vx - meanx) * dx
+                        ssqdmy += (vy - meany) * dy
+                        covxy += (vx - meanx) * dy
 
                 if nobs < minpv:
                     result[xi, yi] = result[yi, xi] = NaN
diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py
@@ -337,6 +337,13 @@ def test_corrwith_dup_cols(self):
         expected = Series(np.ones(4), index=[0, 0, 1, 2])
         tm.assert_series_equal(result, expected)
 
+    def test_corr_numerical_instabilities(self):
+        # GH#45640
+        df = DataFrame([[0.2, 0.4], [0.4, 0.2]])
+        result = df.corr()
+        expected = DataFrame({0: [1.0, -1.0], 1: [-1.0, 1.0]})
+        tm.assert_frame_equal(result - 1, expected - 1, atol=1e-17)
+
     @td.skip_if_no_scipy
     def test_corrwith_spearman(self):
         # GH#21925

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ Bug fixes`
`32`	`32`
`33`	`33`	`Other`
`34`	`34`	`~~~~~`
`35`		`--`
	`35`	+- Reverted performance speedup of :meth:`DataFrame.corr` for ``method=pearson`` to fix precision regression (:issue:`45640`, :issue:`42761`)
`36`	`36`	`-`
`37`	`37`
`38`	`38`	`.. ---------------------------------------------------------------------------`