Skip to content

Commit cb18046

Browse files
committed
changed to sample covariance
1 parent f2e862b commit cb18046

File tree

5 files changed

+7
-7
lines changed

5 files changed

+7
-7
lines changed

python/pyspark/sql/dataframe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -877,8 +877,8 @@ def fillna(self, value, subset=None):
877877

878878
def cov(self, col1, col2):
879879
"""
880-
Calculate the covariance for the given columns, specified by their names as a double value.
881-
:func:`DataFrame.cov` and :func:`DataFrameStatFunctions.cov` are aliases.
880+
Calculate the sample covariance for the given columns, specified by their names, as a
881+
double value. :func:`DataFrame.cov` and :func:`DataFrameStatFunctions.cov` are aliases.
882882
883883
:param col1: The name of the first column
884884
:param col2: The name of the second column

python/pyspark/sql/tests.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ def test_aggregator(self):
390390
def test_cov(self):
391391
df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF()
392392
cov = df.stat.cov("a", "b")
393-
self.assertTrue(abs(cov - 16.5) < 1e-6)
393+
self.assertTrue(abs(cov - 55.0 / 3) < 1e-6)
394394

395395
def test_math_functions(self):
396396
df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF()

sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
6767
}
6868

6969
/**
70-
* Calculate the covariance of two numerical columns of a DataFrame.
70+
* Calculate the sample covariance of two numerical columns of a DataFrame.
7171
* @param col1 the name of the first column
7272
* @param col2 the name of the second column
7373
* @return the covariance of the two columns.

sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ private[sql] object StatFunctions {
4949
count = totalCount
5050
this
5151
}
52-
// return the covariance for the observed examples
53-
def cov: Double = Ck / count
52+
// return the sample covariance for the observed examples
53+
def cov: Double = Ck / (count - 1)
5454
}
5555

5656
/**

sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ class DataFrameStatSuite extends FunSuite {
5151
val df = sqlCtx.sparkContext.parallelize(rows).toDF("singles", "doubles", "letters")
5252

5353
val results = df.stat.cov("singles", "doubles")
54-
assert(math.abs(results - 16.5) < 1e-6)
54+
assert(math.abs(results - 55.0 / 3) < 1e-6)
5555
intercept[IllegalArgumentException] {
5656
df.stat.cov("singles", "letters") // doesn't accept non-numerical dataTypes
5757
}

0 commit comments

Comments
 (0)