calculate hash in Python

davies · davies · commit ded624f11ece · 2014-08-28T12:19:49.000-07:00
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -2008,9 +2008,6 @@ def countApproxDistinct(self, relativeSD=0.05):
         of The Art Cardinality Estimation Algorithm", available
         <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
 
-        This support all the types of objects, which is supported by
-        Pyrolite, nearly all builtin types.
-
         @param relativeSD Relative accuracy. Smaller values create
                            counters that require more space.
                            It must be greater than 0.000017.
@@ -2026,7 +2023,13 @@ def countApproxDistinct(self, relativeSD=0.05):
             raise ValueError("relativeSD should be greater than 0.000017")
         if relativeSD > 0.37:
             raise ValueError("relativeSD should be smaller than 0.37")
-        return self._to_java_object_rdd().countApproxDistinct(relativeSD)
+        hashRDD = self.map(lambda x: portable_hash(x) % sys.maxint)
+        c = hashRDD._to_java_object_rdd().countApproxDistinct(relativeSD)
+        # range of hash is [0, sys.maxint]
+        if c > sys.maxint / 30:
+            # correction for hash collision in Python
+            c = -sys.maxint * log(1 - float(c) / sys.maxint)
+        return int(c)
 
 
 class PipelinedRDD(RDD):
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
@@ -409,13 +409,13 @@ def test_count_approx_distinct(self):
         self.assertTrue(950 < rdd.countApproxDistinct(0.04) < 1050)
         self.assertTrue(950 < rdd.map(float).countApproxDistinct(0.04) < 1050)
         self.assertTrue(950 < rdd.map(str).countApproxDistinct(0.04) < 1050)
-        self.assertTrue(950 < rdd.map(lambda x: set([x, -x])).countApproxDistinct(0.04) < 1050)
+        self.assertTrue(950 < rdd.map(lambda x: (x, -x)).countApproxDistinct(0.04) < 1050)
 
         rdd = self.sc.parallelize([i % 20 for i in range(1000)], 7)
         self.assertTrue(18 < rdd.countApproxDistinct() < 22)
         self.assertTrue(18 < rdd.map(float).countApproxDistinct() < 22)
         self.assertTrue(18 < rdd.map(str).countApproxDistinct() < 22)
-        self.assertTrue(18 < rdd.map(lambda x: set([x, -x])).countApproxDistinct() < 22)
+        self.assertTrue(18 < rdd.map(lambda x: (x, -x)).countApproxDistinct() < 22)
 
         self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.00000001))
         self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.5))