set fpgrwothmodel minconf

YY-OnCall · YY-OnCall · commit 3398d62d7e90 · 2017-03-17T11:27:42.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala
@@ -204,14 +204,25 @@ class FPGrowthModel private[ml] (
   @Since("2.2.0")
   def setPredictionCol(value: String): this.type = set(predictionCol, value)
 
+  @transient private var _cachedMinConf: Double = Double.NaN
+
+  @transient private var _cachedRules: DataFrame = null
+
   /**
    * Get association rules fitted by AssociationRules using the minConfidence. Returns a dataframe
    * with three fields, "antecedent", "consequent" and "confidence", where "antecedent" and
    * "consequent" are Array[T] and "confidence" is Double.
    */
   @Since("2.2.0")
-  @transient lazy val associationRules: DataFrame = {
-    AssociationRules.getAssociationRulesFromFP(freqItemsets, "items", "freq", $(minConfidence))
+  @transient def associationRules: DataFrame = {
+    if ($(minConfidence) == _cachedMinConf) {
+      _cachedRules
+    } else {
+      _cachedRules = AssociationRules
+        .getAssociationRulesFromFP(freqItemsets, "items", "freq", $(minConfidence))
+      _cachedMinConf = $(minConfidence)
+      _cachedRules
+    }
   }
 
   /**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/fpm/FPGrowthSuite.scala
@@ -85,6 +85,36 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     assert(prediction.select("prediction").where("id=3").first().getSeq[String](0).isEmpty)
   }
 
+  test("FPGrowth prediction should not contain duplicates") {
+    // This should generate rule 1 -> 3, 2 -> 3
+    val dataset = spark.createDataFrame(Seq(
+      Array("1", "3"),
+      Array("2", "3")
+    ).map(Tuple1(_))).toDF("features")
+    val model = new FPGrowth().fit(dataset)
+
+    val prediction = model.transform(
+      spark.createDataFrame(Seq(Tuple1(Array("1", "2")))).toDF("features")
+    ).first().getAs[Seq[String]]("prediction")
+
+    assert(prediction === Seq("3"))
+  }
+
+  test("FPGrowthModel setMinConfidence should affect rules generation and transform") {
+    val model = new FPGrowth().setMinSupport(0.1).setMinConfidence(0.1).fit(dataset)
+    val oldRulesNum = model.associationRules.count()
+    assert(oldRulesNum == model.associationRules.count())
+    val oldPredict = model.transform(dataset)
+
+    model.setMinConfidence(0.1)
+    assert(oldRulesNum === model.associationRules.count())
+    assert(model.transform(dataset).collect().toSet.equals(oldPredict.collect().toSet))
+
+    model.setMinConfidence(0.8765)
+    assert(oldRulesNum > model.associationRules.count())
+    assert(!model.transform(dataset).collect().toSet.equals(oldPredict.collect().toSet))
+  }
+
   test("FPGrowth parameter check") {
     val fpGrowth = new FPGrowth().setMinSupport(0.4567)
     val model = fpGrowth.fit(dataset)
@@ -95,28 +125,17 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
 
   test("read/write") {
     def checkModelData(model: FPGrowthModel, model2: FPGrowthModel): Unit = {
-      assert(model.freqItemsets.sort("items").collect() ===
-        model2.freqItemsets.sort("items").collect())
+      assert(model.freqItemsets.collect().toSet.equals(
+        model2.freqItemsets.collect().toSet))
+      assert(model.associationRules.collect().toSet.equals(
+        model2.associationRules.collect().toSet))
+      assert(model.setMinConfidence(0.9).associationRules.collect().toSet.equals(
+        model2.setMinConfidence(0.9).associationRules.collect().toSet))
     }
     val fPGrowth = new FPGrowth()
     testEstimatorAndModelReadWrite(fPGrowth, dataset, FPGrowthSuite.allParamSettings,
       FPGrowthSuite.allParamSettings, checkModelData)
   }
-
-  test("FPGrowth prediction should not contain duplicates") {
-    // This should generate rule 1 -> 3, 2 -> 3
-    val dataset = spark.createDataFrame(Seq(
-      Array("1", "3"),
-      Array("2", "3")
-    ).map(Tuple1(_))).toDF("features")
-    val model = new FPGrowth().fit(dataset)
-
-    val prediction = model.transform(
-      spark.createDataFrame(Seq(Tuple1(Array("1", "2")))).toDF("features")
-    ).first().getAs[Seq[String]]("prediction")
-
-    assert(prediction === Seq("3"))
-  }
 }
 
 object FPGrowthSuite {