Add -1 delimiter

Feynman Liang · Feynman Liang · commit f1114b9651dc · 2015-07-31T11:08:13.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/LocalPrefixSpan.scala
@@ -25,7 +25,7 @@ import org.apache.spark.Logging
  * Calculate all patterns of a projected database in local.
  */
 private[fpm] object LocalPrefixSpan extends Logging with Serializable {
-
+  import PrefixSpan._
   /**
    * Calculate all patterns of a projected database.
    * @param minCount minimum count
@@ -43,7 +43,9 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {
       database: Iterable[Array[Int]]): Iterator[(List[Int], Long)] = {
     if (prefixes.length == maxPatternLength || database.isEmpty) return Iterator.empty
     val frequentItemAndCounts = getFreqItemAndCounts(minCount, database)
-    val filteredDatabase = database.map(x => x.filter(frequentItemAndCounts.contains))
+    val filteredDatabase = database.map { suffix =>
+      suffix.filter(item => item == DELIMITER || frequentItemAndCounts.contains(item))
+    }
     frequentItemAndCounts.iterator.flatMap { case (item, count) =>
       val newPrefixes = item :: prefixes
       val newProjected = project(filteredDatabase, item)
@@ -63,7 +65,8 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {
     if (index == -1) {
       Array()
     } else {
-      sequence.drop(index + 1)
+      // drop until we get to the next delimiter (or end of sequence)
+      sequence.drop(index).dropWhile(_ != DELIMITER).drop(1)
     }
   }
 
@@ -89,6 +92,6 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {
         counts(item) += 1L
       }
     }
-    counts.filter(_._2 >= minCount)
+    counts.filter { case (item, count) => (count >= minCount) && (item != DELIMITER) }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -44,6 +44,7 @@ import org.apache.spark.storage.StorageLevel
 class PrefixSpan private (
     private var minSupport: Double,
     private var maxPatternLength: Int) extends Logging with Serializable {
+  import PrefixSpan._
 
   /**
    * The maximum number of items allowed in a projected database before local processing. If a
@@ -90,12 +91,14 @@ class PrefixSpan private (
 
   /**
    * Find the complete set of sequential patterns in the input sequences.
-   * @param sequences input data set, contains a set of sequences,
-   *                  a sequence is an ordered list of elements.
+   * @param sequences a dataset of sequences. Items in a sequence are represented by non-negative
+   *                  integers and delimited by [[DELIMITER]]. Non-temporal sequences
+   *                  are supported by placing more than one item between delimiters.
    * @return a set of sequential pattern pairs,
    *         the key of pair is pattern (a list of elements),
    *         the value of pair is the pattern's count.
    */
+  // TODO: generalize to arbitrary item-types and use mapping to Ints for internal algorithm
   def run(sequences: RDD[Array[Int]]): RDD[(Array[Int], Long)] = {
     val sc = sequences.sparkContext
 
@@ -110,14 +113,14 @@ class PrefixSpan private (
     val freqItemCounts = sequences
       .flatMap(seq => seq.distinct.map(item => (item, 1L)))
       .reduceByKey(_ + _)
-      .filter(_._2 >= minCount)
+      .filter { case (item, count) => (count >= minCount) && (item != DELIMITER) }
       .collect()
 
     // Pairs of (length 1 prefix, suffix consisting of frequent items)
     val itemSuffixPairs = {
       val freqItems = freqItemCounts.map(_._1).toSet
       sequences.flatMap { seq =>
-        val filteredSeq = seq.filter(freqItems.contains(_))
+        val filteredSeq = seq.filter(item => item == DELIMITER || freqItems.contains(item))
         freqItems.flatMap { item =>
           val candidateSuffix = LocalPrefixSpan.getSuffix(item, filteredSeq)
           candidateSuffix match {
@@ -127,7 +130,6 @@ class PrefixSpan private (
         }
       }
     }
-
     // Accumulator for the computed results to be returned, initialized to the frequent items (i.e.
     // frequent length-one prefixes)
     var resultsAccumulator = freqItemCounts.map(x => (List(x._1), x._2))
@@ -197,7 +199,7 @@ class PrefixSpan private (
     val prefixItemPairAndCounts = prefixSuffixPairs
       .flatMap { case (prefix, suffix) => suffix.distinct.map(y => ((prefix, y), 1L)) }
       .reduceByKey(_ + _)
-      .filter(_._2 >= minCount)
+      .filter { case (item, count) => (count >= minCount) && (item != DELIMITER) }
 
     // Map from prefix to set of possible next items from suffix
     val prefixToNextItems = prefixItemPairAndCounts
@@ -247,3 +249,7 @@ class PrefixSpan private (
     }
   }
 }
+
+object PrefixSpan {
+  val DELIMITER = -1
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/PrefixSpanSuite.scala
@@ -40,7 +40,7 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
       Array(2, 4, 1),
       Array(3, 1, 3, 4, 5),
       Array(3, 4, 4, 3),
-      Array(6, 5, 3))
+      Array(6, 5, 3)).map(insertDelimiter)
 
     val rdd = sc.parallelize(sequences, 2).cache()
 
@@ -69,7 +69,7 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
       (Array(4, 5), 2L),
       (Array(5), 3L)
     )
-    assert(compareResults(expectedValue1, result1.collect()))
+    compareResults(expectedValue1, result1.collect())
 
     prefixspan.setMinSupport(0.5).setMaxPatternLength(50)
     val result2 = prefixspan.run(rdd)
@@ -80,7 +80,7 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
       (Array(4), 4L),
       (Array(5), 3L)
     )
-    assert(compareResults(expectedValue2, result2.collect()))
+    compareResults(expectedValue2, result2.collect())
 
     prefixspan.setMinSupport(0.33).setMaxPatternLength(2)
     val result3 = prefixspan.run(rdd)
@@ -100,14 +100,20 @@ class PrefixSpanSuite extends SparkFunSuite with MLlibTestSparkContext {
       (Array(4, 5), 2L),
       (Array(5), 3L)
     )
-    assert(compareResults(expectedValue3, result3.collect()))
+    compareResults(expectedValue3, result3.collect())
   }
 
   private def compareResults(
     expectedValue: Array[(Array[Int], Long)],
-    actualValue: Array[(Array[Int], Long)]): Boolean = {
-    expectedValue.map(x => (x._1.toSeq, x._2)).toSet ==
-      actualValue.map(x => (x._1.toSeq, x._2)).toSet
+    actualValue: Array[(Array[Int], Long)]): Unit = {
+    assert(expectedValue.map(x => (x._1.toSeq, x._2)).toSet ===
+      actualValue.map(x => (x._1.toSeq, x._2)).toSet)
+  }
+
+  private def insertDelimiter(sequence: Array[Int]): Array[Int] = {
+    sequence.zip(Seq.fill(sequence.length)(PrefixSpan.DELIMITER)).map { case (a, b) =>
+      List(a, b)
+    }.flatten
   }
 
 }

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ import org.apache.spark.Logging`
`25`	`25`	`* Calculate all patterns of a projected database in local.`
`26`	`26`	`*/`
`27`	`27`	`private[fpm] object LocalPrefixSpan extends Logging with Serializable {`
`28`		`-`
	`28`	`+ import PrefixSpan._`
`29`	`29`	`/**`
`30`	`30`	`* Calculate all patterns of a projected database.`
`31`	`31`	`* @param minCount minimum count`
`@@ -43,7 +43,9 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {`
`43`	`43`	`database: Iterable[Array[Int]]): Iterator[(List[Int], Long)] = {`
`44`	`44`	`if (prefixes.length == maxPatternLength \|\| database.isEmpty) return Iterator.empty`
`45`	`45`	`val frequentItemAndCounts = getFreqItemAndCounts(minCount, database)`
`46`		`- val filteredDatabase = database.map(x => x.filter(frequentItemAndCounts.contains))`
	`46`	`+ val filteredDatabase = database.map { suffix =>`
	`47`	`+ suffix.filter(item => item == DELIMITER \|\| frequentItemAndCounts.contains(item))`
	`48`	`+ }`
`47`	`49`	`frequentItemAndCounts.iterator.flatMap { case (item, count) =>`
`48`	`50`	`val newPrefixes = item :: prefixes`
`49`	`51`	`val newProjected = project(filteredDatabase, item)`
`@@ -63,7 +65,8 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {`
`63`	`65`	`if (index == -1) {`
`64`	`66`	`Array()`
`65`	`67`	`} else {`
`66`		`- sequence.drop(index + 1)`
	`68`	`+ // drop until we get to the next delimiter (or end of sequence)`
	`69`	`+ sequence.drop(index).dropWhile(_ != DELIMITER).drop(1)`
`67`	`70`	`}`
`68`	`71`	`}`
`69`	`72`
`@@ -89,6 +92,6 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {`
`89`	`92`	`counts(item) += 1L`
`90`	`93`	`}`
`91`	`94`	`}`
`92`		`- counts.filter(_._2 >= minCount)`
	`95`	`+ counts.filter { case (item, count) => (count >= minCount) && (item != DELIMITER) }`
`93`	`96`	`}`
`94`	`97`	`}`