@@ -44,6 +44,7 @@ import org.apache.spark.storage.StorageLevel
4444class PrefixSpan private (
4545 private var minSupport : Double ,
4646 private var maxPatternLength : Int ) extends Logging with Serializable {
47+ import PrefixSpan ._
4748
4849 /**
4950 * The maximum number of items allowed in a projected database before local processing. If a
@@ -90,12 +91,14 @@ class PrefixSpan private (
9091
9192 /**
9293 * Find the complete set of sequential patterns in the input sequences.
93- * @param sequences input data set, contains a set of sequences,
94- * a sequence is an ordered list of elements.
94+ * @param sequences a dataset of sequences. Items in a sequence are represented by non-negative
95+ * integers and delimited by [[DELIMITER ]]. Non-temporal sequences
96+ * are supported by placing more than one item between delimiters.
9597 * @return a set of sequential pattern pairs,
9698 * the key of pair is pattern (a list of elements),
9799 * the value of pair is the pattern's count.
98100 */
101+ // TODO: generalize to arbitrary item-types and use mapping to Ints for internal algorithm
99102 def run (sequences : RDD [Array [Int ]]): RDD [(Array [Int ], Long )] = {
100103 val sc = sequences.sparkContext
101104
@@ -110,14 +113,14 @@ class PrefixSpan private (
110113 val freqItemCounts = sequences
111114 .flatMap(seq => seq.distinct.map(item => (item, 1L )))
112115 .reduceByKey(_ + _)
113- .filter(_._2 > = minCount)
116+ .filter { case (item, count) => (count > = minCount) && (item != DELIMITER ) }
114117 .collect()
115118
116119 // Pairs of (length 1 prefix, suffix consisting of frequent items)
117120 val itemSuffixPairs = {
118121 val freqItems = freqItemCounts.map(_._1).toSet
119122 sequences.flatMap { seq =>
120- val filteredSeq = seq.filter(freqItems.contains(_ ))
123+ val filteredSeq = seq.filter(item => item == DELIMITER || freqItems.contains(item ))
121124 freqItems.flatMap { item =>
122125 val candidateSuffix = LocalPrefixSpan .getSuffix(item, filteredSeq)
123126 candidateSuffix match {
@@ -127,7 +130,6 @@ class PrefixSpan private (
127130 }
128131 }
129132 }
130-
131133 // Accumulator for the computed results to be returned, initialized to the frequent items (i.e.
132134 // frequent length-one prefixes)
133135 var resultsAccumulator = freqItemCounts.map(x => (List (x._1), x._2))
@@ -197,7 +199,7 @@ class PrefixSpan private (
197199 val prefixItemPairAndCounts = prefixSuffixPairs
198200 .flatMap { case (prefix, suffix) => suffix.distinct.map(y => ((prefix, y), 1L )) }
199201 .reduceByKey(_ + _)
200- .filter(_._2 > = minCount)
202+ .filter { case (item, count) => (count > = minCount) && (item != DELIMITER ) }
201203
202204 // Map from prefix to set of possible next items from suffix
203205 val prefixToNextItems = prefixItemPairAndCounts
@@ -247,3 +249,7 @@ class PrefixSpan private (
247249 }
248250 }
249251}
252+
253+ object PrefixSpan {
254+ val DELIMITER = - 1
255+ }
0 commit comments