@@ -41,166 +41,54 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {
4141 maxPatternLength : Int ,
4242 prefixes : List [Int ],
4343 database : Iterable [Array [Int ]]): Iterator [(List [Int ], Long )] = {
44- if (prefixes.count(_ != - 1 ) == maxPatternLength || database.isEmpty) return Iterator .empty
45- val frequentPrefixAndCounts = getFreqPrefixAndCounts(minCount, prefixes, database)
46- frequentPrefixAndCounts.iterator.flatMap { case (prefix, count) =>
47- val newProjected = project(database, prefix)
48- Iterator .single((prefix, count)) ++
49- run(minCount, maxPatternLength, prefix, newProjected)
44+ if (prefixes.length == maxPatternLength || database.isEmpty) return Iterator .empty
45+ val frequentItemAndCounts = getFreqItemAndCounts(minCount, database)
46+ val filteredDatabase = database.map(x => x.filter(frequentItemAndCounts.contains))
47+ frequentItemAndCounts.iterator.flatMap { case (item, count) =>
48+ val newPrefixes = item :: prefixes
49+ val newProjected = project(filteredDatabase, item)
50+ Iterator .single((newPrefixes, count)) ++
51+ run(minCount, maxPatternLength, newPrefixes, newProjected)
5052 }
5153 }
5254
5355 /**
54- * Calculate suffix sequence immediately after the first occurrence of a prefix .
55- * @param prefix prefix to get suffix after
56+ * Calculate suffix sequence immediately after the first occurrence of an item .
57+ * @param item item to get suffix after
5658 * @param sequence sequence to extract suffix from
5759 * @return suffix sequence
5860 */
59- def getSuffix (prefix : List [Int ], sequence : Array [Int ]): (Boolean , Array [Int ]) = {
60- val element = getLastElement(prefix)
61- if (sequence.apply(0 ) != - 3 ) {
62- if (element.length == 1 ) {
63- getSingleItemElementSuffix(element, sequence)
64- } else {
65- getMultiItemsElementSuffix(element, sequence)
66- }
67- } else {
68- if (element.length == 1 ) {
69- val firstElemPos = sequence.indexOf(- 1 )
70- if (firstElemPos == - 1 ) {
71- (false , Array ())
72- } else {
73- getSingleItemElementSuffix(element, sequence.drop(firstElemPos + 1 ))
74- }
75- } else {
76- val newSequence = element.take(element.length - 1 ) ++ sequence.drop(1 )
77- getMultiItemsElementSuffix(element, newSequence)
78- }
79- }
80- }
81-
82- private def getLastElement (prefix : List [Int ]): Array [Int ] = {
83- val pos = prefix.indexOf(- 1 )
84- if (pos == - 1 ) {
85- prefix.reverse.toArray
86- } else {
87- prefix.take(pos).reverse.toArray
88- }
89- }
90-
91- private def getSingleItemElementSuffix (
92- element : Array [Int ],
93- sequence : Array [Int ]): (Boolean , Array [Int ]) = {
94- val index = sequence.indexOf(element.apply(0 ))
61+ def getSuffix (item : Int , sequence : Array [Int ]): Array [Int ] = {
62+ val index = sequence.indexOf(item)
9563 if (index == - 1 ) {
96- (false , Array ())
97- } else if (index == sequence.length - 1 ) {
98- (true , Array ())
99- } else if (sequence.apply(index + 1 ) == - 1 ) {
100- (true , sequence.drop(index + 2 ))
101- } else {
102- (true , - 3 +: sequence.drop(index + 1 ))
103- }
104- }
105-
106- private def getMultiItemsElementSuffix (
107- element : Array [Int ],
108- sequence : Array [Int ]): (Boolean , Array [Int ]) = {
109- var seqPos = 0
110- var found = false
111- while (seqPos < sequence.length && ! found) {
112- var elemPos = 0
113- while (! found && elemPos < element.length &&
114- seqPos < sequence.length && sequence.apply(seqPos) != - 1 ) {
115- if (element.apply(elemPos) == sequence.apply(seqPos)) {
116- elemPos += 1
117- seqPos += 1
118- } else {
119- seqPos += 1
120- }
121- found = elemPos == element.length
122- }
123- if (! found) seqPos += 1
124- }
125- if (found) {
126- if (sequence.apply(seqPos) == - 1 ) {
127- (true , sequence.drop(seqPos + 1 ))
128- } else {
129- (true , - 3 +: sequence.drop(seqPos))
130- }
64+ Array ()
13165 } else {
132- ( false , Array () )
66+ sequence.drop(index + 1 )
13367 }
13468 }
13569
136- def project (database : Iterable [Array [Int ]], prefix : List [ Int ] ): Iterable [Array [Int ]] = {
70+ def project (database : Iterable [Array [Int ]], prefix : Int ): Iterable [Array [Int ]] = {
13771 database
138- .map(getSuffix(prefix, _)._2 )
72+ .map(getSuffix(prefix, _))
13973 .filter(_.nonEmpty)
14074 }
14175
14276 /**
143- * Generates frequent prefix by filtering the input data using minimal count level.
77+ * Generates frequent items by filtering the input data using minimal count level.
14478 * @param minCount the minimum count for an item to be frequent
145- * @param prefix the minimum count for an item to be frequent
14679 * @param database database of sequences
14780 * @return freq item to count map
14881 */
149- private def getFreqPrefixAndCounts (
82+ private def getFreqItemAndCounts (
15083 minCount : Long ,
151- prefix : List [Int ],
152- database : Iterable [Array [Int ]]): mutable.Map [List [Int ], Long ] = {
84+ database : Iterable [Array [Int ]]): mutable.Map [Int , Long ] = {
15385 // TODO: use PrimitiveKeyOpenHashMap
154-
155- // get frequent items
156- val freqItems = database
157- .flatMap(_.distinct.filter(x => x != - 1 && x != - 3 ))
158- .groupBy(x => x)
159- .mapValues(_.size)
160- .filter(_._2 >= minCount)
161- .map(_._1)
162- if (freqItems.isEmpty) return mutable.Map [List [Int ], Long ]()
163-
164- // get prefixes and counts
165- val singleItemCounts = mutable.Map [Int , Long ]().withDefaultValue(0L )
166- val multiItemsCounts = mutable.Map [Int , Long ]().withDefaultValue(0L )
167- val prefixLastElement = getLastElement(prefix)
86+ val counts = mutable.Map [Int , Long ]().withDefaultValue(0L )
16887 database.foreach { sequence =>
169- if (sequence.apply(0 ) != - 3 ) {
170- freqItems.foreach { item =>
171- if (getSingleItemElementSuffix(Array (item), sequence)._1) {
172- singleItemCounts(item) += 1
173- }
174- if (prefixLastElement.nonEmpty &&
175- getMultiItemsElementSuffix(prefixLastElement :+ item, sequence)._1) {
176- multiItemsCounts(item) += 1
177- }
178- }
179- } else {
180- val firstElemPos = sequence.indexOf(- 1 )
181- if (firstElemPos != - 1 ) {
182- val newSequence = sequence.drop(firstElemPos + 1 )
183- freqItems.foreach { item =>
184- if (getSingleItemElementSuffix(Array (item), newSequence)._1) {
185- singleItemCounts(item) += 1
186- }
187- }
188- }
189- val newSequence = prefixLastElement ++ sequence.drop(1 )
190- freqItems.foreach { item =>
191- if (prefixLastElement.nonEmpty &&
192- getMultiItemsElementSuffix(prefixLastElement :+ item, newSequence)._1) {
193- multiItemsCounts(item) += 1
194- }
195- }
88+ sequence.distinct.foreach { item =>
89+ counts(item) += 1L
19690 }
19791 }
198-
199- if (prefix.nonEmpty) {
200- singleItemCounts.filter(_._2 >= minCount).map(x => (x._1 :: (- 1 :: prefix), x._2)) ++
201- multiItemsCounts.filter(_._2 >= minCount).map(x => (x._1 :: prefix, x._2))
202- } else {
203- singleItemCounts.filter(_._2 >= minCount).map(x => (List (x._1), x._2))
204- }
92+ counts.filter(_._2 >= minCount)
20593 }
20694}
0 commit comments