From 2a21d6d4ab294c515ec23980cd9eda4508576a05 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Sun, 8 Feb 2015 22:11:22 +0000 Subject: [PATCH 1/2] Initial draft of FeatureAttributes class --- .../apache/spark/ml/attribute/Attribute.scala | 64 ++++++++++++ .../ml/attribute/CategoricalAttribute.scala | 59 +++++++++++ .../ml/attribute/ContinuousAttribute.scala | 58 +++++++++++ .../ml/attribute/FeatureAttributes.scala | 97 +++++++++++++++++++ .../spark/ml/attribute/FeatureType.scala | 31 ++++++ 5 files changed, 309 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala new file mode 100644 index 0000000000000..8d41187b3e917 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +import org.apache.spark.ml.attribute.FeatureType.FeatureType +import org.apache.spark.sql.types.{MetadataBuilder, Metadata} + +abstract class Attribute(val index: Int, + val name: Option[String], + val dimension: Int) { + + require(index >= 0) + require(dimension >= 1) + + def featureType: FeatureType + + def toMetadata(): Metadata + + private[attribute] def toBaseMetadata(): MetadataBuilder = { + val builder = new MetadataBuilder() + builder.putLong("index", index) + if (name.isDefined) { + builder.putString("name", name.get) + } + if (dimension > 1) { + builder.putLong("dimension", dimension) + } + builder + } + +} + +object Attribute { + + def fromMetadata(metadata: Metadata): Attribute = { + FeatureType.withName(metadata.getString("type")) match { + case FeatureType.CATEGORICAL => CategoricalAttribute.fromMetadata(metadata) + case FeatureType.CONTINUOUS => ContinuousAttribute.fromMetadata(metadata) + } + } + + private[attribute] def parseBaseMetadata(metadata: Metadata): (Int, Option[String], Int) = { + val index = metadata.getLong("index").toInt + val name = if (metadata.contains("name")) Some(metadata.getString("name")) else None + val dimension = if (metadata.contains("dimension")) metadata.getLong("dimension").toInt else 1 + (index, name, dimension) + } + +} \ No newline at end of file diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala new file mode 100644 index 0000000000000..2b317d735a906 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +import org.apache.spark.ml.attribute.FeatureType.FeatureType +import org.apache.spark.sql.types.Metadata + +class CategoricalAttribute private ( + override val index: Int, + override val name: Option[String], + override val dimension: Int, + val categories: Option[Array[String]]) extends Attribute(index, name, dimension) { + + require(!categories.isDefined || categories.get.nonEmpty) + + override def featureType: FeatureType = FeatureType.CATEGORICAL + + def numCategories: Option[Int] = + if (categories.isDefined) Some(categories.get.length) else None + + override def toMetadata(): Metadata = { + val builder = toBaseMetadata() + if (categories.isDefined) { + builder.putStringArray("categories", categories.get) + } + builder.build() + } + +} + +private[attribute] object CategoricalAttribute { + + def fromMetadata(metadata: Metadata): CategoricalAttribute = { + val (index, name, dimension) = Attribute.parseBaseMetadata(metadata) + val categories = + if (metadata.contains("categories")) { + Some(metadata.getStringArray("categories")) + } else { + None + } + new CategoricalAttribute(index, name, dimension, categories) + } + +} \ No newline at end of file diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala new file mode 100644 index 0000000000000..f56ae57c315e7 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +import org.apache.spark.ml.attribute.FeatureType.FeatureType +import org.apache.spark.sql.types.Metadata + +class ContinuousAttribute private ( + override val index: Int, + override val name: Option[String], + override val dimension: Int, + val min: Option[Double], + val max: Option[Double]) extends Attribute(index, name, dimension) { + + if (min.isDefined && max.isDefined) { + require(min.get <= max.get) + } + + override def featureType(): FeatureType = FeatureType.CONTINUOUS + + override def toMetadata(): Metadata = { + val builder = toBaseMetadata() + if (min.isDefined) { + builder.putDouble("min", min.get) + } + if (max.isDefined) { + builder.putDouble("max", max.get) + } + builder.build() + } + +} + +private[attribute] object ContinuousAttribute { + + def fromMetadata(metadata: Metadata): ContinuousAttribute = { + val (index, name, dimension) = Attribute.parseBaseMetadata(metadata) + val min = if (metadata.contains("min")) Some(metadata.getDouble("min")) else None + val max = if (metadata.contains("max")) Some(metadata.getDouble("max")) else None + new ContinuousAttribute(index, name, dimension, min, max) + } + +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala new file mode 100644 index 0000000000000..6a947a3d50dc8 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +import org.apache.spark.sql.types.{MetadataBuilder, Metadata} + +/** + * Wrapper around [[Metadata]] with specialized methods for accessing information about + * data as machine learning features, and their associated attributes, like: + * + * - type (continuous, categorical, etc.) as [[FeatureType]] + * - for categorical features, the category values + * + * This information is stored as a [[Metadata]] under key "features", and contains an array of + * [[Metadata]] inside that for each feature for which metadata is defined. Example: + * + * {{{ + * { + * ... + * "features" : [ + * { + * "index": 0, + * "name": "age", + * "type": "CONTINUOUS", + * "min": 0 + * }, + * { + * "index": 5, + * "name": "gender", + * "type": "CATEGORICAL", + * "categories" : [ "male", "female" ] + * }, + * { + * "index": 7, + * "name": "percentAllocations", + * "type": "CONTINUOUS", + * "dimension": 10, + * "min": 0, + * "max": 1 + * ] + * "producer": "..." + * ... + * } + * }}} + */ +class FeatureAttributes private (val attributes: Array[Attribute], + val producer: Option[String]) { + + private val nameToIndex: Map[String,Int] = + attributes.filter(_.name.isDefined).map(att => (att.name.get, att.index)).toMap + private val indexToAttribute: Map[Int,Attribute] = + attributes.map(att => (att.index, att)).toMap + private val categoricalIndices: Array[Int] = + attributes.filter(_.featureType == FeatureType.CATEGORICAL).map(_.index) + + def getFeatureAttribute(index: Int): Option[Attribute] = indexToAttribute.get(index) + + def getFeatureIndex(featureName: String): Option[Int] = nameToIndex.get(featureName) + + def categoricalFeatureIndices(): Array[Int] = categoricalIndices + + def toMetadata(): Metadata = { + val builder = new MetadataBuilder() + builder.putMetadataArray("features", attributes.map(_.toMetadata())) + if (producer.isDefined) { + builder.putString("producer", producer.get) + } + builder.build() + } + +} + +object FeatureAttributes { + + def fromMetadata(metadata: Metadata): FeatureAttributes = { + val attributes = metadata.getMetadataArray("features").map(Attribute.fromMetadata(_)) + val producer = + if (metadata.contains("producer")) Some(metadata.getString("producer")) else None + new FeatureAttributes(attributes, producer) + } + +} \ No newline at end of file diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala new file mode 100644 index 0000000000000..d3f23ef7cf12a --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +/** + * Enumeration of machine learning feature types. + */ +object FeatureType extends Enumeration { + + type FeatureType = Value + + // CATEGORICAL = discrete, unordered value + // CONTINUOUS = ordered numeric value; also used for discrete numeric values now + val CATEGORICAL, CONTINUOUS = Value + +} From 7c944da8b2a1aa9ec9d70fabe77b17d54c08b291 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Sat, 21 Feb 2015 12:54:31 +0000 Subject: [PATCH 2/2] Add FeatureType hierarchy and categorical cardinality --- .../apache/spark/ml/attribute/Attribute.scala | 7 ++-- .../ml/attribute/CategoricalAttribute.scala | 34 ++++++++++++++----- .../ml/attribute/ContinuousAttribute.scala | 3 +- .../ml/attribute/FeatureAttributes.scala | 18 ++++++++-- .../spark/ml/attribute/FeatureType.scala | 23 ++++++++----- 5 files changed, 59 insertions(+), 26 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala index 8d41187b3e917..afb03b488ac56 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala @@ -17,7 +17,6 @@ package org.apache.spark.ml.attribute -import org.apache.spark.ml.attribute.FeatureType.FeatureType import org.apache.spark.sql.types.{MetadataBuilder, Metadata} abstract class Attribute(val index: Int, @@ -48,9 +47,9 @@ abstract class Attribute(val index: Int, object Attribute { def fromMetadata(metadata: Metadata): Attribute = { - FeatureType.withName(metadata.getString("type")) match { - case FeatureType.CATEGORICAL => CategoricalAttribute.fromMetadata(metadata) - case FeatureType.CONTINUOUS => ContinuousAttribute.fromMetadata(metadata) + FeatureTypes.withName(metadata.getString("type")) match { + case Categorical => CategoricalAttribute.fromMetadata(metadata) + case Continuous => ContinuousAttribute.fromMetadata(metadata) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala index 2b317d735a906..4c54839863211 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala @@ -17,27 +17,28 @@ package org.apache.spark.ml.attribute -import org.apache.spark.ml.attribute.FeatureType.FeatureType import org.apache.spark.sql.types.Metadata class CategoricalAttribute private ( override val index: Int, override val name: Option[String], override val dimension: Int, - val categories: Option[Array[String]]) extends Attribute(index, name, dimension) { + val categories: Option[Array[String]], + val cardinality: Option[Int]) extends Attribute(index, name, dimension) { require(!categories.isDefined || categories.get.nonEmpty) + require(!cardinality.isDefined || cardinality.get > 0) - override def featureType: FeatureType = FeatureType.CATEGORICAL - - def numCategories: Option[Int] = - if (categories.isDefined) Some(categories.get.length) else None + override def featureType: FeatureType = Categorical override def toMetadata(): Metadata = { val builder = toBaseMetadata() if (categories.isDefined) { builder.putStringArray("categories", categories.get) } + if (cardinality.isDefined) { + builder.putLong("cardinality", cardinality.get) + } builder.build() } @@ -47,13 +48,28 @@ private[attribute] object CategoricalAttribute { def fromMetadata(metadata: Metadata): CategoricalAttribute = { val (index, name, dimension) = Attribute.parseBaseMetadata(metadata) - val categories = + + var cardinality: Option[Int] = + if (metadata.contains("cardinality")) { + Some(metadata.getLong("cardinality").toInt) + } else { + None + } + + val categories: Option[Array[String]] = if (metadata.contains("categories")) { - Some(metadata.getStringArray("categories")) + val theCategories = Some(metadata.getStringArray("categories")) + if (cardinality.isDefined) { + require(theCategories.get.size <= cardinality.get) + } else { + cardinality = Some(theCategories.get.size) + } + theCategories } else { None } - new CategoricalAttribute(index, name, dimension, categories) + + new CategoricalAttribute(index, name, dimension, categories, cardinality) } } \ No newline at end of file diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala index f56ae57c315e7..137cc62921863 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala @@ -17,7 +17,6 @@ package org.apache.spark.ml.attribute -import org.apache.spark.ml.attribute.FeatureType.FeatureType import org.apache.spark.sql.types.Metadata class ContinuousAttribute private ( @@ -31,7 +30,7 @@ class ContinuousAttribute private ( require(min.get <= max.get) } - override def featureType(): FeatureType = FeatureType.CONTINUOUS + override def featureType(): FeatureType = Continuous override def toMetadata(): Metadata = { val builder = toBaseMetadata() diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala index 6a947a3d50dc8..fe69a6dc27203 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala @@ -20,11 +20,14 @@ package org.apache.spark.ml.attribute import org.apache.spark.sql.types.{MetadataBuilder, Metadata} /** - * Wrapper around [[Metadata]] with specialized methods for accessing information about - * data as machine learning features, and their associated attributes, like: + * Representation of specialized information in a [[Metadata]] concerning + * data as machine learning features, with methods to access their associated attributes, like: * * - type (continuous, categorical, etc.) as [[FeatureType]] + * - optional feature name * - for categorical features, the category values + * - for continuous values, maximum and minimum value + * - dimension for vector-valued features * * This information is stored as a [[Metadata]] under key "features", and contains an array of * [[Metadata]] inside that for each feature for which metadata is defined. Example: @@ -46,6 +49,12 @@ import org.apache.spark.sql.types.{MetadataBuilder, Metadata} * "categories" : [ "male", "female" ] * }, * { + * "index": 6, + * "name": "customerType", + * "type": "CATEGORICAL", + * "cardinality": 10 + * }, + * { * "index": 7, * "name": "percentAllocations", * "type": "CONTINUOUS", @@ -66,7 +75,10 @@ class FeatureAttributes private (val attributes: Array[Attribute], private val indexToAttribute: Map[Int,Attribute] = attributes.map(att => (att.index, att)).toMap private val categoricalIndices: Array[Int] = - attributes.filter(_.featureType == FeatureType.CATEGORICAL).map(_.index) + attributes.filter(_.featureType match { + case c: CategoricalFeatureType => true + case _ => false + }).map(_.index) def getFeatureAttribute(index: Int): Option[Attribute] = indexToAttribute.get(index) diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala index d3f23ef7cf12a..1ec9599be4696 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala @@ -17,15 +17,22 @@ package org.apache.spark.ml.attribute -/** - * Enumeration of machine learning feature types. - */ -object FeatureType extends Enumeration { +sealed trait FeatureType - type FeatureType = Value +sealed trait ContinuousFeatureType extends FeatureType +sealed trait CategoricalFeatureType extends FeatureType +sealed trait DiscreteFeatureType extends ContinuousFeatureType - // CATEGORICAL = discrete, unordered value - // CONTINUOUS = ordered numeric value; also used for discrete numeric values now - val CATEGORICAL, CONTINUOUS = Value +case object Continuous extends ContinuousFeatureType +case object Categorical extends CategoricalFeatureType +case object Discrete extends DiscreteFeatureType +case object Binary extends DiscreteFeatureType with CategoricalFeatureType +object FeatureTypes { + def withName(name: String): FeatureType = name match { + case "CONTINUOUS" => Continuous + case "CATEGORICAL" => Categorical + case "DISCRETE" => Discrete + case "BINARY" => Binary + } }