Skip to content

Commit 3a16cc2

Browse files
committed
refine comments and names
1 parent ac77859 commit 3a16cc2

File tree

2 files changed

+30
-18
lines changed

2 files changed

+30
-18
lines changed

mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,21 @@ import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
3232
* `Bucketizer` maps a column of continuous features to a column of feature buckets.
3333
*/
3434
@AlphaComponent
35-
private[ml] final class Bucketizer(override val parent: Estimator[Bucketizer])
35+
final class Bucketizer private[ml] (override val parent: Estimator[Bucketizer])
3636
extends Model[Bucketizer] with HasInputCol with HasOutputCol {
3737

3838
def this() = this(null)
3939

4040
/**
4141
* Parameter for mapping continuous features into buckets. With n splits, there are n+1 buckets.
42-
* A bucket defined by splits x,y holds values in the range [x,y).
42+
* A bucket defined by splits x,y holds values in the range [x,y). Note that the splits should be
43+
* strictly increasing.
4344
* @group param
4445
*/
4546
val splits: Param[Array[Double]] = new Param[Array[Double]](this, "splits",
46-
"Split points for mapping continuous features into buckets. With n splits, there are n+1" +
47-
" buckets. A bucket defined by splits x,y holds values in the range [x,y).",
47+
"Split points for mapping continuous features into buckets. With n splits, there are n+1 " +
48+
"buckets. A bucket defined by splits x,y holds values in the range [x,y). The splits " +
49+
"should be strictly increasing.",
4850
Bucketizer.checkSplits)
4951

5052
/** @group getParam */
@@ -53,9 +55,15 @@ private[ml] final class Bucketizer(override val parent: Estimator[Bucketizer])
5355
/** @group setParam */
5456
def setSplits(value: Array[Double]): this.type = set(splits, value)
5557

56-
/** @group Param */
58+
/**
59+
* An indicator of the inclusiveness of negative infinite. If true, then use implicit bin
60+
* (-inf, getSplits.head). If false, then throw exception if values < getSplits.head are
61+
* encountered.
62+
* @group Param */
5763
val lowerInclusive: BooleanParam = new BooleanParam(this, "lowerInclusive",
58-
"An indicator of the inclusiveness of negative infinite.")
64+
"An indicator of the inclusiveness of negative infinite. If true, then use implicit bin " +
65+
"(-inf, getSplits.head). If false, then throw exception if values < getSplits.head are " +
66+
"encountered.")
5967
setDefault(lowerInclusive -> true)
6068

6169
/** @group getParam */
@@ -64,9 +72,15 @@ private[ml] final class Bucketizer(override val parent: Estimator[Bucketizer])
6472
/** @group setParam */
6573
def setLowerInclusive(value: Boolean): this.type = set(lowerInclusive, value)
6674

67-
/** @group Param */
75+
/**
76+
* An indicator of the inclusiveness of positive infinite. If true, then use implicit bin
77+
* [getSplits.last, inf). If false, then throw exception if values > getSplits.last are
78+
* encountered.
79+
* @group Param */
6880
val upperInclusive: BooleanParam = new BooleanParam(this, "upperInclusive",
69-
"An indicator of the inclusiveness of positive infinite.")
81+
"An indicator of the inclusiveness of positive infinite. If true, then use implicit bin " +
82+
"[getSplits.last, inf). If false, then throw exception if values > getSplits.last are " +
83+
"encountered.")
7084
setDefault(upperInclusive -> true)
7185

7286
/** @group getParam */
@@ -93,9 +107,7 @@ private[ml] final class Bucketizer(override val parent: Estimator[Bucketizer])
93107
}
94108

95109
private def prepOutputField(schema: StructType): StructField = {
96-
val attr = new NominalAttribute(
97-
name = Some($(outputCol)),
98-
isOrdinal = Some(true),
110+
val attr = new NominalAttribute(name = Some($(outputCol)), isOrdinal = Some(true),
99111
values = Some($(splits).map(_.toString)))
100112

101113
attr.toStructField()
@@ -109,7 +121,7 @@ private[ml] final class Bucketizer(override val parent: Estimator[Bucketizer])
109121
}
110122
}
111123

112-
object Bucketizer {
124+
private[feature] object Bucketizer {
113125
/**
114126
* The given splits should match 1) its size is larger than zero; 2) it is ordered in a strictly
115127
* increasing way.
@@ -137,8 +149,8 @@ object Bucketizer {
137149
lowerInclusive: Boolean,
138150
upperInclusive: Boolean): Double = {
139151
if ((feature < splits.head && !lowerInclusive) || (feature > splits.last && !upperInclusive)) {
140-
throw new Exception(s"Feature $feature out of bound, check your features or loose the" +
141-
s" lower/upper bound constraint.")
152+
throw new RuntimeException(s"Feature $feature out of bound, check your features or loosen " +
153+
s"the lower/upper bound constraint.")
142154
}
143155
var left = 0
144156
var right = splits.length - 2
@@ -153,6 +165,6 @@ object Bucketizer {
153165
left = mid + 1
154166
}
155167
}
156-
throw new Exception(s"Failed to find a bucket for feature $feature.")
168+
throw new RuntimeException(s"Unexpected error: failed to find a bucket for feature $feature.")
157169
}
158170
}

mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,15 @@ class BucketizerSuite extends FunSuite with MLlibTestSparkContext {
3131
test("Bucket continuous features with setter") {
3232
val sqlContext = new SQLContext(sc)
3333
val data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4, -0.9)
34-
val buckets = Array(-0.5, 0.0, 0.5)
34+
val splits = Array(-0.5, 0.0, 0.5)
3535
val bucketizedData = Array(2.0, 1.0, 2.0, 1.0, 3.0, 3.0, 1.0, 1.0, 0.0)
3636
val dataFrame: DataFrame = sqlContext.createDataFrame(
3737
data.zip(bucketizedData)).toDF("feature", "expected")
3838

3939
val bucketizer: Bucketizer = new Bucketizer()
4040
.setInputCol("feature")
4141
.setOutputCol("result")
42-
.setSplits(buckets)
42+
.setSplits(splits)
4343

4444
bucketizer.transform(dataFrame).select("result", "expected").collect().foreach {
4545
case Row(x: Double, y: Double) =>
@@ -58,7 +58,7 @@ class BucketizerSuite extends FunSuite with MLlibTestSparkContext {
5858
}
5959
}
6060

61-
object BucketizerSuite {
61+
private object BucketizerSuite {
6262
private def linearSearchForBuckets(splits: Array[Double], feature: Double): Double = {
6363
var i = 0
6464
while (i < splits.size) {

0 commit comments

Comments
 (0)