From a5ebe405a4457570b494935550b5fbf0804f6e95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Antonio?= <joseanmunoz@gmail.com>
Date: Fri, 24 Jun 2016 14:58:08 -0400
Subject: [PATCH] [MLlib] org.apache.spark.mllib.util.SVMDataGenerator generates
 ArrayIndexOutOfBoundsException. I have found the bug and tested the solution.

line 66: val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
crashes because trueWeights has length "nfeatures + 1" while "x" has length "features", and they should have the same length.

To fix this just make trueWeights be the same length as x.

I have recompiled the project with the change and it is working now:
[spark-1.6.1]$ spark-submit --master local[*] --class org.apache.spark.mllib.util.SVMDataGenerator mllib/target/spark-mllib_2.11-1.6.1.jar local /home/user/test

And it generates the data successfully now in the specified folder.
---
 .../scala/org/apache/spark/mllib/util/SVMDataGenerator.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
index cde5979396178..c9468606544db 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
@@ -55,7 +55,7 @@ object SVMDataGenerator {
     val sc = new SparkContext(sparkMaster, "SVMGenerator")
 
     val globalRnd = new Random(94720)
-    val trueWeights = Array.fill[Double](nfeatures + 1)(globalRnd.nextGaussian())
+    val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian())
 
     val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
       val rnd = new Random(42 + idx)