CodingCat
diff --git a/‎core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java‎
Lines changed: 20 additions & 16 deletions b/‎core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java‎
Lines changed: 20 additions & 16 deletions
diff --git a/‎core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java‎
Lines changed: 10 additions & 2 deletions b/‎core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java‎
Lines changed: 13 additions & 8 deletions b/‎core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java‎
Lines changed: 13 additions & 8 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala‎
Lines changed: 4 additions & 4 deletions b/‎core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala‎
Lines changed: 170 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala‎
Lines changed: 17 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala‎
Lines changed: 1 addition & 1 deletion b/‎mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala‎
Lines changed: 24 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala‎
Lines changed: 24 additions & 0 deletions
@@ -191,24 +191,29 @@ public void spill() throws IOException {
       spillWriters.size(),
       spillWriters.size() > 1 ? " times" : " time");
 
-    final UnsafeSorterSpillWriter spillWriter =
-      new UnsafeSorterSpillWriter(blockManager, fileBufferSizeBytes, writeMetrics,
-        inMemSorter.numRecords());
-    spillWriters.add(spillWriter);
-    final UnsafeSorterIterator sortedRecords = inMemSorter.getSortedIterator();
-    while (sortedRecords.hasNext()) {
-      sortedRecords.loadNext();
-      final Object baseObject = sortedRecords.getBaseObject();
-      final long baseOffset = sortedRecords.getBaseOffset();
-      final int recordLength = sortedRecords.getRecordLength();
-      spillWriter.write(baseObject, baseOffset, recordLength, sortedRecords.getKeyPrefix());
+    // We only write out contents of the inMemSorter if it is not empty.
+    if (inMemSorter.numRecords() > 0) {
+      final UnsafeSorterSpillWriter spillWriter =
+        new UnsafeSorterSpillWriter(blockManager, fileBufferSizeBytes, writeMetrics,
+          inMemSorter.numRecords());
+      spillWriters.add(spillWriter);
+      final UnsafeSorterIterator sortedRecords = inMemSorter.getSortedIterator();
+      while (sortedRecords.hasNext()) {
+        sortedRecords.loadNext();
+        final Object baseObject = sortedRecords.getBaseObject();
+        final long baseOffset = sortedRecords.getBaseOffset();
+        final int recordLength = sortedRecords.getRecordLength();
+        spillWriter.write(baseObject, baseOffset, recordLength, sortedRecords.getKeyPrefix());
+      }
+      spillWriter.close();
     }
-    spillWriter.close();
+
     final long spillSize = freeMemory();
     // Note that this is more-or-less going to be a multiple of the page size, so wasted space in
     // pages will currently be counted as memory spilled even though that space isn't actually
     // written to disk. This also counts the space needed to store the sorter's pointer array.
     taskContext.taskMetrics().incMemoryBytesSpilled(spillSize);
+
     initializeForWriting();
   }
 
@@ -505,12 +510,11 @@ public UnsafeSorterIterator getSortedIterator() throws IOException {
       final UnsafeSorterSpillMerger spillMerger =
         new UnsafeSorterSpillMerger(recordComparator, prefixComparator, numIteratorsToMerge);
       for (UnsafeSorterSpillWriter spillWriter : spillWriters) {
-        spillMerger.addSpill(spillWriter.getReader(blockManager));
+        spillMerger.addSpillIfNotEmpty(spillWriter.getReader(blockManager));
       }
       spillWriters.clear();
-      if (inMemoryIterator.hasNext()) {
-        spillMerger.addSpill(inMemoryIterator);
-      }
+      spillMerger.addSpillIfNotEmpty(inMemoryIterator);
+
       return spillMerger.getSortedIterator();
     }
   }
 
@@ -47,11 +47,19 @@ public int compare(UnsafeSorterIterator left, UnsafeSorterIterator right) {
     priorityQueue = new PriorityQueue<UnsafeSorterIterator>(numSpills, comparator);
   }
 
-  public void addSpill(UnsafeSorterIterator spillReader) throws IOException {
+  /**
+   * Add an UnsafeSorterIterator to this merger
+   */
+  public void addSpillIfNotEmpty(UnsafeSorterIterator spillReader) throws IOException {
     if (spillReader.hasNext()) {
+      // We only add the spillReader to the priorityQueue if it is not empty. We do this to
+      // make sure the hasNext method of UnsafeSorterIterator returned by getSortedIterator
+      // does not return wrong result because hasNext will returns true
+      // at least priorityQueue.size() times. If we allow n spillReaders in the
+      // priorityQueue, we will have n extra empty records in the result of the UnsafeSorterIterator.
       spillReader.loadNext();
+      priorityQueue.add(spillReader);
     }
-    priorityQueue.add(spillReader);
   }
 
   public UnsafeSorterIterator getSortedIterator() throws IOException {
 
@@ -101,7 +101,7 @@ public OutputStream apply(OutputStream stream) {
   public void setUp() {
     MockitoAnnotations.initMocks(this);
     sparkConf = new SparkConf();
-    tempDir = new File(Utils.createTempDir$default$1());
+    tempDir = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "unsafe-test");
     shuffleMemoryManager = new ShuffleMemoryManager(Long.MAX_VALUE);
     spillFilesCreated.clear();
     taskContext = mock(TaskContext.class);
@@ -143,13 +143,18 @@ public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Th
 
   @After
   public void tearDown() {
-    long leakedUnsafeMemory = taskMemoryManager.cleanUpAllAllocatedMemory();
-    if (shuffleMemoryManager != null) {
-      long leakedShuffleMemory = shuffleMemoryManager.getMemoryConsumptionForThisTask();
-      shuffleMemoryManager = null;
-      assertEquals(0L, leakedShuffleMemory);
+    try {
+      long leakedUnsafeMemory = taskMemoryManager.cleanUpAllAllocatedMemory();
+      if (shuffleMemoryManager != null) {
+        long leakedShuffleMemory = shuffleMemoryManager.getMemoryConsumptionForThisTask();
+        shuffleMemoryManager = null;
+        assertEquals(0L, leakedShuffleMemory);
+      }
+      assertEquals(0, leakedUnsafeMemory);
+    } finally {
+      Utils.deleteRecursively(tempDir);
+      tempDir = null;
     }
-    assertEquals(0, leakedUnsafeMemory);
   }
 
   private void assertSpillFilesWereCleanedUp() {
@@ -234,7 +239,7 @@ public void testSortingEmptyArrays() throws Exception {
   public void spillingOccursInResponseToMemoryPressure() throws Exception {
     shuffleMemoryManager = new ShuffleMemoryManager(pageSizeBytes * 2);
     final UnsafeExternalSorter sorter = newSorter();
-    final int numRecords = 100000;
+    final int numRecords = (int) pageSizeBytes / 4;
     for (int i = 0; i <= numRecords; i++) {
       insertNumber(sorter, numRecords - i);
     }
 
@@ -93,8 +93,8 @@ class MasterSuite extends SparkFunSuite with Matchers with Eventually with Priva
       publicAddress = ""
     )
 
-    val (rpcEnv, uiPort, restPort) =
-      Master.startRpcEnvAndEndpoint("127.0.0.1", 7077, 8080, conf)
+    val (rpcEnv, _, _) =
+      Master.startRpcEnvAndEndpoint("127.0.0.1", 0, 0, conf)
 
     try {
       rpcEnv.setupEndpointRef(Master.SYSTEM_NAME, rpcEnv.address, Master.ENDPOINT_NAME)
@@ -343,8 +343,8 @@ class MasterSuite extends SparkFunSuite with Matchers with Eventually with Priva
 
   private def makeMaster(conf: SparkConf = new SparkConf): Master = {
     val securityMgr = new SecurityManager(conf)
-    val rpcEnv = RpcEnv.create(Master.SYSTEM_NAME, "localhost", 7077, conf, securityMgr)
-    val master = new Master(rpcEnv, rpcEnv.address, 8080, securityMgr, conf)
+    val rpcEnv = RpcEnv.create(Master.SYSTEM_NAME, "localhost", 0, conf, securityMgr)
+    val master = new Master(rpcEnv, rpcEnv.address, 0, securityMgr, conf)
     master
   }
 
 
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup}
+import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.param.{IntArrayParam, ParamMap, StringArrayParam}
+import org.apache.spark.ml.util.{Identifiable, MetadataUtils, SchemaUtils}
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.StructType
+
+/**
+ * :: Experimental ::
+ * This class takes a feature vector and outputs a new feature vector with a subarray of the
+ * original features.
+ *
+ * The subset of features can be specified with either indices ([[setIndices()]])
+ * or names ([[setNames()]]).  At least one feature must be selected. Duplicate features
+ * are not allowed, so there can be no overlap between selected indices and names.
+ *
+ * The output vector will order features with the selected indices first (in the order given),
+ * followed by the selected names (in the order given).
+ */
+@Experimental
+final class VectorSlicer(override val uid: String)
+  extends Transformer with HasInputCol with HasOutputCol {
+
+  def this() = this(Identifiable.randomUID("vectorSlicer"))
+
+  /**
+   * An array of indices to select features from a vector column.
+   * There can be no overlap with [[names]].
+   * @group param
+   */
+  val indices = new IntArrayParam(this, "indices",
+    "An array of indices to select features from a vector column." +
+      " There can be no overlap with names.", VectorSlicer.validIndices)
+
+  setDefault(indices -> Array.empty[Int])
+
+  /** @group getParam */
+  def getIndices: Array[Int] = $(indices)
+
+  /** @group setParam */
+  def setIndices(value: Array[Int]): this.type = set(indices, value)
+
+  /**
+   * An array of feature names to select features from a vector column.
+   * These names must be specified by ML [[org.apache.spark.ml.attribute.Attribute]]s.
+   * There can be no overlap with [[indices]].
+   * @group param
+   */
+  val names = new StringArrayParam(this, "names",
+    "An array of feature names to select features from a vector column." +
+      " There can be no overlap with indices.", VectorSlicer.validNames)
+
+  setDefault(names -> Array.empty[String])
+
+  /** @group getParam */
+  def getNames: Array[String] = $(names)
+
+  /** @group setParam */
+  def setNames(value: Array[String]): this.type = set(names, value)
+
+  /** @group setParam */
+  def setInputCol(value: String): this.type = set(inputCol, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  override def validateParams(): Unit = {
+    require($(indices).length > 0 || $(names).length > 0,
+      s"VectorSlicer requires that at least one feature be selected.")
+  }
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    // Validity checks
+    transformSchema(dataset.schema)
+    val inputAttr = AttributeGroup.fromStructField(dataset.schema($(inputCol)))
+    inputAttr.numAttributes.foreach { numFeatures =>
+      val maxIndex = $(indices).max
+      require(maxIndex < numFeatures,
+        s"Selected feature index $maxIndex invalid for only $numFeatures input features.")
+    }
+
+    // Prepare output attributes
+    val inds = getSelectedFeatureIndices(dataset.schema)
+    val selectedAttrs: Option[Array[Attribute]] = inputAttr.attributes.map { attrs =>
+      inds.map(index => attrs(index))
+    }
+    val outputAttr = selectedAttrs match {
+      case Some(attrs) => new AttributeGroup($(outputCol), attrs)
+      case None => new AttributeGroup($(outputCol), inds.length)
+    }
+
+    // Select features
+    val slicer = udf { vec: Vector =>
+      vec match {
+        case features: DenseVector => Vectors.dense(inds.map(features.apply))
+        case features: SparseVector => features.slice(inds)
+      }
+    }
+    dataset.withColumn($(outputCol),
+      slicer(dataset($(inputCol))).as($(outputCol), outputAttr.toMetadata()))
+  }
+
+  /** Get the feature indices in order: indices, names */
+  private def getSelectedFeatureIndices(schema: StructType): Array[Int] = {
+    val nameFeatures = MetadataUtils.getFeatureIndicesFromNames(schema($(inputCol)), $(names))
+    val indFeatures = $(indices)
+    val numDistinctFeatures = (nameFeatures ++ indFeatures).distinct.length
+    lazy val errMsg = "VectorSlicer requires indices and names to be disjoint" +
+      s" sets of features, but they overlap." +
+      s" indices: ${indFeatures.mkString("[", ",", "]")}." +
+      s" names: " +
+      nameFeatures.zip($(names)).map { case (i, n) => s"$i:$n" }.mkString("[", ",", "]")
+    require(nameFeatures.length + indFeatures.length == numDistinctFeatures, errMsg)
+    indFeatures ++ nameFeatures
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT)
+
+    if (schema.fieldNames.contains($(outputCol))) {
+      throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.")
+    }
+    val numFeaturesSelected = $(indices).length + $(names).length
+    val outputAttr = new AttributeGroup($(outputCol), numFeaturesSelected)
+    val outputFields = schema.fields :+ outputAttr.toStructField()
+    StructType(outputFields)
+  }
+
+  override def copy(extra: ParamMap): VectorSlicer = defaultCopy(extra)
+}
+
+private[feature] object VectorSlicer {
+
+  /** Return true if given feature indices are valid */
+  def validIndices(indices: Array[Int]): Boolean = {
+    if (indices.isEmpty) {
+      true
+    } else {
+      indices.length == indices.distinct.length && indices.forall(_ >= 0)
+    }
+  }
+
+  /** Return true if given feature names are valid */
+  def validNames(names: Array[String]): Boolean = {
+    names.forall(_.nonEmpty) && names.length == names.distinct.length
+  }
+}
@@ -20,6 +20,7 @@ package org.apache.spark.ml.util
 import scala.collection.immutable.HashMap
 
 import org.apache.spark.ml.attribute._
+import org.apache.spark.mllib.linalg.VectorUDT
 import org.apache.spark.sql.types.StructField
 
 
@@ -74,4 +75,20 @@ private[spark] object MetadataUtils {
     }
   }
 
+  /**
+   * Takes a Vector column and a list of feature names, and returns the corresponding list of
+   * feature indices in the column, in order.
+   * @param col  Vector column which must have feature names specified via attributes
+   * @param names  List of feature names
+   */
+  def getFeatureIndicesFromNames(col: StructField, names: Array[String]): Array[Int] = {
+    require(col.dataType.isInstanceOf[VectorUDT], s"getFeatureIndicesFromNames expected column $col"
+      + s" to be Vector type, but it was type ${col.dataType} instead.")
+    val inputAttr = AttributeGroup.fromStructField(col)
+    names.map { name =>
+      require(inputAttr.hasAttr(name),
+        s"getFeatureIndicesFromNames found no feature with name $name in column $col.")
+      inputAttr.getAttr(name).index.get
+    }
+  }
 }
@@ -82,7 +82,7 @@ class PrefixSpan private (
   /**
    * Gets the maximal pattern length (i.e. the length of the longest sequential pattern to consider.
    */
-  def getMaxPatternLength: Double = maxPatternLength
+  def getMaxPatternLength: Int = maxPatternLength
 
   /**
    * Sets maximal pattern length (default: `10`).
 
@@ -766,6 +766,30 @@ class SparseVector(
       maxIdx
     }
   }
+
+  /**
+   * Create a slice of this vector based on the given indices.
+   * @param selectedIndices Unsorted list of indices into the vector.
+   *                        This does NOT do bound checking.
+   * @return  New SparseVector with values in the order specified by the given indices.
+   *
+   * NOTE: The API needs to be discussed before making this public.
+   *       Also, if we have a version assuming indices are sorted, we should optimize it.
+   */
+  private[spark] def slice(selectedIndices: Array[Int]): SparseVector = {
+    var currentIdx = 0
+    val (sliceInds, sliceVals) = selectedIndices.flatMap { origIdx =>
+      val iIdx = java.util.Arrays.binarySearch(this.indices, origIdx)
+      val i_v = if (iIdx >= 0) {
+        Iterator((currentIdx, this.values(iIdx)))
+      } else {
+        Iterator()
+      }
+      currentIdx += 1
+      i_v
+    }.unzip
+    new SparseVector(selectedIndices.length, sliceInds.toArray, sliceVals.toArray)
+  }
 }
 
 object SparseVector {