diff --git a/core/pom.xml b/core/pom.xml
index 2a81f6df289c..cd058de0bc9a 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -85,7 +85,6 @@
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-math3</artifactId>
-      <version>3.3</version>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/mllib/pom.xml b/mllib/pom.xml
index a5eeef88e9d6..e0d4215463b6 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -71,6 +71,11 @@
         </exclusion>
       </exclusions>
     </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-math3</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${scala.binary.version}</artifactId>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
new file mode 100644
index 000000000000..0dcb94439c47
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -0,0 +1,409 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.clustering
+
+import java.util.Random
+import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
+
+import org.apache.spark.Logging
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.mllib.linalg.{DenseVector => SDV, SparseVector => SSV, Vector => SV}
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.rdd.RDD
+import org.apache.spark.storage.StorageLevel
+
+case class Document(docId: Int, content: Array[Int]) {
+  private[spark] var topics: Array[Int] = null
+  private[spark] var topicsDist: BV[Double] = null
+}
+
+class TopicModel private[spark](
+  private[spark] val topicCounts_ : BDV[Double],
+  private[spark] val topicTermCounts_ : Array[BSV[Double]],
+  private[spark] val alpha: Double,
+  private[spark] val beta: Double
+) extends Serializable {
+
+  def this(topicCounts_ : SDV, topicTermCounts_ : Array[SSV], alpha: Double, beta: Double) {
+    this(new BDV[Double](topicCounts_.toArray), topicTermCounts_.map(t =>
+      new BSV(t.indices, t.values, t.size)), alpha, beta)
+  }
+
+  def topicCounts = Vectors.dense(topicCounts_.toArray)
+
+  def topicTermCounts = topicTermCounts_.map(t => Vectors.sparse(t.size, t.activeIterator.toSeq))
+
+  def update(term: Int, topic: Int, inc: Int) = {
+    topicCounts_(topic) += inc
+    topicTermCounts_(topic)(term) += inc
+    this
+  }
+
+  def merge(other: TopicModel) = {
+    topicCounts_ += other.topicCounts_
+    var i = 0
+    while (i < topicTermCounts_.length) {
+      topicTermCounts_(i) += other.topicTermCounts_(i)
+      i += 1
+    }
+    this
+  }
+
+  def infer(
+    doc: Document, totalIter: Int = 10,
+    burnInIter: Int = 5, rand: Random = new Random()
+  ): SV = {
+    require(totalIter > burnInIter, "totalIter is less than burnInIter")
+    require(totalIter > 0, "totalIter is less than 0")
+    require(burnInIter > 0, "burnInIter is less than 0")
+    val numTopics = topicCounts_.size
+    val topicThisTerm = BDV.zeros[Double](numTopics)
+    val topicThisDoc = BSV.zeros[Double](numTopics)
+
+    for (i <- 1 to totalIter) {
+      generateTopicDistForDocument(doc, topicThisTerm, rand)
+      if (i > burnInIter) {
+        topicThisDoc :+= doc.topicsDist
+      }
+    }
+    topicThisDoc :/= (totalIter - burnInIter).toDouble
+
+    new SDV(topicThisDoc.toArray)
+  }
+
+  /**
+   * This function used for computing the new distribution after drop one from current document,
+   * which is a really essential part of Gibbs sampling for LDA, you can refer to the paper:
+   * <I>Parameter estimation for text analysis<I>
+   */
+  private[spark] def dropOneDistSampler(
+    docTopicCount: BV[Double],
+    topicThisTerm: BDV[Double],
+    rand: Random, term: Int,
+    currentTopic: Int
+  ): Int = {
+    val ttm = generateTopicDistForTerm(docTopicCount, topicThisTerm, term,
+      currentTopic, isTrainModel = true)
+    CGS.multinomialDistSampler(rand, ttm)
+  }
+
+  private[spark] def generateTopicDistForTerm(
+    docTopicCount: BV[Double],
+    topicThisTerm: BDV[Double],
+    term: Int,
+    currentTopic: Int = -1,
+    isTrainModel: Boolean = false
+  ): BDV[Double] = {
+    val numTopics = topicCounts_.size
+    val numTerms = topicTermCounts_.head.size
+    var i = 0
+    while (i < numTopics) {
+      val adjustment = if (isTrainModel && i == currentTopic) -1 else 0
+      topicThisTerm(i) = (topicTermCounts_(i)(term) + adjustment + beta) /
+        (topicCounts_(i) + adjustment + (numTerms * beta)) *
+        (docTopicCount(i) + adjustment + alpha)
+      i += 1
+    }
+    topicThisTerm
+  }
+
+  private[spark] def generateTopicDistForDocument(doc: Document,
+    topicThisTerm: BDV[Double], rand: Random): Unit = {
+    val content = doc.content
+    val numTopics = topicCounts_.size
+    var currentTopicDist = doc.topicsDist
+    var currentTopics = doc.topics
+    if (currentTopicDist != null) {
+      for (i <- 0 until content.length) {
+        val term = content(i)
+        val topic = currentTopics(i)
+        val dist = generateTopicDistForTerm(currentTopicDist, topicThisTerm, term)
+        val lastTopic = CGS.multinomialDistSampler(rand, dist)
+        if (lastTopic != topic) {
+          currentTopics(i) = lastTopic
+          currentTopicDist(topic) += -1
+          currentTopicDist(lastTopic) += 1
+        }
+      }
+    }
+    else {
+      currentTopicDist = BSV.zeros[Double](numTopics)
+      currentTopics = content.map { term =>
+        val lastTopic = CGS.uniformDistSampler(rand, numTopics)
+        currentTopicDist(lastTopic) += 1
+        lastTopic
+      }
+      doc.topicsDist = currentTopicDist
+      doc.topics = currentTopics
+    }
+  }
+
+  private[spark] def phi(topic: Int, term: Int): Double = {
+    val numTerms = topicTermCounts_.head.size
+    (topicTermCounts_(topic)(term) + beta) / (topicCounts_(topic) + numTerms * beta)
+  }
+}
+
+object TopicModel {
+  def apply(numTopics: Int, numTerms: Int, alpha: Double = 0.1, beta: Double = 0.01) = {
+    new TopicModel(
+      BDV.zeros[Double](numTopics),
+      Array(0 until numTopics: _*).map(_ => BSV.zeros[Double](numTerms)),
+      alpha, beta)
+  }
+}
+
+class LDA private(
+  var numTerms: Int,
+  var numTopics: Int,
+  totalIter: Int,
+  burnInIter: Int,
+  var alpha: Double,
+  var beta: Double
+) extends Serializable with Logging {
+  def run(input: RDD[Document]): (TopicModel, RDD[Document]) = {
+    val initModel = TopicModel(numTopics, numTerms, alpha, beta)
+    CGS.runGibbsSampling(input, initModel, totalIter, burnInIter)
+  }
+}
+
+object LDA extends Logging {
+  def train(
+    data: RDD[Document],
+    numTopics: Int,
+    totalIter: Int = 150,
+    burnInIter: Int = 5,
+    alpha: Double = 0.1,
+    beta: Double = 0.01): (TopicModel, RDD[Document]) = {
+    val numTerms = data.flatMap(_.content).max() + 1
+    val lda = new LDA(numTerms, numTopics, totalIter, burnInIter, alpha, beta)
+    lda.run(data)
+  }
+
+  /**
+   * Perplexity is a kind of evaluation method of LDA. Usually it is used on unseen data. But here
+   * we use it for current documents, which is also OK. If using it on unseen data, you must do an
+   * iteration of Gibbs sampling before calling this. Small perplexity means good result.
+   */
+  def perplexity(data: RDD[Document], computedModel: TopicModel): Double = {
+    val broadcastModel = data.context.broadcast(computedModel)
+    val (termProb, totalNum) = data.mapPartitions { docs =>
+      val model = broadcastModel.value
+      val numTopics = model.topicCounts_.size
+      val numTerms = model.topicTermCounts_.head.size
+      val rand = new Random
+      val alpha = model.alpha
+      val currentTheta = BDV.zeros[Double](numTerms)
+      val topicThisTerm = BDV.zeros[Double](numTopics)
+      docs.flatMap { doc =>
+        val content = doc.content
+        if (doc.topicsDist != null) {
+          model.generateTopicDistForDocument(doc, topicThisTerm, rand)
+        }
+        content.foreach(term => currentTheta(term) = 0)
+        content.foreach { term =>
+          (0 until numTopics).foreach { topic =>
+            currentTheta(term) += model.phi(topic, term) * ((doc.topicsDist(topic) + alpha) /
+              (content.size + alpha * numTopics))
+          }
+        }
+        content.map(x => (math.log(currentTheta(x)), 1))
+      }
+    }.reduce { (lhs, rhs) =>
+      (lhs._1 + rhs._1, lhs._2 + rhs._2)
+    }
+    math.exp(-1 * termProb / totalNum)
+  }
+
+}
+
+/**
+ * Collapsed Gibbs sampling from for Latent Dirichlet Allocation.
+ */
+private[spark] object CGS extends Logging {
+
+  /**
+   * Main function of running a Gibbs sampling method. It contains two phases of total Gibbs
+   * sampling: first is initialization, second is real sampling.
+   */
+  def runGibbsSampling(
+    data: RDD[Document], initModel: TopicModel,
+    totalIter: Int, burnInIter: Int): (TopicModel, RDD[Document]) = {
+    require(totalIter > burnInIter, "totalIter is less than burnInIter")
+    require(totalIter > 0, "totalIter is less than 0")
+    require(burnInIter > 0, "burnInIter is less than 0")
+
+    val (numTopics, numTerms, alpha, beta) = (initModel.topicCounts_.size,
+      initModel.topicTermCounts_.head.size,
+      initModel.alpha, initModel.beta)
+    val probModel = TopicModel(numTopics, numTerms, alpha, beta)
+
+    logInfo("Start initialization")
+    var (topicModel, corpus) = sampleTermAssignment(data, initModel)
+
+    for (iter <- 1 to totalIter) {
+      logInfo("Start Gibbs sampling (Iteration %d/%d)".format(iter, totalIter))
+      val broadcastModel = data.context.broadcast(topicModel)
+      val previousCorpus = corpus
+      corpus = sampleTopicCounters(corpus, broadcastModel, numTerms,
+        numTopics).setName(s"LDA-$iter").persist(StorageLevel.MEMORY_AND_DISK)
+      if (iter % 5 == 0 && data.context.getCheckpointDir.isDefined) {
+        corpus.checkpoint()
+      }
+      topicModel = collectTopicCounters(corpus, numTerms, numTopics, alpha, beta)
+      if (iter > burnInIter) {
+        probModel.merge(topicModel)
+      }
+      previousCorpus.unpersist()
+      broadcastModel.unpersist()
+    }
+    val burnIn = (totalIter - burnInIter).toDouble
+    probModel.topicCounts_ :/= burnIn
+    probModel.topicTermCounts_.foreach(_ :/= burnIn)
+    (probModel, corpus)
+  }
+
+  private def sampleTopicCounters(corpus: RDD[Document], broadcast: Broadcast[TopicModel],
+    numTerms: Int, numTopics: Int): RDD[Document] = {
+    corpus.mapPartitions { docs =>
+      val rand = new Random
+      val currentModel = broadcast.value
+      val topicThisTerm = BDV.zeros[Double](numTopics)
+      docs.map { doc =>
+        val content = doc.content
+        val topics = doc.topics
+        val topicsDist = doc.topicsDist
+        for (i <- 0 until content.length) {
+          val term = content(i)
+          val topic = topics(i)
+          val chosenTopic = currentModel.dropOneDistSampler(topicsDist, topicThisTerm,
+            rand, term, topic)
+          if (topic != chosenTopic) {
+            topics(i) = chosenTopic
+            topicsDist(topic) += -1
+            topicsDist(chosenTopic) += 1
+            currentModel.update(term, topic, -1)
+            currentModel.update(term, chosenTopic, 1)
+          }
+        }
+        doc
+      }
+    }
+  }
+
+  private def collectTopicCounters(docTopics: RDD[Document], numTerms: Int,
+    numTopics: Int, alpha: Double, beta: Double): TopicModel = {
+    docTopics.mapPartitions { iter =>
+      val topicCounters = TopicModel(numTopics, numTerms, alpha, beta)
+      iter.foreach { doc =>
+        val content = doc.content
+        val topics = doc.topics
+        for (i <- 0 until doc.content.length) {
+          topicCounters.update(content(i), topics(i), 1)
+        }
+      }
+      Iterator(topicCounters)
+    }.fold(TopicModel(numTopics, numTerms, alpha, beta)) { (thatOne, otherOne) =>
+      thatOne.merge(otherOne)
+    }
+  }
+
+  /**
+   * Initial step of Gibbs sampling, which supports incremental LDA.
+   */
+  private def sampleTermAssignment(data: RDD[Document], topicModel: TopicModel):
+  (TopicModel, RDD[Document]) = {
+    val (numTopics, numTerms, alpha, beta) = (topicModel.topicCounts_.size,
+      topicModel.topicTermCounts_.head.size,
+      topicModel.alpha, topicModel.beta)
+    val broadcast = data.context.broadcast(topicModel)
+    val initialDocs = if (topicModel.topicCounts_.norm(2) == 0) {
+      uniformDistAssignment(data, numTopics, numTerms)
+    } else {
+      multinomialDistAssignment(data, broadcast, numTopics, numTopics)
+    }
+    initialDocs.persist(StorageLevel.MEMORY_AND_DISK)
+    val initialModel = collectTopicCounters(initialDocs, numTerms, numTopics, alpha, beta)
+    broadcast.unpersist()
+    (initialModel, initialDocs)
+  }
+
+  private def multinomialDistAssignment(data: RDD[Document], broadcast: Broadcast[TopicModel],
+    numTopics: Int, numTerms: Int): RDD[Document] = {
+    data.mapPartitions { docs =>
+      val rand = new Random
+      val currentParams = broadcast.value
+      val topicThisTerm = BDV.zeros[Double](numTopics)
+      docs.map { doc =>
+        val content = doc.content
+        val topicDist = doc.topicsDist
+        val lastDocTopicCount = BDV.zeros[Double](numTopics)
+        val lastTopics = content.map { term =>
+          val dist = currentParams.generateTopicDistForTerm(topicDist, topicThisTerm, term)
+          val lastTopic = multinomialDistSampler(rand, dist)
+          lastDocTopicCount(lastTopic) += 1
+          lastTopic
+        }
+        doc.topicsDist = lastDocTopicCount
+        doc.topics = lastTopics
+        doc
+      }
+    }
+  }
+
+  private def uniformDistAssignment(data: RDD[Document],
+    numTopics: Int, numTerms: Int): RDD[Document] = {
+    data.mapPartitions { docs =>
+      val rand = new Random
+      docs.map { doc =>
+        val content = doc.content
+        val lastDocTopicCount = BDV.zeros[Double](numTopics)
+        val lastTopics = content.map { term =>
+          val topic = uniformDistSampler(rand, numTopics)
+          lastDocTopicCount(topic) += 1
+          topic
+        }
+        doc.topicsDist = lastDocTopicCount
+        doc.topics = lastTopics
+        doc
+      }
+    }
+  }
+
+  /**
+   * A uniform distribution sampler, which is only used for initialization.
+   */
+  def uniformDistSampler(rand: Random, dimension: Int): Int = {
+    rand.nextInt(dimension)
+  }
+
+  /**
+   * A multinomial distribution sampler, using roulette method to sample an Int back.
+   */
+  def multinomialDistSampler(rand: Random, dist: BDV[Double]): Int = {
+    val distSum = rand.nextDouble() * breeze.linalg.sum[BDV[Double], Double](dist)
+
+    def loop(index: Int, accum: Double): Int = {
+      if (index == dist.length) return dist.length - 1
+      val sum = accum - dist(index)
+      if (sum <= 0) index else loop(index + 1, sum)
+    }
+
+    loop(0, distSum)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
new file mode 100644
index 000000000000..403f84df5adc
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.clustering
+
+import org.scalatest.FunSuite
+
+import java.util.Random
+import breeze.linalg.{DenseVector => BDV}
+import breeze.stats.distributions.Poisson
+
+import org.apache.spark.mllib.util.LocalSparkContext
+
+class LDASuite extends FunSuite with LocalSparkContext {
+
+  import LDASuite._
+
+  test("LDA || Gibbs sampling") {
+    val model = generateRandomLDAModel(numTopics, numTerms)
+    val corpus = sampleCorpus(model, numDocs, numTerms, numTopics)
+    var data = sc.parallelize(corpus, 2)
+    var computedModel = TopicModel(numTopics, numTerms, alpha, beta)
+    val pps = new Array[Double](incrementalLearning)
+
+    //    val m = LDA.train(data, numTopics, 10, 2, alpha, beta)
+    //    println(LDA.perplexity(m._2, m._1))
+    var i = 0
+    val startedAt = System.currentTimeMillis()
+    while (i < incrementalLearning) {
+      val m = CGS.runGibbsSampling(data, computedModel, totalIterations, burnInIterations)
+      data = m._2
+      computedModel = m._1
+      pps(i) = LDA.perplexity(data, computedModel)
+      i += 1
+    }
+    println((System.currentTimeMillis() - startedAt) / 1e3)
+
+    pps.foreach(println)
+    val ppsDiff = pps.init.zip(pps.tail).map { case (lhs, rhs) => lhs - rhs}
+    assert(ppsDiff.count(_ > 0).toDouble / ppsDiff.size > 0.6)
+    assert(pps.head - pps.last > 0)
+  }
+}
+
+object LDASuite {
+  val numTopics = 5
+  val numTerms = 1000
+  val numDocs = 100
+  val expectedDocLength = 300
+  val alpha = 0.01
+  val beta = 0.01
+  val totalIterations = 2
+  val burnInIterations = 1
+  val incrementalLearning = 10
+
+  /**
+   * Generate a random LDA model, i.e. the topic-term matrix.
+   */
+  def generateRandomLDAModel(numTopics: Int, numTerms: Int): Array[BDV[Double]] = {
+    val model = new Array[BDV[Double]](numTopics)
+    val width = numTerms * 1.0 / numTopics
+    var topic = 0
+    var i = 0
+    while (topic < numTopics) {
+      val topicCentroid = width * (topic + 1)
+      model(topic) = BDV.zeros[Double](numTerms)
+      i = 0
+      while (i < numTerms) {
+        // treat the term list as a circle, so the distance between the first one and the last one
+        // is 1, not n-1.
+        val distance = Math.abs(topicCentroid - i) % (numTerms / 2)
+        // Possibility is decay along with distance
+        model(topic)(i) = 1.0 / (1 + Math.abs(distance))
+        i += 1
+      }
+      topic += 1
+    }
+    model
+  }
+
+  /**
+   * Sample one document given the topic-term matrix.
+   */
+  def ldaSampler(
+    model: Array[BDV[Double]],
+    topicDist: BDV[Double],
+    numTermsPerDoc: Int): Array[Int] = {
+    val samples = new Array[Int](numTermsPerDoc)
+    val rand = new Random()
+    (0 until numTermsPerDoc).foreach { i =>
+      samples(i) = CGS.multinomialDistSampler(
+        rand,
+        model(CGS.multinomialDistSampler(rand, topicDist))
+      )
+    }
+    samples
+  }
+
+  /**
+   * Sample corpus (many documents) from a given topic-term matrix.
+   */
+  def sampleCorpus(
+    model: Array[BDV[Double]],
+    numDocs: Int,
+    numTerms: Int,
+    numTopics: Int): Array[Document] = {
+    (0 until numDocs).map { i =>
+      val rand = new Random()
+      val numTermsPerDoc = Poisson.distribution(expectedDocLength).sample()
+      val numTopicsPerDoc = rand.nextInt(numTopics / 2) + 1
+      val topicDist = BDV.zeros[Double](numTopics)
+      (0 until numTopicsPerDoc).foreach { _ =>
+        topicDist(rand.nextInt(numTopics)) += 1
+      }
+      Document(i, ldaSampler(model, topicDist, numTermsPerDoc))
+    }.toArray
+  }
+}