moved Kinesis examples out of examples/ and back into extras/kinesis-asl

cfregly · cfregly · commit 0393795b53c2 · 2014-08-01T19:23:04.000-07:00
updated the build to only include kinesis-asl inside the examples jar
when -Pkinesis-asl is specified
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -185,16 +185,6 @@
         </dependency>
       </dependencies>
     </profile>
-    <profile>
-      <id>kinesis-asl</id>
-      <dependencies>
-        <dependency>
-          <groupId>org.apache.spark</groupId>
-          <artifactId>spark-streaming-kinesis-asl_${scala.binary.version}</artifactId>
-          <version>${project.version}</version>
-        </dependency>
-      </dependencies>
-    </profile>
     <profile>
       <id>bigtop-dist</id>
       <!-- This profile uses the assembly plugin to create a special "dist" package for BigTop
diff --git a/examples/pom.xml b/examples/pom.xml
@@ -34,6 +34,19 @@
   <name>Spark Project Examples</name>
   <url>http://spark.apache.org/</url>
 
+  <profiles>
+    <profile>
+      <id>kinesis-asl</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-streaming-kinesis-asl_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
+  </profiles>
+  
   <dependencies>
     <dependency>
       <groupId>org.apache.spark</groupId>
@@ -96,11 +109,6 @@
       <artifactId>spark-streaming-mqtt_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-streaming-kinesis-asl_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
-    </dependency>
     <dependency>
       <groupId>org.apache.hbase</groupId>
       <artifactId>hbase</artifactId>
diff --git a/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java b/extras/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java
@@ -79,14 +79,14 @@ public final class JavaKinesisWordCountASL {
     private static final Pattern WORD_SEPARATOR = Pattern.compile(" ");
     private static final Logger logger = Logger.getLogger(JavaKinesisWordCountASL.class);
 
-    /**
+    /*
      * Make the constructor private to enforce singleton
      */
     private JavaKinesisWordCountASL() {
     }
 
     public static void main(String[] args) {
-        /**
+        /*
          * Check that all required args were passed in.
          */
         if (args.length < 2) {
@@ -100,41 +100,41 @@ public static void main(String[] args) {
 
         StreamingExamples.setStreamingLogLevels();
 
-        /** Populate the appropriate variables from the given args */
+        /* Populate the appropriate variables from the given args */
         String streamName = args[0];
         String endpointUrl = args[1];
-        /** Set the batch interval to a fixed 2000 millis (2 seconds) */
+        /* Set the batch interval to a fixed 2000 millis (2 seconds) */
         Duration batchInterval = new Duration(2000);
 
-        /** Create a Kinesis client in order to determine the number of shards for the given stream */
+        /* Create a Kinesis client in order to determine the number of shards for the given stream */
         AmazonKinesisClient kinesisClient = new AmazonKinesisClient(
                 new DefaultAWSCredentialsProviderChain());
         kinesisClient.setEndpoint(endpointUrl);
 
-        /** Determine the number of shards from the stream */
+        /* Determine the number of shards from the stream */
         int numShards = kinesisClient.describeStream(streamName)
                 .getStreamDescription().getShards().size();
 
-        /** In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard */ 
+        /* In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard */ 
         int numStreams = numShards;
 
-        /** Must add 1 more thread than the number of receivers or the output won't show properly from the driver */
+        /* Must add 1 more thread than the number of receivers or the output won't show properly from the driver */
         int numSparkThreads = numStreams + 1;
 
-        /** Setup the Spark config. */
+        /* Setup the Spark config. */
         SparkConf sparkConfig = new SparkConf().setAppName("KinesisWordCount").setMaster(
                 "local[" + numSparkThreads + "]");
 
-        /** Kinesis checkpoint interval.  Same as batchInterval for this example. */
+        /* Kinesis checkpoint interval.  Same as batchInterval for this example. */
         Duration checkpointInterval = batchInterval;
 
-        /** Setup the StreamingContext */
+        /* Setup the StreamingContext */
         JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval);
 
-        /** Setup the checkpoint directory used by Spark Streaming */
+        /* Setup the checkpoint directory used by Spark Streaming */
         jssc.checkpoint("/tmp/checkpoint");
 
-        /** Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */
+        /* Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */
         List<JavaDStream<byte[]>> streamsList = new ArrayList<JavaDStream<byte[]>>(numStreams);
         for (int i = 0; i < numStreams; i++) {
         	streamsList.add(
@@ -143,27 +143,27 @@ public static void main(String[] args) {
             );
         }
 
-        /** Union all the streams if there is more than 1 stream */
+        /* Union all the streams if there is more than 1 stream */
         JavaDStream<byte[]> unionStreams;
         if (streamsList.size() > 1) {
             unionStreams = jssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size()));
         } else {
-            /** Otherwise, just use the 1 stream */
+            /* Otherwise, just use the 1 stream */
             unionStreams = streamsList.get(0);
         }
 
-        /**
-          * Split each line of the union'd DStreams into multiple words using flatMap to produce the collection.
-          * Convert lines of byte[] to multiple Strings by first converting to String, then splitting on WORD_SEPARATOR.
-          */
+        /*
+         * Split each line of the union'd DStreams into multiple words using flatMap to produce the collection.
+         * Convert lines of byte[] to multiple Strings by first converting to String, then splitting on WORD_SEPARATOR.
+         */
         JavaDStream<String> words = unionStreams.flatMap(new FlatMapFunction<byte[], String>() {
                 @Override
                 public Iterable<String> call(byte[] line) {
                     return Lists.newArrayList(WORD_SEPARATOR.split(new String(line)));
                 }
             });
 
-        /** Map each word to a (word, 1) tuple, then reduce/aggregate by key. */
+        /* Map each word to a (word, 1) tuple, then reduce/aggregate by key. */
         JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
             new PairFunction<String, String, Integer>() {
                 @Override
@@ -177,10 +177,10 @@ public Integer call(Integer i1, Integer i2) {
                 }
             });
 
-        /** Print the first 10 wordCounts by key */
+        /* Print the first 10 wordCounts by key */
         wordCounts.print();
 
-        /** Start the streaming context and await termination */
+        /* Start the streaming context and await termination */
         jssc.start();
         jssc.awaitTermination();
     }
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
@@ -18,21 +18,20 @@
 package org.apache.spark.examples.streaming
 
 import java.nio.ByteBuffer
-
 import scala.util.Random
-
 import org.apache.spark.Logging
 import org.apache.spark.SparkConf
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.Milliseconds
 import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.streaming.StreamingContext.toPairDStreamFunctions
 import org.apache.spark.streaming.kinesis.KinesisUtils
-
 import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
 import com.amazonaws.services.kinesis.AmazonKinesisClient
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
 import com.amazonaws.services.kinesis.model.PutRecordRequest
+import org.apache.log4j.Logger
+import org.apache.log4j.Level
 
 /**
  * Kinesis Spark Streaming WordCount example.
@@ -72,9 +71,7 @@ import com.amazonaws.services.kinesis.model.PutRecordRequest
  */
 object KinesisWordCountASL extends Logging {
   def main(args: Array[String]) {
-/**
- * Check that all required args were passed in.
- */
+    /* Check that all required args were passed in. */
     if (args.length < 2) {
       System.err.println(
         """
@@ -87,57 +84,57 @@ object KinesisWordCountASL extends Logging {
     }
 
     StreamingExamples.setStreamingLogLevels()
-    
-    /** Populate the appropriate variables from the given args */
+
+    /* Populate the appropriate variables from the given args */
     val Array(streamName, endpointUrl) = args
 
-    /** Determine the number of shards from the stream */
+    /* Determine the number of shards from the stream */
     val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain())
     kinesisClient.setEndpoint(endpointUrl)
     val numShards = kinesisClient.describeStream(streamName).getStreamDescription().getShards()
       .size()
 
-    /** In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard. */
+    /* In this example, we're going to create 1 Kinesis Worker/Receiver/DStream for each shard. */
     val numStreams = numShards
 
-    /** 
+    /* 
      *  numSparkThreads should be 1 more thread than the number of receivers.
      *  This leaves one thread available for actually processing the data.
      */
     val numSparkThreads = numStreams + 1
 
-    /** Setup the and SparkConfig and StreamingContext */
-    /** Spark Streaming batch interval */
+    /* Setup the and SparkConfig and StreamingContext */
+    /* Spark Streaming batch interval */
     val batchInterval = Milliseconds(2000)    
     val sparkConfig = new SparkConf().setAppName("KinesisWordCount")
       .setMaster(s"local[$numSparkThreads]")
     val ssc = new StreamingContext(sparkConfig, batchInterval)
-    /** Setup the checkpoint directory used by Spark Streaming */
+    /* Setup the checkpoint directory used by Spark Streaming */
     ssc.checkpoint("/tmp/checkpoint");
 
-    /** Kinesis checkpoint interval.  Same as batchInterval for this example. */
+    /* Kinesis checkpoint interval.  Same as batchInterval for this example. */
     val kinesisCheckpointInterval = batchInterval
 
-    /** Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */
+    /* Create the same number of Kinesis DStreams/Receivers as Kinesis stream's shards */
     val kinesisStreams = (0 until numStreams).map { i =>
       KinesisUtils.createStream(ssc, streamName, endpointUrl, kinesisCheckpointInterval,
           InitialPositionInStream.LATEST, StorageLevel.MEMORY_AND_DISK_2)
     }
 
-    /** Union all the streams */
+    /* Union all the streams */
     val unionStreams = ssc.union(kinesisStreams)
 
-    /** Convert each line of Array[Byte] to String, split into words, and count them */
+    /* Convert each line of Array[Byte] to String, split into words, and count them */
     val words = unionStreams.flatMap(byteArray => new String(byteArray)
       .split(" "))
 
-    /** Map each word to a (word, 1) tuple so we can reduce/aggregate by key. */
+    /* Map each word to a (word, 1) tuple so we can reduce/aggregate by key. */
     val wordCounts = words.map(word => (word, 1)).reduceByKey(_ + _)
 
-    /** Print the first 10 wordCounts by key */
+    /* Print the first 10 wordCounts by key */
     wordCounts.print()
 
-    /** Start the streaming context and await termination */
+    /* Start the streaming context and await termination */
     ssc.start()
     ssc.awaitTermination()
   }
@@ -169,13 +166,13 @@ object KinesisWordCountProducerASL {
 
     StreamingExamples.setStreamingLogLevels()
 
-    /** Populate the appropriate variables from the given args */
+    /* Populate the appropriate variables from the given args */
     val Array(stream, endpoint, recordsPerSecond, wordsPerRecord) = args
 
-    /** Generate the records and return the totals */
+    /* Generate the records and return the totals */
     val totals = generate(stream, endpoint, recordsPerSecond.toInt, wordsPerRecord.toInt)
 
-    /** Print the array of (index, total) tuples */
+    /* Print the array of (index, total) tuples */
     println("Totals")
     totals.foreach(total => println(total.toString()))
   }
@@ -187,51 +184,70 @@ object KinesisWordCountProducerASL {
 
     val MaxRandomInts = 10
 
-    /** Create the Kinesis client */
+    /* Create the Kinesis client */
     val kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain())
     kinesisClient.setEndpoint(endpoint)
 
     println(s"Putting records onto stream $stream and endpoint $endpoint at a rate of" +
       s" $recordsPerSecond records per second and $wordsPerRecord words per record");
 
     val totals = new Array[Int](MaxRandomInts)
-    /** Put String records onto the stream per the given recordPerSec and wordsPerRecord */
+    /* Put String records onto the stream per the given recordPerSec and wordsPerRecord */
     for (i <- 1 to 5) {
 
-      /** Generate recordsPerSec records to put onto the stream */
+      /* Generate recordsPerSec records to put onto the stream */
       val records = (1 to recordsPerSecond.toInt).map { recordNum =>
-        /** 
+        /* 
          *  Randomly generate each wordsPerRec words between 0 (inclusive)
          *  and MAX_RANDOM_INTS (exclusive) 
          */
         val data = (1 to wordsPerRecord.toInt).map(x => {
-          /** Generate the random int */
+          /* Generate the random int */
           val randomInt = Random.nextInt(MaxRandomInts)
 
-          /** Keep track of the totals */
+          /* Keep track of the totals */
           totals(randomInt) += 1
 
           randomInt.toString()
         }).mkString(" ")
 
-        /** Create a partitionKey based on recordNum */
+        /* Create a partitionKey based on recordNum */
         val partitionKey = s"partitionKey-$recordNum"
 
-        /** Create a PutRecordRequest with an Array[Byte] version of the data */
+        /* Create a PutRecordRequest with an Array[Byte] version of the data */
         val putRecordRequest = new PutRecordRequest().withStreamName(stream)
             .withPartitionKey(partitionKey)
             .withData(ByteBuffer.wrap(data.getBytes()));
 
-        /** Put the record onto the stream and capture the PutRecordResult */
+        /* Put the record onto the stream and capture the PutRecordResult */
         val putRecordResult = kinesisClient.putRecord(putRecordRequest);
       }
 
-      /** Sleep for a second */
+      /* Sleep for a second */
       Thread.sleep(1000)
       println("Sent " + recordsPerSecond + " records")
     }
 
-    /** Convert the totals to (index, total) tuple */
+    /* Convert the totals to (index, total) tuple */
     (0 to (MaxRandomInts - 1)).zip(totals)
   }
 }
+
+/** 
+ *  Utility functions for Spark Streaming examples. 
+ *  This has been lifted from the examples/ project to remove the circular dependency.
+ */
+object StreamingExamples extends Logging {
+
+  /** Set reasonable logging levels for streaming if the user has not configured log4j. */
+  def setStreamingLogLevels() {
+    val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
+    if (!log4jInitialized) {
+      // We first log something to initialize Spark's default logging, then we override the
+      // logging level.
+      logInfo("Setting log level to [WARN] for streaming example." +
+        " To override add a custom log4j.properties to the classpath.")
+      Logger.getRootLogger.setLevel(Level.WARN)
+    }
+  }
+}
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
@@ -32,9 +32,8 @@ private[kinesis] class KinesisCheckpointState(
     checkpointInterval: Duration, 
     currentClock: Clock = new SystemClock())
   extends Logging {
-  /**
-   * Initialize the checkpoint clock using the given currentClock + checkpointInterval millis
-   */
+  
+  /* Initialize the checkpoint clock using the given currentClock + checkpointInterval millis */
   val checkpointClock = new ManualClock()
   checkpointClock.setTime(currentClock.currentTime() + checkpointInterval.milliseconds)
 
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala