map-reduce using scala and hadoop

sauravsahu02 · sauravsahu02 · commit 7a46336f198d · 2021-06-05T14:39:17.000+05:30
diff --git a/.github/workflows/blank.yml b/.github/workflows/blank.yml
@@ -0,0 +1,30 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Practice-Scala-CI
+
+# Controls when the action will run. 
+on:
+  # Triggers the workflow on push or pull request events but only for the master branch
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+
+      # Runs a single command using the runners shell
+      - name: Run assembly
+        run: sbt assembly
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,48 @@
+*~
+*#
+src_managed
+activemq-data
+project/plugins/project
+project/boot/*
+*/project/build/target
+*/project/boot
+lib_managed
+etags
+tags
+TAGS
+reports
+dist
+bin
+target
+deploy/*.jar
+data
+out
+logs
+.#*
+.codefellow
+storage
+.ensime
+_dump
+.manager
+manifest.mf
+semantic.cache
+tm*.log
+tm*.lck
+tm.out
+*.tm.epoch
+.DS_Store
+*.iws
+*.ipr
+*.iml
+run-codefellow
+.project
+.settings
+.classpath
+.idea
+.scala_dependencies
+.target
+.cache
+multiverse.log
+.eprj
+/lib
+.bsp
diff --git a/README.md b/README.md
@@ -0,0 +1,27 @@
+## MapReduce using Scala
+
+Run this command to generate a fat-jar `WordCountUsingHadoop-1.0.jar`  : 
+```sbt assembly```
+
+Demo run:
+
+```Demo-WordCountOnHadoop:]$ echo "Hello world hello hello" > input.txt
+Demo-WordCoundOnHadoop:]$ cat input.txt
+Hello world hello hello
+Demo-WordCountOnHadoop:]$ docker exec -it hadoop-namenode bash
+(docker) [root@hadoop-namenode /]# hadoop fs -ls /user/root/
+(docker) [root@hadoop-namenode /]# hadoop fs -copyFromLocal input.txt /user/root/input
+(docker) [root@hadoop-namenode /]# hadoop jar WordCountUsingHadoop-1.0.jar /user/root/input output
+(docker) [root@hadoop-namenode /]# hadoop fs -ls /user/root/
+Found 2 items
+-rw-r--r--   2 root supergroup         24 2021-05-24 05:53 /user/root/input
+drwxr-xr-x   - root supergroup          0 2021-05-24 05:57 /user/root/output
+(docker) [root@hadoop-namenode /]# hadoop fs -ls /user/root/output
+Found 2 items
+-rw-r--r--   2 root supergroup          0 2021-05-24 05:57 /user/root/output/_SUCCESS
+-rw-r--r--   2 root supergroup         24 2021-05-24 05:57 /user/root/output/part-r-00000
+(docker) [root@hadoop-namenode /]# hadoop fs -cat /user/root/output/part-r-00000
+Hello	1
+hello	2
+world	1
+```
diff --git a/build.sbt b/build.sbt
@@ -0,0 +1,22 @@
+ThisBuild / scalaVersion := "2.13.3"
+
+lazy val root = (project in file(".")).
+  settings(
+    name := "MapReduce Using Scala and Hadoop",
+    version := "1.0",
+    mainClass in Compile := Some("com.mapreduce.hadoop.WordCountUsingHadoop"),
+    mainClass in assembly := Some("com.mapreduce.hadoop.WordCountUsingHadoop")
+  )
+
+libraryDependencies ++= Seq(
+  "org.apache.hadoop" % "hadoop-core" % "1.2.1",
+  "org.scalatest" %% "scalatest" % "3.0.8" % "test"
+)
+
+assemblyJarName in assembly := "WordCountUsingHadoop-1.0.jar"
+
+// META-INF discarding
+assemblyMergeStrategy in assembly := {
+  case PathList("META-INF", xs @ _*) => MergeStrategy.discard
+  case x => MergeStrategy.first
+}
diff --git a/project/assembly.sbt b/project/assembly.sbt
diff --git a/project/build.properties b/project/build.properties
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -0,0 +1 @@
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
diff --git a/src/main/scala/com/mapreduce/hadoop/WordCountUsingHadoop.scala b/src/main/scala/com/mapreduce/hadoop/WordCountUsingHadoop.scala
@@ -0,0 +1,61 @@
+package com.mapreduce.hadoop
+
+import java.lang
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.io.{IntWritable, Text}
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
+import org.apache.hadoop.mapreduce.{Job, Mapper, Reducer}
+/**
+ * Created by Saurav Sahu : 23 May 2021
+ */
+object WordCountUsingHadoop {
+
+  val wordDelimiter = " "
+
+  def getWordsInLine(value : Text) = value.toString.split(wordDelimiter)
+
+  type Mapper_T = Mapper[Object /* InputData */, Text /* Line */, Text /* Word */, IntWritable /* OccurrenceCount */]
+
+  class LineTokenizerMapper extends Mapper_T {
+    val word = new Text
+    val singleOccurrence = new IntWritable(1)
+    override def map(key: Object, value: Text, context: Mapper_T#Context): Unit = {
+      getWordsInLine(value).foreach(token =>{
+        word.set(token)
+        context.write(word, singleOccurrence)
+      })
+    }
+  }
+
+  def calculateSum(values: lang.Iterable[IntWritable]) = {
+    var iterator = values.iterator
+    var total = 0
+    while (iterator.hasNext) total += iterator.next.get()
+    total
+  }
+
+  type Reducer_T = Reducer[Text /* Word */, IntWritable /* Occurrence */, Text /* Word */, IntWritable /* Occurrences */]
+
+  class OccurrencesReducer extends Reducer_T{
+    val totalOccurrences = new IntWritable(1)
+    override def reduce(key: Text, values: lang.Iterable[IntWritable], context: Reducer_T#Context): Unit = {
+      totalOccurrences.set(calculateSum(values))
+      context.write(key, totalOccurrences)
+    }
+  }
+
+  def main(args: Array[String]): Unit ={
+    val mapReduceHadoopJob = Job.getInstance(new Configuration, "Word Count Using Hadoop")
+    mapReduceHadoopJob.setJarByClass(this.getClass)
+    mapReduceHadoopJob.setMapperClass(classOf[LineTokenizerMapper])
+    mapReduceHadoopJob.setReducerClass(classOf[OccurrencesReducer])
+    mapReduceHadoopJob.setOutputKeyClass(classOf[Text])
+    mapReduceHadoopJob.setOutputValueClass(classOf[IntWritable])
+    FileInputFormat.addInputPath(mapReduceHadoopJob, new Path(args(0)))
+    FileOutputFormat.setOutputPath(mapReduceHadoopJob, new Path(args(1)))
+    System.exit(mapReduceHadoopJob.waitForCompletion(true).compare(true))
+  }
+}
diff --git a/src/test/scala/com/mapreduce/hadoop/WordCountUsingHadoopTest.scala b/src/test/scala/com/mapreduce/hadoop/WordCountUsingHadoopTest.scala
@@ -0,0 +1,26 @@
+package com.mapreduce.hadoop
+
+import java.util
+
+import org.apache.hadoop.io.{IntWritable, Text}
+import org.scalatest.FunSuite
+
+/**
+ * Created by Saurav 24 May 2021
+ */
+
+class WordCountUsingHadoopTest extends FunSuite {
+  test("calculate sum of iterables") {
+    val iterable = new util.ArrayList[IntWritable](util.Arrays.asList(
+      new IntWritable(1),
+      new IntWritable(2),
+      new IntWritable(1)))
+
+    assert(WordCountUsingHadoop.calculateSum(iterable) == 4)
+  }
+
+  test("test getWordsInLine") {
+    val line = "Hello World, how are you"
+    assert(WordCountUsingHadoop.getWordsInLine(new Text(line)).sameElements(Array("Hello", "World,", "how", "are", "you")))
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")`