Bugfix/48 no storer write (#51)

dk1844 · web-flow · commit da979ee65028 · 2021-01-06T16:09:01.000+01:00
* #48 no storer write fix - hadoopfs default storer is used - hdfs test enabled for build, while s3 ignored - readme update - implicit saving test adding 1 (loader non-"", storer = "") - explicit saving test adding (loader non-"", storer = defined)
diff --git a/README.md b/README.md
@@ -168,7 +168,7 @@ object ExampleSparkJob {
     import spark.implicits._
 
     // implicit FS is needed for enableControlMeasuresTracking, setCheckpoint calls, e.g. standard HDFS here:
-    implicit val localHdfs = FileSystem.get(new Configuration)
+    implicit val localHdfs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
 
     // Initializing library to hook up to Apache Spark
     spark.enableControlMeasuresTracking(sourceInfoFile = "data/input/_INFO")
@@ -196,8 +196,28 @@ in 'data/input/_INFO'. Two checkpoints are created. Any business logic can be in
 and saving it to Parquet format.  
 
 ### Storing Measurements in AWS S3
-Starting with version 3.0.0, persistence support for AWS S3 has been added. 
-AWS S3 can be both used for loading the measurement data from as well as saving the measurements back to.
+
+#### AWS S3 via Hadoop FS API
+Since version 3.1.0, persistence support for AWS S3 via Hadoop FS API is available. The usage is the same as with 
+regular HDFS with the exception of providing a different file system, e.g.:
+```scala
+import java.net.URI
+import org.apache.hadoop.fs.FileSystem
+import org.apache.spark.sql.SparkSession
+
+val spark = SparkSession
+      .builder()
+      .appName("Example Spark Job")
+      .getOrCreate()
+
+val s3Uri = new URI("s3://my-awesome-bucket")
+implicit  val fs = FileSystem.get(s3Uri, spark.sparkContext.hadoopConfiguration)
+
+```
+The rest of the usage is the same in the example listed above.
+
+#### AWS S3 via AWS SDK for S3
+Starting with version 3.0.0, there is also persistence support for AWS S3 via AWS SDK S3.
 
 The following example demonstrates the setup:
 ```scala
@@ -238,7 +258,7 @@ object S3Example {
 }
 
 ```
-The rest of the processing logic and programatic approach to the library remains unchanged.
+The rest of the processing logic and programmatic approach to the library remains unchanged.
 
 
 ## Atum library routines
diff --git a/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala b/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala
@@ -17,14 +17,15 @@ package za.co.absa.atum.core
 
 import java.io.{PrintWriter, StringWriter}
 
-import org.apache.hadoop.fs.FileSystem
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.util.QueryExecutionListener
 import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider
 import software.amazon.awssdk.regions.Region
-import za.co.absa.atum.persistence.{HadoopFsControlMeasuresStorer, S3ControlMeasuresStorer, S3KmsSettings}
+import za.co.absa.atum.persistence.{S3ControlMeasuresStorer, S3KmsSettings}
 import za.co.absa.atum.utils.ExecutionPlanUtils._
-import za.co.absa.atum.utils.S3Utils
+import za.co.absa.atum.utils.{InfoFile, S3Utils}
 
 /**
  * The class is responsible for listening to DataSet save events and outputting corresponding control measurements.
@@ -39,12 +40,9 @@ class SparkQueryExecutionListener(cf: ControlFrameworkState) extends QueryExecut
           Atum.log.debug(s"SparkQueryExecutionListener.onSuccess for S3ControlMeasuresStorer: writing to ${s3storer.outputLocation.s3String}")
           writeInfoFileForQueryForSdkS3(qe, s3storer.outputLocation.region, s3storer.kmsSettings)(s3storer.credentialsProvider)
 
-        case Some(hadoopStorer: HadoopFsControlMeasuresStorer) =>
-          Atum.log.debug(s"SparkQueryExecutionListener.onSuccess: writing to Hadoop FS")
-          writeInfoFileForQuery(qe)(hadoopStorer.outputFs)
-
         case _ =>
-          Atum.log.info("No usable storer is set, therefore no data will be written the automatically with DF-save to an _INFO file.")
+          Atum.log.debug(s"SparkQueryExecutionListener.onSuccess: writing to Hadoop FS")
+          writeInfoFileForQuery(qe)
       }
 
       // Notify listeners
@@ -64,14 +62,21 @@ class SparkQueryExecutionListener(cf: ControlFrameworkState) extends QueryExecut
   }
 
   /** Write _INFO file with control measurements to the output directory based on the query plan */
-  private def writeInfoFileForQuery(qe: QueryExecution)(implicit outputFs: FileSystem): Unit = {
-    val infoFilePath = inferOutputInfoFileName(qe, cf.outputInfoFileName)
+  private[core] def writeInfoFileForQuery(qe: QueryExecution)(): Unit = {
+    val infoFileDir: Option[String] = inferOutputInfoFileDir(qe)
+
+    implicit val hadoopConf: Configuration = qe.sparkSession.sparkContext.hadoopConfiguration
+    val fsWithDir = infoFileDir
+      .map(InfoFile)
+      .flatMap(_.toOptFsPath) // path + FS based on HDFS or S3 over hadoopFS
 
     // Write _INFO file to the output directory
-    infoFilePath.foreach(path => {
+    fsWithDir.foreach { case (fs, dir) => {
+      val path = new Path(dir, cf.outputInfoFileName)
+
       Atum.log.info(s"Inferred _INFO Path = ${path.toUri.toString}")
-      cf.storeCurrentInfoFile(path)
-    })
+      cf.storeCurrentInfoFile(path)(fs)
+    }}
 
     // Write _INFO file to a registered storer
     if (cf.accumulator.isStorerLoaded) {
diff --git a/atum/src/main/scala/za/co/absa/atum/utils/ExecutionPlanUtils.scala b/atum/src/main/scala/za/co/absa/atum/utils/ExecutionPlanUtils.scala
@@ -97,11 +97,22 @@ object ExecutionPlanUtils {
     * @return The inferred output control measurements file path of the source dataset
     */
   def inferOutputInfoFileName(qe: QueryExecution, infoFileName: String = Constants.DefaultInfoFileName): Option[Path] = {
+    inferOutputInfoFileDir(qe).map { dir =>
+      new Path(dir, infoFileName)
+    }
+  }
+
+  /**
+   * Based on the `qe` supplied, output _INFO file path is inference is attempted
+   * @param qe QueryExecution - path inference basis
+   * @return optional inferred _INFO file path
+   */
+  def inferOutputInfoFileDir(qe: QueryExecution): Option[String] = {
     qe.analyzed match {
       case s: SaveIntoDataSourceCommand =>
-          Some(new Path(s.options("path"), infoFileName))
+        Some(s.options("path"))
       case h: InsertIntoHadoopFsRelationCommand =>
-          Some(new Path(h.outputPath, infoFileName))
+        Some(h.outputPath.toString)
       case a =>
         log.warn(s"Logical plan: ${qe.logical.treeString}")
         log.warn(s"Analyzed plan: ${qe.analyzed.treeString}")
diff --git a/atum/src/main/scala/za/co/absa/atum/utils/InfoFile.scala b/atum/src/main/scala/za/co/absa/atum/utils/InfoFile.scala
@@ -12,7 +12,7 @@ private[atum] case class InfoFile(infoFile: String) {
 
   private val validatedInfoFile: Option[String] = if (infoFile.isEmpty) None else Some(infoFile)
 
-  private def toOptFsPath(implicit hadoopConfiguration: Configuration): Option[(FileSystem, Path)] = {
+  def toOptFsPath(implicit hadoopConfiguration: Configuration): Option[(FileSystem, Path)] = {
     validatedInfoFile.map { definedInfoFile =>
       definedInfoFile.toS3Location match {
 
diff --git a/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala
@@ -4,16 +4,17 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.scalatest.flatspec.AnyFlatSpec
 import org.scalatest.matchers.should.Matchers
+import za.co.absa.atum.model.ControlMeasure
 import za.co.absa.atum.persistence.TestResources
 import za.co.absa.atum.utils.{FileUtils, HdfsFileUtils}
 
 class ControlMeasuresHdfsStorerJsonSpec extends AnyFlatSpec with Matchers {
 
   val expectedFilePath: String = TestResources.InputInfo.localPath
-  val inputControlMeasure = TestResources.InputInfo.controlMeasure
+  val inputControlMeasure: ControlMeasure = TestResources.InputInfo.controlMeasure
 
-  val hadoopConfiguration = new Configuration()
-  implicit val fs = FileSystem.get(hadoopConfiguration)
+  val hadoopConfiguration: Configuration = new Configuration()
+  implicit val fs: FileSystem = FileSystem.get(hadoopConfiguration)
 
   "ControlMeasuresHdfsStorerJsonFile" should "store json file to HDFS" in {
 
diff --git a/build-all.sh b/build-all.sh
diff --git a/examples/pom.xml b/examples/pom.xml
@@ -79,7 +79,7 @@
                 <artifactId>scalatest-maven-plugin</artifactId>
                 <version>${scalatest.maven.version}</version>
                 <configuration>
-                    <skipTests>true</skipTests>
+                    <skipTests>false</skipTests>
                 </configuration>
             </plugin>
             <!-- Uber jar generation -->
diff --git a/examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements1.scala b/examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements1.scala
@@ -29,7 +29,7 @@ object SampleMeasurements1 {
     import spark.implicits._
 
     val hadoopConfiguration = spark.sparkContext.hadoopConfiguration
-    implicit val fs = FileSystem.get(hadoopConfiguration)
+    implicit val fs: FileSystem = FileSystem.get(hadoopConfiguration)
 
     // Initializing library to hook up to Apache Spark
     spark.enableControlMeasuresTracking(sourceInfoFile = "data/input/wikidata.csv.info")
diff --git a/examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements2.scala b/examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements2.scala
@@ -30,7 +30,7 @@ object SampleMeasurements2 {
     import spark.implicits._
 
     val hadoopConfiguration = spark.sparkContext.hadoopConfiguration
-    implicit val fs = FileSystem.get(hadoopConfiguration)
+    implicit val fs: FileSystem = FileSystem.get(hadoopConfiguration)
 
     // Initializing library to hook up to Apache Spark
     // No need to specify datasetName and datasetVersion as it is stage 2 and it will be determined automatically
diff --git a/examples/src/main/scala/za/co/absa/atum/examples/SampleSdkS3Measurements1.scala b/examples/src/main/scala/za/co/absa/atum/examples/SampleSdkS3Measurements1.scala
@@ -32,7 +32,7 @@ object SampleSdkS3Measurements1 {
     import spark.implicits._
 
     val hadoopConfiguration = spark.sparkContext.hadoopConfiguration
-    implicit val fs = FileSystem.get(hadoopConfiguration)
+    implicit val fs: FileSystem = FileSystem.get(hadoopConfiguration)
 
     // This sample example relies on local credentials profile named "saml" with access to the s3 location defined below
     implicit val samlCredentialsProvider = S3Utils.getLocalProfileCredentialsProvider("saml")
diff --git a/examples/src/main/scala/za/co/absa/atum/examples/SampleSdkS3Measurements2.scala b/examples/src/main/scala/za/co/absa/atum/examples/SampleSdkS3Measurements2.scala
@@ -34,7 +34,7 @@ object SampleSdkS3Measurements2 {
     import spark.implicits._
 
     val hadoopConfiguration = spark.sparkContext.hadoopConfiguration
-    implicit val fs = FileSystem.get(hadoopConfiguration)
+    implicit val fs: FileSystem = FileSystem.get(hadoopConfiguration)
 
     // This sample example relies on local credentials profile named "saml" with access to the s3 location defined below
     // AND by having explicitly defined KMS Key ID
diff --git a/examples/src/test/scala/za/co/absa/atum/HdfsInfoIntegrationSuite.scala b/examples/src/test/scala/za/co/absa/atum/HdfsInfoIntegrationSuite.scala
@@ -0,0 +1,76 @@
+package za.co.absa.atum
+
+import org.apache.hadoop.fs.FileSystem
+import org.apache.log4j.LogManager
+import org.apache.spark.sql.{DataFrame, SaveMode}
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.flatspec.AnyFlatSpec
+import org.scalatest.matchers.should.Matchers
+import za.co.absa.atum.model.{Checkpoint, Measurement}
+import za.co.absa.atum.persistence.ControlMeasuresParser
+import za.co.absa.atum.utils.SparkTestBase
+
+class HdfsInfoIntegrationSuite extends AnyFlatSpec with SparkTestBase with Matchers with BeforeAndAfterAll {
+
+  private val log = LogManager.getLogger(this.getClass)
+  val tempDir: String = LocalFsTestUtils.createLocalTemporaryDirectory("hdfsTestOutput")
+
+  override def afterAll: Unit = {
+    LocalFsTestUtils.safeDeleteTestDir(tempDir)
+  }
+
+  private val inputCsv = "data/input/wikidata.csv"
+  private def readSparkInputCsv(inputCsvPath: String): DataFrame = spark.read
+    .option("header", "true")
+    .option("inferSchema", "true")
+    .csv(inputCsvPath)
+
+  private def writeSparkData(df: DataFrame, outputPath: String): Unit =
+    df.write.mode(SaveMode.Overwrite)
+      .parquet(outputPath)
+
+  {
+    val outputPath = s"$tempDir/outputCheck1"
+    // implicit variant only writes to derived outputPath, explicit writes to both implicit derived path and the explicit one, too.
+    Seq(
+      ("implicit output _INFO path only", "", Seq(s"$outputPath/_INFO")),
+      ("implicit & explicit output _INFO path", s"$outputPath/extra/_INFO2", Seq(s"$outputPath/_INFO", s"$outputPath/extra/_INFO2"))
+    ).foreach { case (testCaseName, destinationInfoFilePath, expectedPaths) =>
+
+      "_INFO" should s"be written on spark.write ($testCaseName)" in {
+        import spark.implicits._
+        import za.co.absa.atum.AtumImplicits._
+
+        val hadoopConfiguration = spark.sparkContext.hadoopConfiguration
+        implicit val fs: FileSystem = FileSystem.get(hadoopConfiguration)
+
+        // Initializing library to hook up to Apache Spark
+        spark.enableControlMeasuresTracking(sourceInfoFile = "data/input/wikidata.csv.info", destinationInfoFile = destinationInfoFilePath)
+          .setControlMeasuresWorkflow("Job 1")
+
+        val df1 = readSparkInputCsv(inputCsv)
+        df1.setCheckpoint("Checkpoint0")
+        val filteredDf1 = df1.filter($"total_response_size" > 1000)
+        filteredDf1.setCheckpoint("Checkpoint1") // stateful, do not need return value
+        writeSparkData(filteredDf1, outputPath) // implicit output _INFO file path is derived from this path passed to spark.write
+
+        spark.disableControlMeasuresTracking()
+
+        expectedPaths.foreach { expectedPath =>
+          log.info(s"Checking $expectedPath to contain expected values")
+
+          val infoContentJson = LocalFsTestUtils.readFileAsString(expectedPath)
+          val infoControlMeasures = ControlMeasuresParser.fromJson(infoContentJson)
+
+          infoControlMeasures.checkpoints.map(_.name) shouldBe Seq("Source", "Raw", "Checkpoint0", "Checkpoint1")
+          val checkpoint0 = infoControlMeasures.checkpoints.collectFirst { case c: Checkpoint if c.name == "Checkpoint0" => c }.get
+          checkpoint0.controls should contain(Measurement("recordCount", "count", "*", "5000"))
+
+          val checkpoint1 = infoControlMeasures.checkpoints.collectFirst { case c: Checkpoint if c.name == "Checkpoint1" => c }.get
+          checkpoint1.controls should contain(Measurement("recordCount", "count", "*", "4964"))
+        }
+      }
+    }
+  }
+
+}
diff --git a/examples/src/test/scala/za/co/absa/atum/LocalFsTestUtils.scala b/examples/src/test/scala/za/co/absa/atum/LocalFsTestUtils.scala
@@ -0,0 +1,43 @@
+package za.co.absa.atum
+
+import java.io.File
+import java.nio.file.Files
+
+import org.apache.commons.io.FileUtils
+import org.apache.log4j.LogManager
+
+import scala.io.Source
+import scala.util.control.NonFatal
+
+object LocalFsTestUtils {
+  private val log = LogManager.getLogger(this.getClass)
+
+  /**
+   * Creates a temporary directory in the local filesystem.
+   *
+   * @param prefix A prefix to use for the temporary directory.
+   * @return A path to a temporary directory.
+   */
+  def createLocalTemporaryDirectory(prefix: String): String = {
+    val tmpPath = Files.createTempDirectory(prefix)
+    tmpPath.toAbsolutePath.toString
+  }
+
+  def safeDeleteTestDir(path: String): Unit = {
+    try {
+      FileUtils.deleteDirectory(new File(path))
+    } catch {
+      case NonFatal(_) => log.warn(s"Unable to delete a test directory $path")
+    }
+  }
+
+  def readFileAsString(filename: String, lineSeparator: String = "\n"): String = {
+    val sourceFile = Source.fromFile(filename)
+    try {
+      sourceFile.getLines().mkString(lineSeparator)
+    } finally {
+      sourceFile.close()
+    }
+  }
+
+}
diff --git a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerExampleSpec.scala b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerExampleSpec.scala
@@ -15,10 +15,12 @@
 
 package za.co.absa.atum.examples
 
+import org.scalatest.Ignore
 import org.scalatest.funsuite.AnyFunSuite
 import za.co.absa.atum.utils._
 
-class SampleMeasurementsS3RunnerSpec extends AnyFunSuite
+@Ignore
+class SampleMeasurementsS3RunnerExampleSpec extends AnyFunSuite
   with SparkJobRunnerMethods
   with SparkLocalMaster {