- 
                Notifications
    You must be signed in to change notification settings 
- Fork 9
Bugfix/48 no storer write #51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
601c217
              5c8057d
              6872dfa
              c140413
              a051dfa
              a40d819
              61adf96
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,76 @@ | ||
| package za.co.absa.atum | ||
|  | ||
| import org.apache.hadoop.fs.FileSystem | ||
| import org.apache.log4j.LogManager | ||
| import org.apache.spark.sql.{DataFrame, SaveMode} | ||
| import org.scalatest.BeforeAndAfterAll | ||
| import org.scalatest.flatspec.AnyFlatSpec | ||
| import org.scalatest.matchers.should.Matchers | ||
| import za.co.absa.atum.model.{Checkpoint, Measurement} | ||
| import za.co.absa.atum.persistence.ControlMeasuresParser | ||
| import za.co.absa.atum.utils.SparkTestBase | ||
|  | ||
| class HdfsInfoIntegrationSuite extends AnyFlatSpec with SparkTestBase with Matchers with BeforeAndAfterAll { | ||
|  | ||
| private val log = LogManager.getLogger(this.getClass) | ||
| val tempDir: String = LocalFsTestUtils.createLocalTemporaryDirectory("hdfsTestOutput") | ||
|  | ||
| override def afterAll: Unit = { | ||
| LocalFsTestUtils.safeDeleteTestDir(tempDir) | ||
| } | ||
|  | ||
| private val inputCsv = "data/input/wikidata.csv" | ||
| private def readSparkInputCsv(inputCsvPath: String): DataFrame = spark.read | ||
| .option("header", "true") | ||
| .option("inferSchema", "true") | ||
| .csv(inputCsvPath) | ||
|  | ||
| private def writeSparkData(df: DataFrame, outputPath: String): Unit = | ||
| df.write.mode(SaveMode.Overwrite) | ||
| .parquet(outputPath) | ||
|  | ||
| { | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure I get how this is styled. Shouldn't there be some keyword here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what what? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, interesting, haven't thought about it that way. I have never seen a standalone block like this in scala. So it was weird to me. | ||
| val outputPath = s"$tempDir/outputCheck1" | ||
| // implicit variant only writes to derived outputPath, explicit writes to both implicit derived path and the explicit one, too. | ||
| Seq( | ||
| ("implicit output _INFO path only", "", Seq(s"$outputPath/_INFO")), | ||
| ("implicit & explicit output _INFO path", s"$outputPath/extra/_INFO2", Seq(s"$outputPath/_INFO", s"$outputPath/extra/_INFO2")) | ||
| ).foreach { case (testCaseName, destinationInfoFilePath, expectedPaths) => | ||
|  | ||
| "_INFO" should s"be written on spark.write ($testCaseName)" in { | ||
| import spark.implicits._ | ||
| import za.co.absa.atum.AtumImplicits._ | ||
|  | ||
| val hadoopConfiguration = spark.sparkContext.hadoopConfiguration | ||
| implicit val fs: FileSystem = FileSystem.get(hadoopConfiguration) | ||
|  | ||
| // Initializing library to hook up to Apache Spark | ||
| spark.enableControlMeasuresTracking(sourceInfoFile = "data/input/wikidata.csv.info", destinationInfoFile = destinationInfoFilePath) | ||
| .setControlMeasuresWorkflow("Job 1") | ||
|  | ||
| val df1 = readSparkInputCsv(inputCsv) | ||
| df1.setCheckpoint("Checkpoint0") | ||
| val filteredDf1 = df1.filter($"total_response_size" > 1000) | ||
| filteredDf1.setCheckpoint("Checkpoint1") // stateful, do not need return value | ||
| writeSparkData(filteredDf1, outputPath) // implicit output _INFO file path is derived from this path passed to spark.write | ||
|  | ||
| spark.disableControlMeasuresTracking() | ||
|  | ||
| expectedPaths.foreach { expectedPath => | ||
| log.info(s"Checking $expectedPath to contain expected values") | ||
|  | ||
| val infoContentJson = LocalFsTestUtils.readFileAsString(expectedPath) | ||
| val infoControlMeasures = ControlMeasuresParser.fromJson(infoContentJson) | ||
|  | ||
| infoControlMeasures.checkpoints.map(_.name) shouldBe Seq("Source", "Raw", "Checkpoint0", "Checkpoint1") | ||
| val checkpoint0 = infoControlMeasures.checkpoints.collectFirst { case c: Checkpoint if c.name == "Checkpoint0" => c }.get | ||
| checkpoint0.controls should contain(Measurement("recordCount", "count", "*", "5000")) | ||
|  | ||
| val checkpoint1 = infoControlMeasures.checkpoints.collectFirst { case c: Checkpoint if c.name == "Checkpoint1" => c }.get | ||
| checkpoint1.controls should contain(Measurement("recordCount", "count", "*", "4964")) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|  | ||
| } | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| package za.co.absa.atum | ||
|  | ||
| import java.io.File | ||
| import java.nio.file.Files | ||
|  | ||
| import org.apache.commons.io.FileUtils | ||
| import org.apache.log4j.LogManager | ||
|  | ||
| import scala.io.Source | ||
| import scala.util.control.NonFatal | ||
|  | ||
| object LocalFsTestUtils { | ||
| private val log = LogManager.getLogger(this.getClass) | ||
|  | ||
| /** | ||
| * Creates a temporary directory in the local filesystem. | ||
| * | ||
| * @param prefix A prefix to use for the temporary directory. | ||
| * @return A path to a temporary directory. | ||
| */ | ||
| def createLocalTemporaryDirectory(prefix: String): String = { | ||
| val tmpPath = Files.createTempDirectory(prefix) | ||
| tmpPath.toAbsolutePath.toString | ||
| } | ||
|  | ||
| def safeDeleteTestDir(path: String): Unit = { | ||
| try { | ||
| FileUtils.deleteDirectory(new File(path)) | ||
| } catch { | ||
| case NonFatal(_) => log.warn(s"Unable to delete a test directory $path") | ||
| } | ||
| } | ||
|  | ||
| def readFileAsString(filename: String, lineSeparator: String = "\n"): String = { | ||
| val sourceFile = Source.fromFile(filename) | ||
| try { | ||
| sourceFile.getLines().mkString(lineSeparator) | ||
| } finally { | ||
| sourceFile.close() | ||
| } | ||
| } | ||
|  | ||
| } | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -15,10 +15,12 @@ | |
|  | ||
| package za.co.absa.atum.examples | ||
|  | ||
| import org.scalatest.Ignore | ||
| import org.scalatest.funsuite.AnyFunSuite | ||
| import za.co.absa.atum.utils._ | ||
|  | ||
| class SampleMeasurementsS3RunnerSpec extends AnyFunSuite | ||
| @Ignore | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because unlike the hadoop-fs tests, these tests should not be run against actual S3. Thus, they now: 
 | ||
| class SampleMeasurementsS3RunnerExampleSpec extends AnyFunSuite | ||
| with SparkJobRunnerMethods | ||
| with SparkLocalMaster { | ||
|  | ||
|  | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing the info file write here was the main cause.