From b6abfb81866786cb9b755cbf7b755028d2206332 Mon Sep 17 00:00:00 2001 From: "Mark A. Matney, Jr" Date: Wed, 5 May 2021 15:26:01 -0700 Subject: [PATCH 1/3] Add script chaining A/V Pairtree, Metagetter, and Festerize --- src/main/scripts/README.md | 53 +++++++++++++ src/main/scripts/avpt_data_pipeline.sh | 103 +++++++++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 src/main/scripts/README.md create mode 100755 src/main/scripts/avpt_data_pipeline.sh diff --git a/src/main/scripts/README.md b/src/main/scripts/README.md new file mode 100644 index 0000000..534e23e --- /dev/null +++ b/src/main/scripts/README.md @@ -0,0 +1,53 @@ +# A/V Pairtree Scripts + +Various scripts for use with the A/V Pairtree application. + +## avpt_data_pipeline.sh + +This script constructs a data processing pipeline consisting of A/V Pairtree, Metagetter, and Festerize (in that order), in which the output CSV files of each component application are passed to the next one for further processing. + +### Installation + +Dependencies: +- GNU bash (written for version 4.2.46(2)-release (x86_64-redhat-linux-gnu)) +- GNU coreutils +- curl +- ffmpeg +- [inotifywait](https://github.com/inotify-tools/inotify-tools) +- [UCLALibrary/services-metagetter](https://github.com/UCLALibrary/services-metagetter) +- [UCLALibrary/festerize](https://github.com/UCLALibrary/festerize) + +### Usage + +The following environment variables must be set: + +Environment variable|Description +---|--- +AVPTDP_INPUT_DIRECTORY|directory where A/V Pairtree puts .out files; this is the input directory for the pipeline (and thus, for Metagetter) +AVPTDP_FESTERIZE_OUTPUT_DIRECTORY|directory where Festerize puts .csv files +AVPTDP_METAGETTER_MEDIA_DIRECTORY|directory where Metagetter will search for A/V media files +AVPTDP_METAGETTER_OUTPUT_DIRECTORY|directory where Metagetter puts .out files (which are then renamed as .csv); this is the input directory for Festerize +AVPTDP_SLACK_WEBHOOK_URL|URL of the webhook for posting to Slack + +The script takes a single optional positional argument: an alias for the ingest Fester instance to Festerize the data with. If omitted, or if an unknown alias is used, the script will point Festerize at http://localhost:8888. + +Known aliases: + +Argument|Description +---|--- +prod|https://ingest.iiif.library.ucla.edu +test|https://test-iiif.library.ucla.edu + +For example: + +```bash +#!/bin/bash + +export AVPTDP_INPUT_DIRECTORY="avpt_output/" +export AVPTDP_FESTERIZE_OUTPUT_DIRECTORY="festerize_output/" +export AVPTDP_METAGETTER_MEDIA_DIRECTORY="metagetter_media/" +export AVPTDP_METAGETTER_OUTPUT_DIRECTORY="metagetter_output/" +export AVPTDP_SLACK_WEBHOOK_URL="https://hooks.slack.com/services/0123456789" + +./avpt_data_pipeline.sh prod +``` diff --git a/src/main/scripts/avpt_data_pipeline.sh b/src/main/scripts/avpt_data_pipeline.sh new file mode 100755 index 0000000..ba9aa07 --- /dev/null +++ b/src/main/scripts/avpt_data_pipeline.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +function get_av_metadata { + # Runs the CSV at the path provided via $1 through services-metagetter and outputs the path of the result CSV + 2>/dev/null 1>&2 \ + java -jar UCLALibrary/services-metagetter/target/build-artifact/services-metagetter-0.0.1-SNAPSHOT.jar \ + $1 ${AVPTDP_METAGETTER_MEDIA_DIRECTORY} `which ffprobe` ${AVPTDP_METAGETTER_OUTPUT_DIRECTORY} && + echo `strip_trailing_slash ${AVPTDP_METAGETTER_OUTPUT_DIRECTORY}`/`basename $1` +} + +function change_filename_extension { + # Change the filename extension of the provided path (piped to stdin) from .out to .csv, since festerize only looks + # at .csv files + read filename_dot_out && + filename_dot_csv=`sed -e "s/\.out$/.csv/" <<< ${filename_dot_out}` + mv ${filename_dot_out} ${filename_dot_csv} + echo ${filename_dot_csv} +} + +function festerize_ { + # Runs the CSV at the provided path (piped to stdin) through festerize (using the base URL provided via $1) and + # outputs the path of the result CSV + read csv_filename && + yes | + 2>/dev/null 1>&2 \ + festerize --iiif-api-version 3 --server $1 --out ${AVPTDP_FESTERIZE_OUTPUT_DIRECTORY} ${csv_filename} && + echo `strip_trailing_slash ${AVPTDP_FESTERIZE_OUTPUT_DIRECTORY}`/`basename ${csv_filename}` +} + +function send_slack_notification { + # Posts a notification to a Slack channel with a message about the input CSV ($1), the ingest Fester base URL ($2), + # and the output CSV (stdin), and then outputs the message + read csv_filename && + message="Input CSV $1 was updated successfully, and after Festerizing with $2 is now available at ${csv_filename}." + curl -s -X POST -H 'Content-type: application/json' --data '{"text":${message}}' ${AVPTDP_SLACK_WEBHOOK_URL} + echo ${message} +} + +function get_ingest_fester_base_url { + # Outputs the base URL of the ingest Fester instance associated with the provided alias + case $1 in + prod) + echo "https://ingest.iiif.library.ucla.edu" + ;; + test) + echo "https://test-iiif.library.ucla.edu" + ;; + *) + echo "http://localhost:8888" + ;; + esac +} + +function strip_trailing_slash { + # Outputs the provided path with any trailing slash removed + sed -e "s/\/$//" <<< $1 +} + +# Check if the required env vars are set +if [ -z "${AVPTDP_INPUT_DIRECTORY}" ] +then + echo "The env var AVPTDP_INPUT_DIRECTORY must be set." + exit 1 +elif [ -z "${AVPTDP_FESTERIZE_OUTPUT_DIRECTORY}" ] +then + echo "The env var AVPTDP_FESTERIZE_OUTPUT_DIRECTORY must be set." + exit 1 +elif [ -z "${AVPTDP_METAGETTER_MEDIA_DIRECTORY}" ] +then + echo "The env var AVPTDP_METAGETTER_MEDIA_DIRECTORY must be set." + exit 1 +elif [ -z "${AVPTDP_METAGETTER_OUTPUT_DIRECTORY}" ] +then + echo "The env var AVPTDP_METAGETTER_OUTPUT_DIRECTORY must be set." + exit 1 +elif [ -z "${AVPTDP_SLACK_WEBHOOK_URL}" ] +then + echo "The env var AVPTDP_SLACK_WEBHOOK_URL must be set." + exit 1 +fi + +ingest_fester_base_url=`get_ingest_fester_base_url $1` +>&2 echo "Using Fester instance at ${ingest_fester_base_url} for ingest." + +inotifywait -mr \ + --timefmt '%d/%m/%y %H:%M' --format '%T %w %f' \ + -e close_write \ + ${AVPTDP_INPUT_DIRECTORY} | +while read -r date time dir file; do + # Only process files with a ".out" filename extension + case ${file} in + *.out) + abs_path=${dir}${file} + + get_av_metadata ${abs_path} | + change_filename_extension | + festerize_ ${ingest_fester_base_url} | + send_slack_notification ${abs_path} ${ingest_fester_base_url} + ;; + *) + ;; + esac +done From 7b9c030fc562b2ac6297ee57ce183f5a389cee63 Mon Sep 17 00:00:00 2001 From: "Mark A. Matney, Jr" Date: Thu, 6 May 2021 09:56:03 -0700 Subject: [PATCH 2/3] Fix first-time build instructions in README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e98c3dc..cd4c5e1 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ There are two sets of build instructions: one for systems with [Maven](https://m To build the project the first time, type: - ./mvnw validate verify + ./mvnw validate && ./mvnw verify To run the service locally, type: @@ -26,7 +26,7 @@ To process one of the test CSVs, you can copy a CSV file from `src/test/resource To build the project the first time, type: - mvn validate verify + mvn validate && mvn verify To run the service locally, type: @@ -38,7 +38,7 @@ To process one of the test CSVs, you can copy a CSV file from `src/test/resource ## Additional instructions -The `validate` argument only needs to be supplied to the mvn(w) command on the first run. After that, `mvn(w) verify` (or `mvn(w) package`) will work fine. Also, the build automatically happens when you run `mvn(w) -Plive test` so you don't need to repeat both steps just to run a test after the initial run. +`mvn(w) validate` only needs to be run once, in order to build the project for the first time. After that, `mvn(w) verify` (or `mvn(w) package`) will work fine. Also, the build automatically happens when you run `mvn(w) -Plive test` so you don't need to repeat both steps just to run a test after the initial run. ## Running in production From b5b0e6268898d2f9d91b482dd76b152b1604f96b Mon Sep 17 00:00:00 2001 From: "Mark A. Matney, Jr" Date: Thu, 6 May 2021 11:55:20 -0700 Subject: [PATCH 3/3] set -o pipefail --- src/main/scripts/avpt_data_pipeline.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/scripts/avpt_data_pipeline.sh b/src/main/scripts/avpt_data_pipeline.sh index ba9aa07..fd155c9 100755 --- a/src/main/scripts/avpt_data_pipeline.sh +++ b/src/main/scripts/avpt_data_pipeline.sh @@ -82,6 +82,9 @@ fi ingest_fester_base_url=`get_ingest_fester_base_url $1` >&2 echo "Using Fester instance at ${ingest_fester_base_url} for ingest." +# Get a more informative return status from our pipeline in the main loop +set -o pipefail + inotifywait -mr \ --timefmt '%d/%m/%y %H:%M' --format '%T %w %f' \ -e close_write \