metafacture · TobiasNx · May 29, 2024 · May 29, 2024 · May 29, 2024 · May 29, 2024
diff --git a/Concordance-RVK-Verbundbibliothek/bulk.csv b/Concordance-RVK-Verbundbibliothek/bulk.csv
@@ -0,0 +1,3 @@
+"HT013166356","CI 5310,CI 5603,CI 1100,CI 1125,CI 5604,EC 2430,IH 34381"
+"HT018625006","CI 5310,CI 5603,CI 1100,CI 1125,CI 5604,EC 2430,IH 34381"
+"TT000577460","CI 5310,CI 5603,CI 1100,CI 1125,CI 5604,EC 2430,IH 34381"
diff --git a/Concordance-RVK-Verbundbibliothek/bulk.json b/Concordance-RVK-Verbundbibliothek/bulk.json
@@ -0,0 +1 @@
+"HT013166356, HT018625006, TT000577460","CI 5310","CI 5603","CI 1100","CI 1125","CI 5603","CI 5604","EC 2430","IH 34381"
diff --git a/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux b/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux
@@ -0,0 +1,28 @@
+// Die flux filtert mittels morph-cg-to-es.xml die Isil DE-605 aus
+// This flux uses morph-cg-to-es.xml to filter records with holdings
+// by Isil DE-605 from culturegraph aggregate marcxml. It then builds
+// a concordance Id<->RVK which can be indexed directly into elasticsearch.
+// Snippet from the output json:
+//
+//{"index":{"_index":"cgrvk","_type":"rvk"}}
+//{"rvk":["CI 1100","5,1"],"hbzId":"HT018839495, HT018625006"}
+//
+// Use curl to bulk load the file:
+//
+//  curl -XPOST --header 'Content-Type: application/x-ndjson' -d @bulk.ndjson 'http://localhost:9200/_bulk'
+
+default outfile = FLUX_DIR + "bulk.csv";
+default infile = FLUX_DIR + "aggregate_auslieferung_20191212.small.marcxml.gz";
+default fixfile = FLUX_DIR + "fix-cg-to-es.fix";
+
+
+infile
+| open-file
+| decode-xml
+| handle-marcxml
+| fix(fixfile)
+| encode-json
+| decode-json(recordPath="records")
+| encode-csv
+| write(outfile)
+;
diff --git a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix
@@ -1,33 +1,42 @@
-set_array("rvk[]")
+set_array("records[]")
+set_array("@id[]")
+set_array("@rvk")
 
 do list(path: "084??", "var": "$i")
     if any_match("$i.2", "rvk")
-        copy_field("$i.a","rvk[].$append")
+        copy_field("$i.a","@rvk.$append")
     end
 end
 
-set_array("id")
+uniq("@rvk")
+join_field("@rvk",",")
+
+
 do list(path: "035??", "var": "$i")
     if any_match("$i.a", "^\\(DE-605\\)(.*)")
-        copy_field("$i.a","id.$append")
+        copy_field("$i.a","@id[].$append")
     end
 end
-replace_all("id.*","^\\(DE-605\\)(.*)","$1")
-join_field("id",", ")
+replace_all("id[].*","^\\(DE-605\\)(.*)","$1")
+
+do list(path: "@id[]", "var": "$i")
+    copy_field("$i","records[].$append.id")
+    copy_field("@rvk","records[].$last.rvk")
+end
+replace_all("records[].*.id","^\\(DE-605\\)(.*)","$1")
 
-retain("rvk[]","id")
 vacuum()
 
 # Filter records without RVK
-unless exists("rvk[]")
+unless exists("@rvk")
     reject()
 end
 
 # Filter records without hbz ids
-unless exists("id")
+unless exists("@id[]")
     reject()
 end
 
-
+retain("records[]")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"HT013166356, HT018625006, TT000577460","CI 5310","CI 5603","CI 1100","CI 1125","CI 5603","CI 5604","EC 2430","IH 34381"