2626Should a more complex query be necessary, an outer, wrapping SELECT query would let this script continue to function.
2727"""
2828
29- __version__ = "0.4.4 "
29+ __version__ = "0.5.0 "
3030
3131import argparse
3232import binascii
3333import logging
3434import os
3535import sys
36+ import typing
3637
3738import pandas as pd # type: ignore
3839import rdflib .plugins .sparql
4849_logger = logging .getLogger (os .path .basename (__file__ ))
4950
5051
51- def main () -> None :
52- parser = argparse .ArgumentParser ()
53-
54- # Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
55- logging .basicConfig (
56- level = logging .DEBUG
57- if ("--debug" in sys .argv or "-d" in sys .argv )
58- else logging .INFO
59- )
60-
61- parser .add_argument ("-d" , "--debug" , action = "store_true" )
62- parser .add_argument (
63- "--built-version" ,
64- choices = tuple (built_version_choices_list ),
65- default = "case-" + CURRENT_CASE_VERSION ,
66- help = "Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis." ,
67- )
68- parser .add_argument (
69- "--disallow-empty-results" ,
70- action = "store_true" ,
71- help = "Raise error if no results are returned for query." ,
72- )
73- parser .add_argument (
74- "out_table" ,
75- help = "Expected extensions are .html for HTML tables or .md for Markdown tables." ,
76- )
77- parser .add_argument (
78- "in_sparql" ,
79- help = "File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs." ,
80- )
81- parser .add_argument ("in_graph" , nargs = "+" )
82- args = parser .parse_args ()
52+ def query_text_to_variables (select_query_text : str ) -> typing .List [str ]:
53+ # Build columns list from SELECT line.
54+ select_query_text_lines = select_query_text .split ("\n " )
55+ select_line = [
56+ line for line in select_query_text_lines if line .startswith ("SELECT " )
57+ ][0 ]
58+ variables = select_line .replace (" DISTINCT" , "" ).replace ("SELECT " , "" ).split (" " )
59+ return variables
8360
84- graph = rdflib .Graph ()
85- for in_graph_filename in args .in_graph :
86- graph .parse (in_graph_filename )
8761
62+ def graph_and_query_to_data_frame (
63+ graph : rdflib .Graph ,
64+ select_query_text : str ,
65+ * args : typing .Any ,
66+ built_version : str = "case-" + CURRENT_CASE_VERSION ,
67+ disallow_empty_results : bool = False ,
68+ use_prefixes : bool = False ,
69+ ** kwargs : typing .Any ,
70+ ) -> pd .DataFrame :
8871 # Inherit prefixes defined in input context dictionary.
8972 nsdict = {k : v for (k , v ) in graph .namespace_manager .namespaces ()}
9073
91- select_query_text = None
92- with open (args .in_sparql , "r" ) as in_fh :
93- select_query_text = in_fh .read ().strip ()
94- _logger .debug ("select_query_text = %r." % select_query_text )
95-
74+ # Avoid side-effects on input parameter.
9675 if "subClassOf" in select_query_text :
97- case_utils .ontology .load_subclass_hierarchy (
98- graph , built_version = args .built_version
99- )
76+ _graph = rdflib .Graph ()
77+ _graph += graph
78+ case_utils .ontology .load_subclass_hierarchy (_graph , built_version = built_version )
79+ else :
80+ _graph = graph
10081
101- # Build columns list from SELECT line.
102- select_query_text_lines = select_query_text .split ("\n " )
103- select_line = [
104- line for line in select_query_text_lines if line .startswith ("SELECT " )
105- ][0 ]
106- variables = select_line .replace (" DISTINCT" , "" ).replace ("SELECT " , "" ).split (" " )
82+ variables = query_text_to_variables (select_query_text )
10783
10884 tally = 0
10985 records = []
11086 select_query_object = rdflib .plugins .sparql .processor .prepareQuery (
11187 select_query_text , initNs = nsdict
11288 )
113- for (row_no , row ) in enumerate (graph .query (select_query_object )):
89+ for (row_no , row ) in enumerate (_graph .query (select_query_object )):
11490 tally = row_no + 1
11591 record = []
11692 for (column_no , column ) in enumerate (row ):
@@ -124,35 +100,241 @@ def main() -> None:
124100 # The render to ASCII is in support of this script rendering results for website viewing.
125101 # .decode() is because hexlify returns bytes.
126102 column_value = binascii .hexlify (column .toPython ()).decode ()
103+ elif isinstance (column , rdflib .URIRef ):
104+ if use_prefixes :
105+ column_value = graph .namespace_manager .qname (column .toPython ())
106+ else :
107+ column_value = column .toPython ()
127108 else :
128109 column_value = column .toPython ()
129110 if row_no == 0 :
130111 _logger .debug ("row[0]column[%d] = %r." % (column_no , column_value ))
131112 record .append (column_value )
132113 records .append (record )
114+
133115 if tally == 0 :
134- if args . disallow_empty_results :
116+ if disallow_empty_results :
135117 raise ValueError ("Failed to return any results." )
136118
137119 df = pd .DataFrame (records , columns = variables )
120+ return df
121+
122+
123+ def data_frame_to_table_text (
124+ df : pd .DataFrame ,
125+ * args : typing .Any ,
126+ json_indent : typing .Optional [int ] = None ,
127+ json_orient : str ,
128+ output_mode : str ,
129+ use_header : bool ,
130+ use_index : bool ,
131+ ** kwargs : typing .Any ,
132+ ) -> str :
133+ table_text : typing .Optional [str ] = None
138134
139- table_text = None
140- if args .out_table .endswith (".html" ):
135+ # Set up kwargs dicts. One kwarg behaves slightly differently for Markdown vs. other formats.
136+ general_kwargs : typing .Dict [str , typing .Any ] = dict ()
137+ md_kwargs : typing .Dict [str , typing .Any ] = dict ()
138+
139+ # Note some output modes will drop 'header' from general_kwargs, due to alternate support or lack of support.
140+ if use_header :
141+ general_kwargs ["header" ] = True
142+ else :
143+ general_kwargs ["header" ] = False
144+ md_kwargs ["headers" ] = tuple ()
145+
146+ general_kwargs ["index" ] = use_index
147+
148+ if output_mode in {"csv" , "tsv" }:
149+ sep : str
150+ if output_mode == "csv" :
151+ sep = ","
152+ elif output_mode == "tsv" :
153+ sep = "\t "
154+ else :
155+ raise NotImplementedError (
156+ "Output extension not implemented in CSV-style output."
157+ )
158+ table_text = df .to_csv (sep = sep , ** general_kwargs )
159+ elif output_mode == "html" :
141160 # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_html.html
142161 # Add CSS classes for CASE website Bootstrap support.
143- table_text = df .to_html (classes = ("table" , "table-bordered" , "table-condensed" ))
144- elif args .out_table .endswith (".md" ):
162+ table_text = df .to_html (
163+ classes = ("table" , "table-bordered" , "table-condensed" ), ** general_kwargs
164+ )
165+ elif output_mode == "json" :
166+ # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html
167+
168+ # Drop unsupported kwarg.
169+ del general_kwargs ["header" ]
170+
171+ table_text = df .to_json (
172+ indent = json_indent , orient = json_orient , date_format = "iso" , ** general_kwargs
173+ )
174+ elif output_mode == "md" :
145175 # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_markdown.html
146176 # https://pypi.org/project/tabulate/
147177 # Assume Github-flavored Markdown.
148- table_text = df .to_markdown (tablefmt = "github" )
149- if table_text is None :
150- raise NotImplementedError (
151- "Unsupported output extension for output filename %r." , args .out_table
178+
179+ # Drop unsupported kwarg.
180+ del general_kwargs ["header" ]
181+
182+ table_text = df .to_markdown (tablefmt = "github" , ** general_kwargs , ** md_kwargs )
183+ else :
184+ if table_text is None :
185+ raise NotImplementedError ("Unimplemented output mode: %r." % output_mode )
186+ assert table_text is not None
187+
188+ return table_text
189+
190+
191+ def main () -> None :
192+ parser = argparse .ArgumentParser ()
193+
194+ # Configure debug logging before running parse_args, because there could be an error raised before the construction of the argument parser.
195+ logging .basicConfig (
196+ level = logging .DEBUG
197+ if ("--debug" in sys .argv or "-d" in sys .argv )
198+ else logging .INFO
199+ )
200+
201+ parser .add_argument ("-d" , "--debug" , action = "store_true" )
202+ parser .add_argument (
203+ "--built-version" ,
204+ choices = tuple (built_version_choices_list ),
205+ default = "case-" + CURRENT_CASE_VERSION ,
206+ help = "Ontology version to use to supplement query, such as for subclass querying. Does not require networking to use. Default is most recent CASE release. Passing 'none' will mean no pre-built CASE ontology versions accompanying this tool will be included in the analysis." ,
207+ )
208+ parser .add_argument (
209+ "--disallow-empty-results" ,
210+ action = "store_true" ,
211+ help = "Raise error if no results are returned for query." ,
212+ )
213+ parser .add_argument (
214+ "--json-indent" ,
215+ type = int ,
216+ help = "Number of whitespace characters to use for indentation. Only applicable for JSON output." ,
217+ )
218+ parser .add_argument (
219+ "--json-orient" ,
220+ default = "columns" ,
221+ choices = ("columns" , "index" , "records" , "split" , "table" , "values" ),
222+ help = "Orientation to use for Pandas DataFrame JSON output. Only applicable for JSON output." ,
223+ )
224+ parser .add_argument (
225+ "--use-prefixes" ,
226+ action = "store_true" ,
227+ help = "Abbreviate node IDs according to graph's encoded prefixes. (This will use prefixes in the graph, not the query.)" ,
228+ )
229+ parser .add_argument (
230+ "out_table" ,
231+ help = "Expected extensions are .html for HTML tables, .json for JSON tables, .md for Markdown tables, .csv for comma-separated values, and .tsv for tab-separated values. Note that JSON is a Pandas output JSON format (chosen by '--json-orient'), and not JSON-LD." ,
232+ )
233+ parser .add_argument (
234+ "in_sparql" ,
235+ help = "File containing a SPARQL SELECT query. Note that prefixes not mapped with a PREFIX statement will be mapped according to their first occurrence among input graphs." ,
236+ )
237+
238+ parser_header_group = parser .add_mutually_exclusive_group (required = False )
239+ parser_header_group .add_argument (
240+ "--header" ,
241+ action = "store_true" ,
242+ help = "Print column labels. This is the default behavior." ,
243+ )
244+ parser_header_group .add_argument (
245+ "--no-header" ,
246+ action = "store_true" ,
247+ help = "Do not print column labels." ,
248+ )
249+
250+ parser_index_group = parser .add_mutually_exclusive_group (required = False )
251+ parser_index_group .add_argument (
252+ "--index" ,
253+ action = "store_true" ,
254+ help = "Print index (auto-incrementing row labels as left untitled column). This is the default behavior." ,
255+ )
256+ parser_index_group .add_argument (
257+ "--no-index" ,
258+ action = "store_true" ,
259+ help = "Do not print index. If output is JSON, --json-orient must be 'split' or 'table'." ,
260+ )
261+
262+ parser .add_argument ("in_graph" , nargs = "+" )
263+ args = parser .parse_args ()
264+
265+ output_mode : str
266+ if args .out_table .endswith (".csv" ):
267+ output_mode = "csv"
268+ elif args .out_table .endswith (".html" ):
269+ output_mode = "html"
270+ elif args .out_table .endswith (".json" ):
271+ output_mode = "json"
272+ elif args .out_table .endswith (".md" ):
273+ output_mode = "md"
274+ elif args .out_table .endswith (".tsv" ):
275+ output_mode = "tsv"
276+ else :
277+ raise NotImplementedError ("Output file extension not implemented." )
278+
279+ graph = rdflib .Graph ()
280+ for in_graph_filename in args .in_graph :
281+ graph .parse (in_graph_filename )
282+
283+ select_query_text : typing .Optional [str ] = None
284+ with open (args .in_sparql , "r" ) as in_fh :
285+ select_query_text = in_fh .read ().strip ()
286+ if select_query_text is None :
287+ raise ValueError ("Failed to load query." )
288+ _logger .debug ("select_query_text = %r." % select_query_text )
289+
290+ # Process --header and --no-header.
291+ use_header : bool
292+ if args .header is True :
293+ use_header = True
294+ if args .no_header is True :
295+ use_header = False
296+ else :
297+ use_header = True
298+
299+ # Process --index and --no-index.
300+ use_index : bool
301+ if args .index is True :
302+ use_index = True
303+ if args .no_index is True :
304+ use_index = False
305+ else :
306+ use_index = True
307+
308+ if (
309+ output_mode == "json"
310+ and use_index is False
311+ and args .json_orient not in {"split" , "table" }
312+ ):
313+ raise ValueError (
314+ "For JSON output, --no-index flag requires --json-orient to be either 'split' or 'table'."
152315 )
153316
317+ df = graph_and_query_to_data_frame (
318+ graph ,
319+ select_query_text ,
320+ built_version = args .built_version ,
321+ disallow_empty_results = args .disallow_empty_results is True ,
322+ use_prefixes = args .use_prefixes is True ,
323+ )
324+
325+ table_text = data_frame_to_table_text (
326+ df ,
327+ json_indent = args .json_indent ,
328+ json_orient = args .json_orient ,
329+ output_mode = output_mode ,
330+ use_header = use_header ,
331+ use_index = use_index ,
332+ )
154333 with open (args .out_table , "w" ) as out_fh :
155334 out_fh .write (table_text )
335+ if table_text [- 1 ] != "\n " :
336+ # End file with newline. CSV and TSV modes end with a built-in newline.
337+ out_fh .write ("\n " )
156338
157339
158340if __name__ == "__main__" :
0 commit comments