2020import datetime
2121import hashlib
2222import os
23+ import typing
24+ import warnings
2325
24- import rdflib
26+ import rdflib # type: ignore
2527
2628import case_utils
2729
3436NS_UCO_VOCABULARY = rdflib .Namespace ("https://unifiedcyberontology.org/ontology/uco/vocabulary#" )
3537NS_XSD = rdflib .XSD
3638
37- def create_file_node (graph , filepath , node_iri = None , node_prefix = DEFAULT_PREFIX , disable_hashes = False , disable_mtime = False ):
39+ # Shortcut syntax for defining an immutable named tuple is noted here:
40+ # https://docs.python.org/3/library/typing.html#typing.NamedTuple
41+ # via the "See also" box here: https://docs.python.org/3/library/collections.html#collections.namedtuple
42+ class HashDict (typing .NamedTuple ):
43+ filesize : int
44+ md5 : str
45+ sha1 : str
46+ sha256 : str
47+ sha512 : str
48+
49+ def create_file_node (
50+ graph : rdflib .Graph ,
51+ filepath : str ,
52+ node_iri : typing .Optional [str ] = None ,
53+ node_prefix : str = DEFAULT_PREFIX ,
54+ disable_hashes : bool = False ,
55+ disable_mtime : bool = False
56+ ) -> rdflib .URIRef :
3857 r"""
3958 This function characterizes the file at filepath.
4059
@@ -119,20 +138,20 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
119138 ))
120139
121140 # Compute hashes until they are re-computed and match once. (This is a lesson learned from working with a NAS that had a subtly faulty network cable.)
122- successful_hashdict = None
123- last_hashdict = dict ()
141+
142+ successful_hashdict : typing .Optional [HashDict ] = None
143+ last_hashdict : typing .Optional [HashDict ] = None
124144 for attempt_no in [0 , 1 , 2 , 3 ]:
125- current_hashdict = dict ()
126145 # Hash file's contents.
127146 # This hashing logic was partially copied from DFXML's walk_to_dfxml.py.
128147 md5obj = hashlib .md5 ()
129148 sha1obj = hashlib .sha1 ()
130149 sha256obj = hashlib .sha256 ()
131150 sha512obj = hashlib .sha512 ()
132151 stashed_error = None
152+ byte_tally = 0
133153 with open (filepath , "rb" ) as in_fh :
134154 chunk_size = 2 ** 22
135- byte_tally = 0
136155 while True :
137156 buf = b""
138157 try :
@@ -147,13 +166,15 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
147166 sha1obj .update (buf )
148167 sha256obj .update (buf )
149168 sha512obj .update (buf )
150- current_hashdict ["filesize" ] = byte_tally
151169 if not stashed_error is None :
152170 raise stashed_error
153- current_hashdict ["md5" ] = md5obj .hexdigest ()
154- current_hashdict ["sha1" ] = sha1obj .hexdigest ()
155- current_hashdict ["sha256" ] = sha256obj .hexdigest ()
156- current_hashdict ["sha512" ] = sha512obj .hexdigest ()
171+ current_hashdict = HashDict (
172+ byte_tally ,
173+ md5obj .hexdigest (),
174+ sha1obj .hexdigest (),
175+ sha256obj .hexdigest (),
176+ sha512obj .hexdigest ()
177+ )
157178 if last_hashdict == current_hashdict :
158179 successful_hashdict = current_hashdict
159180 break
@@ -163,22 +184,23 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
163184 del current_hashdict
164185 if successful_hashdict is None :
165186 raise ValueError ("Failed to confirm hashes of file %r." % filepath )
166- if successful_hashdict [ " filesize" ] != file_stat .st_size :
187+ if successful_hashdict . filesize != file_stat .st_size :
167188 # TODO - Discuss with AC whether this should be something stronger, like an assertion error.
168- _logger .warning (
169- "Inode file size and hashed file sizes disagree: %d vs. %d." ,
170- file_stat .st_size ,
171- successful_hashdict ["filesize" ]
189+ warnings .warn (
190+ "Inode file size and hashed file sizes disagree: %d vs. %d." % (
191+ file_stat .st_size ,
192+ successful_hashdict .filesize
193+ )
172194 )
173195 # TODO - Discuss whether this property should be recorded even if hashes are not attempted.
174196 graph .add ((
175197 n_contentdata_facet ,
176198 NS_UCO_OBSERVABLE .sizeInBytes ,
177- rdflib .Literal (successful_hashdict [ " filesize" ] )
199+ rdflib .Literal (successful_hashdict . filesize )
178200 ))
179201
180202 # Add confirmed hashes into graph.
181- for key in successful_hashdict :
203+ for key in successful_hashdict . _fields :
182204 if not key in ("md5" , "sha1" , "sha256" , "sha512" ):
183205 continue
184206 n_hash = rdflib .BNode ()
@@ -197,15 +219,16 @@ def create_file_node(graph, filepath, node_iri=None, node_prefix=DEFAULT_PREFIX,
197219 NS_UCO_TYPES .hashMethod ,
198220 rdflib .Literal (key .upper (), datatype = NS_UCO_VOCABULARY .HashNameVocab )
199221 ))
222+ hash_value = getattr (successful_hashdict , key )
200223 graph .add ((
201224 n_hash ,
202225 NS_UCO_TYPES .hashValue ,
203- rdflib .Literal (successful_hashdict [ key ] .upper (), datatype = NS_XSD .hexBinary )
226+ rdflib .Literal (hash_value .upper (), datatype = NS_XSD .hexBinary )
204227 ))
205228
206229 return n_file
207230
208- def main ():
231+ def main () -> None :
209232 import argparse
210233 parser = argparse .ArgumentParser ()
211234 parser .add_argument ("--base-prefix" , default = DEFAULT_PREFIX )
@@ -234,7 +257,7 @@ def main():
234257 else :
235258 output_format = args .output_format
236259
237- serialize_kwargs = {
260+ serialize_kwargs : typing . Dict [ str , typing . Any ] = {
238261 "format" : output_format
239262 }
240263 if output_format == "json-ld" :
0 commit comments