diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..32298e55 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,4 @@ +[run] +include = tests/* + bin/* + setup.py diff --git a/.gitignore b/.gitignore index 0d20b648..52671470 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ *.pyc +.cache/ +.coverage +*.egg-info/ diff --git a/.travis.yml b/.travis.yml index 648d768c..6b5fdf21 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,9 +2,9 @@ language: python python: - 2.7 - - 3.3 - 3.4 - 3.5 + - 3.6 cache: - apt @@ -17,17 +17,25 @@ env: before_install: # The ultimate one-liner setup for NeuroDebian repository - # which might be needed later for tools - #- bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh) - #- travis_retry sudo apt-get update -qq - # for now even remove requirements.txt since dependencies aren't avail - - echo '' > requirements.txt + - bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh) + - travis_retry sudo apt-get update -qq + - travis_retry sudo apt-get install git-annex-standalone dcm2niix + # Install in our own virtualenv + - python -m pip install --upgrade pip + - pip install --upgrade virtualenv + - virtualenv --python=python venv + - source venv/bin/activate + - python --version # just to check - pip install -r dev-requirements.txt - - pip install codecov + - pip install datalad + - pip install codecov pytest +install: + - git config --global user.email "test@travis.land" + - git config --global user.name "Travis Almighty" script: - - nosetests -s -v --with-doctest --doctest-tests --with-cov --cover-package . --logging-level=INFO tests + - coverage run `which py.test` -s -v tests heuristics after_success: - codecov diff --git a/Dockerfile b/Dockerfile index b58d8487..b852c8a2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,10 +3,31 @@ FROM continuumio/miniconda MAINTAINER -RUN apt-get update && apt-get upgrade -y && apt-get install -y g++ && apt-get clean -y && apt-get autoclean -y && apt-get autoremove -y -RUN cd /tmp && git clone https://github.com/neurolabusc/dcm2niix.git && cd dcm2niix/console/ && git checkout 60bab318ee738b644ebb1396bbb8cbe1b006218f && g++ -O3 -I. main_console.cpp nii_dicom.cpp jpg_0XC3.cpp ujpeg.cpp nifti1_io_core.cpp nii_ortho.cpp nii_dicom_batch.cpp nii_foreign.cpp -o dcm2niix -DmyDisableOpenJPEG -DmyDisableJasper && cp dcm2niix /usr/local/bin/ -RUN conda install -y -c conda-forge nipype && pip install https://github.com/moloney/dcmstack/archive/c12d27d2c802d75a33ad70110124500a83e851ee.zip && pip install https://github.com/nipy/nipype/archive/dd1ed4f0d5735c69c1743f29875acf09d23a62e0.zip -RUN curl -O https://raw.githubusercontent.com/nipy/heudiconv/master/bin/heudiconv && chmod +x heudiconv && cp heudiconv /usr/local/bin/ -RUN curl -O https://raw.githubusercontent.com/nipy/heudiconv/master/heuristics/convertall.py && chmod +x convertall.py +RUN apt-get update && apt-get upgrade -y && \ + apt-get install -y g++ pkg-config make && \ + apt-get clean -y && apt-get autoclean -y && apt-get autoremove -y +RUN (wget -O- http://neuro.debian.net/lists/jessie.us-nh.full | tee /etc/apt/sources.list.d/neurodebian.sources.list) && \ + apt-key adv --recv-keys --keyserver hkp://pool.sks-keyservers.net:80 0xA5D32F012649A5A9 && \ + apt-get update -qq && apt-get install -y git-annex-standalone && \ + apt-get clean -y && apt-get autoclean -y && apt-get autoremove -y +RUN conda install -y -c conda-forge nipype && \ + conda install cmake && \ + pip install https://github.com/moloney/dcmstack/archive/c12d27d2c802d75a33ad70110124500a83e851ee.zip && \ + pip install datalad && \ + conda clean -tipsy && rm -rf ~/.pip/ +RUN cd /tmp && git clone https://github.com/neurolabusc/dcm2niix.git && \ + cd dcm2niix && \ + git checkout 60bab318ee738b644ebb1396bbb8cbe1b006218f && \ + mkdir build && cd build && cmake -DBATCH_VERSION=ON .. && \ + make && make install && \ + cd / && rm -rf /tmp/dcm2niix + +COPY bin/heudiconv /usr/local/bin/heudiconv +RUN chmod +x /usr/local/bin/heudiconv +RUN mkdir /heuristics +COPY heuristics/convertall.py /heuristics +RUN chmod +x /heuristics/convertall.py +RUN git config --global user.email "test@docker.land" && \ + git config --global user.name "Docker Almighty" ENTRYPOINT ["/usr/local/bin/heudiconv"] diff --git a/README.md b/README.md index f35db48a..e85f53dc 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ function `infotodict`, which takes a single argument `seqinfo`. * total_files_till_now * example_dcm_file -* series_number +* series_id * dcm_dir_name * unspecified2 * unspecified3 diff --git a/bin/heudiconv b/bin/heudiconv index ff7c2a8c..43502da7 100755 --- a/bin/heudiconv +++ b/bin/heudiconv @@ -1,32 +1,55 @@ #!/usr/bin/env python -"""Convert DICOM TimTrio dirs based on heuristic info +"""Convert DICOM dirs based on heuristic info -This function uses DicomStack and Dcm2niix to convert Siemens -TrioTim/Prisma directories. It proceeds by extracting dicominfo from each -subject and writing a config file $subject_id/$subject_id.auto.txt in -the output directory. Users can create a copy of the file called -$subject_id.edit.txt and modify it to change the files that are -converted. This edited file will always overwrite the original file. If -there is a need to revert to original state, please delete this edit.txt -file and rerun the conversion +This script uses the dcmstack package and dcm2niix tool to convert DICOM +directories or tarballs into collections of NIfTI files following pre-defined +heuristic(s). +It has multiple modes of operation + +- If subject ID(s) is specified, it proceeds by extracting dicominfo from each + subject and writing a config file $subject_id/$subject_id.auto.txt in + the output directory. Users can create a copy of the file called + $subject_id.edit.txt and modify it to change the files that are + converted. This edited file will always overwrite the original file. If + there is a need to revert to original state, please delete this edit.txt + file and rerun the conversion + +- If no subject specified, all files under specified directory are scanned, + DICOMs are sorted based on study UID, and layed out using specified heuristic """ -__version__ = '0.1' +__version__ = '0.3' import argparse from glob import glob +import csv +import dicom as dcm +import dcmstack as ds import inspect import json import os -import shutil import sys +import re +import shutil from tempfile import mkdtemp import tarfile +from copy import deepcopy from collections import namedtuple +from collections import defaultdict +from collections import OrderedDict as ordereddict +from datetime import datetime from os.path import isdir +from os.path import basename +from os.path import dirname +from os.path import lexists, exists +from os.path import join as pjoin + +from random import sample + +PY3 = sys.version_info[0] >= 3 import logging lgr = logging.getLogger('heudiconv') @@ -38,26 +61,122 @@ logging.basicConfig( ) lgr.debug("Starting the abomination") # just to "run-test" logging -seqinfo_fields = ['total_files_till_now', # 0 - 'example_dcm_file', # 1 - 'series_number', # 2 - 'dcm_dir_name', # 3 - 'unspecified2', # 4 - 'unspecified3', # 5 - 'dim1', 'dim2', 'dim3', 'dim4', # 6, 7, 8, 9 - 'TR', 'TE', # 10, 11 - 'protocol_name', # 12 - 'is_motion_corrected', # 13 - # Introduced with namedtuple - 'is_derived', # 14 - 'patient_id', # 15 - 'study_description', # 16 - 'referring_physician_name', # 17 - 'series_description', # 18 - 'image_type', # 19 - ] - -SeqInfo = namedtuple('SeqInfo', seqinfo_fields) +# bloody git screws things up +# https://github.com/gitpython-developers/GitPython/issues/600 +# +## for nipype, we would like to lower level unless we are at debug level +#try: +# import nipype +# for l in nipype.logging.loggers.values(): +# if lgr.getEffectiveLevel() > 10: +# l.setLevel(logging.WARN) +#except ImportError: +# pass +# +#try: +# # Set datalad's logger to our handler +# import datalad +# datalad_lgr = logging.getLogger('datalad') +# datalad_lgr.handlers = lgr.handlers +#except ImportError: +# pass + +global_options = { + 'overwrite': False # overwrite existing files +} + +SeqInfo = namedtuple( + 'SeqInfo', + ['total_files_till_now', # 0 + 'example_dcm_file', # 1 + 'series_id', # 2 + 'unspecified1', # 3 + 'unspecified2', # 4 + 'unspecified3', # 5 + 'dim1', 'dim2', 'dim3', 'dim4', # 6, 7, 8, 9 + 'TR', 'TE', # 10, 11 + 'protocol_name', # 12 + 'is_motion_corrected', # 13 + # Introduced with namedtuple + 'is_derived', + 'patient_id', + 'study_description', + 'referring_physician_name', + 'series_description', + 'image_type', + 'accession_number', + 'patient_age', + 'patient_sex', + 'date' + ] +) + +StudySessionInfo = namedtuple( + 'StudySessionInfo', + [ + 'locator', # possible prefix identifying the study, e.g. PI/dataset or just a dataset or empty (default) + # Note that ATM there should be no multiple DICOMs with the same + # StudyInstanceUID which would collide, i.e point to the same + # subject/session. + # So 'locator' is pretty much an assignment from StudyInstanceUID + # into some place within hierarchy + 'session', # could be None + 'subject', # should be some ID defined either in cmdline or deduced + ] +) + + +# TODO: RF to avoid package-level global structure, and be more friendly in +# case of refactoring of heudiconv into a proper Python package/module +class TempDirs(object): + """A helper to centralize handling and cleanup of dirs""" + + def __init__(self): + self.dirs = [] + self.exists = os.path.exists + + def __call__(self, prefix=None): + tmpdir = mkdtemp(prefix=prefix) + self.dirs.append(tmpdir) + return tmpdir + + def __del__(self): + try: + self.cleanup() + except AttributeError: + # we are too late to the show + pass + + def cleanup(self): + lgr.debug("Removing %d temporary directories", len(self.dirs)) + for t in self.dirs[:]: + lgr.debug("Removing %s", t) + if self: + self.rmtree(t) + self.dirs = [] + + def rmtree(self, tmpdir): + if self.exists(tmpdir): + shutil.rmtree(tmpdir) + if tmpdir in self.dirs: + self.dirs.remove(tmpdir) + +tempdirs = TempDirs() + + +def _canonical_dumps(json_obj, **kwargs): + """ Dump `json_obj` to string, allowing for Python newline bug + + Runs ``json.dumps(json_obj, \*\*kwargs), then removes trailing whitespaces + added when doing indent in some Python versions. See + https://bugs.python.org/issue16333. Bug seems to be fixed in 3.4, for now + fixing manually not only for aestetics but also to guarantee the same + result across versions of Python. + """ + out = json.dumps(json_obj, **kwargs) + if 'indent' in kwargs: + out = out.replace(' \n', '\n') + return out def save_json(filename, data): @@ -72,7 +191,49 @@ def save_json(filename, data): """ with open(filename, 'w') as fp: - json.dump(data, fp, sort_keys=True, indent=4) + fp.write(_canonical_dumps(data, sort_keys=True, indent=4)) + + +def slim_down_info(j): + """Given an aggregated info structure, removes excessive details + + such as Csa fields, and SourceImageSequence which on Siemens files could be + huge and not providing any additional immediately usable information. + If needed, could be recovered from stored DICOMs + """ + j = deepcopy(j) # we will do in-place modification on a copy + dicts = [] + # poor man programming for now + if 'const' in j.get('global', {}): + dicts.append(j['global']['const']) + if 'samples' in j.get('time', {}): + dicts.append(j['time']['samples']) + for d in dicts: + for k in list(d.keys()): + if k.startswith('Csa') or k.lower() in {'sourceimagesequence'}: + del d[k] + return j + + +def json_dumps_pretty(j, indent=2, sort_keys=True): + """Given a json structure, pretty print it by colliding numeric arrays into a line + + If resultant structure differs from original -- throws exception + """ + js = _canonical_dumps(j, indent=indent, sort_keys=sort_keys) + # trim away \n and spaces between entries of numbers + js_ = re.sub( + '[\n ]+("?[-+.0-9e]+"?,?) *\n(?= *"?[-+.0-9e]+"?)', r' \1', + js, flags=re.MULTILINE) + # uniform no spaces before ] + js_ = re.sub(" *\]", "]", js_) + # uniform spacing before numbers + js_ = re.sub(' *("?[-+.0-9e]+"?)[ \n]*', r' \1', js_) + # no spaces after [ + js_ = re.sub('\[ ', '[', js_) + j_ = json.loads(js_) + assert(j == j_) + return js_ def load_json(filename): @@ -101,7 +262,6 @@ def load_json(filename): # find_files utility copied/borrowed from DataLad (Copyright 2016 DataLad developers, MIT license) # -import re from os.path import sep as dirsep from os.path import curdir from os.path import join as opj @@ -140,92 +300,134 @@ def find_files(regex, topdir=curdir, exclude=None, exclude_vcs=True, dirs=False) find_files.__doc__ %= (_VCS_REGEX,) -def process_dicoms(fl, dcmsession=None, basedir=None, dcmfilter=None): - """Process list of dicoms and return seqinfo and file group +def group_dicoms_into_seqinfos( + files, file_filter=None, dcmfilter=None, grouping='studyUID' +): + """Process list of dicoms and return seqinfo and file group + + `seqinfo` contains per-sequence extract of fields from DICOMs which + will be later provided into heuristics to decide on filenames Parameters ---------- - fl : list of str + files : list of str List of files to consider - dcmsession : list of int, optional - List of session ids (?) for each file if `fl` - corresponds to multiple sessions - basedir : str, optional - Base directory relative to which filenames are provided in `fl` + file_filter : callable, optional + Applied to each item of filenames. Should return True if file needs to be + kept, False otherwise. dcmfilter : callable, optional - If called on dcm_data and returns True, it is used to set - series_id + If called on dcm_data and returns True, it is used to set series_id + grouping : {'studyUID', 'accession_number', None}, optional + what to group by: studyUID or accession_number Returns ------- - seqinfo, filegrp : list of list, dict - `seqinfo` is a list of info entries per each sequence (some entry + seqinfo : list of list + `seqinfo` is a list of info entries per each sequence (some entry there defines a key for `filegrp`) - `filegrp` is a dictionary with files groupped per each sequence. + filegrp : dict + `filegrp` is a dictionary with files groupped per each sequence """ + allowed_groupings = ['studyUID', 'accession_number', None] + if grouping not in allowed_groupings: + raise ValueError('I do not know how to group by {0}'.format(grouping)) + per_studyUID = grouping == 'studyUID' + per_accession_number = grouping == 'accession_number' + lgr.info("Analyzing %d dicoms", len(files)) import dcmstack as ds import dicom as dcm - if dcmsession is None: - multi_session = False - dcmsession = [0] * len(fl) - else: - multi_session = True - groups = [[], []] mwgroup = [] - for fidx, filename in enumerate(fl): - if not basedir is None: - filename = os.path.join(basedir, filename) + + studyUID = None + # for sanity check that all DICOMs came from the same + # "study". If not -- what is the use-case? (interrupted acquisition?) + # and how would then we deal with series numbers + # which would differ already + if file_filter: + nfl_before = len(files) + files = list(filter(file_filter, files)) + nfl_after = len(files) + lgr.info('Filtering out {0} dicoms based on their filename'.format( + nfl_before-nfl_after)) + for fidx, filename in enumerate(files): + # TODO after getting a regression test check if the same behavior + # with stop_before_pixels=True mw = ds.wrapper_from_data(dcm.read_file(filename, force=True)) + + for f in ('iop', 'ICE_Dims', 'SequenceName'): + try: + del mw.series_signature[f] + except: + pass + try: - del mw.series_signature['iop'] - except: - pass - try: - del mw.series_signature['ICE_Dims'] - except: - pass - try: - del mw.series_signature['SequenceName'] - except: - pass + file_studyUID = mw.dcm_data.StudyInstanceUID + except AttributeError: + lgr.info("File %s is missing any StudyInstanceUID" % filename) + file_studyUID = None + #continue + try: - series_id = (dcmsession[fidx], - mw.dcm_data.SeriesNumber, + series_id = (int(mw.dcm_data.SeriesNumber), mw.dcm_data.ProtocolName) - except AttributeError: + file_studyUID = mw.dcm_data.StudyInstanceUID + + if not per_studyUID: + # verify that we are working with a single study + if studyUID is None: + studyUID = file_studyUID + elif not per_accession_number: + assert studyUID == file_studyUID + except AttributeError as exc: + lgr.warning('Ignoring %s since not quite a "normal" DICOM: %s', + filename, exc) # not a normal DICOM -> ignore - series_id = (dcmsession[fidx], -1, 'none') + series_id = (-1, 'none') + file_studyUID = None + if not series_id[0] < 0: if dcmfilter is not None and dcmfilter(mw.dcm_data): - series_id = (dcmsession[fidx], -1, mw.dcm_data.ProtocolName) + series_id = (-1, mw.dcm_data.ProtocolName) + if not groups: + raise RuntimeError("Yarik really thinks this is never ran!") + # if I was wrong -- then per_studyUID might need to go above + # yoh: I don't think this would ever be executed! mwgroup.append(mw) groups[0].append(series_id) groups[1].append(len(mwgroup) - 1) continue - N = len(mwgroup) # filter out unwanted non-image-data DICOMs by assigning # a series number < 0 (see test below) - if not series_id[1] < 0 and mw.dcm_data[0x0008, 0x0016].repval in ( + if not series_id[0] < 0 and mw.dcm_data[0x0008, 0x0016].repval in ( 'Raw Data Storage', 'GrayscaleSoftcopyPresentationStateStorage'): - series_id = (dcmsession[fidx], -1, mw.dcm_data.ProtocolName) + series_id = (-1, mw.dcm_data.ProtocolName) + + if per_studyUID: + series_id = series_id + (file_studyUID,) + + #print fidx, N, filename ingrp = False - for idx in range(N): + for idx in range(len(mwgroup)): same = mw.is_same_series(mwgroup[idx]) #print idx, same, groups[idx][0] if same: + # the same series should have the same study uuid + assert mwgroup[idx].dcm_data.get('StudyInstanceUID', None) == file_studyUID ingrp = True - if series_id[1] >= 0: - series_id = (dcmsession[fidx], - mwgroup[idx].dcm_data.SeriesNumber, + if series_id[0] >= 0: + series_id = (mwgroup[idx].dcm_data.SeriesNumber, mwgroup[idx].dcm_data.ProtocolName) + if per_studyUID: + series_id = series_id + (file_studyUID,) groups[0].append(series_id) groups[1].append(idx) + if not ingrp: mwgroup.append(mw) groups[0].append(series_id) @@ -234,12 +436,12 @@ def process_dicoms(fl, dcmsession=None, basedir=None, dcmfilter=None): group_map = dict(zip(groups[0], groups[1])) total = 0 - filegroup = {} - seqinfo = [] + seqinfo = ordereddict() + # for the next line to make any sense the series_id needs to # be sortable in a way that preserves the series order - for series, mwidx in sorted(group_map.items()): - if series[1] < 0: + for series_id, mwidx in sorted(group_map.items()): + if series_id[0] < 0: # skip our fake series with unwanted files continue mw = mwgroup[mwidx] @@ -248,15 +450,17 @@ def process_dicoms(fl, dcmsession=None, basedir=None, dcmfilter=None): # nothing to see here, just move on continue dcminfo = mw.dcm_data - files = [fl[i] for i, s in enumerate(groups[0]) if s == series] + series_files = [files[i] for i, s in enumerate(groups[0]) if s == series_id] # turn the series_id into a human-readable string -- string is needed # for JSON storage later on - if multi_session: - series = '%i-%i-%s' % series - else: - series = '%i-%s' % series[1:] - filegroup[series] = files - size = list(mw.image_shape) + [len(files)] + if per_studyUID: + studyUID = series_id[2] + series_id = series_id[:2] + accession_number = dcminfo.get('AccessionNumber') + + series_id = '-'.join(map(str, series_id)) + + size = list(mw.image_shape) + [len(series_files)] total += size[-1] if len(size) < 4: size.append(1) @@ -269,48 +473,77 @@ def process_dicoms(fl, dcmsession=None, basedir=None, dcmfilter=None): except AttributeError: TE = -1 try: - refphys = dcminfo.ReferringPhysicianName + refphys = str(dcminfo.ReferringPhysicianName) except AttributeError: refphys = '-' + image_type = tuple(dcminfo.ImageType) + motion_corrected = 'MoCo' in dcminfo.SeriesDescription \ + or 'MOCO' in image_type info = SeqInfo( total, - os.path.split(files[0])[1], - series, - os.path.basename(os.path.dirname(files[0])), + os.path.split(series_files[0])[1], + series_id, + os.path.basename(os.path.dirname(series_files[0])), '-', '-', size[0], size[1], size[2], size[3], TR, TE, dcminfo.ProtocolName, - 'MoCo' in dcminfo.SeriesDescription, + motion_corrected, # New ones by us 'derived' in [x.lower() for x in dcminfo.get('ImageType', [])], - dcminfo.PatientID, - dcminfo.StudyDescription, + dcminfo.get('PatientID'), + dcminfo.get('StudyDescription'), refphys, - dcminfo.SeriesDescription, - dcminfo.ImageType, + dcminfo.get('SeriesDescription'), + image_type, + accession_number, + # For demographics to populate BIDS participants.tsv + dcminfo.get('PatientAge'), + dcminfo.get('PatientSex'), + dcminfo.get('AcquisitionDate'), ) # candidates # dcminfo.AccessionNumber # len(dcminfo.ReferencedImageSequence) # len(dcminfo.SourceImageSequence) # FOR demographics - # dcminfo.PatientsAge - # dcminfo.PatientsSex - lgr.debug("%30s %27s %5s nref=%-2d nsrc=%-2d %s" % ( - info.series_number, + if per_studyUID: + key = studyUID.split('.')[-1] + elif per_accession_number: + key = accession_number + else: + key = '' + lgr.debug("%30s %30s %27s %27s %5s nref=%-2d nsrc=%-2d %s" % ( + key, + info.series_id, dcminfo.SeriesDescription, + dcminfo.ProtocolName, info.is_derived, len(dcminfo.get('ReferencedImageSequence', '')), len(dcminfo.get('SourceImageSequence', '')), info.image_type )) - #if dcminfo.SeriesDescription == 'ep2d_bold_moco_p2': - # import pdb; pdb.set_trace() - seqinfo.append(info) - lgr.info("Generated sequence info with %d entries", len(info)) - return seqinfo, filegroup + if per_studyUID: + if studyUID not in seqinfo: + seqinfo[studyUID] = ordereddict() + seqinfo[studyUID][info] = series_files + elif per_accession_number: + if accession_number not in seqinfo: + seqinfo[accession_number] = ordereddict() + seqinfo[accession_number][info] = series_files + else: + seqinfo[info] = series_files + + if per_studyUID: + lgr.info("Generated sequence info for %d studies with %d entries total", + len(seqinfo), sum(map(len, seqinfo.values()))) + elif per_accession_number: + lgr.info("Generated sequence info for %d accession numbers with %d entries total", + len(seqinfo), sum(map(len, seqinfo.values()))) + else: + lgr.info("Generated sequence info with %d entries", len(seqinfo)) + return seqinfo def write_config(outfile, info): @@ -325,13 +558,15 @@ def read_config(infile): return info -def conversion_info(subject, outdir, info, filegroup, basedir=None, ses=None): +def conversion_info(subject, outdir, info, filegroup, ses=None): convert_info = [] for key, items in info.items(): if not items: continue template = key[0] outtype = key[1] + # So no annotation_classes of any kind! so if not used -- what was the + # intension???? XXX outpath = outdir for idx, itemgroup in enumerate(items): if not isinstance(itemgroup, list): @@ -361,35 +596,58 @@ def conversion_info(subject, outdir, info, filegroup, basedir=None, ses=None): try: files = filegroup[item] except KeyError: - files = filegroup[unicode(item)] - - if basedir is not None: - files = [os.path.join(basedir, f) for f in files] - + files = filegroup[(str if PY3 else unicode)(item)] outprefix = template.format(**parameters) convert_info.append((os.path.join(outpath, outprefix), outtype, files)) - return convert_info def embed_nifti(dcmfiles, niftifile, infofile, bids_info=None, force=False, min_meta=False): + """ + + If `niftifile` doesn't exist, it gets created out of the `dcmfiles` stack, + and json representation of its meta_ext is returned (bug since should return + both niftifile and infofile?) + + if `niftifile` exists, its affine's orientation information is used while + establishing new `NiftiImage` out of dicom stack and together with `bids_info` + (if provided) is dumped into json `infofile` + + Parameters + ---------- + dcmfiles + niftifile + infofile + bids_info + force + min_meta + + Returns + ------- + niftifile, infofile + + """ import nibabel as nb import os import json import re + meta_info = {} if not min_meta: import dcmstack as ds stack = ds.parse_and_stack(dcmfiles, force=force).values() if len(stack) > 1: raise ValueError('Found multiple series') stack = stack[0] - # Create the nifti image using the data array + + #Create the nifti image using the data array if not os.path.exists(niftifile): nifti_image = stack.to_nifti(embed_meta=True) nifti_image.to_filename(niftifile) return ds.NiftiWrapper(nifti_image).meta_ext.to_json() + orig_nii = nb.load(niftifile) - ornt = nb.orientations.io_orientation(orig_nii.affine) + aff = orig_nii.get_affine() + ornt = nb.orientations.io_orientation(aff) axcodes = nb.orientations.ornt2axcodes(ornt) new_nii = stack.to_nifti(voxel_order=''.join(axcodes), embed_meta=True) meta = ds.NiftiWrapper(new_nii).meta_ext.to_json() @@ -408,47 +666,161 @@ def embed_nifti(dcmfiles, niftifile, infofile, bids_info=None, force=False, min_ json.dump(meta_info, fp, indent=3, sort_keys=True) return niftifile, infofile -def compress_dicoms(dicom_list, prefix, sourcedir): + +def compress_dicoms(dicom_list, out_prefix): + """Archives DICOMs into a tarball + + Also tries to do it reproducibly, so takes the date for files + and target tarball based on the series time (within the first file) + + Parameters + ---------- + dicom_list : list of str + list of dicom files + out_prefix : str + output path prefix, including the portion of the output file name + before .dicom.tgz suffix + + Returns + ------- + filename : str + Result tarball + """ tmpdir = mkdtemp(prefix='dicomtar') - outtar = os.path.join(sourcedir, prefix + '.dicom.tgz') - with tarfile.open(outtar, 'w:gz', dereference=True) as tar: - for filename in dicom_list: - outfile = os.path.join(tmpdir, os.path.basename(filename)) - if not os.path.islink(outfile): - os.symlink(filename, outfile) - tar.add(outfile, recursive=False) - tar.close() + outtar = out_prefix + '.dicom.tgz' + if os.path.exists(outtar) and not global_options['overwrite']: + raise RuntimeError("File %s already exists, will not override" + % outtar) + # tarfile encodes current time.time inside making those non-reproducible + # so we should choose which date to use. + # Solution from DataLad although ugly enough: + + dicom_list = sorted(dicom_list) + dcm_time = get_dicom_series_time(dicom_list) + + def _assign_dicom_time(ti): + # Reset the date to match the one of the last commit, not from the + # filesystem since git doesn't track those at all + ti.mtime = dcm_time + return ti + + # poor man mocking since can't rely on having mock + try: + import time + _old_time = time.time + time.time = lambda: dcm_time + if os.path.lexists(outtar): + os.unlink(outtar) + with tarfile.open(outtar, 'w:gz', dereference=True) as tar: + for filename in dicom_list: + outfile = os.path.join(tmpdir, os.path.basename(filename)) + if not os.path.islink(outfile): + os.symlink(os.path.realpath(filename), outfile) + # place into archive stripping any lead directories and + # adding the one corresponding to prefix + tar.add(outfile, + arcname=opj(basename(out_prefix), + os.path.basename(outfile)), + recursive=False, + filter=_assign_dicom_time) + finally: + time.time = _old_time + shutil.rmtree(tmpdir) + return outtar + + +def get_dicom_series_time(dicom_list): + """Get time in seconds since epoch from dicom series date and time + + Primarily to be used for reproducible time stamping + """ + import time + import calendar + import dicom as dcm + + dcm = dcm.read_file(dicom_list[0], stop_before_pixels=True) + dcm_date = dcm.SeriesDate # YYYYMMDD + dcm_time = dcm.SeriesTime # HHMMSS.MICROSEC + dicom_time_str = dcm_date + dcm_time.split('.', 1)[0] # YYYYMMDDHHMMSS + # convert to epoch + return calendar.timegm(time.strptime(dicom_time_str, '%Y%m%d%H%M%S')) + + +def safe_copyfile(src, dest): + """Copy file but blow if destination name already exists + """ + if os.path.isdir(dest): + dest = os.path.join(dest, os.path.basename(src)) + if os.path.lexists(dest): + if not global_options['overwrite']: + raise ValueError("was asked to copy %s but destination already exists: %s" + % (src, dest)) + else: + # to make sure we can write there ... still fail if it is entire directory ;) + os.unlink(dest) + shutil.copyfile(src, dest) def convert(items, symlink=True, converter=None, scaninfo_suffix='.json', custom_callable=None, with_prov=False, - is_bids=False, sourcedir=None, min_meta=False): + is_bids=False, sourcedir=None, outdir=None, min_meta=False): + """Perform actual conversion (calls to converter etc) given info from + heuristic's `infotodict` + + Parameters + ---------- + items + symlink + converter + scaninfo_suffix + custom_callable + with_prov + is_bids + sourcedir + outdir + min_meta + + Returns + ------- + None + """ prov_files = [] - tmpdir = mkdtemp(prefix='heudiconvtmp') - for item in items: - if isinstance(item[1], (list, tuple)): - outtypes = item[1] - else: - outtypes = [item[1]] - prefix = item[0] - print('Converting %s' % prefix) - dirname = os.path.dirname(prefix + '.ext') - print(dirname) - if not os.path.exists(dirname): - os.makedirs(dirname) + tmpdir = mkdtemp(prefix='heudiconvdcm') + for item_idx, item in enumerate(items): + prefix, outtypes, item_dicoms = item[:3] + if not isinstance(outtypes, (list, tuple)): + outtypes = [outtypes] + + prefix_dirname = os.path.dirname(prefix + '.ext') + prov_file = None + outname_bids = prefix + '.json' + outname_bids_files = [] # actual bids files since dcm2niix might generate multiple ATM + lgr.info('Converting %s (%d DICOMs) -> %s . ' + 'Converter: %s . Output types: %s', + prefix, len(item_dicoms), prefix_dirname, converter, outtypes) + if not os.path.exists(prefix_dirname): + os.makedirs(prefix_dirname) for outtype in outtypes: - item_dicoms = item[2] - lgr.info("Processing %d dicoms", len(item_dicoms)) - lgr.debug(" those dicoms are: %s", item_dicoms) + lgr.debug("Processing %d dicoms for output type %s", + len(item_dicoms), outtype) + lgr.log(1, " those dicoms are: %s", item_dicoms) + + seqtype = basename(dirname(prefix)) if is_bids else None + if outtype == 'dicom': if is_bids: - if not os.path.exists(sourcedir): - os.makedirs(sourcedir) - dicom_list = [] - for filename in item_dicoms: - dicom_list.append(filename) - compress_dicoms(dicom_list, os.path.basename(prefix), sourcedir) + # mimic the same hierarchy location as the prefix + # although it could all have been done probably + # within heuristic really + sourcedir_ = os.path.join( + sourcedir, + os.path.dirname( + os.path.relpath(prefix, outdir))) + if not os.path.exists(sourcedir_): + os.makedirs(sourcedir_) + compress_dicoms(item_dicoms, + opj(sourcedir_, os.path.basename(prefix))) else: dicomdir = prefix + '_dicom' if os.path.exists(dicomdir): @@ -470,319 +842,657 @@ def convert(items, symlink=True, converter=None, config.enable_provenance() from nipype import Function, Node from nipype.interfaces.base import isdefined - print(converter) - # deprecated - if converter == 'mri_convert': - lgr.warn('Deprecated - use dcm2niix for future conversions') - from nipype.interfaces.freesurfer.preprocess import MRIConvert - convertnode = Node(MRIConvert(), name='convert') - convertnode.base_dir = tmpdir - if outtype == 'nii.gz': - convertnode.inputs.out_type = 'niigz' - convertnode.inputs.in_file = item_dicoms[0] - convertnode.inputs.out_file = os.path.abspath(outname) - #cmd = 'mri_convert %s %s' % (item[2][0], outname) - #print(cmd) - #os.system(cmd) - res = convertnode.run() - elif converter in {'dcm2nii', 'dcm2niix'}: + if converter == 'dcm2niix': from nipype.interfaces.dcm2nii import Dcm2nii, Dcm2niix - convertnode = Node( - {'dcm2nii': Dcm2nii, - 'dcm2niix': Dcm2niix}[converter](), - name='convert') + convertnode = Node(Dcm2niix(), name='convert') convertnode.base_dir = tmpdir + # need to be abspaths! + item_dicoms = list(map(os.path.abspath, item_dicoms)) convertnode.inputs.source_names = item_dicoms if converter == 'dcm2nii': convertnode.inputs.gzip_output = outtype == 'nii.gz' else: if not is_bids: convertnode.inputs.bids_format = False - convertnode.inputs.out_filename = os.path.basename( - dirname +'_%e') + convertnode.inputs.out_filename = os.path.basename(prefix_dirname) convertnode.inputs.terminal_output = 'allatonce' res = convertnode.run() - if isinstance(res.outputs.converted_files, list): - lgr.warning( - "Following series likely has multiple orientations: ", - item_dicoms - ) - for fl in res.outputs.converted_files: - idx = fl.split('_')[-1][0:-7] # get echo number, remove .nii.gz - outname = prefix + '-' + str(idx) + '.' + outtype - shutil.copyfile(fl, outname) - else: - shutil.copyfile(res.outputs.converted_files, outname) + if isdefined(res.outputs.bvecs): outname_bvecs = prefix + '.bvec' outname_bvals = prefix + '.bval' - shutil.copyfile(res.outputs.bvecs, outname_bvecs) - shutil.copyfile(res.outputs.bvals, outname_bvals) - - if converter == 'dcm2niix' \ - and isdefined(res.outputs.bids): - ### extract bids - if isinstance(res.outputs.bids, list): - lgr.warning("There was someone catching lists!") - for fl in res.outputs.bids: - idx = fl.split('_')[-1][0:-5] # get echo number, remove .json - scaninfo = prefix + '_' + str(idx) + '.json' - shutil.copyfile(fl, scaninfo) + safe_copyfile(res.outputs.bvecs, outname_bvecs) + safe_copyfile(res.outputs.bvals, outname_bvals) + + res_files = res.outputs.converted_files + if isinstance(res_files, list): + # TODO: move into a function + # by default just suffix them up + suffixes = None + # we should provide specific handling for fmap, + # dwi etc which might spit out multiple files + if is_bids: + if seqtype == 'fmap': + # expected! + suffixes = ["%d" % (i+1) for i in range(len(res_files))] + if not suffixes: + lgr.warning( + "Following series files likely have " + "multiple (%d) volumes (orientations?) " + "generated: %s ...", + len(res_files), item_dicoms[0] + ) + suffixes = ['-%d' % (i+1) for i in range(len(res_files))] + + # Also copy BIDS files although they might need to be merged/postprocessed later + if converter == 'dcm2niix' and isdefined(res.outputs.bids): + assert(len(res.outputs.bids) == len(res_files)) + bids_files = res.outputs.bids else: - shutil.copyfile(res.outputs.bids, scaninfo) + bids_files = [None] * len(res_files) + + for fl, suffix, bids_file in zip(res_files, suffixes, bids_files): + outname = "%s%s.%s" % (prefix, suffix, outtype) + safe_copyfile(fl, outname) + if bids_file: + outname_bids_file = "%s%s.json" % (prefix, suffix) + safe_copyfile(bids_file, outname_bids_file) + outname_bids_files.append(outname_bids_file) + + else: + safe_copyfile(res_files, outname) + if converter == 'dcm2niix' and isdefined(res.outputs.bids): + try: + safe_copyfile(res.outputs.bids, outname_bids) + outname_bids_files.append(outname_bids) + except TypeError as exc: ##catch lists + lgr.warning( + "There was someone catching lists!: %s", exc + ) + continue + + # save acquisition time information if it's BIDS + # at this point we still have acquisition date + if is_bids: + save_scans_key(items, outname_bids_files) + # Fix up and unify BIDS files + tuneup_bids_json_files(outname_bids_files) + # we should provide specific handling for fmap, + # dwi etc .json of which should get merged to satisfy + # BIDS. BUT wer might be somewhat not in time for a + # party here since we sorted into multiple seqinfo + # (e.g. magnitude, phase for fmap so we might want + # to sort them into a single one) if with_prov: prov_file = prefix + '_prov.ttl' - shutil.copyfile(os.path.join(convertnode.base_dir, + safe_copyfile(os.path.join(convertnode.base_dir, convertnode.name, 'provenance.ttl'), prov_file) prov_files.append(prov_file) - from nipype import Node, Function - embedfunc = Node(Function(input_names=['dcmfiles', - 'niftifile', - 'infofile', - 'bids_info', - 'force', - 'min_meta'], - output_names=['outfile', - 'meta'], - function=embed_nifti), - name='embedder') - embedfunc.inputs.dcmfiles = item_dicoms - embedfunc.inputs.niftifile = os.path.abspath(outname) - embedfunc.inputs.infofile = os.path.abspath(scaninfo) - embedfunc.inputs.min_meta = min_meta - if is_bids and (converter == 'dcm2niix'): - embedfunc.inputs.bids_info = load_json(os.path.abspath(scaninfo)) - embedfunc.inputs.force = True - embedfunc.base_dir = tmpdir - cwd = os.getcwd() - try: - res = embedfunc.run() - os.chmod(scaninfo, 0o0440) - if with_prov: - g = res.provenance.rdf() - g.parse(prov_file, - format='turtle') - g.serialize(prov_file, format='turtle') - os.chmod(prov_file, 0o0440) - except: - os.chdir(cwd) + if len(outname_bids_files) > 1: + lgr.warning( + "For now not embedding BIDS and info generated .nii.gz itself since sequence produced multiple files") + else: + #if not is_bids or converter != 'dcm2niix': ##uses dcm2niix's infofile + embed_metadata_from_dicoms(converter, is_bids, item_dicoms, + outname, outname_bids, prov_file, + scaninfo, tmpdir, with_prov, + min_meta) + if exists(scaninfo): + lgr.info("Post-treating %s file", scaninfo) + treat_infofile(scaninfo) os.chmod(outname, 0o0440) - if not custom_callable is None: + if custom_callable is not None: custom_callable(*item) shutil.rmtree(tmpdir) -def convert_dicoms(subjs, dicom_dir_template, outdir, heuristic_file, converter, - queue=None, anon_sid_cmd=None, anon_outdir=None, with_prov=False, - ses=None, is_bids=False, sbatch_args='-N1 -c2 --mem=20G -t 01:00:00', - min_meta=False): +def get_formatted_scans_key_row(item): + """ + Parameters + ---------- + item - if '%s' in dicom_dir_template: - raise ValueError("Formatting DICOM search pattern with '%s' is " - "deprecated. Please indicate participant ID with " - "'{subject}' and (optionally) timepoint with '{session}'.") + Returns + ------- + row: list + [ISO acquisition time, performing physician name, random string] - for sid in subjs: - tmpdir = None - if queue: - # TODO This needs to be updated to better scale with additional args - progname = os.path.abspath(inspect.getfile(inspect.currentframe())) - convertcmd = ' '.join(['python', progname, '-d', dicom_dir_template, - '-o', outdir, '-f', heuristic_file, '-s', sid, - '-c', converter]) - if ses: - convertcmd += " --ses {}".format(ses) - if with_prov: - convertcmd += " --with-prov" - if is_bids: - convertcmd += " --bids" - if min_meta: - convertcmd += " --minmeta" - if ses: - script_file = 'dicom-{}_ses-{}.sh'.format(sid, ses) - else: - script_file = 'dicom-{}.sh'.format(sid) - with open(script_file, 'wt') as fp: - fp.writelines(['#!/bin/bash\n', convertcmd]) - outcmd = 'sbatch -J dicom-%s -p %s %s %s' % (sid, queue, sbatch_args, script_file) - os.system(outcmd) - continue + """ + dcm_fn = item[-1][0] + mw = ds.wrapper_from_data(dcm.read_file(dcm_fn, stop_before_pixels=True)) + # we need to store filenames and acquisition times + # parse date and time and get it into isoformat + date = mw.dcm_data.AcquisitionDate + time = mw.dcm_data.AcquisitionTime.split('.')[0] + td = time + date + acq_time = datetime.strptime(td, '%H%M%S%Y%m%d').isoformat() + # add random string + randstr = ''.join(map(chr, sample(k=8, population=range(33, 127)))) + row = [acq_time, mw.dcm_data.PerformingPhysicianName, randstr] + return row + + +def add_rows_to_scans_keys_file(fn, newrows): + """ + Add new rows to file fn for scans key filename - # TODO: RF into a function - # expand the input template - if sid: - sdir = dicom_dir_template.format(subject=sid, session=ses) - # and see what matches - fl = sorted(glob(sdir)) - else: - # we were given no subject so we consider dicom_dir_template to be - # and actual directory to process - if not isdir(dicom_dir_template): - raise ValueError( - "No subject id was provided, and dicom_dir_template=%s is not " - "an existing directory to traverse" % str(dicom_dir_template) - ) - fl = sorted(find_files('.*', topdir=dicom_dir_template)) - - lgr.info("Processing %d dicoms", len(fl)) - dcmsessions = None - # some people keep compressed tarballs around -- be nice! - if len(fl) and tarfile.is_tarfile(fl[0]): - # check if we got a list of tarfiles - if not len(fl) == sum([tarfile.is_tarfile(i) for i in fl]): - raise ValueError("some but not all input files are tar files") - # tarfiles already know what they contain, and often the filenames - # are unique, or at least in a unqiue subdir per session - # strategy: extract everything in a temp dir and assemble a list - # of all files in all tarballs - content = [] - tmpdir = mkdtemp(prefix='heudiconvtmp') - # needs sorting to keep the generated "session" label deterministic - for i, t in enumerate(sorted(fl)): - tf = tarfile.open(t) - # check content and sanitize permission bits - tmembers = tf.getmembers() - for tm in tmembers: - tm.mode = 0o700 - # get all files, assemble full path in tmp dir - tf_content = [m.name for m in tmembers if m.isfile()] - content += tf_content - if len(fl) > 1: - # more than one tarball, take care of "session" indicator - if dcmsessions is None: - dcmsessions = [] - dcmsessions += ([i] * len(tf_content)) - # extract into tmp dir - tf.extractall(path=tmpdir, members=tmembers) - fl = content - - #dcmfile = dcm.read_file(fl[0], force=True) - #print sid, 'Dicom: ', dcmfile.PatientName, sid == dcmfile.PatientName - - seqinfo = None # we might need it earlier than later - if not sid: - # figure out the sid out of available information - seqinfo, filegroup = process_dicoms( - fl, dcmsessions, basedir=tmpdir, - dcmfilter=getattr(mod, 'filter_dicom', None)) - # XXX session information handling is somewhat backwards since done above - # already. Moreover above logic with .edit.txt file -- seqinfo is - # available only on initial run - TODO - if not hasattr(mod, 'get_session_subject_id'): - raise ValueError( - "%s has no get_session_subject_id needed to figure out " - "subject/session from DICOMs" % mod - ) + Parameters + ---------- + fn: filename + newrows: extra rows to add + dict fn: [acquisition time, referring physician, random string] + """ + if lexists(fn): + with open(fn, 'r') as csvfile: + reader = csv.reader(csvfile, delimiter='\t') + existing_rows = [row for row in reader] + # skip header + fnames2info = {row[0]: row[1:] for row in existing_rows[1:]} + + newrows_key = newrows.keys() + newrows_toadd = list(set(newrows_key) - set(fnames2info.keys())) + for key_toadd in newrows_toadd: + fnames2info[key_toadd] = newrows[key_toadd] + # remove + os.unlink(fn) + else: + fnames2info = newrows - session_subject_ids = set(mod.get_session_subject_id(s) - for s in seqinfo) - assert len(session_subject_ids) == 1, \ - "atm we support processing only 1 subject/session at a time" - sess, sid = session_subject_ids.pop() + header = ['filename', 'acq_time', 'operator', 'randstr'] + # save + with open(fn, 'a') as csvfile: + writer = csv.writer(csvfile, delimiter='\t') + writer.writerow(header) + for key in sorted(fnames2info.keys()): + writer.writerow([key] + fnames2info[key]) - if is_bids and not sid.isalnum(): # alphanumeric only - old_sid = sid - cleaner = lambda y: ''.join([x for x in y if x.isalnum()]) - sid = cleaner(sid) #strip out - lgr.warn('{0} contained nonalphanumeric character(s), subject ' - 'ID was cleaned to be {1}'.format(old_sid, sid)) +def save_scans_key(items, outname_bids_files): + """ + Parameters + ---------- + items: + outname_bids_files: - # Figure out where to stick supplemental info files - if is_bids and not sid.startswith('sub-'): - idir = os.path.join(outdir, 'sub-' + sid) - else: - idir = os.path.join(outdir, sid) - if is_bids and ses: - idir = os.path.join(idir, 'ses-%s' % str(ses)) - # all heudiconv stuff in info folder - idir = os.path.join(idir, 'info') - if not os.path.exists(idir): - os.makedirs(idir) + Returns + ------- + + """ + rows = dict() + for item, outname_bids_file in zip(items, outname_bids_files): + # get filenames + f_name = '/'.join(outname_bids_file.split('/')[-2:]) + f_name = f_name.replace('json', 'nii.gz') + rows[f_name] = get_formatted_scans_key_row(item) + # where should we store it? + output_dir = dirname(dirname(outname_bids_file)) + # get subject info + subj_pattern = re.compile('(sub-[a-zA-Z0-9]*)') + subj = subj_pattern.findall(f_name) + assert(len(subj) >= 1) + subj = subj[0] + + # save + add_rows_to_scans_keys_file( + pjoin(output_dir, '{0}_scans.tsv'.format(subj)), + rows) + + +def tuneup_bids_json_files(json_files): + """Given a list of BIDS .json files, e.g. """ + if not json_files: + return + + # Harmonize generic .json formatting + for jsonfile in json_files: + json_ = json.load(open(jsonfile)) + # sanitize! + for f in ['AcquisitionDateTime', 'AcquisitionDate']: + json_.pop(f, None) + # TODO: should actually be placed into series file which must + # go under annex (not under git) and marked as sensitive + if 'Date' in str(json_): + # Let's hope no word 'Date' comes within a study name or smth like + # that + raise ValueError("There must be no dates in .json sidecar") + json.dump(json_, open(jsonfile, 'w'), indent=2) + + # Load the beast + seqtype = basename(dirname(jsonfile)) + + if seqtype == 'fmap': + json_basename = '_'.join(jsonfile.split('_')[:-1]) + # if we got by now all needed .json files -- we can fix them up + # unfortunately order of "items" is not guaranteed atm + if len(glob(json_basename + '*.json')) == 3: + json_phasediffname = json_basename + '_phasediff.json' + json_ = json.load(open(json_phasediffname)) + # TODO: we might want to reorder them since ATM + # the one for shorter TE is the 2nd one! + # For now just save truthfully by loading magnitude files + lgr.debug("Placing EchoTime fields into phasediff file") + for i in 1, 2: + try: + json_['EchoTime%d' % i] = \ + json.load(open(json_basename + '_magnitude%d.json' % i))[ + 'EchoTime'] + except IOError as exc: + lgr.error("Failed to open magnitude file: %s", exc) + + # might have been made R/O already + os.chmod(json_phasediffname, 0o0664) + json.dump(json_, open(json_phasediffname, 'w'), indent=2) + os.chmod(json_phasediffname, 0o0444) + + # phasediff one should contain two PhaseDiff's + # -- one for original amplitude and the other already replicating what is there + # so let's load json files for magnitudes and + # place them into phasediff + + +def embed_metadata_from_dicoms(converter, is_bids, item_dicoms, outname, + outname_bids, prov_file, scaninfo, tmpdir, + with_prov, min_meta): + """ + Enhance sidecar information file with more information from DICOMs + + Parameters + ---------- + converter + is_bids + item_dicoms + outname + outname_bids + prov_file + scaninfo + tmpdir + with_prov + min_meta + + Returns + ------- + + """ + from nipype import Node, Function + embedfunc = Node(Function(input_names=['dcmfiles', + 'niftifile', + 'infofile', + 'bids_info', + 'force', + 'min_meta'], + output_names=['outfile', + 'meta'], + function=embed_nifti), + name='embedder') + embedfunc.inputs.dcmfiles = item_dicoms + embedfunc.inputs.niftifile = os.path.abspath(outname) + embedfunc.inputs.infofile = os.path.abspath(scaninfo) + embedfunc.inputs.min_meta = min_meta + if is_bids and (converter == 'dcm2niix'): + embedfunc.inputs.bids_info = load_json(os.path.abspath(outname_bids)) + else: + embedfunc.inputs.bids_info = None + embedfunc.inputs.force = True + embedfunc.base_dir = tmpdir + cwd = os.getcwd() + try: + """ + Ran into +INFO: Executing node embedder in dir: /tmp/heudiconvdcm2W3UQ7/embedder +ERROR: Embedding failed: [Errno 13] Permission denied: '/inbox/BIDS/tmp/test2-jessie/Wheatley/Beau/1007_personality/sub-sid000138/fmap/sub-sid000138_3mm_run-01_phasediff.json' +while +HEUDICONV_LOGLEVEL=WARNING time bin/heudiconv -f heuristics/dbic_bids.py -c dcm2niix -o /inbox/BIDS/tmp/test2-jessie --bids --datalad /inbox/DICOM/2017/01/28/A000203 + +so it seems that there is a filename collision so it tries to save into the same file name +and there was a screw up for that A + +/mnt/btrfs/dbic/inbox/DICOM/2017/01/28/A000203 + StudySessionInfo(locator='Wheatley/Beau/1007_personality', session=None, subject='sid000138') 16 sequences + StudySessionInfo(locator='Wheatley/Beau/1007_personality', session=None, subject='a000203') 2 sequences + + +in that one though + """ + if global_options['overwrite'] and os.path.lexists(scaninfo): + # TODO: handle annexed file case + if not os.path.islink(scaninfo): + os.chmod(scaninfo, 0o0660) + res = embedfunc.run() + os.chmod(scaninfo, 0o0444) + if with_prov: + g = res.provenance.rdf() + g.parse(prov_file, + format='turtle') + g.serialize(prov_file, format='turtle') + os.chmod(prov_file, 0o0440) + except Exception as exc: + lgr.error("Embedding failed: %s", str(exc)) + os.chdir(cwd) + + +def treat_infofile(filename): + """Tune up generated .json file (slim down, pretty-print for humans). + + Was difficult to do within embed_nifti since it has no access to our functions + """ + with open(filename) as f: + j = json.load(f) + + j_slim = slim_down_info(j) + j_pretty = json_dumps_pretty(j_slim, indent=2, sort_keys=True) + + os.chmod(filename, 0o0664) + with open(filename, 'wt') as fp: + fp.write(j_pretty) + os.chmod(filename, 0o0444) + + +def convert_dicoms(sid, + dicoms, + outdir, + heuristic, + converter, + anon_sid=None, anon_outdir=None, + with_prov=False, + ses=None, + is_bids=False, + seqinfo=None, + min_meta=False): + if dicoms: + lgr.info("Processing %d dicoms", len(dicoms)) + elif seqinfo: + lgr.info("Processing %d pre-sorted seqinfo entries", len(seqinfo)) + else: + raise ValueError("neither dicoms nor seqinfo dict was provided") - # - # Anonymization - # + # in this reimplementation we can have only a single session assigned + # at this point + # dcmsessions = + + if is_bids and not sid.isalnum(): # alphanumeric only + old_sid = sid + cleaner = lambda y: ''.join([x for x in y if x.isalnum()]) + sid = cleaner(sid) #strip out + lgr.warning('{0} contained nonalphanumeric character(s), subject ' + 'ID was cleaned to be {1}'.format(old_sid, sid)) + + # + # Annonimization parameters + # + if anon_sid is None: anon_sid = sid - if anon_sid_cmd is not None: - from subprocess import check_output - anon_sid = check_output([anon_sid_cmd, sid]).strip() - lgr.info("Anonymized sid %s into %s", sid, anon_sid) - else: - anon_outdir = idir - - shutil.copy(heuristic_file, idir) - path, fname = os.path.split(heuristic_file) - sys.path.append(path) - mod = __import__(fname.split('.')[0]) - - infofile = os.path.join(idir, '%s.auto.txt' % sid) - editfile = os.path.join(idir, '%s.edit.txt' % sid) - if os.path.exists(editfile): # XXX may be condition on seqinfo is None - lgr.info("Reloading existing filegroup.json because %s exists", - editfile) - info = read_config(editfile) - filegroup = load_json(os.path.join(idir, 'filegroup.json')) - else: - if seqinfo is None: - seqinfo, filegroup = process_dicoms( - fl, dcmsessions, basedir=tmpdir, - dcmfilter=getattr(mod, 'filter_dicom', None)) + if anon_outdir is None: + anon_outdir = outdir + + # Figure out where to stick supplemental info dicoms + idir = os.path.join(outdir, '.heudiconv', sid) + # THAT IS WHERE WE MUST KNOW ABOUT SESSION ALREADY! + if is_bids and ses: + idir = os.path.join(idir, 'ses-%s' % str(ses)) + # yoh: in my case if idir exists, it means that that study/subject/session + # is already processed + if anon_outdir == outdir: + # if all goes into a single dir, have a dedicated 'info' subdir + idir = os.path.join(idir, 'info') + if not os.path.exists(idir): + os.makedirs(idir) + + shutil.copy(heuristic.filename, idir) + ses_suffix = "_ses-%s" % ses if ses is not None else "" + info_file = os.path.join(idir, '%s%s.auto.txt' % (sid, ses_suffix)) + edit_file = os.path.join(idir, '%s%s.edit.txt' % (sid, ses_suffix)) + filegroup_file = os.path.join(idir, 'filegroup%s.json' % ses_suffix) + + if os.path.exists(edit_file): # XXX may be condition on seqinfo is None + lgr.info("Reloading existing filegroup.json because %s exists", + edit_file) + info = read_config(edit_file) + filegroup = load_json(filegroup_file) + # XXX Yarik finally understood why basedir was dragged along! + # So we could reuse the same PATHs definitions possibly consistent + # across re-runs... BUT that wouldn't work anyways if e.g. + # DICOMs dumped with SOP UUIDs thus differing across runs etc + # So either it would need to be brought back or reconsidered altogether + # (since no sample data to test on etc) + else: + # TODO -- might have been done outside already! + if dicoms: + seqinfo = group_dicoms_into_seqinfos( + dicoms, + file_filter=getattr(heuristic, 'filter_files', None), + dcmfilter=getattr(heuristic, 'filter_dicom', None), + grouping=None, # no groupping + ) + seqinfo_list = list(seqinfo.keys()) + + filegroup = {si.series_id: x for si, x in seqinfo.items()} + + save_json(filegroup_file, filegroup) + dicominfo_file = os.path.join(idir, 'dicominfo%s.tsv' % ses_suffix) + with open(dicominfo_file, 'wt') as fp: + for seq in seqinfo_list: + fp.write('\t'.join([str(val) for val in seq]) + '\n') + lgr.debug("Calling out to %s.infodict", heuristic) + info = heuristic.infotodict(seqinfo_list) + write_config(info_file, info) + write_config(edit_file, info) + + # + # Conversion + # + + sourcedir = None + if is_bids: + sourcedir = os.path.join(outdir, 'sourcedata') + # the other portion of the path would mimic BIDS layout + # so we don't need to worry here about sub, ses at all + tdir = anon_outdir + else: + tdir = os.path.join(anon_outdir, anon_sid) + + if converter != 'none': + lgr.info("Doing conversion using %s", converter) + cinfo = conversion_info(anon_sid, tdir, info, filegroup, + ses=ses) + convert(cinfo, + converter=converter, + scaninfo_suffix=getattr( + heuristic, 'scaninfo_suffix', '.json'), + custom_callable=getattr( + heuristic, 'custom_callable', None), + with_prov=with_prov, + is_bids=is_bids, + sourcedir=sourcedir, + outdir=tdir) + + if is_bids: + if seqinfo: + keys = list(seqinfo) + add_participant_record( + anon_outdir, + anon_sid, + keys[0].patient_age, + keys[0].patient_sex, + ) + populate_bids_templates( + anon_outdir, + getattr(heuristic, 'DEFAULT_FIELDS', {}) + ) + + +def get_annonimized_sid(sid, anon_sid_cmd): + anon_sid = sid + if anon_sid_cmd is not None: + from subprocess import check_output + anon_sid = check_output([anon_sid_cmd, sid]).strip() + lgr.info("Annonimized sid %s into %s", sid, anon_sid) + return anon_sid + + +def get_extracted_dicoms(fl): + """Given a list of files, possibly extract some from tarballs + + For 'classical' heudiconv, if multiple tarballs are provided, they correspond + to different sessions, so here we would group into sessions and return + pairs `sessionid`, `files` with `sessionid` being None if no "sessions" + detected for that file or there was just a single tarball in the list + """ + # TODO: bring check back? + # if any(not tarfile.is_tarfile(i) for i in fl): + # raise ValueError("some but not all input files are tar files") + + # tarfiles already know what they contain, and often the filenames + # are unique, or at least in a unqiue subdir per session + # strategy: extract everything in a temp dir and assemble a list + # of all files in all tarballs + tmpdir = tempdirs(prefix='heudiconvdcm') + + sessions = defaultdict(list) + session = 0 + if not isinstance(fl, (list, tuple)): + fl = list(fl) + + # needs sorting to keep the generated "session" label deterministic + for i, t in enumerate(sorted(fl)): + # "classical" heudiconv has that heuristic to handle multiple + # tarballs as providing different sessions per each tarball + if not tarfile.is_tarfile(t): + sessions[None].append(t) + continue # the rest is tarball specific + + tf = tarfile.open(t) + # check content and sanitize permission bits + tmembers = tf.getmembers() + for tm in tmembers: + tm.mode = 0o700 + # get all files, assemble full path in tmp dir + tf_content = [m.name for m in tmembers if m.isfile()] + # store full paths to each file, so we don't need to drag along + # tmpdir as some basedir + sessions[session] = [opj(tmpdir, f) for f in tf_content] + session += 1 + # extract into tmp dir + tf.extractall(path=tmpdir, members=tmembers) + + if session == 1: + # we had only 1 session, so no really multiple sessions according + # to classical 'heudiconv' assumptions, thus just move them all into + # None + sessions[None] += sessions.pop(0) + + return sessions.items() + + +def load_heuristic(heuristic_file): + """Load heuristic from the file, return the module + """ + path, fname = os.path.split(heuristic_file) + sys.path.append(path) + mod = __import__(fname.split('.')[0]) + mod.filename = heuristic_file + return mod + + +def get_study_sessions(dicom_dir_template, files_opt, heuristic, outdir, + session, sids, grouping='studyUID'): + """Given options from cmdline sort files or dicom seqinfos into + study_sessions which put together files for a single session of a subject + in a study + + Two major possible workflows: + - if dicom_dir_template provided -- doesn't pre-load DICOMs and just + loads files pointed by each subject and possibly sessions as corresponding + to different tarballs + - if files_opt is provided, sorts all DICOMs it can find under those paths + """ + study_sessions = {} + if dicom_dir_template: + dicom_dir_template = os.path.abspath(dicom_dir_template) + assert not files_opt # see above TODO + assert sids + # expand the input template + if '{subject}' not in dicom_dir_template: + raise ValueError( + "dicom dir template must have {subject} as a placeholder for a " + "subject id. Got %r" % dicom_dir_template) + for sid in sids: + sdir = dicom_dir_template.format(subject=sid, session=session) + # and see what matches + files = sorted(glob(sdir)) + for session_, files_ in get_extracted_dicoms(files): + if session_ is not None and session: + lgr.warning( + "We had session specified (%s) but while analyzing " + "files got a new value %r (using it instead)" + % (session, session_)) + # in this setup we do not care about tracking "studies" so + # locator would be the same None + study_sessions[ + StudySessionInfo( + None, + session_ if session_ is not None else session, + sid, + )] = files_ + else: + # prep files + assert files_opt + assert not sids + files = [] + for f in files_opt: + if isdir(f): + files += sorted(find_files( + '.*', topdir=f, exclude_vcs=True, exclude="/\.datalad/")) else: - lgr.debug("DICOMS were already processed, reusing that info") - save_json(os.path.join(idir, 'filegroup.json'), filegroup) - with open(os.path.join(idir, 'dicominfo.txt'), 'wt') as fp: - fp.write('\t'.join([val for val in seqinfo_fields]) + '\n') - for seq in seqinfo: - fp.write('\t'.join([str(val) for val in seq]) + '\n') - lgr.debug("Calling out to %s.infodict", mod) - info = mod.infotodict(seqinfo) - write_config(infofile, info) - write_config(editfile, info) - - # - # Conversion - # - - sourcedir = None - if is_bids: - sourcedir = os.path.join(outdir, 'sourcedata', sid) - if ses: - sourcedir = os.path.join(sourcedir, 'ses-%s' % str(ses)) - - #tdir = os.path.join(anon_outdir, anon_sid) - if anon_sid_cmd: - tdir = os.path.join(anon_outdir, anon_sid) - else: - tdir = os.path.dirname(idir) - - if converter != 'none': - lgr.info("Doing conversion using %s", converter) - cinfo = conversion_info(anon_sid, tdir, info, filegroup, - basedir=tmpdir, - ses=ses) - convert(cinfo, - converter=converter, - scaninfo_suffix=getattr( - mod, 'scaninfo_suffix', '.json'), - custom_callable=getattr( - mod, 'custom_callable', None), - with_prov=with_prov, - is_bids=is_bids, - sourcedir=sourcedir, - min_meta=min_meta) - - # - # Cleanup - # - if tmpdir is not None: - # clean up tmp dir with extracted tarball - shutil.rmtree(tmpdir) + files.append(f) + + # in this scenario we don't care about sessions obtained this way + files_ = [] + for _, files_ex in get_extracted_dicoms(files): + files_ += files_ex + + # sort all DICOMS using heuristic + # TODO: this one is not groupping by StudyUID but may be we should! + seqinfo_dict = group_dicoms_into_seqinfos( + files_, + file_filter=getattr(heuristic, 'filter_files', None), + dcmfilter=getattr(heuristic, 'filter_dicom', None), + grouping=grouping) + + if not getattr(heuristic, 'infotoids', None): + raise NotImplementedError( + "For now, if no subj template is provided, requiring " + "heuristic to have infotoids") + + for studyUID, seqinfo in seqinfo_dict.items(): + # so we have a single study, we need to figure out its + # locator, session, subject + # TODO: Try except to ignore those we can't handle? + # actually probably there should be a dedicated exception for + # heuristics to throw if they detect that the study they are given + # is not the one they would be willing to work on + ids = heuristic.infotoids(seqinfo.keys(), outdir=outdir) + # TODO: probably infotoids is doomed to do more and possibly + # split into multiple sessions!!!! but then it should be provided + # full seqinfo with files which it would place into multiple groups + lgr.info("Study session for %s" % str(ids)) + study_session_info = StudySessionInfo( + ids.get('locator'), + ids.get('session', session) or session, + ids.get('subject', None)) + if study_session_info in study_sessions: + #raise ValueError( + lgr.warning( + "We already have a study session with the same value %s" + % repr(study_session_info)) + continue # skip for now + study_sessions[study_session_info] = seqinfo + + return study_sessions # # Additional handlers @@ -794,7 +1504,250 @@ def is_interactive(): return sys.stdin.isatty() and sys.stdout.isatty() and sys.stderr.isatty() -_sys_excepthook = sys.excepthook # Just in case we ever need original one +def create_file_if_missing(filename, content): + """Create file if missing, so we do not override any possibly introduced changes""" + if exists(filename): + return False + with open(filename, 'w') as f: + f.write(content) + return True + + +def populate_bids_templates(path, defaults={}): + # dataset descriptor + lgr.info("Populating template files under %s", path) + descriptor = opj(path, 'dataset_description.json') + if not exists(descriptor): + save_json(descriptor, + ordereddict([ + ('Name', "TODO: name of the dataset"), + ('BIDSVersion', "1.0.1"), + ('License', defaults.get('License', "TODO: choose a license, e.g. PDDL (http://opendatacommons.org/licenses/pddl/)")), + ('Authors', defaults.get('Authors', ["TODO:", "First1 Last1", "First2 Last2", "..."])), + ('Acknowledgements', defaults.get('Acknowledgements', 'TODO: whom you want to acknowledge')), + ('HowToAcknowledge', "TODO: describe how to acknowledge -- either cite a corresponding paper, or just in acknowledgement section"), + ('Funding', ["TODO", "GRANT #1", "GRANT #2"]), + ('ReferencesAndLinks', ["TODO", "List of papers or websites"]), + ('DatasetDOI', 'TODO: eventually a DOI for the dataset') + ])) + + sourcedata_README = opj(path, 'sourcedata', 'README') + if exists(dirname(sourcedata_README)): + create_file_if_missing( + sourcedata_README, + """\ +TODO: Provide description about source data, e.g. + +Directory below contains DICOMS compressed into tarballs per each sequence, +replicating directory hierarchy of the BIDS dataset itself. +""") + + create_file_if_missing( + opj(path, 'CHANGES'), + """\ +0.0.1 Initial data acquired + +TODOs: + - verify and possibly extend information in participants.tsv + (see for example http://datasets.datalad.org/?dir=/openfmri/ds000208) + - fill out dataset_description.json, README, sourcedata/README (if present) + - provide _events.tsv file for each _bold.nii.gz with onsets of events + (see "8.5 Task events" of BIDS specification) +""") + + create_file_if_missing( + opj(path, 'README'), + """\ +TODO: Provide description for the dataset -- basic details about the study, +possibly pointing to pre-registration (if public or embargoed) +""") + + # TODO: collect all task- .json files for func files to + tasks = {} + # way too many -- let's just collect all which are the same! + # FIELDS_TO_TRACK = {'RepetitionTime', 'FlipAngle', 'EchoTime', 'Manufacturer', 'SliceTiming', ''} + for fpath in find_files('.*_task-.*\_bold\.json', topdir=path, + exclude_vcs=True, exclude="/\.(datalad|heudiconv)/"): + task = re.sub('.*_(task-[^_\.]*(_acq-[^_\.]*)?)_.*', r'\1', fpath) + j = load_json(fpath) + if task not in tasks: + tasks[task] = j + else: + rec = tasks[task] + # let's retain only those fields which have the same value + for field in sorted(rec): + if field not in j or j[field] != rec[field]: + del rec[field] + + # create a stub onsets file for each one of those + suf = '_bold.json' + assert fpath.endswith(suf) + events_file = fpath[:-len(suf)] + '_events.tsv' + lgr.debug("Generating %s", events_file) + with open(events_file, 'w') as f: + f.write("onset\tduration\ttrial_type\tTODO -- fill in rows and add more tab-separated columns if desired") + + # - extract tasks files stubs + for task_acq, fields in tasks.items(): + task_file = opj(path, task_acq + '_bold.json') + lgr.debug("Generating %s", task_file) + fields["TaskName"] = "TODO: full task name for %s" % task_acq.split('_')[0].split('-')[1] + fields["CogAtlasID"] = "TODO" + with open(task_file, 'w') as f: + f.write(json_dumps_pretty(fields, indent=2, sort_keys=True)) + + +def add_participant_record(studydir, subject, age, sex): + participants_tsv = opj(studydir, 'participants.tsv') + participant_id = 'sub-%s' % subject + + if not create_file_if_missing( + participants_tsv, + '\t'.join(['participant_id', 'age', 'sex', 'group']) + '\n' + ): + # check if may be subject record already exists + with open(participants_tsv) as f: + f.readline() + known_subjects = {l.split('\t')[0] for l in f.readlines()} + if participant_id in known_subjects: + # already there -- not adding + return + + # Add a new participant + with open(participants_tsv, 'a') as f: + f.write('\t'.join(map(str, [ + participant_id, + age.lstrip('0').rstrip('Y') if age else 'N/A', + sex, + 'control'])) + + '\n') + + +def mark_sensitive(ds, path_glob=None): + """ + + Parameters + ---------- + ds : Dataset to operate on + path_glob : str, optional + glob of the paths within dataset to work on + + Returns + ------- + None + """ + sens_kwargs = dict( + init=[('distribution-restrictions', 'sensitive')] + ) + if path_glob: + paths = glob(opj(ds.path, path_glob)) + if not paths: + return + sens_kwargs['path'] = paths + ds.metadata(**sens_kwargs) + + +def add_to_datalad(topdir, studydir, msg=None, bids=False): + """Do all necessary preparations (if were not done before) and save + """ + from datalad.api import create + from datalad.api import Dataset + from datalad.support.annexrepo import AnnexRepo + + from datalad.support.external_versions import external_versions + assert external_versions['datalad'] >= '0.5.1', "Need datalad >= 0.5.1" + + studyrelpath = os.path.relpath(studydir, topdir) + assert not studyrelpath.startswith(os.path.pardir) # so we are under + # now we need to test and initiate a DataLad dataset all along the path + curdir_ = topdir + superds = None + subdirs = [''] + studyrelpath.split(os.path.sep) + for isubdir, subdir in enumerate(subdirs): + curdir_ = opj(curdir_, subdir) + ds = Dataset(curdir_) + if not ds.is_installed(): + lgr.info("Initiating %s", ds) + # would require annex > 20161018 for correct operation on annex v6 + ds_ = create(curdir_, dataset=superds, + force=True, + no_annex=True, # need to add .gitattributes first anyways + shared_access='all', + annex_version=6) + assert ds == ds_ + assert ds.is_installed() + superds = ds + + create_file_if_missing( + opj(studydir, '.gitattributes'), + """\ +* annex.largefiles=(largerthan=100kb) +*.json annex.largefiles=nothing +*.txt annex.largefiles=nothing +*.tsv annex.largefiles=nothing +*.nii.gz annex.largefiles=anything +*.tgz annex.largefiles=anything +*_scans.tsv annex.largefiles=anything +""") + # so for mortals it just looks like a regular directory! + if not ds.config.get('annex.thin'): + ds.config.add('annex.thin', 'true', where='local') + # initialize annex there if not yet initialized + AnnexRepo(ds.path, init=True) + # ds might have memories of having ds.repo GitRepo + superds = None + del ds + ds = Dataset(studydir) + # Add doesn't have all the options of save such as msg and supers + ds.add('.gitattributes', to_git=True, save=False) + dsh = None + if os.path.lexists(os.path.join(ds.path, '.heudiconv')): + dsh = Dataset(opj(ds.path, '.heudiconv')) + if not dsh.is_installed(): + # we need to create it first + dsh = ds.create(path='.heudiconv', + force=True, + shared_access='all') + # Since .heudiconv could contain sensitive information + # we place all files under annex and then add + if create_file_if_missing( + opj(dsh.path, '.gitattributes'), + """* annex.largefiles=anything + """): + dsh.add('.gitattributes', message="Added gitattributes to place all content under annex") + ds.add('.', recursive=True, save=False, + # not in effect! ? + #annex_add_opts=['--include-dotfiles'] + ) + + # TODO: filter for only changed files? + # Provide metadata for sensitive information + mark_sensitive(ds, 'sourcedata') + mark_sensitive(ds, '*_scans.tsv') # top level + mark_sensitive(ds, '*/*_scans.tsv') # within subj + mark_sensitive(ds, '*/anat') # within subj + mark_sensitive(ds, '*/*/anat') # within subj/ses + if dsh: + mark_sensitive(dsh) # entire .heudiconv! + dsh.save(message=msg) + ds.save(message=msg, recursive=True, super_datasets=True) + + assert not ds.repo.dirty + # TODO: they are still appearing as native annex symlinked beasts + """ + TODOs: + it needs + - unlock (thin will be in effect) + - save/commit (does modechange 120000 => 100644 + + - could potentially somehow automate that all: + http://git-annex.branchable.com/tips/automatically_adding_metadata/ + - possibly even make separate sub-datasets for originaldata, derivatives ? + """ + + +_sys_excepthook = sys.excepthook # Just in case we ever need original one + def setup_exceptionhook(): """Overloads default sys.excepthook with our exceptionhook handler. @@ -805,54 +1758,240 @@ def setup_exceptionhook(): def _pdb_excepthook(type, value, tb): if is_interactive(): - import traceback, pdb + import traceback + import pdb traceback.print_exception(type, value, tb) print() pdb.post_mortem(tb) else: lgr.warn("We cannot setup exception hook since not in interactive mode") - # we are in interactive mode or we don't have a tty-like - # device, so we call the default hook - #sys.__excepthook__(type, value, tb) _sys_excepthook(type, value, tb) sys.excepthook = _pdb_excepthook -def main(args=None): +def _main(args): + """Given a structure of arguments from the parser perform computation""" + + # + # Deal with provided files or templates + # + + # + # pre-process provided list of files and possibly sort into groups/sessions + # + + # Group files per each study/sid/session + + dicom_dir_template = args.dicom_dir_template + files_opt = args.files + session = args.session + subjs = args.subjs + outdir = os.path.abspath(args.outdir) + grouping = args.grouping + + if args.command: + # custom mode of operation + if args.command == 'treat-json': + for f in files_opt: + treat_infofile(f) + elif args.command == 'ls': + heuristic = load_heuristic(os.path.realpath(args.heuristic_file)) + heuristic_ls = getattr(heuristic, 'ls', None) + for f in files_opt: + study_sessions = get_study_sessions( + dicom_dir_template, [f], + heuristic, outdir, session, subjs, grouping=grouping) + print(f) + for study_session, sequences in study_sessions.items(): + suf = '' + if heuristic_ls: + suf += heuristic_ls(study_session, sequences) + print( + "\t%s %d sequences%s" + % (str(study_session), len(sequences), suf) + ) + elif args.command == 'populate-templates': + heuristic = load_heuristic(os.path.realpath(args.heuristic_file)) + for f in files_opt: + populate_bids_templates( + f, + getattr(heuristic, 'DEFAULT_FIELDS', {}) + ) + elif args.command == 'sanitize-jsons': + tuneup_bids_json_files(files_opt) + else: + raise ValueError("Unknown command %s", args.command) + return + + # + # Load heuristic -- better do it asap to make sure it loads correctly + # + heuristic = load_heuristic(os.path.realpath(args.heuristic_file)) + # TODO: Move into a function! + study_sessions = get_study_sessions( + dicom_dir_template, files_opt, + heuristic, outdir, session, subjs, + grouping=grouping) + # extract tarballs, and replace their entries with expanded lists of files + # TODO: we might need to sort so sessions are ordered??? + lgr.info("Need to process %d study sessions", len(study_sessions)) + + # + # processed_studydirs = set() + + for (locator, session, sid), files_or_seqinfo in study_sessions.items(): + + if not len(files_or_seqinfo): + raise ValueError("nothing to process?") + # that is how life is ATM :-/ since we don't do sorting if subj + # template is provided + if isinstance(files_or_seqinfo, dict): + assert(isinstance(list(files_or_seqinfo.keys())[0], SeqInfo)) + dicoms = None + seqinfo = files_or_seqinfo + else: + dicoms = files_or_seqinfo + seqinfo = None + + if locator == 'unknown': + lgr.warning("Skipping unknown locator dataset") + continue + + if args.queue: + if seqinfo and not dicoms: + # flatten them all and provide into batching, which again + # would group them... heh + dicoms = sum(seqinfo.values(), []) + # so + raise NotImplementedError( + "we already groupped them so need to add a switch to avoid " + "any groupping, so no outdir prefix doubled etc" + ) + # TODO This needs to be updated to better scale with additional args + progname = os.path.abspath(inspect.getfile(inspect.currentframe())) + convertcmd = ' '.join(['python', progname, + '-o', study_outdir, + '-f', heuristic.filename, + '-s', sid, + '--anon-cmd', args.anon_cmd, + '-c', args.converter]) + if session: + convertcmd += " --ses '%s'" % session + if args.with_prov: + convertcmd += " --with-prov" + if args.bids: + convertcmd += " --bids" + convertcmd += ["'%s'" % f for f in dicoms] + + script_file = 'dicom-%s.sh' % sid + with open(script_file, 'wt') as fp: + fp.writelines(['#!/bin/bash\n', convertcmd]) + outcmd = 'sbatch -J dicom-%s -p %s -N1 -c2 --mem=20G %s' \ + % (sid, args.queue, script_file) + os.system(outcmd) + continue + + anon_sid = get_annonimized_sid(sid, args.anon_cmd) + + study_outdir = opj(outdir, locator or '') + + anon_outdir = args.conv_outdir or outdir + anon_study_outdir = opj(anon_outdir, locator or '') + + # TODO: --datalad cmdline option, which would take care about initiating + # the outdir -> study_outdir datasets if not yet there + if args.datalad: + datalad_msg_suf = ' %s' % anon_sid + if session: + datalad_msg_suf += ", session %s" % session + if seqinfo: + datalad_msg_suf += ", %d sequences" % len(seqinfo) + datalad_msg_suf += ", %d dicoms" % ( + len(sum(seqinfo.values(), [])) if seqinfo else len(dicoms) + ) + from datalad.api import Dataset + ds = Dataset(anon_study_outdir) + if not exists(anon_outdir) or not ds.is_installed(): + add_to_datalad( + anon_outdir, anon_study_outdir, + msg="Preparing for %s" % datalad_msg_suf, + bids=args.bids) + lgr.info("PROCESSING STARTS: {0}".format( + str(dict(subject=sid, outdir=study_outdir, session=session)))) + convert_dicoms( + sid, + dicoms, + study_outdir, + heuristic=heuristic, + converter=args.converter, + anon_sid=anon_sid, + anon_outdir=anon_study_outdir, + with_prov=args.with_prov, + ses=session, + is_bids=args.bids, + seqinfo=seqinfo, + min_meta=args.minmeta) + lgr.info("PROCESSING DONE: {0}".format( + str(dict(subject=sid, outdir=study_outdir, session=session)))) + + if args.datalad: + msg = "Converted subject %s" % datalad_msg_suf + # TODO: whenever propagate to supers work -- do just + # ds.save(msg=msg) + # also in batch mode might fail since we have no locking ATM + # and theoretically no need actually to save entire study + # we just need that + add_to_datalad(outdir, study_outdir, msg=msg, bids=args.bids) + + # if args.bids: + # # Let's populate BIDS templates for folks to take care about + # for study_outdir in processed_studydirs: + # populate_bids_templates(study_outdir) + # + # # TODO: record_collection of the sid/session although that information + # # is pretty much present in .heudiconv/SUBJECT/info so we could just poke there + + tempdirs.cleanup() + + +def get_parser(): docstr = '\n'.join((__doc__, -""" - Example: + """ + Example: - heudiconv -d rawdata/%s -o . -f heuristic.py -s s1 s2 -s3 -""")) + heudiconv -d rawdata/{subject} -o . -f + heuristic.py -s s1 s2 + s3 + """)) parser = argparse.ArgumentParser(description=docstr) parser.add_argument('--version', action='version', version=__version__) parser.add_argument('-d', '--dicom_dir_template', dest='dicom_dir_template', - required=True, + required=False, help='''location of dicomdir that can be indexed with subject id {subject} and session {session}. Tarballs (can be compressed) are supported in addition to directory. All matching tarballs for a subject are extracted and their content processed in a single pass''') - parser.add_argument('-s', '--subjects', dest='subjs', required=True, - type=str, nargs='+', help='list of subjects') + parser.add_argument('-s', '--subjects', dest='subjs', + type=str, nargs='*', + help='list of subjects. If not provided, DICOMS would ' + 'first be "sorted" and subject IDs deduced by the ' + 'heuristic') parser.add_argument('-c', '--converter', dest='converter', - required=True, - choices=('dcm2niix', - 'none'), + default='dcm2niix', + choices=('dcm2niix', 'none'), help='''tool to use for dicom conversion. Setting to "none" disables the actual conversion step -- useful for testing heuristics.''') - parser.add_argument('-o', '--outdir', dest='outputdir', + parser.add_argument('-o', '--outdir', dest='outdir', default=os.getcwd(), help='''output directory for conversion setup (for further customization and future reference. This directory will refer to non-anonymized subject IDs''') - parser.add_argument('-a', '--conv-outdir', dest='conv_outputdir', + parser.add_argument('-a', '--conv-outdir', dest='conv_outdir', default=None, help='''output directory for converted files. By default this is identical to --outdir. This option is @@ -863,39 +2002,77 @@ s3 DICOMs to anonymmized IDs. Such command must take a single argument and return a single anonymized ID. Also see --conv-outdir''') - parser.add_argument('-f', '--heuristic', dest='heuristic_file', required=True, + parser.add_argument('-f', '--heuristic', dest='heuristic_file', + required=True, help='python script containing heuristic') parser.add_argument('-q', '--queue', dest='queue', default=None, help='''select batch system to submit jobs to instead of running the conversion serially''') parser.add_argument('-p', '--with-prov', dest='with_prov', action='store_true', - help='''Store additional provenance information. Requires python-rdflib.''') + help='''Store additional provenance information. + Requires python-rdflib.''') parser.add_argument('-ss', '--ses', dest='session', default=None, - help='''session for longitudinal studies, default is none''') + help='''session for longitudinal study_sessions, default is none''') parser.add_argument('-b', '--bids', dest='bids', action='store_true', help='''flag for output into BIDS structure''') + parser.add_argument('--overwrite', dest='overwrite', action='store_true', + help='''flag to allow overwrite existing files''') + parser.add_argument('--datalad', dest='datalad', action='store_true', + help='''Store the entire collection as DataLad dataset(s). + Small files will be committed directly to git, while large to annex. + New version (6) of annex repositories will be used in a "thin" + mode so it would look to mortals as just any other regular directory + (i.e. no symlinks to under .git/annex). For now just for BIDS mode.''') parser.add_argument('--dbg', action='store_true', dest='debug', help="do not catch exceptions and show exception traceback") - parser.add_argument('--sbargs', dest='sbatch_args', default='-N1 -c2 --mem=20G -t 01:00:00', - help="Additional sbatch arguments if running with queue arg") + parser.add_argument('--command', dest='command', + choices=('treat-json', 'ls', 'populate-templates'), + help='''custom actions to be performed on provided files instead of + regular operation.''') + parser.add_argument('-g', '--grouping', + default='studyUID', + choices=('studyUID', 'accession_number'), + help='''How to group dicoms (default: by studyUID)''') + parser.add_argument( + 'files', + nargs='*', + help="files (tarballs, dicoms) or directories containing files to " + "process. Specify one of the --dicom_dir_template or files " + "not both") parser.add_argument('--minmeta', dest='minmeta', action='store_true', help="Exclude dcmstack's meta information in sidecar jsons") - args = parser.parse_args(args) + return parser + + +def main(argv=None): + """Given a list of command line arguments, parse them and pass into _main + """ + parser = get_parser() + args = parser.parse_args(argv) + + # TODO: assign distribution-restrictions=sensitive + + # TODO(privat): entirety of .heudiconv/ should get under annex and be marked + # as sensitive since date incorporated in the path might leak + + # TODO: deprecate dicom_dir_template in favor of --files-templated or + # smth like that which could take {subject} {session} ... and process + # files argument(s) correspondingly before passing into group_dicoms_into_seqinfos + + if args.files and args.dicom_dir_template: + raise ValueError("Specify files or dicom_dir_template, not both") if args.debug: setup_exceptionhook() - convert_dicoms(args.subjs, os.path.abspath(args.dicom_dir_template), - os.path.abspath(args.outputdir), - heuristic_file=os.path.realpath(args.heuristic_file), - converter=args.converter, - queue=args.queue, - anon_sid_cmd=args.anon_cmd, - anon_outdir=args.conv_outputdir, - with_prov=args.with_prov, - ses=args.session, - is_bids=args.bids, - sbatch_args=args.sbatch_args, - min_meta=args.minmeta) + + orig_global_options = global_options.copy() + try: + global_options['overwrite'] = args.overwrite + return _main(args) + finally: + # reset back + for k, v in orig_global_options.items(): + global_options[k] = v if __name__ == '__main__': diff --git a/bin/heudiconv_monitor b/bin/heudiconv_monitor new file mode 100755 index 00000000..2faec825 --- /dev/null +++ b/bin/heudiconv_monitor @@ -0,0 +1,127 @@ +#!/usr/bin/env python +import logging +import os +import re +import subprocess +import time +import json + +from collections import deque, OrderedDict +from datetime import date +import inotify.adapters +from inotify.constants import IN_MODIFY, IN_CREATE, IN_ISDIR +from py.path import local as localpath +from tinydb import TinyDB + +_DEFAULT_LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' +_LOGGER = logging.getLogger(__name__) + +MASK = (IN_MODIFY | IN_CREATE) +MASK_NEWDIR = (IN_CREATE | IN_ISDIR) +WAIT_TIME = 86400 # in seconds + + +#def _configure_logging(): +_LOGGER.setLevel(logging.DEBUG) +ch = logging.StreamHandler() +formatter = logging.Formatter(_DEFAULT_LOG_FORMAT) +ch.setFormatter(formatter) +_LOGGER.addHandler(ch) + + +def run_heudiconv(cmd): + info_dict = dict() + proc = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + return_code = proc.wait() + if return_code == 0: + _LOGGER.info("Done running {0}".format(cmd)) + info_dict['success'] = 1 + else: + _LOGGER.error("{0} failed".format(cmd)) + info_dict['success'] = 0 + # get info on what we run + stdout = proc.communicate()[0].decode('utf-8') + match = re.match('INFO: PROCESSING STARTS: (.*)', stdout) + info_dict_ = json.loads(match.group(1) if match else '{}') + info_dict.update(info_dict_) + return stdout, info_dict + + +def process(paths2process, db, wait=WAIT_TIME, logdir='log'): + cmd = 'ls -l {0}' + #if paths2process and time.time() - os.path.getmtime(paths2process[0]) > WAIT_TIME: + processed = [] + for path, mod_time in paths2process.items(): + if time.time() - mod_time > wait: + #process_me = paths2process.popleft().decode('utf-8') + process_me = path + cmd_ = cmd.format(process_me) + process_dict = {'input_path': process_me, 'accession_number': os.path.basename(process_me)} + print("Time to process {0}".format(process_me)) + stdout, run_dict = run_heudiconv(cmd_) + process_dict.update(run_dict) + db.insert(process_dict) + # save log + logdir = localpath(logdir) + log = logdir.join(process_dict['accession_number'] + '.log') + log.write(stdout) + # if we processed it, or it failed, we need to remove it to avoid running it again + processed.append(path) + for processed_path in processed: + del paths2process[processed_path] + + +def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..', db=None, wait=WAIT_TIME, logdir='log'): + # make logdir if not existant + try: + os.makedirs(parsed.logdir) + except OSError: + pass + #paths2process = deque() + paths2process = OrderedDict() + # watch only today's folder + path_re = re.compile("(%s%s)/?$" % (topdir, check_ptrn)) + i = inotify.adapters.InotifyTree(topdir.encode())#, mask=MASK) + for event in i.event_gen(): + if event is not None: + (header, type_names, watch_path, filename) = event + _LOGGER.info("WD=(%d) MASK=(%d) COOKIE=(%d) LEN=(%d) MASK->NAMES=%s " + "WATCH-PATH=[%s] FILENAME=[%s]", + header.wd, header.mask, header.cookie, header.len, type_names, + watch_path.decode('utf-8'), filename.decode('utf-8')) + if header.mask == MASK_NEWDIR and path_re.match(watch_path.decode('utf-8')): + # we got our directory, now let's do something on it + newpath2process = os.path.join(watch_path, filename).decode('utf-8') + #paths2process.append(newpath2process) + # update time + paths2process[newpath2process] = time.time() + print(newpath2process, time.time()) + # check if we need to update the time + for path in paths2process.keys(): + if path in watch_path.decode('utf-8'): + paths2process[path] = time.time() + print("Updating {0}: {1}".format(path, paths2process[path])) + + # check if there's anything to process + process(paths2process, db, wait=wait, logdir=logdir) + + +def parse_args(): + import argparse + parser = argparse.ArgumentParser(prog='monitor.py', description='Small monitoring script to detect new directories and process them', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('path', help='Which directory to monitor') + parser.add_argument('--check_ptrn', '-p', help='regexp pattern for which subdirectories to check', default='/20../../..') + parser.add_argument('--database', '-d', help='database location', default='database.json') + parser.add_argument('--wait_time', '-w', help='After how long should we start processing datasets? (in seconds)', default=86400, type=float) + parser.add_argument('--logdir', '-l', help='Where should we save the logs?', default='log') + + + return parser.parse_args() + + +if __name__ == '__main__': + parsed = parse_args() + print('Got {0}'.format(parsed)) + # open database + db = TinyDB(parsed.database, default_table='heudiconv') + monitor(parsed.path, parsed.check_ptrn, db, wait=parsed.wait_time, logdir=parsed.logdir) diff --git a/bin/test_heudiconv_monitor.py b/bin/test_heudiconv_monitor.py new file mode 100644 index 00000000..78c0285d --- /dev/null +++ b/bin/test_heudiconv_monitor.py @@ -0,0 +1,98 @@ +from collections import namedtuple +import os +import pytest +from mock import patch +from monitor import monitor, process, run_heudiconv, MASK_NEWDIR +from os.path import exists +from tinydb import TinyDB, Query +from subprocess import CalledProcessError + + +class MockInotifyTree(object): + def __init__(self, events): + self.events = iter(events) + def event_gen(self): + for e in self.events: + yield e + def __call__(self, topdir): + return self + + +class MockTime(object): + def __init__(self, time): + self.time = time + def __call__(self): + return self.time + + +Header = namedtuple('header', ['wd', 'mask', 'cookie', 'len']) +header = Header(5, MASK_NEWDIR, 5, 5) +watch_path = b'WATCHME' +filename = b'FILE' +type_names = b'TYPE' + +path2 = watch_path + b'/' + filename + b'/subpath' + +my_events = [(header, type_names, watch_path, filename), + (header, type_names, path2, b'')] + +@patch('inotify.adapters.InotifyTree', MockInotifyTree(my_events)) +@patch('time.time', MockTime(42)) +def test_monitor(capsys): + monitor(watch_path.decode(), check_ptrn='') + out, err = capsys.readouterr() + desired_output = '{0}/{1} {2}\n'.format(watch_path.decode(), filename.decode(), 42) + desired_output += 'Updating {0}/{1}: {2}\n'.format(watch_path.decode(), filename.decode(), 42) + assert out == desired_output + + +@patch('time.time', MockTime(42)) +@pytest.mark.parametrize("side_effect,success", [ + (None, 1), + (CalledProcessError('mycmd', 1), 0) +]) +def test_process(tmpdir, capsys, side_effect, success): + db_fn = tmpdir.join('database.json') + log_dir = tmpdir.mkdir('log') + db = TinyDB(db_fn.strpath) + process_me = '/my/path/A12345' + accession_number = os.path.basename(process_me) + paths2process = {process_me: 42} + with patch('subprocess.Popen') as mocked_popen: + stdout = b"INFO: PROCESSING STARTS: {'just': 'a test'}" + mocked_popen_instance = mocked_popen.return_value + mocked_popen_instance.side_effect = side_effect + mocked_popen_instance.communicate.return_value = (stdout, ) + # set return value for wait + mocked_popen_instance.wait.return_value = 1 - success + # mock also communicate to get the supposed stdout + # mocked_popen.communicate = lambda: (b"INFO: PROCESSING STARTS: {'just': 'a test'}", ) + process(paths2process, db, wait=-30, logdir=log_dir.strpath) + out, err = capsys.readouterr() + log_fn = log_dir.join(accession_number + '.log') + + mocked_popen.assert_called_once() + assert log_fn.check() + assert log_fn.read() == stdout.decode('utf-8') + assert db_fn.check() + # dictionary should be empty + assert not paths2process + assert out == 'Time to process {0}\n'.format(process_me) + + # check what we have in the database + Path = Query() + query = db.get(Path.input_path == process_me) + assert len(db) == 1 + assert query + assert query['success'] == success + assert query['accession_number'] == os.path.basename(process_me) + assert query['just'] == 'a test' + + +def test_run_heudiconv(): + # echo should succeed always + mydict = {'key1': 'value1', 'key2': 'value2', 'success': 1} + cmd = "echo INFO: PROCESSING STARTS: {0}".format(str(mydict)) + stdout, info_dict = run_heudiconv(cmd) + assert info_dict == mydict + assert "echo " + stdout.strip() == cmd diff --git a/custom/dbic/README b/custom/dbic/README new file mode 100644 index 00000000..844e7446 --- /dev/null +++ b/custom/dbic/README @@ -0,0 +1,3 @@ +Scripts and configurations used alongside with heudiconv setup at DBIC +(Dartmouth Brain Imaging Center). Might migrate to an independent repository +eventually but for now placed here with hope they might come useful for some. diff --git a/custom/dbic/singularity-env.def b/custom/dbic/singularity-env.def new file mode 100644 index 00000000..a7722e4a --- /dev/null +++ b/custom/dbic/singularity-env.def @@ -0,0 +1,64 @@ +# Copyright (c) 2015-2016, Gregory M. Kurtzer. All rights reserved. +# +# Changes for NeuroDebian/DBIC setup are Copyright (c) 2017 Yaroslav Halchenko. +# +# The purpose of the environment is to provide a complete suite for running +# heudiconv on the INBOX server to provide conversion into BIDS layout. +# ATM it does not ship heudiconv itself which would be accessed directly +# from the main drive for now. +# +# "Singularity" Copyright (c) 2016, The Regents of the University of California, +# through Lawrence Berkeley National Laboratory (subject to receipt of any +# required approvals from the U.S. Dept. of Energy). All rights reserved. + +# +# Notes: +# - Due to https://github.com/singularityware/singularity/issues/471 +# bootstrapping leads to non-usable/non-removable-without-reboot +# image due to some rogue run away processes. +# This line could help to kill them but should be used with caution +# since could kill other unrelated processes +# +# grep -l loop /proc/*/mountinfo | sed -e 's,/proc/\(.*\)/.*,\1,g' | while read pid; do sudo kill $pid; done + +BootStrap: debootstrap +#OSVersion: stable +# needs nipype 0.12.1 but that one didn't build for stable since needs python-prov... +# so trying stretch +OSVersion: stretch +MirrorURL: http://ftp.us.debian.org/debian/ + +# so if image is executed we just enter the environment +%runscript + echo "Welcome to the DBIC BIDS environment" + /bin/bash + + +%post + echo "Configuring the environment" + apt-get update + apt-get -y install eatmydata + eatmydata apt-get -y install vim wget strace time ncdu gnupg curl procps + wget -q -O/tmp/nd-configurerepo https://raw.githubusercontent.com/neurodebian/neurodebian/4d26c8f30433145009aa3f74516da12f560a5a13/tools/nd-configurerepo + bash /tmp/nd-configurerepo + chmod a+r -R /etc/apt + eatmydata apt-get -y install datalad python-nipype virtualenv dcm2niix python-dcmstack python-configparser python-funcsigs python-pytest dcmtk + + # for bids-validator + curl -sL https://deb.nodesource.com/setup_6.x | bash - && \ + eatmydata apt-get install -y nodejs + npm install -g bids-validator@0.20.0 + chmod a+rX -R /usr/lib/node_modules/ + + chmod a+rX -R /etc/apt/sources.list.d + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + apt-get clean + + # and wipe out apt lists since not to be used RW for further tuning + # find /var/lib/apt/lists/ -type f -delete + # /usr/bin/find /var/lib/apt/lists/ -type f -name \*Packages\* -o -name \*Contents\* + # complicates later interrogation - thus disabled + + # Create some bind mount directories present on rolando + mkdir -p /afs /inbox + chmod a+rX /afs /inbox diff --git a/dev-requirements.txt b/dev-requirements.txt index 78e5af36..26b77f68 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,3 +1,2 @@ -r requirements.txt -six -nose +pytest diff --git a/heuristics/banda-bids.py b/heuristics/banda-bids.py index 92d8391a..946c33b7 100644 --- a/heuristics/banda-bids.py +++ b/heuristics/banda-bids.py @@ -38,9 +38,9 @@ def infotodict(seqinfo): for idx, s in enumerate(seqinfo): # T1 and T2 scans if (s.dim3 == 208) and (s.dim4 == 1) and ('T1w' in s.protocol_name): - info[t1] = [s.series_number] + info[t1] = [s.series_id] if (s.dim3 == 208) and ('T2w' in s.protocol_name): - info[t2] = [s.series_number] + info[t2] = [s.series_id] # diffusion scans if ('dMRI_dir9' in s.protocol_name): key = None @@ -49,7 +49,7 @@ def infotodict(seqinfo): elif (s.dim4 == 1) and ('SBRef' in s.series_description): key = dwi_sbref if key: - info[key].append({'item': s.series_number}) + info[key].append({'item': s.series_id}) # functional scans if ('fMRI' in s.protocol_name): tasktype = s.protocol_name.split('fMRI')[1].split('_')[1] @@ -65,10 +65,10 @@ def infotodict(seqinfo): if 'conflict' in tasktype: key = conflict_sbref if 'gambling' in tasktype: key = gamble_sbref if key: - info[key].append({'item': s.series_number}) + info[key].append({'item': s.series_id}) if (s.dim4 == 3) and ('SpinEchoFieldMap' in s.protocol_name): dirtype = s.protocol_name.split('_')[-1] - info[fmap].append({'item': s.series_number, 'dir': dirtype}) + info[fmap].append({'item': s.series_id, 'dir': dirtype}) # You can even put checks in place for your protocol msg = [] diff --git a/heuristics/bids_with_ses.py b/heuristics/bids_with_ses.py index 1972b51b..e8dc84d8 100644 --- a/heuristics/bids_with_ses.py +++ b/heuristics/bids_with_ses.py @@ -37,38 +37,38 @@ def infotodict(seqinfo): last_run = len(seqinfo) for s in seqinfo: if (s.dim3 == 176 or s.dim3 == 352) and (s.dim4 == 1) and ('MEMPRAGE' in s.protocol_name): - info[t1] = [s.series_number] + info[t1] = [s.series_id] elif (s.dim4 == 1) and ('MEMPRAGE' in s.protocol_name): - info[t1] = [s.series_number] + info[t1] = [s.series_id] elif (s.dim3 == 176 or s.dim3 == 352) and (s.dim4 == 1) and ('T2_SPACE' in s.protocol_name): - info[t2] = [s.series_number] + info[t2] = [s.series_id] elif ('field_mapping_diffusion' in s.protocol_name): - info[fm_diff].append([s.series_number]) + info[fm_diff].append([s.series_id]) elif (s.dim4 >= 70) and ('DIFFUSION_HighRes_AP' in s.protocol_name): - info[dwi_ap].append([s.series_number]) + info[dwi_ap].append([s.series_id]) elif ('DIFFUSION_HighRes_PA' in s.protocol_name): - info[dwi_pa].append([s.series_number]) + info[dwi_pa].append([s.series_id]) elif ('field_mapping_resting' in s.protocol_name): - info[fm_rest].append([s.series_number]) + info[fm_rest].append([s.series_id]) elif (s.dim4 == 144) and ('resting' in s.protocol_name): if not s.is_motion_corrected: - info[rs].append([(s.series_number)]) + info[rs].append([(s.series_id)]) elif (s.dim4 == 183 or s.dim4 == 366) and ('localizer' in s.protocol_name): if not s.is_motion_corrected: - info[boldt1].append([s.series_number]) + info[boldt1].append([s.series_id]) elif (s.dim4 == 227 or s.dim4 == 454) and ('transfer1' in s.protocol_name): if not s.is_motion_corrected: - info[boldt2].append([s.series_number]) + info[boldt2].append([s.series_id]) elif (s.dim4 == 227 or s.dim4 == 454) and ('transfer2' in s.protocol_name): if not s.is_motion_corrected: - info[boldt3].append([s.series_number]) + info[boldt3].append([s.series_id]) elif (('run1' in s.protocol_name) or ('run6' in s.protocol_name)) and (s.dim4 == 159): if not s.is_motion_corrected: - info[nofb_task].append([s.series_number]) + info[nofb_task].append([s.series_id]) elif (('run2' in s.protocol_name) or ('run3' in s.protocol_name) or ('run4' in s.protocol_name) or ('run5' in s.protocol_name)) and (s.dim4 == 159): if not s.is_motion_corrected: - info[fb_task].append([s.series_number]) + info[fb_task].append([s.series_id]) else: pass return info diff --git a/heuristics/cmrr_heuristic.py b/heuristics/cmrr_heuristic.py index 79c370c1..449b5473 100644 --- a/heuristics/cmrr_heuristic.py +++ b/heuristics/cmrr_heuristic.py @@ -31,41 +31,41 @@ def infotodict(seqinfo): for idx, s in enumerate(seqinfo): if (s.dim3 == 208) and (s.dim4 == 1) and ('T1w' in s.protocol_name): - info[t1] = [s.series_number] + info[t1] = [s.series_id] if (s.dim3 == 208) and ('T2w' in s.protocol_name): - info[t2] = [s.series_number] + info[t2] = [s.series_id] if (s.dim4 >= 99) and (('dMRI_dir98_AP' in s.protocol_name) or ('dMRI_dir99_AP' in s.protocol_name)): acq = s.protocol_name.split('dMRI_')[1].split('_')[0] + 'AP' - info[dwi].append({'item': s.series_number, 'acq': acq}) + info[dwi].append({'item': s.series_id, 'acq': acq}) if (s.dim4 >= 99) and (('dMRI_dir98_PA' in s.protocol_name) or ('dMRI_dir99_PA' in s.protocol_name)): acq = s.protocol_name.split('dMRI_')[1].split('_')[0] + 'PA' - info[dwi].append({'item': s.series_number, 'acq': acq}) + info[dwi].append({'item': s.series_id, 'acq': acq}) if (s.dim4 == 1) and (('dMRI_dir98_AP' in s.protocol_name) or ('dMRI_dir99_AP' in s.protocol_name)): acq = s.protocol_name.split('dMRI_')[1].split('_')[0] - info[fmap_dwi].append({'item': s.series_number, 'dir': 'AP', 'acq': acq}) + info[fmap_dwi].append({'item': s.series_id, 'dir': 'AP', 'acq': acq}) if (s.dim4 == 1) and (('dMRI_dir98_PA' in s.protocol_name) or ('dMRI_dir99_PA' in s.protocol_name)): acq = s.protocol_name.split('dMRI_')[1].split('_')[0] - info[fmap_dwi].append({'item': s.series_number, 'dir': 'PA', 'acq': acq}) + info[fmap_dwi].append({'item': s.series_id, 'dir': 'PA', 'acq': acq}) if (s.dim4 == 420) and ('rfMRI_REST_AP' in s.protocol_name): - info[rest].append({'item': s.series_number, 'acq': 'AP'}) + info[rest].append({'item': s.series_id, 'acq': 'AP'}) if (s.dim4 == 420) and ('rfMRI_REST_PA' in s.protocol_name): - info[rest].append({'item': s.series_number, 'acq': 'PA'}) + info[rest].append({'item': s.series_id, 'acq': 'PA'}) if (s.dim4 == 1) and ('rfMRI_REST_AP' in s.protocol_name): if seqinfo[idx + 1][9] != 420: continue - info[fmap_rest].append({'item': s.series_number, 'dir': 'AP', 'acq': ''}) + info[fmap_rest].append({'item': s.series_id, 'dir': 'AP', 'acq': ''}) if (s.dim4 == 1) and ('rfMRI_REST_PA' in s.protocol_name): - info[fmap_rest].append({'item': s.series_number, 'dir': 'PA', 'acq': ''}) + info[fmap_rest].append({'item': s.series_id, 'dir': 'PA', 'acq': ''}) if (s.dim4 == 346) and ('tfMRI_faceMatching_AP' in s.protocol_name): - info[face].append({'item': s.series_number, 'acq': 'AP'}) + info[face].append({'item': s.series_id, 'acq': 'AP'}) if (s.dim4 == 346) and ('tfMRI_faceMatching_PA' in s.protocol_name): - info[face].append({'item': s.series_number, 'acq': 'PA'}) + info[face].append({'item': s.series_id, 'acq': 'PA'}) if (s.dim4 == 288) and ('tfMRI_conflict_AP' in s.protocol_name): - info[conflict].append({'item': s.series_number, 'acq': 'AP'}) + info[conflict].append({'item': s.series_id, 'acq': 'AP'}) if (s.dim4 == 288) and ('tfMRI_conflict_PA' in s.protocol_name): - info[conflict].append({'item': s.series_number, 'acq': 'PA'}) + info[conflict].append({'item': s.series_id, 'acq': 'PA'}) if (s.dim4 == 223) and ('tfMRI_gambling_AP' in (s.protocol_name)): - info[gamble].append({'item': s.series_number, 'acq': 'AP'}) + info[gamble].append({'item': s.series_id, 'acq': 'AP'}) if (s.dim4 == 223) and ('tfMRI_gambling_PA' in s.protocol_name): - info[gamble].append({'item': s.series_number, 'acq': 'PA'}) + info[gamble].append({'item': s.series_id, 'acq': 'PA'}) return info diff --git a/heuristics/convertall.py b/heuristics/convertall.py index 0b264a81..eb315e49 100644 --- a/heuristics/convertall.py +++ b/heuristics/convertall.py @@ -28,7 +28,7 @@ def infotodict(seqinfo): * total_files_till_now * example_dcm_file - * series_number + * series_id * dcm_dir_name * unspecified2 * unspecified3 @@ -48,5 +48,5 @@ def infotodict(seqinfo): * image_type """ - info[data].append(s.series_number) + info[data].append(s.series_id) return info diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py index 2262041d..046f125b 100644 --- a/heuristics/dbic_bids.py +++ b/heuristics/dbic_bids.py @@ -1,21 +1,284 @@ import os - -from collections import defaultdict +import re +from collections import OrderedDict +import hashlib +from glob import glob import logging lgr = logging.getLogger('heudiconv') +# dictionary from accession-number to runs that need to be marked as bad +# NOTE: even if filename has number that is 0-padded, internally no padding +# is done +fix_accession2run = { + 'A000005': ['^1-'], + 'A000035': ['^8-', '^9-'], + 'A000067': ['^9-'], + 'A000072': ['^5-'], + 'A000081': ['^5-'], + 'A000082': ['^5-'], + 'A000088': ['^9-'], + 'A000090': ['^5-'], + 'A000127': ['^21-'], + 'A000130': ['^15-'], + 'A000137': ['^9-', '^11-'], + 'A000297': ['^12-'], + 'A000326': ['^15-'], + 'A000376': ['^15-'], + 'A000384': ['^8-', '^11-'], + 'A000467': ['^15-'], + 'A000490': ['^15-'], + 'A000511': ['^15-'], +} + +# dictionary containing fixes, keys are md5sum of study_description from +# dicoms, in the form of PI-Experimenter^protocolname +# values are list of tuples in the form (regex_pattern, substitution) +protocols2fix = { + # QA + '43b67d9139e8c7274578b7451ab21123': + [ + #('anat-scout.*', 'anat-scout_ses-{date}'), + # do not change it so we retain _ses-{date} + #('anat-scout.*', 'anat-scout'), + ('BOLD_p2_s4_3\.5mm', 'func_task-rest_acq-p2-s4-3.5mm'), + ('BOLD_p2_s4', 'func_task-rest_acq-p2-s4'), + ('BOLD_p2_noprescannormalize', 'func-bold_task-rest_acq-p2noprescannormalize'), + ('BOLD_p2', 'func-bold_task-rest_acq-p2'), + ('BOLD_', 'func_task-rest'), + ('DTI_30_p2_s4_3\.5mm', 'dwi_acq-DTI-30-p2-s4-3.5mm'), + ('DTI_30_p2_s4', 'dwi_acq-DTI-30-p2-s4'), + ('DTI_30_p2', 'dwi_acq-DTI-30-p2'), + ('_p2_s4_3\.5mm', '_acq-p2-s4-3.5mm'), + ('_p2_s4', '_acq-p2-s4'), + ('_p2', '_acq-p2'), + ], + '9d148e2a05f782273f6343507733309d': + [('anat_', 'anat-'), + ('run-life[0-9]', 'run+_task-life'), + ('scout_run\+', 'scout'), + ('T2w', 'T2w_run+'), + # substitutions for old protocol names + ('AAHead_Scout_32ch-head-coil', 'anat-scout'), + ('MPRAGE', 'anat-T1w_acq-MPRAGE_run+'), + ('gre_field_mapping_2mm', 'fmap_run+_acq-2mm'), + ('gre_field_mapping_3mm', 'fmap_run+_acq-3mm'), + ('epi_bold_sms_p2_s4_2mm_life1_748', + 'func_run+_task-life_acq-2mm748'), + ('epi_bold_sms_p2_s4_2mm_life2_692', + 'func_run+_task-life_acq-2mm692'), + ('epi_bold_sms_p2_s4_2mm_life3_754', + 'func_run+_task-life_acq-2mm754'), + ('epi_bold_sms_p2_s4_2mm_life4_824', + 'func_run+_task-life_acq-2mm824'), + ('epi_bold_p2_3mm_nofs_life1_374', + 'func_run+_task-life_acq-3mmnofs374'), + ('epi_bold_p2_3mm_nofs_life2_346', + 'func_run+_task-life_acq-3mmnofs346'), + ('epi_bold_p2_3mm_nofs_life3_377', + 'func_run+_task-life_acq-3mmnofs377'), + ('epi_bold_p2_3mm_nofs_life4_412', + 'func_run+_task-life_acq-3mmnofs412'), + ('t2_space_sag_p4_iso', 'anat-T2w_run+'), + ('gre_field_mapping_2.4mm', 'fmap_run+_acq-2.4mm'), + ('rest_p2_sms4_2.4mm_64sl_1000tr_32te_600dyn', + 'func_run+_task-rest_acq-2.4mm64sl1000tr32te600dyn'), + ('DTI_30', 'dwi_run+_acq-30'), + ('t1_space_sag_p2_iso', 'anat-T1w_acq-060mm_run+')], + '76b36c80231b0afaf509e2d52046e964': + [('fmap_run\+_2mm', 'fmap_run+_acq-2mm')], + 'c6d8fbccc72990bee61d28e73b2618a4': + [('run=', 'run+')], + 'a751cc977f1e354fcafcb0ea2de123bd': + [ + ('_unlabeled', '_task-unlabeled'), + ('_mSense', '_acq-mSense'), + ('_p1_sms4_2.5mm', '_acq-p1-sms4-2.5mm'), + ('_p1_sms4_3mm', '_acq-p1-sms4-3mm'), + ], + 'd160113cf5ea8c5d0cbbbe14ef625e76': + [ + ('_run0', '_run-0'), + ], + '1bd62e10672fe0b435a9aa8d75b45425': + [ + # need to add incrementing session -- study should have 2 + ('scout_run\+$', 'scout_run+_ses+'), + ], + 'da218a66de902adb3ad9407d514e3639': + [ + # those sequences renamed later to include DTI- in their acq- + # so fot consistency + ('hardi_64', 'dwi_acq-DTI-hardi64'), + ('acq-hardi', 'acq-DTI-hardi'), + ], +} +keys2replace = ['protocol_name', 'series_description'] + +# list containing StudyInstanceUID to skip -- hopefully doesn't happen too often +dicoms2skip = [ + '1.3.12.2.1107.5.2.43.66112.30000016110117002435700000001', + '1.3.12.2.1107.5.2.43.66112.30000016102813152550600000004', # double scout +] + +DEFAULT_FIELDS = { + # Let it just be in each json file extracted + #'Manufacturer': "Siemens", + #'ManufacturersModelName': "Prisma", + "Acknowledgements": + "We thank Terry Sacket and the rest of the DBIC (Dartmouth Brain Imaging " + "Center) personnel for assistance in data collection, and " + "Yaroslav Halchenko and Matteo Visconti for preparing BIDS dataset. " + "TODO: more", +} + + +def _delete_chars(from_str, deletechars): + """ Delete characters from string allowing for Python 2 / 3 difference + """ + try: + return from_str.translate(None, deletechars) + except TypeError: + return from_str.translate(str.maketrans('', '', deletechars)) + -def create_key(subdir, file_suffix, outtype=('nii.gz',), - annotation_classes=None): +def filter_dicom(dcmdata): + """Return True if a DICOM dataset should be filtered out, else False""" + return True if dcmdata.StudyInstanceUID in dicoms2skip else False + + +def filter_files(fn): + """Return True if a file should be kept, else False. + We're using it to filter out files that do not start with a number.""" + + # do not check for these accession numbers because they haven't been + # recopied with the initial number + donotfilter = ['A000012', 'A000013', 'A000020', 'A000041'] + + split = os.path.split(fn) + split2 = os.path.split(split[0]) + sequence_dir = split2[1] + split3 = os.path.split(split2[0]) + accession_number = split3[1] + return True + if accession_number == 'A000043': + # crazy one that got copied for some runs but not for others, + # so we are going to discard those that got copied and let heudiconv + # figure out the rest + return False if re.match('^[0-9]+-', sequence_dir) else True + elif accession_number == 'unknown': + # this one had some stuff without study description, filter stuff before + # collecting info, so it doesn't crash completely + return False if re.match('^[34][07-9]-sn', sequence_dir) else True + elif accession_number in donotfilter: + return True + elif accession_number.startswith('phantom-'): + # Accessions on phantoms, e.g. in dartmouth-phantoms/bids_test4-20161014 + return True + elif accession_number.startswith('heudiconvdcm'): + # we were given some tarball with dicoms which was extracted so we + # better obey + return True + else: + return True if re.match('^[0-9]+-', sequence_dir) else False + + +def create_key(subdir, file_suffix, outtype=('nii.gz', 'dicom'), + annotation_classes=None, prefix=''): if not subdir: raise ValueError('subdir must be a valid format string') # may be even add "performing physician" if defined?? - template = "{referring_physician_name}/{study_description}/{bids_subject_session_dir}/" \ - "%s/{bids_subject_session_prefix}_%s" % (subdir, file_suffix) + template = os.path.join( + prefix, + "{bids_subject_session_dir}", + subdir, + "{bids_subject_session_prefix}_%s" % file_suffix + ) return template, outtype, annotation_classes +def md5sum(string): + """Computes md5sum of as string""" + if not string: + return "" # not None so None was not compared to strings + m = hashlib.md5(string.encode()) + return m.hexdigest() + +def get_study_description(seqinfo): + # Centralized so we could fix/override + v = get_unique(seqinfo, 'study_description') + return v + +def get_study_hash(seqinfo): + # XXX: ad hoc hack + return md5sum(get_study_description(seqinfo)) + + +def fix_canceled_runs(seqinfo, accession2run=fix_accession2run): + """Function that adds cancelme_ to known bad runs which were forgotten + """ + accession_number = get_unique(seqinfo, 'accession_number') + if accession_number in accession2run: + lgr.info("Considering some runs possibly marked to be " + "canceled for accession %s", accession_number) + badruns = accession2run[accession_number] + badruns_pattern = '|'.join(badruns) + for i, s in enumerate(seqinfo): + if re.match(badruns_pattern, s.series_id): + lgr.info('Fixing bad run {0}'.format(s.series_id)) + fixedkwargs = dict() + for key in keys2replace: + fixedkwargs[key] = 'cancelme_' + getattr(s, key) + seqinfo[i] = s._replace(**fixedkwargs) + return seqinfo + + +def fix_dbic_protocol(seqinfo, keys=keys2replace, subsdict=protocols2fix): + """Ad-hoc fixup for existing protocols + """ + study_hash = get_study_hash(seqinfo) + + if study_hash not in subsdict: + raise ValueError("I don't know how to fix {0}".format(study_hash)) + + # need to replace both protocol_name series_description + substitutions = subsdict[study_hash] + for i, s in enumerate(seqinfo): + fixed_kwargs = dict() + for key in keys: + value = getattr(s, key) + # replace all I need to replace + for substring, replacement in substitutions: + value = re.sub(substring, replacement, value) + fixed_kwargs[key] = value + # namedtuples are immutable + seqinfo[i] = s._replace(**fixed_kwargs) + + return seqinfo + + +def fix_seqinfo(seqinfo): + """Just a helper on top of both fixers + """ + # add cancelme to known bad runs + seqinfo = fix_canceled_runs(seqinfo) + study_hash = get_study_hash(seqinfo) + if study_hash in protocols2fix: + lgr.info("Fixing up protocol for {0}".format(study_hash)) + seqinfo = fix_dbic_protocol(seqinfo) + return seqinfo + + +def ls(study_session, seqinfo): + """Additional ls output for a seqinfo""" + #assert len(sequences) <= 1 # expecting only a single study here + #seqinfo = sequences.keys()[0] + return ' study hash: %s' % get_study_hash(seqinfo) + + +# XXX we killed session indicator! what should we do now?!!! +# WE DON:T NEED IT -- it will be provided into conversion_info as `session` +# So we just need subdir and file_suffix! def infotodict(seqinfo): """Heuristic evaluator for determining which runs belong where @@ -27,130 +290,578 @@ def infotodict(seqinfo): subindex: sub index within group session: scan index for longitudinal acq """ + + seqinfo = fix_seqinfo(seqinfo) lgr.info("Processing %d seqinfo entries", len(seqinfo)) and_dicom = ('dicom', 'nii.gz') - # decide on global prefix (TODO -- manage to do it outside of here) - - # no checks for consistency for now -- assume that those fields we rely on - # ARE the same - # Actually all below is taken care of outside now! - # s = seqinfo[0] - # - # session_id, subject_id = get_session_subject_id(s) - # - # session_suffix = session_prefix = '' - # if session_id: - # session_suffix = "_ses-%s" % session_id - # session_prefix = "ses-%s/" % session_id - # if not subject_id: - # raise ValueError("Could not figure out the subject id") - # - # lgr.debug("Figured out subject %s. Session prefix: %s", subject_id, session_prefix) - # del subject_id # not to be used - - t1 = create_key('anat', 'T1w', outtype=and_dicom) - t2 = create_key('anat', 'T2w', outtype=and_dicom) - fm_diff = create_key('fmap', 'fieldmap-dwi') - dwi_ap = create_key('dwi', 'dir-AP_dwi', outtype=and_dicom) - dwi_pa = create_key('dwi', 'dir-PA_dwi', outtype=and_dicom) - fm_rest = create_key('fmap', 'fieldmap-rest') - rs = create_key('func', 'task-rest_run-{item:02d}_bold', outtype=and_dicom) - boldt1 = create_key('func', 'task-bird1back_run-{item:02d}_bold', outtype=and_dicom) - boldt2 = create_key('func', 'task-letter1back_run-{item:02d}_bold', outtype=and_dicom) - boldt3 = create_key('func', 'task-letter2back_run-{item:02d}_bold', outtype=and_dicom) - nofb_task = create_key('func', 'task-nofb_run-{item:02d}_bold', outtype=and_dicom) - fb_task = create_key('func', 'task-fb_run-{item:02d}_bold', outtype=and_dicom) - #info = {t1: [], t2: [], fm_diff:[], dwi_ap:[], dwi_pa:[], fm_rest:[], rs:[], - # boldt1:[], boldt2:[], boldt3:[], nofb_task:[], fb_task:[]} - info = defaultdict(list) - last_run = len(seqinfo) + info = OrderedDict() skipped, skipped_unknown = [], [] - + current_run = 0 + run_label = None # run- + image_data_type = None for s in seqinfo: + # XXX: skip derived sequences, we don't store them to avoid polluting + # the directory + if s.is_derived: + skipped.append(s.series_id) + lgr.debug("Ignoring derived data %s", s.series_id) + continue + + # possibly apply present formatting in the series_description or protocol name + for f in 'series_description', 'protocol_name': + s = s._replace(**{f: getattr(s, f).format(**s._asdict())}) + template = None suffix = '' seq = [] - # figure out type of image from s.image_info - image_dir = { - 'P': 'fmap', + # figure out type of image from s.image_info -- just for checking ATM + # since we primarily rely on encoded in the protocol name information + prev_image_data_type = image_data_type + image_data_type = s.image_type[2] + image_type_seqtype = { + 'P': 'fmap', # phase 'FMRI': 'func', 'MPR': 'anat', - 'M': 'anat', + # 'M': 'func', "magnitude" -- can be for scout, anat, bold, fmap 'DIFFUSION': 'dwi', - }.get(s.image_type[2], None) + 'MIP_SAG': 'anat', # angiography + 'MIP_COR': 'anat', # angiography + 'MIP_TRA': 'anat', # angiography + }.get(image_data_type, None) - if image_dir is None: - # must be exhaustive! - raise ValueError( - "Cannot figure out type of image with image_info %s" - % str(s.image_type) - ) + protocol_name_tuned = s.protocol_name + # Few common replacements + if protocol_name_tuned in {'AAHead_Scout'}: + protocol_name_tuned = 'anat-scout' - if s.is_derived: - # Let's for now stash those close to original images - image_dir += '/derivative' - # just keep it lower case and without special characters - seq.append(s.series_description.lower()) - - # analyze s.protocol_name (series_number is based on it) for full name mapping etc - if image_dir == 'func': - if '_pace_' in s.protocol_name: - suffix += '_pace' # or should it be part of seq- + regd = parse_dbic_protocol_name(protocol_name_tuned) + + if image_data_type.startswith('MIP'): + regd['acq'] = regd.get('acq', '') + sanitize_str(image_data_type) + + if not regd: + skipped_unknown.append(s.series_id) + continue + + seqtype = regd.pop('seqtype') + seqtype_label = regd.pop('seqtype_label', None) + + if image_type_seqtype and seqtype != image_type_seqtype: + lgr.warning( + "Deduced seqtype to be %s from DICOM, but got %s out of %s", + image_type_seqtype, seqtype, protocol_name_tuned) + + # if s.is_derived: + # # Let's for now stash those close to original images + # # TODO: we might want a separate tree for all of this!? + # # so more of a parameter to the create_key + # #seqtype += '/derivative' + # # just keep it lower case and without special characters + # # XXXX what for??? + # #seq.append(s.series_description.lower()) + # prefix = os.path.join('derivatives', 'scanner') + # else: + # prefix = '' + prefix = '' + + # analyze s.protocol_name (series_id is based on it) for full name mapping etc + if seqtype == 'func' and not seqtype_label: + if '_pace_' in protocol_name_tuned: + seqtype_label = 'pace' # or should it be part of seq- else: # assume bold by default - suffix += '_bold' - - # TODO run. might be needed for fieldmap - - # .series_description in case of - sdesc = s.study_description - # temporary aliases for those phantoms which we already collected - # so we rename them into this - #MAPPING - - # the idea ias to have sequence names in the format like - # bids__bidsrecord - # in bids record we could have _run[+=] - # which would say to either increment run number from already encountered - # or reuse the last one - if seq: - suffix += 'seq-%s' % ('+'.join(seq)) - - if template: - info[template].append(s.series_number) - else: - # some are ok to skip and not to whine - if "_Scout_" in s.series_description: - skipped.append(s.series_number) - lgr.debug("Ignoring %s", s.series_number) + seqtype_label = 'bold' + + if seqtype == 'fmap' and not seqtype_label: + seqtype_label = { + 'M': 'magnitude', # might want explicit {file_index} ? + 'P': 'phasediff' + }[image_data_type] + + # label for dwi as well + if seqtype == 'dwi' and not seqtype_label: + seqtype_label = 'dwi' + + run = regd.get('run') + if run is not None: + # so we have an indicator for a run + if run == '+': + # some sequences, e.g. fmap, would generate two (or more?) + # sequences -- e.g. one for magnitude(s) and other ones for + # phases. In those we must not increment run! + if image_data_type == 'P': + if prev_image_data_type != 'M': + # XXX if we have a known earlier study, we need to always + # increase the run counter for phasediff because magnitudes + # were not acquired + if get_study_hash([s]) == '9d148e2a05f782273f6343507733309d': + current_run += 1 + else: + raise RuntimeError( + "Was expecting phase image to follow magnitude " + "image, but previous one was %r", prev_image_data_type) + # else we do nothing special + else: # and otherwise we go to the next run + current_run += 1 + elif run == '=': + if not current_run: + current_run = 1 + elif run.isdigit(): + current_run_ = int(run) + if current_run_ < current_run: + lgr.warning( + "Previous run (%s) was larger than explicitly specified %s", + current_run, current_run_) + current_run = current_run_ else: - skipped_unknown.append(s.series_number) + raise ValueError( + "Don't know how to deal with run specification %s" % repr(run)) + if isinstance(current_run, str) and current_run.isdigit(): + current_run = int(current_run) + run_label = "run-" + ("%02d" % current_run + if isinstance(current_run, int) + else current_run) + else: + # if there is no _run -- no run label addded + run_label = None + + if s.is_motion_corrected and 'rec-' in regd.get('bids', ''): + raise NotImplementedError("want to add _acq-moco but there is _acq- already") + + suffix_parts = [ + None if not regd.get('task') else "task-%s" % regd['task'], + None if not regd.get('acq') else "acq-%s" % regd['acq'], + # But we want to add an indicator in case it was motion corrected + # in the magnet. ref sample /2017/01/03/qa + None if not s.is_motion_corrected else 'rec-moco', + regd.get('bids'), + run_label, + seqtype_label, + ] + # filter tose which are None, and join with _ + suffix = '_'.join(filter(bool, suffix_parts)) + + # # .series_description in case of + # sdesc = s.study_description + # # temporary aliases for those phantoms which we already collected + # # so we rename them into this + # #MAPPING + # + # # the idea ias to have sequence names in the format like + # # bids__bidsrecord + # # in bids record we could have _run[+=] + # # which would say to either increment run number from already encountered + # # or reuse the last one + # if seq: + # suffix += 'seq-%s' % ('+'.join(seq)) + + # some are ok to skip and not to whine + + if "_Scout" in s.series_description or \ + (seqtype == 'anat' and seqtype_label and seqtype_label.startswith('scout')): + skipped.append(s.series_id) + lgr.debug("Ignoring %s", s.series_id) + else: + template = create_key(seqtype, suffix, prefix=prefix) + # we wanted ordered dict for consistent demarcation of dups + if template not in info: + info[template] = [] + info[template].append(s.series_id) - info = dict(info) # convert to dict since outside functionality depends on it being a basic dict if skipped: lgr.info("Skipped %d sequences: %s" % (len(skipped), skipped)) if skipped_unknown: lgr.warning("Could not figure out where to stick %d sequences: %s" % (len(skipped_unknown), skipped_unknown)) + + info = get_dups_marked(info) # mark duplicate ones with __dup-0x suffix + + info = dict(info) # convert to dict since outside functionality depends on it being a basic dict + return info + + +def get_dups_marked(info): + # analyze for "cancelled" runs, if run number was explicitly specified and + # thus we ended up with multiple entries which would mean that older ones + # were "cancelled" + info = info.copy() + dup_id = 0 + for template, series_ids in list(info.items()): + if len(series_ids) > 1: + lgr.warning("Detected %d duplicated run(s) for template %s: %s", + len(series_ids) - 1, template[0], series_ids[:-1]) + # copy the duplicate ones into separate ones + for dup_series_id in series_ids[:-1]: + dup_id += 1 + dup_template = ( + '%s__dup-%02d' % (template[0], dup_id), + ) + template[1:] + # There must have not been such a beast before! + assert dup_template not in info + info[dup_template] = [dup_series_id] + info[template] = series_ids[-1:] + assert len(info[template]) == 1 return info -def get_session_subject_id(s): +def get_unique(seqinfos, attr): + """Given a list of seqinfos, which must have come from a single study + get specific attr, which must be unique across all of the entries + + If not -- fail! + + """ + values = set(getattr(si, attr) for si in seqinfos) + assert (len(values) == 1) + return values.pop() + + +# TODO: might need to do groupping per each session and return here multiple +# hits, or may be we could just somehow demarkate that it will be multisession +# one and so then later value parsed (again) in infotodict would be used??? +def infotoids(seqinfos, outdir): # decide on subjid and session based on patient_id - pid_split = s.patient_id.split('_') - if len(pid_split) == 1: - # there were no explicit session - # then it is not a multi-session study - sid = s.patient_id - session_id = None - elif len(pid_split) == 2: - sid, session_id = pid_split - elif len(pid_split) == 3: - _nonanon_sid, session_id, sid = pid_split + lgr.info("Processing sequence infos to deduce study/session") + study_description = get_study_description(seqinfos) + study_description_hash = md5sum(study_description) + subject = fixup_subjectid(get_unique(seqinfos, 'patient_id')) + # TODO: fix up subject id if missing some 0s + if study_description: + split = study_description.split('^', 1) + # split first one even more, since couldbe PI_Student or PI-Student + split = re.split('-|_', split[0], 1) + split[1:] + + # locator = study_description.replace('^', '/') + locator = '/'.join(split) else: - raise ValueError( - "No logic for more than 3 _-separated entries in patient_id. Got:" - " %s" % s.patient_id) - return session_id, sid + locator = 'unknown' + + # TODO: actually check if given study is study we would care about + # and if not -- we should throw some ???? exception + + # So -- use `outdir` and locator etc to see if for a given locator/subject + # and possible ses+ in the sequence names, so we would provide a sequence + # So might need to go through parse_dbic_protocol_name(s.protocol_name) + # to figure out presence of sessions. + ses_markers = [] + + # there might be fixups needed so we could deduce session etc + # this copy is not replacing original one, so the same fix_seqinfo + # might be called later + seqinfos = fix_seqinfo(seqinfos) + for s in seqinfos: + if s.is_derived: + continue + session_ = parse_dbic_protocol_name(s.protocol_name).get('session', None) + if session_ and '{' in session_: + # there was a marker for something we could provide from our seqinfo + # e.g. {date} + session_ = session_.format(**s._asdict()) + ses_markers.append(session_) + ses_markers = list(filter(bool, ses_markers)) # only present ones + session = None + if ses_markers: + # we have a session or possibly more than one even + # let's figure out which case we have + nonsign_vals = set(ses_markers).difference('+=') + # although we might want an explicit '=' to note the same session as + # mentioned before? + if len(nonsign_vals) > 1: + lgr.warning( #raise NotImplementedError( + "Cannot deal with multiple sessions in the same study yet!" + " We will process until the end of the first session" + ) + if nonsign_vals: + # get only unique values + ses_markers = list(set(ses_markers)) + if set(ses_markers).intersection('+='): + raise NotImplementedError( + "Should not mix hardcoded session markers with incremental ones (+=)" + ) + assert len(ses_markers) == 1 + session = ses_markers[0] + else: + # TODO - I think we are doomed to go through the sequence and split + # ... actually the same as with nonsign_vals, we just would need to figure + # out initial one if sign ones, and should make use of knowing + # outdir + #raise NotImplementedError() + # we need to look at what sessions we already have + sessions_dir = os.path.join(outdir, locator, 'sub-' + subject) + prior_sessions = sorted(glob(os.path.join(sessions_dir, 'ses-*'))) + # TODO: more complicated logic + # For now just increment session if + and keep the same number if = + # and otherwise just give it 001 + # Note: this disables our safety blanket which would refuse to process + # what was already processed before since it would try to override, + # BUT there is no other way besides only if heudiconv was storing + # its info based on some UID + if ses_markers == ['+']: + session = '%03d' % (len(prior_sessions) + 1) + elif ses_markers == ['=']: + session = os.path.basename(prior_sessions[-1])[4:] if prior_sessions else '001' + else: + session = '001' + + + if study_description_hash == '9d148e2a05f782273f6343507733309d': + session = 'siemens1' + lgr.info('Imposing session {0}'.format(session)) + + return { + # TODO: request info on study from the JedCap + 'locator': locator, + # Sessions to be deduced yet from the names etc TODO + 'session': session, + 'subject': subject, + } + + +def sanitize_str(value): + """Remove illegal characters for BIDS from task/acq/etc..""" + return _delete_chars(value, '#!@$%^&.,:;_-') + + +def parse_dbic_protocol_name(protocol_name): + """Parse protocol name according to our convention with minimal set of fixups + """ + # Since Yarik didn't know better place to put it in, but could migrate outside + # at some point + protocol_name = protocol_name.replace("anat_T1w", "anat-T1w") + protocol_name = protocol_name.replace("hardi_64", "dwi_acq-hardi64") + + # Parse the name according to our convention + # https://docs.google.com/document/d/1R54cgOe481oygYVZxI7NHrifDyFUZAjOBwCTu7M7y48/edit?usp=sharing + # Remove possible suffix we don't care about after __ + protocol_name = protocol_name.split('__', 1)[0] + + bids = None # we don't know yet for sure + # We need to figure out if it is a valid bids + split = protocol_name.split('_') + prefix = split[0] + + # Fixups + if prefix == 'scout': + prefix = split[0] = 'anat-scout' + + if prefix != 'bids' and '-' in prefix: + prefix, _ = prefix.split('-', 1) + if prefix == 'bids': + bids = True # for sure + split = split[1:] + + def split2(s): + # split on - if present, if not -- 2nd one returned None + if '-' in s: + return s.split('-', 1) + return s, None + + # Let's analyze first element which should tell us sequence type + seqtype, seqtype_label = split2(split[0]) + if seqtype not in {'anat', 'func', 'dwi', 'behav', 'fmap'}: + # It is not something we don't consume + if bids: + lgr.warning("It was instructed to be BIDS sequence but unknown " + "type %s found", seqtype) + return {} + + regd = dict(seqtype=seqtype) + if seqtype_label: + regd['seqtype_label'] = seqtype_label + # now go through each to see if one which we care + bids_leftovers = [] + for s in split[1:]: + key, value = split2(s) + if value is None and key[-1] in "+=": + value = key[-1] + key = key[:-1] + + # sanitize values, which must not have _ and - is undesirable ATM as well + # TODO: BIDSv2.0 -- allows "-" so replace with it instead + value = str(value).replace('_', 'X').replace('-', 'X') + + if key in ['ses', 'run', 'task', 'acq']: + # those we care about explicitly + regd[{'ses': 'session'}.get(key, key)] = sanitize_str(value) + else: + bids_leftovers.append(s) + + if bids_leftovers: + regd['bids'] = '_'.join(bids_leftovers) + + # TODO: might want to check for all known "standard" BIDS suffixes here + # among bids_leftovers, thus serve some kind of BIDS validator + + # if not regd.get('seqtype_label', None): + # # might need to assign a default label for each seqtype if was not + # # given + # regd['seqtype_label'] = { + # 'func': 'bold' + # }.get(regd['seqtype'], None) + + return regd + + +def fixup_subjectid(subjectid): + """Just in case someone managed to miss a zero or added an extra one""" + # make it lowercase + subjectid = subjectid.lower() + reg = re.match("sid0*(\d+)$", subjectid) + if not reg: + # some completely other pattern + # just filter out possible _- in it + return re.sub('[-_]', '', subjectid) + return "sid%06d" % int(reg.groups()[0]) + + +def test_filter_files(): + assert(filter_files('/home/mvdoc/dbic/09-run_func_meh/0123432432.dcm')) + assert(not filter_files('/home/mvdoc/dbic/run_func_meh/012343143.dcm')) + + +def test_md5sum(): + assert md5sum('cryptonomicon') == '1cd52edfa41af887e14ae71d1db96ad1' + assert md5sum('mysecretmessage') == '07989808231a0c6f522f9d8e34695794' + + +def test_fix_canceled_runs(): + from collections import namedtuple + FakeSeqInfo = namedtuple('FakeSeqInfo', + ['accession_number', 'series_id', + 'protocol_name', 'series_description']) + + seqinfo = [] + runname = 'func_run+' + for i in range(1, 6): + seqinfo.append( + FakeSeqInfo('accession1', + '{0:02d}-'.format(i) + runname, + runname, runname) + ) + + fake_accession2run = { + 'accession1': ['^01-', '^03-'] + } + + seqinfo_ = fix_canceled_runs(seqinfo, fake_accession2run) + + for i, s in enumerate(seqinfo_, 1): + output = runname + if i == 1 or i == 3: + output = 'cancelme_' + output + for key in ['series_description', 'protocol_name']: + value = getattr(s, key) + assert(value == output) + # check we didn't touch series_id + assert(s.series_id == '{0:02d}-'.format(i) + runname) + + +def test_fix_dbic_protocol(): + from collections import namedtuple + FakeSeqInfo = namedtuple('FakeSeqInfo', + ['accession_number', 'study_description', + 'field1', 'field2']) + accession_number = 'A003' + seq1 = FakeSeqInfo(accession_number, + 'mystudy', + '02-anat-scout_run+_MPR_sag', + '11-func_run-life2_acq-2mm692') + seq2 = FakeSeqInfo(accession_number, + 'mystudy', + 'nochangeplease', + 'nochangeeither') + + + seqinfos = [seq1, seq2] + keys = ['field1'] + subsdict = { + md5sum('mystudy'): + [('scout_run\+', 'scout'), + ('run-life[0-9]', 'run+_task-life')], + } + + seqinfos_ = fix_dbic_protocol(seqinfos, keys=keys, subsdict=subsdict) + assert(seqinfos[1] == seqinfos_[1]) + # field2 shouldn't have changed since I didn't pass it + assert(seqinfos_[0] == FakeSeqInfo(accession_number, + 'mystudy', + '02-anat-scout_MPR_sag', + seq1.field2)) + + # change also field2 please + keys = ['field1', 'field2'] + seqinfos_ = fix_dbic_protocol(seqinfos, keys=keys, subsdict=subsdict) + assert(seqinfos[1] == seqinfos_[1]) + # now everything should have changed + assert(seqinfos_[0] == FakeSeqInfo(accession_number, + 'mystudy', + '02-anat-scout_MPR_sag', + '11-func_run+_task-life_acq-2mm692')) + + +def test_sanitize_str(): + assert sanitize_str('super@duper.faster') == 'superduperfaster' + assert sanitize_str('perfect') == 'perfect' + assert sanitize_str('never:use:colon:!') == 'neverusecolon' + + +def test_fixupsubjectid(): + assert fixup_subjectid("abra") == "abra" + assert fixup_subjectid("sub") == "sub" + assert fixup_subjectid("sid") == "sid" + assert fixup_subjectid("sid000030") == "sid000030" + assert fixup_subjectid("sid0000030") == "sid000030" + assert fixup_subjectid("sid00030") == "sid000030" + assert fixup_subjectid("sid30") == "sid000030" + assert fixup_subjectid("SID30") == "sid000030" + + +def test_parse_dbic_protocol_name(): + pdpn = parse_dbic_protocol_name + + assert pdpn("nondbic_func-bold") == {} + assert pdpn("cancelme_func-bold") == {} + + assert pdpn("bids_func-bold") == \ + pdpn("func-bold") == \ + {'seqtype': 'func', 'seqtype_label': 'bold'} + + # pdpn("bids_func_ses+_task-boo_run+") == \ + # order should not matter + assert pdpn("bids_func_ses+_run+_task-boo") == \ + { + 'seqtype': 'func', + # 'seqtype_label': 'bold', + 'session': '+', + 'run': '+', + 'task': 'boo', + } + # TODO: fix for that + assert pdpn("bids_func-pace_ses-1_task-boo_acq-bu_bids-please_run-2__therest") == \ + pdpn("bids_func-pace_ses-1_run-2_task-boo_acq-bu_bids-please__therest") == \ + pdpn("func-pace_ses-1_task-boo_acq-bu_bids-please_run-2") == \ + { + 'seqtype': 'func', 'seqtype_label': 'pace', + 'session': '1', + 'run': '2', + 'task': 'boo', + 'acq': 'bu', + 'bids': 'bids-please' + } + + assert pdpn("bids_anat-scout_ses+") == \ + { + 'seqtype': 'anat', + 'seqtype_label': 'scout', + 'session': '+', + } + + assert pdpn("anat_T1w_acq-MPRAGE_run+") == \ + { + 'seqtype': 'anat', + 'run': '+', + 'acq': 'MPRAGE', + 'seqtype_label': 'T1w' + } diff --git a/heuristics/dbic_bids_validator.cfg b/heuristics/dbic_bids_validator.cfg new file mode 100644 index 00000000..0da468b5 --- /dev/null +++ b/heuristics/dbic_bids_validator.cfg @@ -0,0 +1,14 @@ +{ + "ignore": [ + "TOTAL_READOUT_TIME_NOT_DEFINED" + ], + "warn": [], + "error": [], + "ignoredFiles": [ + "/.heudiconv/*", "/.heudiconv/*/*", "/.heudiconv/*/*/*", "/.heudiconv/*/*/*/*", + "/.git*", + "/.datalad/*", "/.datalad/.*", + "/sub*/ses*/*/*__dup*", "/sub*/*/*__dup*" + ] +} + diff --git a/heuristics/example.py b/heuristics/example.py index 324d875b..5c2bc5ca 100644 --- a/heuristics/example.py +++ b/heuristics/example.py @@ -86,4 +86,4 @@ def infotodict(seqinfo): info[fmrest].append(s[2]) else: pass - return info \ No newline at end of file + return info diff --git a/heuristics/test_dbic_bids.py b/heuristics/test_dbic_bids.py new file mode 100644 index 00000000..4d106914 --- /dev/null +++ b/heuristics/test_dbic_bids.py @@ -0,0 +1,25 @@ +# +# Tests for dbic_bids.py +# +from collections import OrderedDict +from dbic_bids import get_dups_marked + +def test_get_dups_marked(): + no_dups = {('some',): [1]} + assert get_dups_marked(no_dups) == no_dups + + assert get_dups_marked( + OrderedDict([ + (('bu', 'du'), [1, 2]), + (('smth',), [3]), + (('smth2',), ['a', 'b', 'c']) + ])) == \ + { + ('bu__dup-01', 'du'): [1], + ('bu', 'du'): [2], + ('smth',): [3], + ('smth2__dup-02',): ['a'], + ('smth2__dup-03',): ['b'], + ('smth2',): ['c'] + } + diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..df3eb518 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = --doctest-modules diff --git a/requirements.txt b/requirements.txt index 79d31ffe..4ce7ccd7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ -pydicom -dcmstack -rdflib -nipype +.[full] +# This is a dcmstack branch with changes for Python 3 +# sent PR to main repo, TODO: check if merged +# https://github.com/ghisvail/dcmstack/pull/1 +git+git://github.com/mvdoc/dcmstack@bf/importsys diff --git a/setup.py b/setup.py index 1acee7da..dca6f021 100755 --- a/setup.py +++ b/setup.py @@ -33,12 +33,20 @@ def findsome(subdir, extensions): requires = { 'core': [ 'nibabel', - # TODO: migrate more from requirements.txt + 'pydicom', + 'nipype', ], 'tests': [ 'six', - 'nose', + 'pytest', ], + 'monitor': [ + 'inotify', + 'tinydb' + ], + 'datalad': [ + 'datalad' + ] } requires['full'] = sum(list(requires.values()), []) diff --git a/tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm b/tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm new file mode 100644 index 00000000..bafdaa51 Binary files /dev/null and b/tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm differ diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py new file mode 100644 index 00000000..1bd70d51 --- /dev/null +++ b/tests/test_heuristics.py @@ -0,0 +1,102 @@ +from . import heudiconv + +import os +from mock import patch +from six.moves import StringIO + +from glob import glob +from os.path import join as pjoin, dirname +import csv +import re + +import pytest + +from datalad.api import Dataset + + +def test_smoke_converall(tmpdir): + heudiconv.main( + ("-f heuristics/convertall.py -c dcm2niix -o %s -b --datalad " + "-s fmap_acq-3mm -d tests/data/{subject}/*" % tmpdir).split(' ') + ) + + +@pytest.mark.parametrize('heuristic', [ 'dbic_bids', 'convertall' ]) +@pytest.mark.parametrize( + 'invocation', [ + "tests/data", # our new way with automated groupping + "-d tests/data/{subject}/* -s 01-fmap_acq-3mm" # "old" way specifying subject + # should produce the same results + ]) +def test_dbic_bids_largely_smoke(tmpdir, heuristic, invocation): + is_bids = True if heuristic == 'dbic_bids' else False + arg = "-f heuristics/%s.py -c dcm2niix -o %s" % (heuristic, tmpdir) + if is_bids: + arg += " -b" + arg += " --datalad " + args = ( + arg + invocation + ).split(' ') + if heuristic != 'dbic_bids' and invocation == 'tests/data': + # none other heuristic has mighty infotoids atm + with pytest.raises(NotImplementedError): + heudiconv.main(args) + return + heudiconv.main(args) + ds = Dataset(str(tmpdir)) + assert ds.is_installed() + assert not ds.repo.dirty + head = ds.repo.get_hexsha() + + # and if we rerun -- should fail + if heuristic != 'dbic_bids' and invocation != 'tests/data': + # those guys -- they just plow through it ATM without failing, i.e. + # the logic is to reprocess + heudiconv.main(args) + else: + with pytest.raises(RuntimeError): + heudiconv.main(args) + # but there should be nothing new + assert not ds.repo.dirty + assert head == ds.repo.get_hexsha() + + # unless we pass 'overwrite' flag + heudiconv.main(args + ['--overwrite']) + # but result should be exactly the same, so it still should be clean + # and at the same commit + assert ds.is_installed() + assert not ds.repo.dirty + assert head == ds.repo.get_hexsha() + + +@pytest.mark.parametrize( + 'invocation', [ + "tests/data", # our new way with automated groupping + ]) +def test_scans_keys_dbic_bids(tmpdir, invocation): + args = "-f heuristics/dbic_bids.py -c dcm2niix -o %s -b " % tmpdir + args += invocation + heudiconv.main(args.split()) + # for now check it exists + scans_keys = glob(pjoin(tmpdir.strpath, '*/*/*/*/*.tsv')) + assert(len(scans_keys) == 1) + with open(scans_keys[0]) as f: + reader = csv.reader(f, delimiter='\t') + for i, row in enumerate(reader): + if i == 0: + assert(row == ['filename', 'acq_time', 'operator', 'randstr']) + assert(len(row) == 4) + if i != 0: + assert(os.path.exists(pjoin(dirname(scans_keys[0]), row[0]))) + assert(re.match( + '^[\d]{4}-[\d]{2}-[\d]{2}T[\d]{2}:[\d]{2}:[\d]{2}$', + row[1])) + + +@patch('sys.stdout', new_callable=StringIO) +def test_ls(stdout): + args = "-f heuristics/dbic_bids.py --command ls tests/data".split(' ') + heudiconv.main(args) + out = stdout.getvalue() + assert 'StudySessionInfo(locator=' in out + assert 'Halchenko/Yarik/950_bids_test4' in out \ No newline at end of file diff --git a/tests/test_main.py b/tests/test_main.py index 1a496b2e..8d29d11e 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,19 +1,159 @@ +import csv +import os +import pytest import sys from mock import patch +from os.path import join as opj from six.moves import StringIO -from nose.tools import assert_raises, assert_equal + from . import heudiconv @patch('sys.stdout', new_callable=StringIO) def test_main_help(stdout): - assert_raises(SystemExit, heudiconv.main, ['--help']) - assert(stdout.getvalue().startswith("usage: ")) + with pytest.raises(SystemExit): + heudiconv.main(['--help']) + assert stdout.getvalue().startswith("usage: ") @patch('sys.stderr' if sys.version_info[:2] <= (3, 3) else 'sys.stdout', new_callable=StringIO) def test_main_version(std): - assert_raises(SystemExit, heudiconv.main, ['--version']) - assert_equal(std.getvalue().rstrip(), heudiconv.__version__) + with pytest.raises(SystemExit): + heudiconv.main(['--version']) + assert std.getvalue().rstrip() == heudiconv.__version__ + + +def test_create_file_if_missing(tmpdir): + tf = tmpdir.join("README.txt") + assert not tf.exists() + heudiconv.create_file_if_missing(str(tf), "content") + assert tf.exists() + assert tf.read() == "content" + heudiconv.create_file_if_missing(str(tf), "content2") + # nothing gets changed + assert tf.read() == "content" + + +def test_populate_bids_templates(tmpdir): + heudiconv.populate_bids_templates( + str(tmpdir), + defaults={'Acknowledgements': 'something'}) + for f in "README", "dataset_description.json", "CHANGES": + # Just test that we have created them and they all have stuff TODO + assert "TODO" in tmpdir.join(f).read() + assert "something" in tmpdir.join('dataset_description.json').read() + + +def test_add_participant_record(tmpdir): + tf = tmpdir.join('participants.tsv') + assert not tf.exists() + heudiconv.add_participant_record(str(tmpdir), "sub01", "023Y", "M") + # should create the file and place corrected record + sub01 = tf.read() + assert sub01 == """\ +participant_id age sex group +sub-sub01 23 M control +""" + heudiconv.add_participant_record(str(tmpdir), "sub01", "023Y", "F") + assert tf.read() == sub01 # nothing was added even though differs in values + heudiconv.add_participant_record(str(tmpdir), "sub02", "2", "F") + assert tf.read() == """\ +participant_id age sex group +sub-sub01 23 M control +sub-sub02 2 F control +""" + + +def test_prepare_for_datalad(tmpdir): + pytest.importorskip("datalad") + studydir = tmpdir.join("PI").join("study") + studydir_ = str(studydir) + os.makedirs(studydir_) + heudiconv.populate_bids_templates(studydir_) + + heudiconv.add_to_datalad(str(tmpdir), studydir_) + + from datalad.api import Dataset + superds = Dataset(str(tmpdir)) + + assert superds.is_installed() + assert not superds.repo.dirty + subdss = superds.get_subdatasets(recursive=True) + for ds_path in sorted(subdss): + ds = Dataset(opj(superds.path, ds_path)) + assert ds.is_installed() + assert not ds.repo.dirty + + # the last one should have been the study + target_files = { + '.gitattributes', + '.datalad/config', '.datalad/.gitattributes', + 'dataset_description.json', + 'CHANGES', 'README'} + assert set(ds.repo.get_indexed_files()) == target_files + # and all are under git + for f in target_files: + assert not ds.repo.is_under_annex(f) + assert not ds.repo.is_under_annex('.gitattributes') + + +def test_json_dumps_pretty(): + pretty = heudiconv.json_dumps_pretty + assert pretty({}) == "{}" + assert pretty({"a": -1, "b": "123", "c": [1, 2, 3], "d": ["1.0", "2.0"]}) \ + == '{\n "a": -1,\n "b": "123",\n "c": [1, 2, 3],\n "d": ["1.0", "2.0"]\n}' + assert pretty({'a': ["0.3", "-1.9128906358217845e-12", "0.2"]}) \ + == '{\n "a": ["0.3", "-1.9128906358217845e-12", "0.2"]\n}' + + +def test_get_formatted_scans_key_row(): + item = [ + ('tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm', + ('nii.gz', 'dicom'), + ['tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm']) + ] + outname_bids_file = '/a/path/Halchenko/Yarik/950_bids_test4/sub-phantom1sid1/fmap/sub-phantom1sid1_acq-3mm_phasediff.json' + + row = heudiconv.get_formatted_scans_key_row(item) + assert(len(row) == 3) + assert(row[0] == '2016-10-14T09:26:34') + assert(row[1] == '') + randstr1 = row[2] + row = heudiconv.get_formatted_scans_key_row(item) + randstr2 = row[2] + assert(randstr1 != randstr2) + + +# TODO: finish this +def test_add_rows_to_scans_keys_file(tmpdir): + fn = opj(tmpdir.strpath, 'file.tsv') + rows = { + 'my_file.nii.gz': ['2016adsfasd', '', 'fasadfasdf'], + 'another_file.nii.gz': ['2018xxxxx', '', 'fasadfasdf'] + } + heudiconv.add_rows_to_scans_keys_file(fn, rows) + + def _check_rows(fn, rows): + with open(fn, 'r') as csvfile: + reader = csv.reader(csvfile, delimiter='\t') + rows_loaded = [] + for row in reader: + rows_loaded.append(row) + for i, row_ in enumerate(rows_loaded): + if i == 0: + assert(row_ == ['filename', 'acq_time', 'operator', 'randstr']) + else: + assert(rows[row_[0]] == row_[1:]) + + _check_rows(fn, rows) + # add a new one + extra_rows = { + 'a_new_file.nii.gz': ['2016adsfasd23', '', 'fasadfasdf'], + 'my_file.nii.gz': ['2016adsfasd', '', 'fasadfasdf'], + 'another_file.nii.gz': ['2018xxxxx', '', 'fasadfasdf'] + } + heudiconv.add_rows_to_scans_keys_file(fn, extra_rows) + _check_rows(fn, extra_rows) + diff --git a/tests/test_tarballs.py b/tests/test_tarballs.py new file mode 100644 index 00000000..3a9a9edf --- /dev/null +++ b/tests/test_tarballs.py @@ -0,0 +1,34 @@ +import os +import pytest +import sys +import time + +from mock import patch +from os.path import join as opj +from os.path import dirname +from six.moves import StringIO +from glob import glob + +from . import heudiconv +from .utils import md5sum + +tests_datadir = opj(dirname(__file__), 'data') + + +def test_reproducibility(tmpdir): + #heudiconv.compress_dicoms(dicom_list, prefix, sourcedir) + prefix = str(tmpdir.join("precious")) + args = [glob(opj(tests_datadir, '01-fmap_acq-3mm', '*')), prefix] + tarball = heudiconv.compress_dicoms(*args) + md5 = md5sum(tarball) + assert tarball + # must not override by default + with pytest.raises(RuntimeError): + heudiconv.compress_dicoms(*args) + os.unlink(tarball) + + time.sleep(1.1) # need to guarantee change of time + tarball_ = heudiconv.compress_dicoms(*args) + md5_ = md5sum(tarball_) + assert tarball == tarball_ + assert md5 == md5_ diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 00000000..4de67fae --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,6 @@ +import hashlib + + +def md5sum(filename): + with open(filename, 'rb') as f: + return hashlib.md5(f.read()).hexdigest() diff --git a/tox.ini b/tox.ini index d483a432..5f8f5fcd 100644 --- a/tox.ini +++ b/tox.ini @@ -2,5 +2,5 @@ envlist = py27,py33,py34,py35 [testenv] -commands = nosetests -s -v {posargs} tests +commands = python -m pytest -s -v {posargs} tests deps = -r{toxinidir}/dev-requirements.txt