diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 00000000..32298e55
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,4 @@
+[run]
+include = tests/*
+          bin/*
+          setup.py
diff --git a/.gitignore b/.gitignore
index 0d20b648..52671470 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,4 @@
 *.pyc
+.cache/
+.coverage
+*.egg-info/
diff --git a/.travis.yml b/.travis.yml
index 648d768c..6b5fdf21 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,9 +2,9 @@
 language: python
 python:
   - 2.7
-  - 3.3
   - 3.4
   - 3.5
+  - 3.6
 
 cache:
   - apt
@@ -17,17 +17,25 @@ env:
 
 before_install:
   # The ultimate one-liner setup for NeuroDebian repository
-  # which might be needed later for tools
-  #- bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh)
-  #- travis_retry sudo apt-get update -qq
-  # for now even remove requirements.txt since dependencies aren't avail
-  - echo '' > requirements.txt
+  - bash <(wget -q -O- http://neuro.debian.net/_files/neurodebian-travis.sh)
+  - travis_retry sudo apt-get update -qq
+  - travis_retry sudo apt-get install git-annex-standalone dcm2niix
+  # Install in our own virtualenv
+  - python -m pip install --upgrade pip
+  - pip install --upgrade virtualenv
+  - virtualenv --python=python venv
+  - source venv/bin/activate
+  - python --version # just to check
   - pip install -r dev-requirements.txt
-  - pip install codecov
+  - pip install datalad
+  - pip install codecov pytest
 
+install:
+  - git config --global user.email "test@travis.land"
+  - git config --global user.name "Travis Almighty"
 
 script:
-  - nosetests -s -v --with-doctest --doctest-tests --with-cov --cover-package . --logging-level=INFO tests
+  - coverage run `which py.test` -s -v tests heuristics
 
 after_success:
   - codecov
diff --git a/Dockerfile b/Dockerfile
index b58d8487..b852c8a2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,10 +3,31 @@ FROM continuumio/miniconda
 
 MAINTAINER <satra@mit.edu>
 
-RUN apt-get update && apt-get upgrade -y && apt-get install -y g++ && apt-get clean -y && apt-get autoclean -y && apt-get autoremove -y
-RUN cd /tmp && git clone https://github.com/neurolabusc/dcm2niix.git && cd dcm2niix/console/ && git checkout 60bab318ee738b644ebb1396bbb8cbe1b006218f && g++ -O3 -I. main_console.cpp nii_dicom.cpp jpg_0XC3.cpp ujpeg.cpp nifti1_io_core.cpp nii_ortho.cpp nii_dicom_batch.cpp nii_foreign.cpp -o dcm2niix -DmyDisableOpenJPEG -DmyDisableJasper && cp dcm2niix /usr/local/bin/
-RUN conda install -y -c conda-forge nipype && pip install https://github.com/moloney/dcmstack/archive/c12d27d2c802d75a33ad70110124500a83e851ee.zip && pip install https://github.com/nipy/nipype/archive/dd1ed4f0d5735c69c1743f29875acf09d23a62e0.zip
-RUN curl -O https://raw.githubusercontent.com/nipy/heudiconv/master/bin/heudiconv && chmod +x heudiconv && cp heudiconv /usr/local/bin/
-RUN curl -O https://raw.githubusercontent.com/nipy/heudiconv/master/heuristics/convertall.py && chmod +x convertall.py
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get install -y g++ pkg-config make && \
+    apt-get clean -y && apt-get autoclean -y && apt-get autoremove -y
+RUN (wget -O- http://neuro.debian.net/lists/jessie.us-nh.full | tee /etc/apt/sources.list.d/neurodebian.sources.list) && \
+    apt-key adv --recv-keys --keyserver hkp://pool.sks-keyservers.net:80 0xA5D32F012649A5A9 && \
+    apt-get update -qq && apt-get install -y git-annex-standalone && \
+    apt-get clean -y && apt-get autoclean -y && apt-get autoremove -y
+RUN conda install -y -c conda-forge nipype && \
+    conda install cmake && \
+    pip install https://github.com/moloney/dcmstack/archive/c12d27d2c802d75a33ad70110124500a83e851ee.zip && \
+    pip install datalad && \
+    conda clean -tipsy && rm -rf ~/.pip/
+RUN cd /tmp && git clone https://github.com/neurolabusc/dcm2niix.git && \
+    cd dcm2niix && \
+    git checkout 60bab318ee738b644ebb1396bbb8cbe1b006218f && \
+    mkdir build && cd build && cmake -DBATCH_VERSION=ON .. && \
+    make && make install && \
+    cd / && rm -rf /tmp/dcm2niix
+
+COPY bin/heudiconv /usr/local/bin/heudiconv
+RUN chmod +x /usr/local/bin/heudiconv
+RUN mkdir /heuristics
+COPY heuristics/convertall.py /heuristics
+RUN chmod +x /heuristics/convertall.py
+RUN git config --global user.email "test@docker.land" && \
+    git config --global user.name "Docker Almighty"
 
 ENTRYPOINT ["/usr/local/bin/heudiconv"]
diff --git a/README.md b/README.md
index f35db48a..e85f53dc 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ function `infotodict`, which takes a single argument `seqinfo`.
 
 * total_files_till_now
 * example_dcm_file
-* series_number
+* series_id
 * dcm_dir_name
 * unspecified2
 * unspecified3
diff --git a/bin/heudiconv b/bin/heudiconv
index ff7c2a8c..43502da7 100755
--- a/bin/heudiconv
+++ b/bin/heudiconv
@@ -1,32 +1,55 @@
 #!/usr/bin/env python
 
-"""Convert DICOM TimTrio dirs based on heuristic info
+"""Convert DICOM dirs based on heuristic info
 
-This function uses DicomStack and Dcm2niix to convert Siemens
-TrioTim/Prisma directories. It proceeds by extracting dicominfo from each
-subject and writing a config file $subject_id/$subject_id.auto.txt in
-the output directory. Users can create a copy of the file called
-$subject_id.edit.txt and modify it to change the files that are
-converted. This edited file will always overwrite the original file. If
-there is a need to revert to original state, please delete this edit.txt
-file and rerun the conversion
+This script uses the dcmstack package and dcm2niix tool to convert DICOM
+directories or tarballs into collections of NIfTI files following pre-defined
+heuristic(s).
 
+It has multiple modes of operation
+
+- If subject ID(s) is specified, it proceeds by extracting dicominfo from each
+  subject and writing a config file $subject_id/$subject_id.auto.txt in
+  the output directory. Users can create a copy of the file called
+  $subject_id.edit.txt and modify it to change the files that are
+  converted. This edited file will always overwrite the original file. If
+  there is a need to revert to original state, please delete this edit.txt
+  file and rerun the conversion
+
+- If no subject specified, all files under specified directory are scanned,
+  DICOMs are sorted based on study UID, and layed out using specified heuristic
 """
 
-__version__ = '0.1'
+__version__ = '0.3'
 
 import argparse
 from glob import glob
+import csv
+import dicom as dcm
+import dcmstack as ds
 import inspect
 import json
 import os
-import shutil
 import sys
+import re
+import shutil
 from tempfile import mkdtemp
 import tarfile
 
+from copy import deepcopy
 from collections import namedtuple
+from collections import defaultdict
+from collections import OrderedDict as ordereddict
+from datetime import datetime
 from os.path import isdir
+from os.path import basename
+from os.path import dirname
+from os.path import lexists, exists
+from os.path import join as pjoin
+
+from random import sample
+
+PY3 = sys.version_info[0] >= 3
 
 import logging
 lgr = logging.getLogger('heudiconv')
@@ -38,26 +61,122 @@ logging.basicConfig(
 )
 lgr.debug("Starting the abomination")  # just to "run-test" logging
 
-seqinfo_fields = ['total_files_till_now',  # 0
-                  'example_dcm_file',      # 1
-                  'series_number',         # 2
-                  'dcm_dir_name',          # 3
-                  'unspecified2',          # 4
-                  'unspecified3',          # 5
-                  'dim1', 'dim2', 'dim3', 'dim4', # 6, 7, 8, 9
-                  'TR', 'TE',              # 10, 11
-                  'protocol_name',         # 12
-                  'is_motion_corrected',   # 13
-                  # Introduced with namedtuple
-                  'is_derived',            # 14
-                  'patient_id',            # 15
-                  'study_description',     # 16
-                  'referring_physician_name',     # 17
-                  'series_description',    # 18
-                  'image_type',            # 19
-                  ]
-
-SeqInfo = namedtuple('SeqInfo', seqinfo_fields)
+# bloody git screws things up
+# https://github.com/gitpython-developers/GitPython/issues/600
+#
+## for nipype, we would like to lower level unless we are at debug level
+#try:
+#    import nipype
+#    for l in nipype.logging.loggers.values():
+#        if lgr.getEffectiveLevel() > 10:
+#            l.setLevel(logging.WARN)
+#except ImportError:
+#    pass
+#
+#try:
+#    # Set datalad's logger to our handler
+#    import datalad
+#    datalad_lgr = logging.getLogger('datalad')
+#    datalad_lgr.handlers = lgr.handlers
+#except ImportError:
+#    pass
+
+global_options = {
+    'overwrite': False   # overwrite existing files
+}
+
+SeqInfo = namedtuple(
+    'SeqInfo',
+    ['total_files_till_now',  # 0
+     'example_dcm_file',      # 1
+     'series_id',             # 2
+     'unspecified1',          # 3
+     'unspecified2',          # 4
+     'unspecified3',          # 5
+     'dim1', 'dim2', 'dim3', 'dim4', # 6, 7, 8, 9
+     'TR', 'TE',              # 10, 11
+     'protocol_name',         # 12
+     'is_motion_corrected',   # 13
+     # Introduced with namedtuple
+     'is_derived',
+     'patient_id',
+     'study_description',
+     'referring_physician_name',
+     'series_description',
+     'image_type',
+     'accession_number',
+     'patient_age',
+     'patient_sex',
+     'date'
+     ]
+)
+
+StudySessionInfo = namedtuple(
+    'StudySessionInfo',
+    [
+        'locator',  # possible prefix identifying the study, e.g. PI/dataset or just a dataset or empty (default)
+                    # Note that ATM there should be no multiple DICOMs with the same
+                    # StudyInstanceUID which would collide, i.e point to the same
+                    # subject/session.
+                    # So 'locator' is pretty much an assignment from StudyInstanceUID
+                    # into some place within hierarchy
+        'session',  # could be None
+        'subject',  # should be some ID defined either in cmdline or deduced
+    ]
+)
+
+
+# TODO: RF to avoid package-level global structure, and be more friendly in
+# case of refactoring of heudiconv into a proper Python package/module
+class TempDirs(object):
+    """A helper to centralize handling and cleanup of dirs"""
+
+    def __init__(self):
+        self.dirs = []
+        self.exists = os.path.exists
+
+    def __call__(self, prefix=None):
+        tmpdir = mkdtemp(prefix=prefix)
+        self.dirs.append(tmpdir)
+        return tmpdir
+
+    def __del__(self):
+        try:
+            self.cleanup()
+        except AttributeError:
+            # we are too late to the show
+            pass
+
+    def cleanup(self):
+        lgr.debug("Removing %d temporary directories", len(self.dirs))
+        for t in self.dirs[:]:
+            lgr.debug("Removing %s", t)
+            if self:
+                self.rmtree(t)
+        self.dirs = []
+
+    def rmtree(self, tmpdir):
+        if self.exists(tmpdir):
+            shutil.rmtree(tmpdir)
+        if tmpdir in self.dirs:
+            self.dirs.remove(tmpdir)
+
+tempdirs = TempDirs()
+
+
+def _canonical_dumps(json_obj, **kwargs):
+    """ Dump `json_obj` to string, allowing for Python newline bug
+
+    Runs ``json.dumps(json_obj, \*\*kwargs), then removes trailing whitespaces
+    added when doing indent in some Python versions. See
+    https://bugs.python.org/issue16333. Bug seems to be fixed in 3.4, for now
+    fixing manually not only for aestetics but also to guarantee the same
+    result across versions of Python.
+    """
+    out = json.dumps(json_obj, **kwargs)
+    if 'indent' in kwargs:
+        out = out.replace(' \n', '\n')
+    return out
 
 
 def save_json(filename, data):
@@ -72,7 +191,49 @@ def save_json(filename, data):
 
     """
     with open(filename, 'w') as fp:
-        json.dump(data, fp, sort_keys=True, indent=4)
+        fp.write(_canonical_dumps(data, sort_keys=True, indent=4))
+
+
+def slim_down_info(j):
+    """Given an aggregated info structure, removes excessive details
+
+    such as Csa fields, and SourceImageSequence which on Siemens files could be
+    huge and not providing any additional immediately usable information.
+    If needed, could be recovered from stored DICOMs
+    """
+    j = deepcopy(j)  # we will do in-place modification on a copy
+    dicts = []
+    # poor man programming for now
+    if 'const' in j.get('global', {}):
+        dicts.append(j['global']['const'])
+    if 'samples' in j.get('time', {}):
+        dicts.append(j['time']['samples'])
+    for d in dicts:
+        for k in list(d.keys()):
+            if k.startswith('Csa') or k.lower() in {'sourceimagesequence'}:
+                del d[k]
+    return j
+
+
+def json_dumps_pretty(j, indent=2, sort_keys=True):
+    """Given a json structure, pretty print it by colliding numeric arrays into a line
+
+    If resultant structure differs from original -- throws exception
+    """
+    js = _canonical_dumps(j, indent=indent, sort_keys=sort_keys)
+    # trim away \n and spaces between entries of numbers
+    js_ = re.sub(
+        '[\n ]+("?[-+.0-9e]+"?,?) *\n(?= *"?[-+.0-9e]+"?)', r' \1',
+        js, flags=re.MULTILINE)
+    # uniform no spaces before ]
+    js_ = re.sub(" *\]", "]", js_)
+    # uniform spacing before numbers
+    js_ = re.sub('  *("?[-+.0-9e]+"?)[ \n]*', r' \1', js_)
+    # no spaces after [
+    js_ = re.sub('\[ ', '[', js_)
+    j_ = json.loads(js_)
+    assert(j == j_)
+    return js_
 
 
 def load_json(filename):
@@ -101,7 +262,6 @@ def load_json(filename):
 #  find_files utility copied/borrowed from DataLad (Copyright 2016 DataLad developers, MIT license)
 #
 
-import re
 from os.path import sep as dirsep
 from os.path import curdir
 from os.path import join as opj
@@ -140,92 +300,134 @@ def find_files(regex, topdir=curdir, exclude=None, exclude_vcs=True, dirs=False)
 find_files.__doc__ %= (_VCS_REGEX,)
 
 
-def process_dicoms(fl, dcmsession=None, basedir=None, dcmfilter=None):
-    """Process list of dicoms and return  seqinfo and file group
+def group_dicoms_into_seqinfos(
+        files, file_filter=None, dcmfilter=None, grouping='studyUID'
+):
+    """Process list of dicoms and return seqinfo and file group
+
+    `seqinfo` contains per-sequence extract of fields from DICOMs which
+    will be later provided into heuristics to decide on filenames
 
     Parameters
     ----------
-    fl : list of str
+    files : list of str
       List of files to consider
-    dcmsession : list of int, optional
-      List of session ids (?) for each file if `fl`
-      corresponds to multiple sessions
-    basedir : str, optional
-      Base directory relative to which filenames are provided in `fl`
+    file_filter : callable, optional
+      Applied to each item of filenames. Should return True if file needs to be
+      kept, False otherwise.
     dcmfilter : callable, optional
-      If called on dcm_data and returns True, it is used to set
-      series_id
+      If called on dcm_data and returns True, it is used to set series_id
+    grouping : {'studyUID', 'accession_number', None}, optional
+        what to group by: studyUID or accession_number
 
     Returns
     -------
-    seqinfo, filegrp : list of list, dict
-      `seqinfo` is a list of info entries per each sequence (some entry
+    seqinfo : list of list
+      `seqinfo` is a list of info entries per each sequence (some entry 
       there defines a key for `filegrp`)
-      `filegrp` is a dictionary with files groupped per each sequence.
+    filegrp : dict
+      `filegrp` is a dictionary with files groupped per each sequence
     """
+    allowed_groupings = ['studyUID', 'accession_number', None]
+    if grouping not in allowed_groupings:
+        raise ValueError('I do not know how to group by {0}'.format(grouping))
+    per_studyUID = grouping == 'studyUID'
+    per_accession_number = grouping == 'accession_number'
+    lgr.info("Analyzing %d dicoms", len(files))
     import dcmstack as ds
     import dicom as dcm
 
-    if dcmsession is None:
-        multi_session = False
-        dcmsession = [0] * len(fl)
-    else:
-        multi_session = True
-
     groups = [[], []]
     mwgroup = []
-    for fidx, filename in enumerate(fl):
-        if not basedir is None:
-            filename = os.path.join(basedir, filename)
+
+    studyUID = None
+    # for sanity check that all DICOMs came from the same
+    # "study".  If not -- what is the use-case? (interrupted acquisition?)
+    # and how would then we deal with series numbers
+    # which would differ already
+    if file_filter:
+        nfl_before = len(files)
+        files = list(filter(file_filter, files))
+        nfl_after = len(files)
+        lgr.info('Filtering out {0} dicoms based on their filename'.format(
+            nfl_before-nfl_after))
+    for fidx, filename in enumerate(files):
+        # TODO after getting a regression test check if the same behavior
+        #      with stop_before_pixels=True
         mw = ds.wrapper_from_data(dcm.read_file(filename, force=True))
+
+        for f in ('iop', 'ICE_Dims', 'SequenceName'):
+            try:
+                del mw.series_signature[f]
+            except:
+                pass
+
         try:
-            del mw.series_signature['iop']
-        except:
-            pass
-        try:
-            del mw.series_signature['ICE_Dims']
-        except:
-            pass
-        try:
-            del mw.series_signature['SequenceName']
-        except:
-            pass
+            file_studyUID = mw.dcm_data.StudyInstanceUID
+        except AttributeError:
+            lgr.info("File %s is missing any StudyInstanceUID" % filename)
+            file_studyUID = None
+            #continue
+
         try:
-            series_id = (dcmsession[fidx],
-                         mw.dcm_data.SeriesNumber,
+            series_id = (int(mw.dcm_data.SeriesNumber),
                          mw.dcm_data.ProtocolName)
-        except AttributeError:
+            file_studyUID = mw.dcm_data.StudyInstanceUID
+
+            if not per_studyUID:
+                # verify that we are working with a single study
+                if studyUID is None:
+                    studyUID = file_studyUID
+                elif not per_accession_number:
+                    assert studyUID == file_studyUID
+        except AttributeError as exc:
+            lgr.warning('Ignoring %s since not quite a "normal" DICOM: %s',
+                        filename, exc)
             # not a normal DICOM -> ignore
-            series_id = (dcmsession[fidx], -1, 'none')
+            series_id = (-1, 'none')
+            file_studyUID = None
+
         if not series_id[0] < 0:
             if dcmfilter is not None and dcmfilter(mw.dcm_data):
-                series_id = (dcmsession[fidx], -1, mw.dcm_data.ProtocolName)
+                series_id = (-1, mw.dcm_data.ProtocolName)
+
         if not groups:
+            raise RuntimeError("Yarik really thinks this is never ran!")
+            # if I was wrong -- then per_studyUID might need to go above
+            # yoh: I don't think this would ever be executed!
             mwgroup.append(mw)
             groups[0].append(series_id)
             groups[1].append(len(mwgroup) - 1)
             continue
 
-        N = len(mwgroup)
         # filter out unwanted non-image-data DICOMs by assigning
         # a series number < 0 (see test below)
-        if not series_id[1] < 0 and mw.dcm_data[0x0008, 0x0016].repval in (
+        if not series_id[0] < 0 and mw.dcm_data[0x0008, 0x0016].repval in (
                 'Raw Data Storage',
                 'GrayscaleSoftcopyPresentationStateStorage'):
-            series_id = (dcmsession[fidx], -1, mw.dcm_data.ProtocolName)
+            series_id = (-1, mw.dcm_data.ProtocolName)
+
+        if per_studyUID:
+            series_id = series_id + (file_studyUID,)
+
+
         #print fidx, N, filename
         ingrp = False
-        for idx in range(N):
+        for idx in range(len(mwgroup)):
             same = mw.is_same_series(mwgroup[idx])
             #print idx, same, groups[idx][0]
             if same:
+                # the same series should have the same study uuid
+                assert mwgroup[idx].dcm_data.get('StudyInstanceUID', None) == file_studyUID
                 ingrp = True
-                if series_id[1] >= 0:
-                    series_id = (dcmsession[fidx],
-                                 mwgroup[idx].dcm_data.SeriesNumber,
+                if series_id[0] >= 0:
+                    series_id = (mwgroup[idx].dcm_data.SeriesNumber,
                                  mwgroup[idx].dcm_data.ProtocolName)
+                    if per_studyUID:
+                        series_id = series_id + (file_studyUID,)
                 groups[0].append(series_id)
                 groups[1].append(idx)
+
         if not ingrp:
             mwgroup.append(mw)
             groups[0].append(series_id)
@@ -234,12 +436,12 @@ def process_dicoms(fl, dcmsession=None, basedir=None, dcmfilter=None):
     group_map = dict(zip(groups[0], groups[1]))
 
     total = 0
-    filegroup = {}
-    seqinfo = []
+    seqinfo = ordereddict()
+
     # for the next line to make any sense the series_id needs to
     # be sortable in a way that preserves the series order
-    for series, mwidx in sorted(group_map.items()):
-        if series[1] < 0:
+    for series_id, mwidx in sorted(group_map.items()):
+        if series_id[0] < 0:
             # skip our fake series with unwanted files
             continue
         mw = mwgroup[mwidx]
@@ -248,15 +450,17 @@ def process_dicoms(fl, dcmsession=None, basedir=None, dcmfilter=None):
             # nothing to see here, just move on
             continue
         dcminfo = mw.dcm_data
-        files = [fl[i] for i, s in enumerate(groups[0]) if s == series]
+        series_files = [files[i] for i, s in enumerate(groups[0]) if s == series_id]
         # turn the series_id into a human-readable string -- string is needed
         # for JSON storage later on
-        if multi_session:
-            series = '%i-%i-%s' % series
-        else:
-            series = '%i-%s' % series[1:]
-        filegroup[series] = files
-        size = list(mw.image_shape) + [len(files)]
+        if per_studyUID:
+            studyUID = series_id[2]
+            series_id = series_id[:2]
+        accession_number = dcminfo.get('AccessionNumber')
+
+        series_id = '-'.join(map(str, series_id))
+
+        size = list(mw.image_shape) + [len(series_files)]
         total += size[-1]
         if len(size) < 4:
             size.append(1)
@@ -269,48 +473,77 @@ def process_dicoms(fl, dcmsession=None, basedir=None, dcmfilter=None):
         except AttributeError:
             TE = -1
         try:
-            refphys = dcminfo.ReferringPhysicianName
+            refphys = str(dcminfo.ReferringPhysicianName)
         except AttributeError:
             refphys = '-'
 
+        image_type = tuple(dcminfo.ImageType)
+        motion_corrected = 'MoCo' in dcminfo.SeriesDescription \
+                           or 'MOCO' in image_type
         info = SeqInfo(
             total,
-            os.path.split(files[0])[1],
-            series,
-            os.path.basename(os.path.dirname(files[0])),
+            os.path.split(series_files[0])[1],
+            series_id,
+            os.path.basename(os.path.dirname(series_files[0])),
             '-', '-',
             size[0], size[1], size[2], size[3],
             TR, TE,
             dcminfo.ProtocolName,
-            'MoCo' in dcminfo.SeriesDescription,
+            motion_corrected,
             # New ones by us
             'derived' in [x.lower() for x in dcminfo.get('ImageType', [])],
-            dcminfo.PatientID,
-            dcminfo.StudyDescription,
+            dcminfo.get('PatientID'),
+            dcminfo.get('StudyDescription'),
             refphys,
-            dcminfo.SeriesDescription,
-            dcminfo.ImageType,
+            dcminfo.get('SeriesDescription'),
+            image_type,
+            accession_number,
+            # For demographics to populate BIDS participants.tsv
+            dcminfo.get('PatientAge'),
+            dcminfo.get('PatientSex'),
+            dcminfo.get('AcquisitionDate'),
         )
         # candidates
         # dcminfo.AccessionNumber
         #   len(dcminfo.ReferencedImageSequence)
         #   len(dcminfo.SourceImageSequence)
         # FOR demographics
-        # dcminfo.PatientsAge
-        # dcminfo.PatientsSex
-        lgr.debug("%30s %27s %5s nref=%-2d nsrc=%-2d %s" % (
-            info.series_number,
+        if per_studyUID:
+            key = studyUID.split('.')[-1]
+        elif per_accession_number:
+            key = accession_number
+        else:
+            key = ''
+        lgr.debug("%30s %30s %27s %27s %5s nref=%-2d nsrc=%-2d %s" % (
+            key,
+            info.series_id,
             dcminfo.SeriesDescription,
+            dcminfo.ProtocolName,
             info.is_derived,
             len(dcminfo.get('ReferencedImageSequence', '')),
             len(dcminfo.get('SourceImageSequence', '')),
             info.image_type
         ))
-        #if dcminfo.SeriesDescription == 'ep2d_bold_moco_p2':
-        #    import pdb; pdb.set_trace()
-        seqinfo.append(info)
-    lgr.info("Generated sequence info with %d entries", len(info))
-    return seqinfo, filegroup
+        if per_studyUID:
+            if studyUID not in seqinfo:
+                seqinfo[studyUID] = ordereddict()
+            seqinfo[studyUID][info] = series_files
+        elif per_accession_number:
+            if accession_number not in seqinfo:
+                seqinfo[accession_number] = ordereddict()
+            seqinfo[accession_number][info] = series_files
+        else:
+            seqinfo[info] = series_files
+
+    if per_studyUID:
+        lgr.info("Generated sequence info for %d studies with %d entries total",
+                 len(seqinfo), sum(map(len, seqinfo.values())))
+    elif per_accession_number:
+        lgr.info("Generated sequence info for %d accession numbers with %d entries total",
+                 len(seqinfo), sum(map(len, seqinfo.values())))
+    else:
+        lgr.info("Generated sequence info with %d entries", len(seqinfo))
+    return seqinfo
 
 
 def write_config(outfile, info):
@@ -325,13 +558,15 @@ def read_config(infile):
     return info
 
 
-def conversion_info(subject, outdir, info, filegroup, basedir=None, ses=None):
+def conversion_info(subject, outdir, info, filegroup, ses=None):
     convert_info = []
     for key, items in info.items():
         if not items:
             continue
         template = key[0]
         outtype = key[1]
+        # So no annotation_classes of any kind!  so if not used -- what was the
+        # intension???? XXX
         outpath = outdir
         for idx, itemgroup in enumerate(items):
             if not isinstance(itemgroup, list):
@@ -361,35 +596,58 @@ def conversion_info(subject, outdir, info, filegroup, basedir=None, ses=None):
                 try:
                     files = filegroup[item]
                 except KeyError:
-                    files = filegroup[unicode(item)]
-
-                if basedir is not None:
-                    files = [os.path.join(basedir, f) for f in files]
-
+                    files = filegroup[(str if PY3 else unicode)(item)]
                 outprefix = template.format(**parameters)
                 convert_info.append((os.path.join(outpath, outprefix), outtype, files))
-
     return convert_info
 
 
 def embed_nifti(dcmfiles, niftifile, infofile, bids_info=None, force=False, min_meta=False):
+    """
+
+    If `niftifile` doesn't exist, it gets created out of the `dcmfiles` stack,
+    and json representation of its meta_ext is returned (bug since should return
+    both niftifile and infofile?)
+
+    if `niftifile` exists, its affine's orientation information is used while
+    establishing new `NiftiImage` out of dicom stack and together with `bids_info`
+    (if provided) is dumped into json `infofile`
+
+    Parameters
+    ----------
+    dcmfiles
+    niftifile
+    infofile
+    bids_info
+    force
+    min_meta
+
+    Returns
+    -------
+    niftifile, infofile
+
+    """
     import nibabel as nb
     import os
     import json
     import re
+    meta_info = {}
     if not min_meta:
         import dcmstack as ds
         stack = ds.parse_and_stack(dcmfiles, force=force).values()
         if len(stack) > 1:
             raise ValueError('Found multiple series')
         stack = stack[0]
-        # Create the nifti image using the data array
+
+        #Create the nifti image using the data array
         if not os.path.exists(niftifile):
             nifti_image = stack.to_nifti(embed_meta=True)
             nifti_image.to_filename(niftifile)
             return ds.NiftiWrapper(nifti_image).meta_ext.to_json()
+
         orig_nii = nb.load(niftifile)
-        ornt = nb.orientations.io_orientation(orig_nii.affine)
+        aff = orig_nii.get_affine()
+        ornt = nb.orientations.io_orientation(aff)
         axcodes = nb.orientations.ornt2axcodes(ornt)
         new_nii = stack.to_nifti(voxel_order=''.join(axcodes), embed_meta=True)
         meta = ds.NiftiWrapper(new_nii).meta_ext.to_json()
@@ -408,47 +666,161 @@ def embed_nifti(dcmfiles, niftifile, infofile, bids_info=None, force=False, min_
         json.dump(meta_info, fp, indent=3, sort_keys=True)
     return niftifile, infofile
 
-def compress_dicoms(dicom_list, prefix, sourcedir):
+
+def compress_dicoms(dicom_list, out_prefix):
+    """Archives DICOMs into a tarball
+
+    Also tries to do it reproducibly, so takes the date for files
+    and target tarball based on the series time (within the first file)
+
+    Parameters
+    ----------
+    dicom_list : list of str
+      list of dicom files
+    out_prefix : str
+      output path prefix, including the portion of the output file name
+      before .dicom.tgz suffix
+
+    Returns
+    -------
+    filename : str
+      Result tarball
+    """
     tmpdir = mkdtemp(prefix='dicomtar')
-    outtar = os.path.join(sourcedir, prefix + '.dicom.tgz')
-    with tarfile.open(outtar, 'w:gz', dereference=True) as tar:
-        for filename in dicom_list:
-            outfile = os.path.join(tmpdir, os.path.basename(filename))
-            if not os.path.islink(outfile):
-                os.symlink(filename, outfile)
-            tar.add(outfile, recursive=False)
-    tar.close()
+    outtar = out_prefix + '.dicom.tgz'
+    if os.path.exists(outtar) and not global_options['overwrite']:
+        raise RuntimeError("File %s already exists, will not override"
+                           % outtar)
+    # tarfile encodes current time.time inside making those non-reproducible
+    # so we should choose which date to use.
+    # Solution from DataLad although ugly enough:
+
+    dicom_list = sorted(dicom_list)
+    dcm_time = get_dicom_series_time(dicom_list)
+
+    def _assign_dicom_time(ti):
+        # Reset the date to match the one of the last commit, not from the
+        # filesystem since git doesn't track those at all
+        ti.mtime = dcm_time
+        return ti
+
+    # poor man mocking since can't rely on having mock
+    try:
+        import time
+        _old_time = time.time
+        time.time = lambda: dcm_time
+        if os.path.lexists(outtar):
+            os.unlink(outtar)
+        with tarfile.open(outtar, 'w:gz', dereference=True) as tar:
+            for filename in dicom_list:
+                outfile = os.path.join(tmpdir, os.path.basename(filename))
+                if not os.path.islink(outfile):
+                    os.symlink(os.path.realpath(filename), outfile)
+                # place into archive stripping any lead directories and
+                # adding the one corresponding to prefix
+                tar.add(outfile,
+                        arcname=opj(basename(out_prefix),
+                                    os.path.basename(outfile)),
+                        recursive=False,
+                        filter=_assign_dicom_time)
+    finally:
+        time.time = _old_time
+
     shutil.rmtree(tmpdir)
+    return outtar
+
+
+def get_dicom_series_time(dicom_list):
+    """Get time in seconds since epoch from dicom series date and time
+
+    Primarily to be used for reproducible time stamping
+    """
+    import time
+    import calendar
+    import dicom as dcm
+
+    dcm = dcm.read_file(dicom_list[0], stop_before_pixels=True)
+    dcm_date = dcm.SeriesDate  # YYYYMMDD
+    dcm_time = dcm.SeriesTime  # HHMMSS.MICROSEC
+    dicom_time_str = dcm_date + dcm_time.split('.', 1)[0]  # YYYYMMDDHHMMSS
+    # convert to epoch
+    return calendar.timegm(time.strptime(dicom_time_str, '%Y%m%d%H%M%S'))
+
+
+def safe_copyfile(src, dest):
+    """Copy file but blow if destination name already exists
+    """
+    if os.path.isdir(dest):
+        dest = os.path.join(dest, os.path.basename(src))
+    if os.path.lexists(dest):
+        if not global_options['overwrite']:
+            raise ValueError("was asked to copy %s but destination already exists: %s"
+                             % (src, dest))
+        else:
+            # to make sure we can write there ... still fail if it is entire directory ;)
+            os.unlink(dest)
+    shutil.copyfile(src, dest)
 
 
 def convert(items, symlink=True, converter=None,
         scaninfo_suffix='.json', custom_callable=None, with_prov=False,
-        is_bids=False, sourcedir=None, min_meta=False):
+        is_bids=False, sourcedir=None, outdir=None, min_meta=False):
+    """Perform actual conversion (calls to converter etc) given info from
+    heuristic's `infotodict`
+
+    Parameters
+    ----------
+    items
+    symlink
+    converter
+    scaninfo_suffix
+    custom_callable
+    with_prov
+    is_bids
+    sourcedir
+    outdir
+    min_meta
+
+    Returns
+    -------
+    None
+    """
     prov_files = []
-    tmpdir = mkdtemp(prefix='heudiconvtmp')
-    for item in items:
-        if isinstance(item[1], (list, tuple)):
-            outtypes = item[1]
-        else:
-            outtypes = [item[1]]
-        prefix = item[0]
-        print('Converting %s' % prefix)
-        dirname = os.path.dirname(prefix + '.ext')
-        print(dirname)
-        if not os.path.exists(dirname):
-            os.makedirs(dirname)
+    tmpdir = mkdtemp(prefix='heudiconvdcm')
+    for item_idx, item in enumerate(items):
+        prefix, outtypes, item_dicoms = item[:3]
+        if not isinstance(outtypes, (list, tuple)):
+            outtypes = [outtypes]
+
+        prefix_dirname = os.path.dirname(prefix + '.ext')
+        prov_file = None
+        outname_bids = prefix + '.json'
+        outname_bids_files = []  # actual bids files since dcm2niix might generate multiple ATM
+        lgr.info('Converting %s (%d DICOMs) -> %s . '
+                 'Converter: %s . Output types: %s',
+                 prefix, len(item_dicoms), prefix_dirname, converter, outtypes)
+        if not os.path.exists(prefix_dirname):
+            os.makedirs(prefix_dirname)
         for outtype in outtypes:
-            item_dicoms = item[2]
-            lgr.info("Processing %d dicoms", len(item_dicoms))
-            lgr.debug(" those dicoms are: %s", item_dicoms)
+            lgr.debug("Processing %d dicoms for output type %s",
+                     len(item_dicoms), outtype)
+            lgr.log(1, " those dicoms are: %s", item_dicoms)
+
+            seqtype = basename(dirname(prefix)) if is_bids else None
+
             if outtype == 'dicom':
                 if is_bids:
-                    if not os.path.exists(sourcedir):
-                        os.makedirs(sourcedir)
-                    dicom_list = []
-                    for filename in item_dicoms:
-                        dicom_list.append(filename)
-                    compress_dicoms(dicom_list, os.path.basename(prefix), sourcedir)
+                    # mimic the same hierarchy location as the prefix
+                    # although it could all have been done probably
+                    # within heuristic really
+                    sourcedir_ = os.path.join(
+                        sourcedir,
+                        os.path.dirname(
+                            os.path.relpath(prefix, outdir)))
+                    if not os.path.exists(sourcedir_):
+                        os.makedirs(sourcedir_)
+                    compress_dicoms(item_dicoms,
+                                    opj(sourcedir_, os.path.basename(prefix)))
                 else:
                     dicomdir = prefix + '_dicom'
                     if os.path.exists(dicomdir):
@@ -470,319 +842,657 @@ def convert(items, symlink=True, converter=None,
                         config.enable_provenance()
                     from nipype import Function, Node
                     from nipype.interfaces.base import isdefined
-                    print(converter)
-                    # deprecated
-                    if converter == 'mri_convert':
-                        lgr.warn('Deprecated - use dcm2niix for future conversions')
-                        from nipype.interfaces.freesurfer.preprocess import MRIConvert
-                        convertnode = Node(MRIConvert(), name='convert')
-                        convertnode.base_dir = tmpdir
-                        if outtype == 'nii.gz':
-                            convertnode.inputs.out_type = 'niigz'
-                        convertnode.inputs.in_file = item_dicoms[0]
-                        convertnode.inputs.out_file = os.path.abspath(outname)
-                        #cmd = 'mri_convert %s %s' % (item[2][0], outname)
-                        #print(cmd)
-                        #os.system(cmd)
-                        res = convertnode.run()
-                    elif converter in {'dcm2nii', 'dcm2niix'}:
+                    if converter == 'dcm2niix':
                         from nipype.interfaces.dcm2nii import Dcm2nii, Dcm2niix
-                        convertnode = Node(
-                            {'dcm2nii': Dcm2nii,
-                             'dcm2niix': Dcm2niix}[converter](),
-                            name='convert')
+                        convertnode = Node(Dcm2niix(), name='convert')
                         convertnode.base_dir = tmpdir
+                        # need to be abspaths!
+                        item_dicoms = list(map(os.path.abspath, item_dicoms))
                         convertnode.inputs.source_names = item_dicoms
                         if converter == 'dcm2nii':
                             convertnode.inputs.gzip_output = outtype == 'nii.gz'
                         else:
                             if not is_bids:
                                 convertnode.inputs.bids_format = False
-                            convertnode.inputs.out_filename = os.path.basename(
-                                                                 dirname +'_%e')
+                            convertnode.inputs.out_filename = os.path.basename(prefix_dirname)
                         convertnode.inputs.terminal_output = 'allatonce'
                         res = convertnode.run()
-                        if isinstance(res.outputs.converted_files, list):
-                            lgr.warning(
-                                "Following series likely has multiple orientations: ",
-                                item_dicoms
-                            )
-                            for fl in res.outputs.converted_files:
-                                idx = fl.split('_')[-1][0:-7] # get echo number, remove .nii.gz
-                                outname = prefix + '-' + str(idx) + '.' + outtype
-                                shutil.copyfile(fl, outname)
-                        else:
-                            shutil.copyfile(res.outputs.converted_files, outname)
+
                         if isdefined(res.outputs.bvecs):
                             outname_bvecs = prefix + '.bvec'
                             outname_bvals = prefix + '.bval'
-                            shutil.copyfile(res.outputs.bvecs, outname_bvecs)
-                            shutil.copyfile(res.outputs.bvals, outname_bvals)
-
-                        if converter == 'dcm2niix' \
-                                and isdefined(res.outputs.bids):
-                            ### extract bids
-                            if isinstance(res.outputs.bids, list):
-                                lgr.warning("There was someone catching lists!")
-                                for fl in res.outputs.bids:
-                                    idx = fl.split('_')[-1][0:-5] # get echo number, remove .json
-                                    scaninfo = prefix + '_' + str(idx) + '.json'
-                                    shutil.copyfile(fl, scaninfo)
+                            safe_copyfile(res.outputs.bvecs, outname_bvecs)
+                            safe_copyfile(res.outputs.bvals, outname_bvals)
+
+                        res_files = res.outputs.converted_files
+                        if isinstance(res_files, list):
+                            # TODO: move into a function
+                            # by default just suffix them up
+                            suffixes = None
+                            # we should provide specific handling for fmap,
+                            # dwi etc which might spit out multiple files
+                            if is_bids:
+                                if seqtype == 'fmap':
+                                    # expected!
+                                    suffixes = ["%d" % (i+1) for i in range(len(res_files))]
+                            if not suffixes:
+                                lgr.warning(
+                                    "Following series files likely have "
+                                    "multiple (%d) volumes (orientations?) "
+                                    "generated: %s ...",
+                                    len(res_files), item_dicoms[0]
+                                )
+                                suffixes = ['-%d' % (i+1) for i in range(len(res_files))]
+
+                            # Also copy BIDS files although they might need to be merged/postprocessed later
+                            if converter == 'dcm2niix' and isdefined(res.outputs.bids):
+                                assert(len(res.outputs.bids) == len(res_files))
+                                bids_files = res.outputs.bids
                             else:
-                                shutil.copyfile(res.outputs.bids, scaninfo)
+                                bids_files = [None] * len(res_files)
+
+                            for fl, suffix, bids_file in zip(res_files, suffixes, bids_files):
+                                outname = "%s%s.%s" % (prefix, suffix, outtype)
+                                safe_copyfile(fl, outname)
+                                if bids_file:
+                                    outname_bids_file = "%s%s.json" % (prefix, suffix)
+                                    safe_copyfile(bids_file, outname_bids_file)
+                                    outname_bids_files.append(outname_bids_file)
+
+                        else:
+                            safe_copyfile(res_files, outname)
+                            if converter == 'dcm2niix' and isdefined(res.outputs.bids):
+                                try:
+                                    safe_copyfile(res.outputs.bids, outname_bids)
+                                    outname_bids_files.append(outname_bids)
+                                except TypeError as exc:  ##catch lists
+                                    lgr.warning(
+                                        "There was someone catching lists!: %s", exc
+                                    )
+                                    continue
+
+                        # save acquisition time information if it's BIDS
+                        # at this point we still have acquisition date
+                        if is_bids:
+                            save_scans_key(items, outname_bids_files)
+                        # Fix up and unify BIDS files
+                        tuneup_bids_json_files(outname_bids_files)
+                        # we should provide specific handling for fmap,
+                        # dwi etc .json of which should get merged to satisfy
+                        # BIDS.  BUT wer might be somewhat not in time for a
+                        # party here since we sorted into multiple seqinfo
+                        # (e.g. magnitude, phase for fmap so we might want
+                        # to sort them into a single one)
 
                     if with_prov:
                         prov_file = prefix + '_prov.ttl'
-                        shutil.copyfile(os.path.join(convertnode.base_dir,
+                        safe_copyfile(os.path.join(convertnode.base_dir,
                                                      convertnode.name,
                                                     'provenance.ttl'),
                                         prov_file)
                         prov_files.append(prov_file)
 
-                from nipype import Node, Function
-                embedfunc = Node(Function(input_names=['dcmfiles',
-                                                       'niftifile',
-                                                       'infofile',
-                                                       'bids_info',
-                                                       'force',
-                                                       'min_meta'],
-                                          output_names=['outfile',
-                                                        'meta'],
-                                         function=embed_nifti),
-                                 name='embedder')
-                embedfunc.inputs.dcmfiles = item_dicoms
-                embedfunc.inputs.niftifile = os.path.abspath(outname)
-                embedfunc.inputs.infofile = os.path.abspath(scaninfo)
-                embedfunc.inputs.min_meta = min_meta
-                if is_bids and (converter == 'dcm2niix'):
-                    embedfunc.inputs.bids_info = load_json(os.path.abspath(scaninfo))
-                embedfunc.inputs.force = True
-                embedfunc.base_dir = tmpdir
-                cwd = os.getcwd()
-                try:
-                    res = embedfunc.run()
-                    os.chmod(scaninfo, 0o0440)
-                    if with_prov:
-                        g = res.provenance.rdf()
-                        g.parse(prov_file,
-                                format='turtle')
-                        g.serialize(prov_file, format='turtle')
-                        os.chmod(prov_file, 0o0440)
-                except:
-                    os.chdir(cwd)
+                if len(outname_bids_files) > 1:
+                    lgr.warning(
+                        "For now not embedding BIDS and info generated .nii.gz itself since sequence produced multiple files")
+                else:
+                    #if not is_bids or converter != 'dcm2niix': ##uses dcm2niix's infofile
+                    embed_metadata_from_dicoms(converter, is_bids, item_dicoms,
+                                               outname, outname_bids, prov_file,
+                                               scaninfo, tmpdir, with_prov,
+                                               min_meta)
+                if exists(scaninfo):
+                    lgr.info("Post-treating %s file", scaninfo)
+                    treat_infofile(scaninfo)
                 os.chmod(outname, 0o0440)
 
-        if not custom_callable is None:
+        if custom_callable is not None:
             custom_callable(*item)
     shutil.rmtree(tmpdir)
 
 
-def convert_dicoms(subjs, dicom_dir_template, outdir, heuristic_file, converter,
-                   queue=None, anon_sid_cmd=None, anon_outdir=None, with_prov=False,
-                   ses=None, is_bids=False, sbatch_args='-N1 -c2 --mem=20G -t 01:00:00',
-                   min_meta=False):
+def get_formatted_scans_key_row(item):
+    """
+    Parameters
+    ----------
+    item
 
-    if '%s' in dicom_dir_template:
-        raise ValueError("Formatting DICOM search pattern with '%s' is "
-                         "deprecated. Please indicate participant ID with "
-                         "'{subject}' and (optionally) timepoint with '{session}'.")
+    Returns
+    -------
+    row: list
+        [ISO acquisition time, performing physician name, random string]
 
-    for sid in subjs:
-        tmpdir = None
-        if queue:
-            # TODO This needs to be updated to better scale with additional args
-            progname = os.path.abspath(inspect.getfile(inspect.currentframe()))
-            convertcmd = ' '.join(['python', progname, '-d', dicom_dir_template,
-                                   '-o', outdir, '-f', heuristic_file, '-s', sid,
-                                   '-c', converter])
-            if ses:
-                convertcmd += " --ses {}".format(ses)
-            if with_prov:
-                convertcmd += " --with-prov"
-            if is_bids:
-                convertcmd += " --bids"
-            if min_meta:
-                convertcmd += " --minmeta"
-            if ses:
-                script_file = 'dicom-{}_ses-{}.sh'.format(sid, ses)
-            else:
-                script_file = 'dicom-{}.sh'.format(sid)
-            with open(script_file, 'wt') as fp:
-                fp.writelines(['#!/bin/bash\n', convertcmd])
-            outcmd = 'sbatch -J dicom-%s -p %s %s %s' % (sid, queue, sbatch_args, script_file)
-            os.system(outcmd)
-            continue
+    """
+    dcm_fn = item[-1][0]
+    mw = ds.wrapper_from_data(dcm.read_file(dcm_fn, stop_before_pixels=True))
+    # we need to store filenames and acquisition times
+    # parse date and time and get it into isoformat
+    date = mw.dcm_data.AcquisitionDate
+    time = mw.dcm_data.AcquisitionTime.split('.')[0]
+    td = time + date
+    acq_time = datetime.strptime(td, '%H%M%S%Y%m%d').isoformat()
+    # add random string
+    randstr = ''.join(map(chr, sample(k=8, population=range(33, 127))))
+    row = [acq_time, mw.dcm_data.PerformingPhysicianName, randstr]
+    return row
+
+
+def add_rows_to_scans_keys_file(fn, newrows):
+    """
+    Add new rows to file fn for scans key filename
 
-        # TODO: RF into a function
-        # expand the input template
-        if sid:
-            sdir = dicom_dir_template.format(subject=sid, session=ses)
-            # and see what matches
-            fl = sorted(glob(sdir))
-        else:
-            # we were given no subject so we consider dicom_dir_template to be
-            # and actual directory to process
-            if not isdir(dicom_dir_template):
-                raise ValueError(
-                    "No subject id was provided, and dicom_dir_template=%s is not "
-                    "an existing directory to traverse" % str(dicom_dir_template)
-                )
-            fl = sorted(find_files('.*', topdir=dicom_dir_template))
-
-        lgr.info("Processing %d dicoms", len(fl))
-        dcmsessions = None
-        # some people keep compressed tarballs around -- be nice!
-        if len(fl) and tarfile.is_tarfile(fl[0]):
-            # check if we got a list of tarfiles
-            if not len(fl) == sum([tarfile.is_tarfile(i) for i in fl]):
-                raise ValueError("some but not all input files are tar files")
-            # tarfiles already know what they contain, and often the filenames
-            # are unique, or at least in a unqiue subdir per session
-            # strategy: extract everything in a temp dir and assemble a list
-            # of all files in all tarballs
-            content = []
-            tmpdir = mkdtemp(prefix='heudiconvtmp')
-            # needs sorting to keep the generated "session" label deterministic
-            for i, t in enumerate(sorted(fl)):
-                tf = tarfile.open(t)
-                # check content and sanitize permission bits
-                tmembers = tf.getmembers()
-                for tm in tmembers:
-                    tm.mode = 0o700
-                # get all files, assemble full path in tmp dir
-                tf_content = [m.name for m in tmembers if m.isfile()]
-                content += tf_content
-                if len(fl) > 1:
-                    # more than one tarball, take care of "session" indicator
-                    if dcmsessions is None:
-                        dcmsessions = []
-                    dcmsessions += ([i] * len(tf_content))
-                # extract into tmp dir
-                tf.extractall(path=tmpdir, members=tmembers)
-            fl = content
-
-        #dcmfile = dcm.read_file(fl[0], force=True)
-        #print sid, 'Dicom: ', dcmfile.PatientName, sid == dcmfile.PatientName
-
-        seqinfo = None   # we might need it earlier than later
-        if not sid:
-            # figure out the sid out of available information
-            seqinfo, filegroup = process_dicoms(
-                fl, dcmsessions, basedir=tmpdir,
-                dcmfilter=getattr(mod, 'filter_dicom', None))
-            # XXX session information handling is somewhat backwards since done above
-            # already. Moreover above logic with .edit.txt file -- seqinfo is
-            # available only on initial run - TODO
-            if not hasattr(mod, 'get_session_subject_id'):
-                raise ValueError(
-                    "%s has no get_session_subject_id needed to figure out "
-                    "subject/session from DICOMs" % mod
-                )
+    Parameters
+    ----------
+    fn: filename
+    newrows: extra rows to add
+        dict fn: [acquisition time, referring physician, random string]
+    """
+    if lexists(fn):
+        with open(fn, 'r') as csvfile:
+            reader = csv.reader(csvfile, delimiter='\t')
+            existing_rows = [row for row in reader]
+        # skip header
+        fnames2info = {row[0]: row[1:] for row in existing_rows[1:]}
+
+        newrows_key = newrows.keys()
+        newrows_toadd = list(set(newrows_key) - set(fnames2info.keys()))
+        for key_toadd in newrows_toadd:
+            fnames2info[key_toadd] = newrows[key_toadd]
+        # remove
+        os.unlink(fn)
+    else:
+        fnames2info = newrows
 
-            session_subject_ids = set(mod.get_session_subject_id(s)
-                                      for s in seqinfo)
-            assert len(session_subject_ids) == 1, \
-                "atm we support processing only 1 subject/session at a time"
-            sess, sid = session_subject_ids.pop()
+    header = ['filename', 'acq_time', 'operator', 'randstr']
+    # save
+    with open(fn, 'a') as csvfile:
+        writer = csv.writer(csvfile, delimiter='\t')
+        writer.writerow(header)
+        for key in sorted(fnames2info.keys()):
+            writer.writerow([key] + fnames2info[key])
 
 
-        if is_bids and not sid.isalnum(): # alphanumeric only
-            old_sid = sid
-            cleaner = lambda y: ''.join([x for x in y if x.isalnum()])
-            sid = cleaner(sid) #strip out
-            lgr.warn('{0} contained nonalphanumeric character(s), subject '
-                     'ID was cleaned to be {1}'.format(old_sid, sid))
+def save_scans_key(items, outname_bids_files):
+    """
+    Parameters
+    ----------
+    items:
+    outname_bids_files:
 
-        # Figure out where to stick supplemental info files
-        if is_bids and not sid.startswith('sub-'):
-            idir = os.path.join(outdir, 'sub-' + sid)
-        else:
-            idir = os.path.join(outdir, sid)
-        if is_bids and ses:
-            idir = os.path.join(idir, 'ses-%s' % str(ses))
-        # all heudiconv stuff in info folder
-        idir = os.path.join(idir, 'info')
-        if not os.path.exists(idir):
-            os.makedirs(idir)
+    Returns
+    -------
+
+    """
+    rows = dict()
+    for item, outname_bids_file in zip(items, outname_bids_files):
+        # get filenames
+        f_name = '/'.join(outname_bids_file.split('/')[-2:])
+        f_name = f_name.replace('json', 'nii.gz')
+        rows[f_name] = get_formatted_scans_key_row(item)
+    # where should we store it?
+    output_dir = dirname(dirname(outname_bids_file))
+    # get subject info
+    subj_pattern = re.compile('(sub-[a-zA-Z0-9]*)')
+    subj = subj_pattern.findall(f_name)
+    assert(len(subj) >= 1)
+    subj = subj[0]
+
+    # save
+    add_rows_to_scans_keys_file(
+        pjoin(output_dir, '{0}_scans.tsv'.format(subj)),
+        rows)
+
+
+def tuneup_bids_json_files(json_files):
+    """Given a list of BIDS .json files, e.g. """
+    if not json_files:
+        return
+
+    # Harmonize generic .json formatting
+    for jsonfile in json_files:
+        json_ = json.load(open(jsonfile))
+        # sanitize!
+        for f in ['AcquisitionDateTime', 'AcquisitionDate']:
+            json_.pop(f, None)
+        # TODO:  should actually be placed into series file which must
+        #        go under annex (not under git) and marked as sensitive
+        if 'Date' in str(json_):
+            # Let's hope no word 'Date' comes within a study name or smth like
+            # that
+            raise ValueError("There must be no dates in .json sidecar")
+        json.dump(json_, open(jsonfile, 'w'), indent=2)
+
+    # Load the beast
+    seqtype = basename(dirname(jsonfile))
+
+    if seqtype == 'fmap':
+        json_basename = '_'.join(jsonfile.split('_')[:-1])
+        # if we got by now all needed .json files -- we can fix them up
+        # unfortunately order of "items" is not guaranteed atm
+        if len(glob(json_basename + '*.json')) == 3:
+            json_phasediffname = json_basename + '_phasediff.json'
+            json_ = json.load(open(json_phasediffname))
+            # TODO: we might want to reorder them since ATM
+            # the one for shorter TE is the 2nd one!
+            # For now just save truthfully by loading magnitude files
+            lgr.debug("Placing EchoTime fields into phasediff file")
+            for i in 1, 2:
+                try:
+                    json_['EchoTime%d' % i] = \
+                        json.load(open(json_basename + '_magnitude%d.json' % i))[
+                            'EchoTime']
+                except IOError as exc:
+                    lgr.error("Failed to open magnitude file: %s", exc)
+
+            # might have been made R/O already
+            os.chmod(json_phasediffname, 0o0664)
+            json.dump(json_, open(json_phasediffname, 'w'), indent=2)
+            os.chmod(json_phasediffname, 0o0444)
+
+        # phasediff one should contain two PhaseDiff's
+        #  -- one for original amplitude and the other already replicating what is there
+        # so let's load json files for magnitudes and
+        # place them into phasediff
+
+
+def embed_metadata_from_dicoms(converter, is_bids, item_dicoms, outname,
+                               outname_bids, prov_file, scaninfo, tmpdir,
+                               with_prov, min_meta):
+    """
+    Enhance sidecar information file with more information from DICOMs
+
+    Parameters
+    ----------
+    converter
+    is_bids
+    item_dicoms
+    outname
+    outname_bids
+    prov_file
+    scaninfo
+    tmpdir
+    with_prov
+    min_meta
+
+    Returns
+    -------
+
+    """
+    from nipype import Node, Function
+    embedfunc = Node(Function(input_names=['dcmfiles',
+                                           'niftifile',
+                                           'infofile',
+                                           'bids_info',
+                                           'force',
+                                           'min_meta'],
+                              output_names=['outfile',
+                                            'meta'],
+                              function=embed_nifti),
+                     name='embedder')
+    embedfunc.inputs.dcmfiles = item_dicoms
+    embedfunc.inputs.niftifile = os.path.abspath(outname)
+    embedfunc.inputs.infofile = os.path.abspath(scaninfo)
+    embedfunc.inputs.min_meta = min_meta
+    if is_bids and (converter == 'dcm2niix'):
+        embedfunc.inputs.bids_info = load_json(os.path.abspath(outname_bids))
+    else:
+        embedfunc.inputs.bids_info = None
+    embedfunc.inputs.force = True
+    embedfunc.base_dir = tmpdir
+    cwd = os.getcwd()
+    try:
+        """
+        Ran into
+INFO: Executing node embedder in dir: /tmp/heudiconvdcm2W3UQ7/embedder
+ERROR: Embedding failed: [Errno 13] Permission denied: '/inbox/BIDS/tmp/test2-jessie/Wheatley/Beau/1007_personality/sub-sid000138/fmap/sub-sid000138_3mm_run-01_phasediff.json'
+while
+HEUDICONV_LOGLEVEL=WARNING time bin/heudiconv -f heuristics/dbic_bids.py -c dcm2niix -o /inbox/BIDS/tmp/test2-jessie --bids --datalad /inbox/DICOM/2017/01/28/A000203
+
+so it seems that there is a filename collision so it tries to save into the same file name
+and there was a screw up for that A
+
+/mnt/btrfs/dbic/inbox/DICOM/2017/01/28/A000203
+        StudySessionInfo(locator='Wheatley/Beau/1007_personality', session=None, subject='sid000138') 16 sequences
+        StudySessionInfo(locator='Wheatley/Beau/1007_personality', session=None, subject='a000203') 2 sequences
+
+
+in that one though
+        """
+        if global_options['overwrite'] and os.path.lexists(scaninfo):
+            # TODO: handle annexed file case
+            if not os.path.islink(scaninfo):
+                os.chmod(scaninfo, 0o0660)
+        res = embedfunc.run()
+        os.chmod(scaninfo, 0o0444)
+        if with_prov:
+            g = res.provenance.rdf()
+            g.parse(prov_file,
+                    format='turtle')
+            g.serialize(prov_file, format='turtle')
+            os.chmod(prov_file, 0o0440)
+    except Exception as exc:
+        lgr.error("Embedding failed: %s", str(exc))
+        os.chdir(cwd)
+
+
+def treat_infofile(filename):
+    """Tune up generated .json file (slim down, pretty-print for humans).
+
+    Was difficult to do within embed_nifti since it has no access to our functions
+    """
+    with open(filename) as f:
+        j = json.load(f)
+
+    j_slim = slim_down_info(j)
+    j_pretty = json_dumps_pretty(j_slim, indent=2, sort_keys=True)
+
+    os.chmod(filename, 0o0664)
+    with open(filename, 'wt') as fp:
+        fp.write(j_pretty)
+    os.chmod(filename, 0o0444)
+
+
+def convert_dicoms(sid,
+                   dicoms,
+                   outdir,
+                   heuristic,
+                   converter,
+                   anon_sid=None, anon_outdir=None,
+                   with_prov=False,
+                   ses=None,
+                   is_bids=False,
+                   seqinfo=None,
+                   min_meta=False):
+    if dicoms:
+        lgr.info("Processing %d dicoms", len(dicoms))
+    elif seqinfo:
+        lgr.info("Processing %d pre-sorted seqinfo entries", len(seqinfo))
+    else:
+        raise ValueError("neither dicoms nor seqinfo dict was provided")
 
-        #
-        # Anonymization
-        #
+    # in this reimplementation we can have only a single session assigned
+    # at this point
+    # dcmsessions =
+
+    if is_bids and not sid.isalnum(): # alphanumeric only
+        old_sid = sid
+        cleaner = lambda y: ''.join([x for x in y if x.isalnum()])
+        sid = cleaner(sid) #strip out
+        lgr.warning('{0} contained nonalphanumeric character(s), subject '
+                 'ID was cleaned to be {1}'.format(old_sid, sid))
+
+    #
+    # Annonimization parameters
+    #
+    if anon_sid is None:
         anon_sid = sid
-        if anon_sid_cmd is not None:
-            from subprocess import check_output
-            anon_sid = check_output([anon_sid_cmd, sid]).strip()
-            lgr.info("Anonymized sid %s into %s", sid, anon_sid)
-        else:
-            anon_outdir = idir
-
-        shutil.copy(heuristic_file, idir)
-        path, fname = os.path.split(heuristic_file)
-        sys.path.append(path)
-        mod = __import__(fname.split('.')[0])
-
-        infofile = os.path.join(idir, '%s.auto.txt' % sid)
-        editfile = os.path.join(idir, '%s.edit.txt' % sid)
-        if os.path.exists(editfile):  # XXX may be condition on seqinfo is None
-            lgr.info("Reloading existing filegroup.json because %s exists",
-                     editfile)
-            info = read_config(editfile)
-            filegroup = load_json(os.path.join(idir, 'filegroup.json'))
-        else:
-            if seqinfo is None:
-                seqinfo, filegroup = process_dicoms(
-                    fl, dcmsessions, basedir=tmpdir,
-                    dcmfilter=getattr(mod, 'filter_dicom', None))
+    if anon_outdir is None:
+        anon_outdir = outdir
+
+    # Figure out where to stick supplemental info dicoms
+    idir = os.path.join(outdir, '.heudiconv', sid)
+    # THAT IS WHERE WE MUST KNOW ABOUT SESSION ALREADY!
+    if is_bids and ses:
+        idir = os.path.join(idir, 'ses-%s' % str(ses))
+    # yoh: in my case if idir exists, it means that that study/subject/session
+    # is already processed
+    if anon_outdir == outdir:
+        # if all goes into a single dir, have a dedicated 'info' subdir
+        idir = os.path.join(idir, 'info')
+    if not os.path.exists(idir):
+        os.makedirs(idir)
+
+    shutil.copy(heuristic.filename, idir)
+    ses_suffix = "_ses-%s" % ses if ses is not None else ""
+    info_file = os.path.join(idir, '%s%s.auto.txt' % (sid, ses_suffix))
+    edit_file = os.path.join(idir, '%s%s.edit.txt' % (sid, ses_suffix))
+    filegroup_file = os.path.join(idir, 'filegroup%s.json' % ses_suffix)
+
+    if os.path.exists(edit_file):  # XXX may be condition on seqinfo is None
+        lgr.info("Reloading existing filegroup.json because %s exists",
+                 edit_file)
+        info = read_config(edit_file)
+        filegroup = load_json(filegroup_file)
+        # XXX Yarik finally understood why basedir was dragged along!
+        # So we could reuse the same PATHs definitions possibly consistent
+        # across re-runs... BUT that wouldn't work anyways if e.g.
+        # DICOMs dumped with SOP UUIDs thus differing across runs etc
+        # So either it would need to be brought back or reconsidered altogether
+        # (since no sample data to test on etc)
+    else:
+        # TODO -- might have been done outside already!
+        if dicoms:
+            seqinfo = group_dicoms_into_seqinfos(
+                dicoms,
+                file_filter=getattr(heuristic, 'filter_files', None),
+                dcmfilter=getattr(heuristic, 'filter_dicom', None),
+                grouping=None,  # no groupping
+            )
+        seqinfo_list = list(seqinfo.keys())
+
+        filegroup = {si.series_id: x for si, x in seqinfo.items()}
+
+        save_json(filegroup_file, filegroup)
+        dicominfo_file = os.path.join(idir, 'dicominfo%s.tsv' % ses_suffix)
+        with open(dicominfo_file, 'wt') as fp:
+            for seq in seqinfo_list:
+                fp.write('\t'.join([str(val) for val in seq]) + '\n')
+        lgr.debug("Calling out to %s.infodict", heuristic)
+        info = heuristic.infotodict(seqinfo_list)
+        write_config(info_file, info)
+        write_config(edit_file, info)
+
+    #
+    # Conversion
+    #
+
+    sourcedir = None
+    if is_bids:
+        sourcedir = os.path.join(outdir, 'sourcedata')
+        # the other portion of the path would mimic BIDS layout
+        # so we don't need to worry here about sub, ses at all
+        tdir = anon_outdir
+    else:
+        tdir = os.path.join(anon_outdir, anon_sid)
+
+    if converter != 'none':
+        lgr.info("Doing conversion using %s", converter)
+        cinfo = conversion_info(anon_sid, tdir, info, filegroup,
+                                ses=ses)
+        convert(cinfo,
+                converter=converter,
+                scaninfo_suffix=getattr(
+                    heuristic, 'scaninfo_suffix', '.json'),
+                custom_callable=getattr(
+                    heuristic, 'custom_callable', None),
+                with_prov=with_prov,
+                is_bids=is_bids,
+                sourcedir=sourcedir,
+                outdir=tdir)
+
+    if is_bids:
+        if seqinfo:
+            keys = list(seqinfo)
+            add_participant_record(
+                anon_outdir,
+                anon_sid,
+                keys[0].patient_age,
+                keys[0].patient_sex,
+            )
+        populate_bids_templates(
+            anon_outdir,
+            getattr(heuristic, 'DEFAULT_FIELDS', {})
+        )
+
+
+def get_annonimized_sid(sid, anon_sid_cmd):
+    anon_sid = sid
+    if anon_sid_cmd is not None:
+        from subprocess import check_output
+        anon_sid = check_output([anon_sid_cmd, sid]).strip()
+        lgr.info("Annonimized sid %s into %s", sid, anon_sid)
+    return anon_sid
+
+
+def get_extracted_dicoms(fl):
+    """Given a list of files, possibly extract some from tarballs
+
+    For 'classical' heudiconv, if multiple tarballs are provided, they correspond
+    to different sessions, so here we would group into sessions and return
+    pairs  `sessionid`, `files`  with `sessionid` being None if no "sessions"
+    detected for that file or there was just a single tarball in the list
+    """
+    # TODO: bring check back?
+    # if any(not tarfile.is_tarfile(i) for i in fl):
+    #     raise ValueError("some but not all input files are tar files")
+
+    # tarfiles already know what they contain, and often the filenames
+    # are unique, or at least in a unqiue subdir per session
+    # strategy: extract everything in a temp dir and assemble a list
+    # of all files in all tarballs
+    tmpdir = tempdirs(prefix='heudiconvdcm')
+
+    sessions = defaultdict(list)
+    session = 0
+    if not isinstance(fl, (list, tuple)):
+        fl = list(fl)
+
+    # needs sorting to keep the generated "session" label deterministic
+    for i, t in enumerate(sorted(fl)):
+        # "classical" heudiconv has that heuristic to handle multiple
+        # tarballs as providing different sessions per each tarball
+        if not tarfile.is_tarfile(t):
+            sessions[None].append(t)
+            continue  # the rest is tarball specific
+
+        tf = tarfile.open(t)
+        # check content and sanitize permission bits
+        tmembers = tf.getmembers()
+        for tm in tmembers:
+            tm.mode = 0o700
+        # get all files, assemble full path in tmp dir
+        tf_content = [m.name for m in tmembers if m.isfile()]
+        # store full paths to each file, so we don't need to drag along
+        # tmpdir as some basedir
+        sessions[session] = [opj(tmpdir, f) for f in tf_content]
+        session += 1
+        # extract into tmp dir
+        tf.extractall(path=tmpdir, members=tmembers)
+
+    if session == 1:
+        # we had only 1 session, so no really multiple sessions according
+        # to classical 'heudiconv' assumptions, thus just move them all into
+        # None
+        sessions[None] += sessions.pop(0)
+
+    return sessions.items()
+
+
+def load_heuristic(heuristic_file):
+    """Load heuristic from the file, return the module
+    """
+    path, fname = os.path.split(heuristic_file)
+    sys.path.append(path)
+    mod = __import__(fname.split('.')[0])
+    mod.filename = heuristic_file
+    return mod
+
+
+def get_study_sessions(dicom_dir_template, files_opt, heuristic, outdir,
+                       session, sids, grouping='studyUID'):
+    """Given options from cmdline sort files or dicom seqinfos into
+    study_sessions which put together files for a single session of a subject
+    in a study
+
+    Two major possible workflows:
+    - if dicom_dir_template provided -- doesn't pre-load DICOMs and just
+      loads files pointed by each subject and possibly sessions as corresponding
+      to different tarballs
+    - if files_opt is provided, sorts all DICOMs it can find under those paths
+    """
+    study_sessions = {}
+    if dicom_dir_template:
+        dicom_dir_template = os.path.abspath(dicom_dir_template)
+        assert not files_opt  # see above TODO
+        assert sids
+        # expand the input template
+        if '{subject}' not in dicom_dir_template:
+            raise ValueError(
+                "dicom dir template must have {subject} as a placeholder for a "
+                "subject id.  Got %r" % dicom_dir_template)
+        for sid in sids:
+            sdir = dicom_dir_template.format(subject=sid, session=session)
+            # and see what matches
+            files = sorted(glob(sdir))
+            for session_, files_ in get_extracted_dicoms(files):
+                if session_ is not None and session:
+                    lgr.warning(
+                        "We had session specified (%s) but while analyzing "
+                        "files got a new value %r (using it instead)"
+                        % (session, session_))
+                # in this setup we do not care about tracking "studies" so
+                # locator would be the same None
+                study_sessions[
+                    StudySessionInfo(
+                        None,
+                        session_ if session_ is not None else session,
+                        sid,
+                    )] = files_
+    else:
+        # prep files
+        assert files_opt
+        assert not sids
+        files = []
+        for f in files_opt:
+            if isdir(f):
+                files += sorted(find_files(
+                    '.*', topdir=f, exclude_vcs=True, exclude="/\.datalad/"))
             else:
-                lgr.debug("DICOMS were already processed, reusing that info")
-            save_json(os.path.join(idir, 'filegroup.json'), filegroup)
-            with open(os.path.join(idir, 'dicominfo.txt'), 'wt') as fp:
-                fp.write('\t'.join([val for val in seqinfo_fields]) + '\n')
-                for seq in seqinfo:
-                    fp.write('\t'.join([str(val) for val in seq]) + '\n')
-            lgr.debug("Calling out to %s.infodict", mod)
-            info = mod.infotodict(seqinfo)
-            write_config(infofile, info)
-            write_config(editfile, info)
-
-        #
-        # Conversion
-        #
-
-        sourcedir = None
-        if is_bids:
-            sourcedir = os.path.join(outdir, 'sourcedata', sid)
-            if ses:
-                sourcedir = os.path.join(sourcedir, 'ses-%s' % str(ses))
-
-        #tdir = os.path.join(anon_outdir, anon_sid)
-        if anon_sid_cmd:
-            tdir = os.path.join(anon_outdir, anon_sid)
-        else:
-            tdir = os.path.dirname(idir)
-
-        if converter != 'none':
-            lgr.info("Doing conversion using %s", converter)
-            cinfo = conversion_info(anon_sid, tdir, info, filegroup,
-                                    basedir=tmpdir,
-                                    ses=ses)
-            convert(cinfo,
-                    converter=converter,
-                    scaninfo_suffix=getattr(
-                        mod, 'scaninfo_suffix', '.json'),
-                    custom_callable=getattr(
-                        mod, 'custom_callable', None),
-                    with_prov=with_prov,
-                    is_bids=is_bids,
-                    sourcedir=sourcedir,
-                    min_meta=min_meta)
-
-        #
-        # Cleanup
-        #
-        if tmpdir is not None:
-            # clean up tmp dir with extracted tarball
-            shutil.rmtree(tmpdir)
+                files.append(f)
+
+        # in this scenario we don't care about sessions obtained this way
+        files_ = []
+        for _, files_ex in get_extracted_dicoms(files):
+            files_ += files_ex
+
+        # sort all DICOMS using heuristic
+        # TODO:  this one is not groupping by StudyUID but may be we should!
+        seqinfo_dict = group_dicoms_into_seqinfos(
+            files_,
+            file_filter=getattr(heuristic, 'filter_files', None),
+            dcmfilter=getattr(heuristic, 'filter_dicom', None),
+            grouping=grouping)
+
+        if not getattr(heuristic, 'infotoids', None):
+            raise NotImplementedError(
+                "For now, if no subj template is provided, requiring "
+                "heuristic to have infotoids")
+
+        for studyUID, seqinfo in seqinfo_dict.items():
+            # so we have a single study, we need to figure out its
+            # locator, session, subject
+            # TODO: Try except to ignore those we can't handle?
+            # actually probably there should be a dedicated exception for
+            # heuristics to throw if they detect that the study they are given
+            # is not the one they would be willing to work on
+            ids = heuristic.infotoids(seqinfo.keys(), outdir=outdir)
+            # TODO:  probably infotoids is doomed to do more and possibly
+            # split into multiple sessions!!!! but then it should be provided
+            # full seqinfo with files which it would place into multiple groups
+            lgr.info("Study session for %s" % str(ids))
+            study_session_info = StudySessionInfo(
+                ids.get('locator'),
+                ids.get('session', session) or session,
+                ids.get('subject', None))
+            if study_session_info in study_sessions:
+                #raise ValueError(
+                lgr.warning(
+                    "We already have a study session with the same value %s"
+                    % repr(study_session_info))
+                continue # skip for now
+            study_sessions[study_session_info] = seqinfo
+
+    return study_sessions
 
 #
 # Additional handlers
@@ -794,7 +1504,250 @@ def is_interactive():
     return sys.stdin.isatty() and sys.stdout.isatty() and sys.stderr.isatty()
 
 
-_sys_excepthook = sys.excepthook # Just in case we ever need original one
+def create_file_if_missing(filename, content):
+    """Create file if missing, so we do not override any possibly introduced changes"""
+    if exists(filename):
+        return False
+    with open(filename, 'w') as f:
+        f.write(content)
+    return True
+
+
+def populate_bids_templates(path, defaults={}):
+    # dataset descriptor
+    lgr.info("Populating template files under %s", path)
+    descriptor = opj(path, 'dataset_description.json')
+    if not exists(descriptor):
+        save_json(descriptor,
+              ordereddict([
+                  ('Name', "TODO: name of the dataset"),
+                  ('BIDSVersion', "1.0.1"),
+                  ('License', defaults.get('License', "TODO: choose a license, e.g. PDDL (http://opendatacommons.org/licenses/pddl/)")),
+                  ('Authors', defaults.get('Authors', ["TODO:", "First1 Last1", "First2 Last2", "..."])),
+                  ('Acknowledgements', defaults.get('Acknowledgements', 'TODO: whom you want to acknowledge')),
+                  ('HowToAcknowledge', "TODO: describe how to acknowledge -- either cite a corresponding paper, or just in acknowledgement section"),
+                  ('Funding', ["TODO", "GRANT #1", "GRANT #2"]),
+                  ('ReferencesAndLinks', ["TODO", "List of papers or websites"]),
+                  ('DatasetDOI', 'TODO: eventually a DOI for the dataset')
+        ]))
+
+    sourcedata_README = opj(path, 'sourcedata', 'README')
+    if exists(dirname(sourcedata_README)):
+        create_file_if_missing(
+            sourcedata_README,
+            """\
+TODO: Provide description about source data, e.g.
+
+Directory below contains DICOMS compressed into tarballs per each sequence,
+replicating directory hierarchy of the BIDS dataset itself.
+""")
+
+    create_file_if_missing(
+        opj(path, 'CHANGES'),
+        """\
+0.0.1  Initial data acquired
+
+TODOs:
+  - verify and possibly extend information in participants.tsv
+    (see for example http://datasets.datalad.org/?dir=/openfmri/ds000208)
+  - fill out dataset_description.json, README, sourcedata/README (if present)
+  - provide _events.tsv file for each _bold.nii.gz with onsets of events
+    (see  "8.5 Task events"  of BIDS specification)
+""")
+
+    create_file_if_missing(
+        opj(path, 'README'),
+        """\
+TODO: Provide description for the dataset -- basic details about the study,
+possibly pointing to pre-registration (if public or embargoed)
+""")
+
+    # TODO: collect all task- .json files for func files to
+    tasks = {}
+    # way too many -- let's just collect all which are the same!
+    # FIELDS_TO_TRACK = {'RepetitionTime', 'FlipAngle', 'EchoTime', 'Manufacturer', 'SliceTiming', ''}
+    for fpath in find_files('.*_task-.*\_bold\.json', topdir=path,
+                        exclude_vcs=True, exclude="/\.(datalad|heudiconv)/"):
+        task = re.sub('.*_(task-[^_\.]*(_acq-[^_\.]*)?)_.*', r'\1', fpath)
+        j = load_json(fpath)
+        if task not in tasks:
+            tasks[task] = j
+        else:
+            rec = tasks[task]
+            # let's retain only those fields which have the same value
+            for field in sorted(rec):
+                if field not in j or j[field] != rec[field]:
+                    del rec[field]
+
+        # create a stub onsets file for each one of those
+        suf = '_bold.json'
+        assert fpath.endswith(suf)
+        events_file = fpath[:-len(suf)] + '_events.tsv'
+        lgr.debug("Generating %s", events_file)
+        with open(events_file, 'w') as f:
+            f.write("onset\tduration\ttrial_type\tTODO -- fill in rows and add more tab-separated columns if desired")
+
+    # - extract tasks files stubs
+    for task_acq, fields in tasks.items():
+        task_file = opj(path, task_acq + '_bold.json')
+        lgr.debug("Generating %s", task_file)
+        fields["TaskName"] = "TODO: full task name for %s" % task_acq.split('_')[0].split('-')[1]
+        fields["CogAtlasID"] = "TODO"
+        with open(task_file, 'w') as f:
+            f.write(json_dumps_pretty(fields, indent=2, sort_keys=True))
+
+
+def add_participant_record(studydir, subject, age, sex):
+    participants_tsv = opj(studydir, 'participants.tsv')
+    participant_id = 'sub-%s' % subject
+
+    if not create_file_if_missing(
+        participants_tsv,
+        '\t'.join(['participant_id', 'age', 'sex', 'group']) + '\n'
+    ):
+        # check if may be subject record already exists
+        with open(participants_tsv) as f:
+            f.readline()
+            known_subjects = {l.split('\t')[0] for l in f.readlines()}
+        if participant_id in known_subjects:
+            # already there -- not adding
+            return
+
+    # Add a new participant
+    with open(participants_tsv, 'a') as f:
+        f.write('\t'.join(map(str, [
+            participant_id,
+            age.lstrip('0').rstrip('Y') if age else 'N/A',
+            sex,
+            'control']))
+        + '\n')
+
+
+def mark_sensitive(ds, path_glob=None):
+    """
+    
+    Parameters
+    ----------
+    ds : Dataset to operate on
+    path_glob : str, optional
+      glob of the paths within dataset to work on
+
+    Returns
+    -------
+    None
+    """
+    sens_kwargs = dict(
+        init=[('distribution-restrictions', 'sensitive')]
+    )
+    if path_glob:
+        paths = glob(opj(ds.path, path_glob))
+        if not paths:
+            return
+        sens_kwargs['path'] = paths
+    ds.metadata(**sens_kwargs)
+
+
+def add_to_datalad(topdir, studydir, msg=None, bids=False):
+    """Do all necessary preparations (if were not done before) and save
+    """
+    from datalad.api import create
+    from datalad.api import Dataset
+    from datalad.support.annexrepo import AnnexRepo
+
+    from datalad.support.external_versions import external_versions
+    assert external_versions['datalad'] >= '0.5.1', "Need datalad >= 0.5.1"
+
+    studyrelpath = os.path.relpath(studydir, topdir)
+    assert not studyrelpath.startswith(os.path.pardir)  # so we are under
+    # now we need to test and initiate a DataLad dataset all along the path
+    curdir_ = topdir
+    superds = None
+    subdirs = [''] + studyrelpath.split(os.path.sep)
+    for isubdir, subdir in enumerate(subdirs):
+        curdir_ = opj(curdir_, subdir)
+        ds = Dataset(curdir_)
+        if not ds.is_installed():
+            lgr.info("Initiating %s", ds)
+            # would require annex > 20161018 for correct operation on annex v6
+            ds_ = create(curdir_, dataset=superds,
+                         force=True,
+                         no_annex=True,  # need to add .gitattributes first anyways
+                         shared_access='all',
+                         annex_version=6)
+            assert ds == ds_
+        assert ds.is_installed()
+        superds = ds
+
+    create_file_if_missing(
+        opj(studydir, '.gitattributes'),
+        """\
+* annex.largefiles=(largerthan=100kb)
+*.json annex.largefiles=nothing
+*.txt annex.largefiles=nothing
+*.tsv annex.largefiles=nothing
+*.nii.gz annex.largefiles=anything
+*.tgz annex.largefiles=anything
+*_scans.tsv annex.largefiles=anything
+""")
+    # so for mortals it just looks like a regular directory!
+    if not ds.config.get('annex.thin'):
+        ds.config.add('annex.thin', 'true', where='local')
+    # initialize annex there if not yet initialized
+    AnnexRepo(ds.path, init=True)
+    # ds might have memories of having ds.repo GitRepo
+    superds = None
+    del ds
+    ds = Dataset(studydir)
+    # Add doesn't have all the options of save such as msg and supers
+    ds.add('.gitattributes', to_git=True, save=False)
+    dsh = None
+    if os.path.lexists(os.path.join(ds.path, '.heudiconv')):
+        dsh = Dataset(opj(ds.path, '.heudiconv'))
+        if not dsh.is_installed():
+            # we need to create it first
+            dsh = ds.create(path='.heudiconv',
+                            force=True,
+                            shared_access='all')
+        # Since .heudiconv could contain sensitive information
+        # we place all files under annex and then add
+        if create_file_if_missing(
+            opj(dsh.path, '.gitattributes'),
+            """* annex.largefiles=anything
+            """):
+            dsh.add('.gitattributes', message="Added gitattributes to place all content under annex")
+    ds.add('.', recursive=True, save=False,
+           # not in effect! ?
+           #annex_add_opts=['--include-dotfiles']
+           )
+
+    # TODO: filter for only changed files?
+    # Provide metadata for sensitive information
+    mark_sensitive(ds, 'sourcedata')
+    mark_sensitive(ds, '*_scans.tsv')  # top level
+    mark_sensitive(ds, '*/*_scans.tsv')  # within subj
+    mark_sensitive(ds, '*/anat')  # within subj
+    mark_sensitive(ds, '*/*/anat')  # within subj/ses
+    if dsh:
+        mark_sensitive(dsh)  # entire .heudiconv!
+        dsh.save(message=msg)
+    ds.save(message=msg, recursive=True, super_datasets=True)
+
+    assert not ds.repo.dirty
+    # TODO:  they are still appearing as native annex symlinked beasts
+    """
+    TODOs:
+    it needs
+    - unlock  (thin will be in effect)
+    - save/commit (does modechange 120000 => 100644
+
+    - could potentially somehow automate that all:
+      http://git-annex.branchable.com/tips/automatically_adding_metadata/
+    - possibly even make separate sub-datasets for originaldata, derivatives ?
+    """
+
+
+_sys_excepthook = sys.excepthook  # Just in case we ever need original one
+
 
 def setup_exceptionhook():
     """Overloads default sys.excepthook with our exceptionhook handler.
@@ -805,54 +1758,240 @@ def setup_exceptionhook():
 
     def _pdb_excepthook(type, value, tb):
         if is_interactive():
-            import traceback, pdb
+            import traceback
+            import pdb
             traceback.print_exception(type, value, tb)
             print()
             pdb.post_mortem(tb)
         else:
             lgr.warn("We cannot setup exception hook since not in interactive mode")
-            # we are in interactive mode or we don't have a tty-like
-            # device, so we call the default hook
-            #sys.__excepthook__(type, value, tb)
             _sys_excepthook(type, value, tb)
 
     sys.excepthook = _pdb_excepthook
 
 
-def main(args=None):
+def _main(args):
+    """Given a structure of arguments from the parser perform computation"""
+
+    #
+    # Deal with provided files or templates
+    #
+
+    #
+    # pre-process provided list of files and possibly sort into groups/sessions
+    #
+
+    # Group files per each study/sid/session
+
+    dicom_dir_template = args.dicom_dir_template
+    files_opt = args.files
+    session = args.session
+    subjs = args.subjs
+    outdir = os.path.abspath(args.outdir)
+    grouping = args.grouping
+
+    if args.command:
+        # custom mode of operation
+        if args.command == 'treat-json':
+            for f in files_opt:
+                treat_infofile(f)
+        elif args.command == 'ls':
+            heuristic = load_heuristic(os.path.realpath(args.heuristic_file))
+            heuristic_ls = getattr(heuristic, 'ls', None)
+            for f in files_opt:
+                study_sessions = get_study_sessions(
+                    dicom_dir_template, [f],
+                    heuristic, outdir, session, subjs, grouping=grouping)
+                print(f)
+                for study_session, sequences in study_sessions.items():
+                    suf = ''
+                    if heuristic_ls:
+                        suf += heuristic_ls(study_session, sequences)
+                    print(
+                        "\t%s %d sequences%s"
+                        % (str(study_session), len(sequences), suf)
+                    )
+        elif args.command == 'populate-templates':
+            heuristic = load_heuristic(os.path.realpath(args.heuristic_file))
+            for f in files_opt:
+                populate_bids_templates(
+                    f,
+                    getattr(heuristic, 'DEFAULT_FIELDS', {})
+                )
+        elif args.command == 'sanitize-jsons':
+            tuneup_bids_json_files(files_opt)
+        else:
+            raise ValueError("Unknown command %s", args.command)
+        return
+
+    #
+    # Load heuristic -- better do it asap to make sure it loads correctly
+    #
+    heuristic = load_heuristic(os.path.realpath(args.heuristic_file))
+    # TODO: Move into a function!
+    study_sessions = get_study_sessions(
+        dicom_dir_template, files_opt,
+        heuristic, outdir, session, subjs,
+        grouping=grouping)
+    # extract tarballs, and replace their entries with expanded lists of files
+    # TODO: we might need to sort so sessions are ordered???
+    lgr.info("Need to process %d study sessions", len(study_sessions))
+
+    #
+    # processed_studydirs = set()
+
+    for (locator, session, sid), files_or_seqinfo in study_sessions.items():
+
+        if not len(files_or_seqinfo):
+            raise ValueError("nothing to process?")
+        # that is how life is ATM :-/ since we don't do sorting if subj
+        # template is provided
+        if isinstance(files_or_seqinfo, dict):
+            assert(isinstance(list(files_or_seqinfo.keys())[0], SeqInfo))
+            dicoms = None
+            seqinfo = files_or_seqinfo
+        else:
+            dicoms = files_or_seqinfo
+            seqinfo = None
+        
+        if locator == 'unknown':
+            lgr.warning("Skipping  unknown  locator dataset")
+            continue
+
+        if args.queue:
+            if seqinfo and not dicoms:
+                # flatten them all and provide into batching, which again
+                # would group them... heh
+                dicoms = sum(seqinfo.values(), [])
+                # so
+                raise NotImplementedError(
+                    "we already groupped them so need to add a switch to avoid "
+                    "any groupping, so no outdir prefix doubled etc"
+                )
+            # TODO This needs to be updated to better scale with additional args
+            progname = os.path.abspath(inspect.getfile(inspect.currentframe()))
+            convertcmd = ' '.join(['python', progname,
+                                   '-o', study_outdir,
+                                   '-f', heuristic.filename,
+                                   '-s', sid,
+                                   '--anon-cmd', args.anon_cmd,
+                                   '-c', args.converter])
+            if session:
+                convertcmd += " --ses '%s'" % session
+            if args.with_prov:
+                convertcmd += " --with-prov"
+            if args.bids:
+                convertcmd += " --bids"
+            convertcmd += ["'%s'" % f for f in dicoms]
+
+            script_file = 'dicom-%s.sh' % sid
+            with open(script_file, 'wt') as fp:
+                fp.writelines(['#!/bin/bash\n', convertcmd])
+            outcmd = 'sbatch -J dicom-%s -p %s -N1 -c2 --mem=20G %s' \
+                     % (sid, args.queue, script_file)
+            os.system(outcmd)
+            continue
+
+        anon_sid = get_annonimized_sid(sid, args.anon_cmd)
+
+        study_outdir = opj(outdir, locator or '')
+
+        anon_outdir = args.conv_outdir or outdir
+        anon_study_outdir = opj(anon_outdir, locator or '')
+
+        # TODO: --datalad  cmdline option, which would take care about initiating
+        # the outdir -> study_outdir datasets if not yet there
+        if args.datalad:
+            datalad_msg_suf = ' %s' % anon_sid
+            if session:
+                datalad_msg_suf += ", session %s" % session
+            if seqinfo:
+                datalad_msg_suf += ", %d sequences" % len(seqinfo)
+            datalad_msg_suf += ", %d dicoms" % (
+                len(sum(seqinfo.values(), [])) if seqinfo else len(dicoms)
+            )
+            from datalad.api import Dataset
+            ds = Dataset(anon_study_outdir)
+            if not exists(anon_outdir) or not ds.is_installed():
+                add_to_datalad(
+                    anon_outdir, anon_study_outdir,
+                    msg="Preparing for %s" % datalad_msg_suf,
+                    bids=args.bids)
+        lgr.info("PROCESSING STARTS: {0}".format(
+            str(dict(subject=sid, outdir=study_outdir, session=session))))
+        convert_dicoms(
+                   sid,
+                   dicoms,
+                   study_outdir,
+                   heuristic=heuristic,
+                   converter=args.converter,
+                   anon_sid=anon_sid,
+                   anon_outdir=anon_study_outdir,
+                   with_prov=args.with_prov,
+                   ses=session,
+                   is_bids=args.bids,
+                   seqinfo=seqinfo,
+                   min_meta=args.minmeta)
+        lgr.info("PROCESSING DONE: {0}".format(
+            str(dict(subject=sid, outdir=study_outdir, session=session))))
+
+        if args.datalad:
+            msg = "Converted subject %s" % datalad_msg_suf
+            # TODO:  whenever propagate to supers work -- do just
+            # ds.save(msg=msg)
+            #  also in batch mode might fail since we have no locking ATM
+            #  and theoretically no need actually to save entire study
+            #  we just need that
+            add_to_datalad(outdir, study_outdir, msg=msg, bids=args.bids)
+
+    # if args.bids:
+    #     # Let's populate BIDS templates for folks to take care about
+    #     for study_outdir in processed_studydirs:
+    #         populate_bids_templates(study_outdir)
+    #
+    #         # TODO: record_collection of the sid/session although that information
+    #         # is pretty much present in .heudiconv/SUBJECT/info so we could just poke there
+
+    tempdirs.cleanup()
+
+
+def get_parser():
     docstr = '\n'.join((__doc__,
-"""
-           Example:
+                        """
+                                   Example:
 
-           heudiconv -d rawdata/%s -o . -f heuristic.py -s s1 s2
-s3
-"""))
+                                   heudiconv -d rawdata/{subject} -o . -f
+                                   heuristic.py -s s1 s2
+                        s3
+                        """))
     parser = argparse.ArgumentParser(description=docstr)
     parser.add_argument('--version', action='version', version=__version__)
     parser.add_argument('-d', '--dicom_dir_template',
                         dest='dicom_dir_template',
-                        required=True,
+                        required=False,
                         help='''location of dicomdir that can be indexed with
                         subject id {subject} and session {session}.
                         Tarballs (can be compressed) are supported
                         in addition to directory. All matching tarballs for a
                         subject are extracted and their content processed in
                         a single pass''')
-    parser.add_argument('-s', '--subjects', dest='subjs', required=True,
-                        type=str, nargs='+', help='list of subjects')
+    parser.add_argument('-s', '--subjects', dest='subjs',
+                        type=str, nargs='*',
+                        help='list of subjects. If not provided, DICOMS would '
+                             'first be "sorted" and subject IDs deduced by the '
+                             'heuristic')
     parser.add_argument('-c', '--converter', dest='converter',
-                        required=True,
-                        choices=('dcm2niix',
-                                 'none'),
+                        default='dcm2niix',
+                        choices=('dcm2niix', 'none'),
                         help='''tool to use for dicom conversion. Setting to
                         "none" disables the actual conversion step -- useful
                         for testing heuristics.''')
-    parser.add_argument('-o', '--outdir', dest='outputdir',
+    parser.add_argument('-o', '--outdir', dest='outdir',
                         default=os.getcwd(),
                         help='''output directory for conversion setup (for
                         further customization and future reference. This
                         directory will refer to non-anonymized subject IDs''')
-    parser.add_argument('-a', '--conv-outdir', dest='conv_outputdir',
+    parser.add_argument('-a', '--conv-outdir', dest='conv_outdir',
                         default=None,
                         help='''output directory for converted files. By
                         default this is identical to --outdir. This option is
@@ -863,39 +2002,77 @@ s3
                         DICOMs to anonymmized IDs. Such command must take a
                         single argument and return a single anonymized ID.
                         Also see --conv-outdir''')
-    parser.add_argument('-f', '--heuristic', dest='heuristic_file', required=True,
+    parser.add_argument('-f', '--heuristic', dest='heuristic_file',
+                        required=True,
                         help='python script containing heuristic')
     parser.add_argument('-q', '--queue', dest='queue', default=None,
                         help='''select batch system to submit jobs to instead
                         of running the conversion serially''')
     parser.add_argument('-p', '--with-prov', dest='with_prov', action='store_true',
-                        help='''Store additional provenance information. Requires python-rdflib.''')
+                        help='''Store additional provenance information.
+                        Requires python-rdflib.''')
     parser.add_argument('-ss', '--ses', dest='session', default=None,
-                        help='''session for longitudinal studies, default is none''')
+                        help='''session for longitudinal study_sessions, default is none''')
     parser.add_argument('-b', '--bids', dest='bids', action='store_true',
                         help='''flag for output into BIDS structure''')
+    parser.add_argument('--overwrite', dest='overwrite', action='store_true',
+                        help='''flag to allow overwrite existing files''')
+    parser.add_argument('--datalad', dest='datalad', action='store_true',
+                        help='''Store the entire collection as DataLad dataset(s).
+                        Small files will be committed directly to git, while large to annex.
+                        New version (6) of annex repositories will be used in a "thin"
+                        mode so it would look to mortals as just any other regular directory
+                        (i.e. no symlinks to under .git/annex).  For now just for BIDS mode.''')
     parser.add_argument('--dbg', action='store_true', dest='debug',
                         help="do not catch exceptions and show exception traceback")
-    parser.add_argument('--sbargs', dest='sbatch_args', default='-N1 -c2 --mem=20G -t 01:00:00',
-                        help="Additional sbatch arguments if running with queue arg")
+    parser.add_argument('--command', dest='command',
+                        choices=('treat-json', 'ls', 'populate-templates'),
+                        help='''custom actions to be performed on provided files instead of
+                        regular operation.''')
+    parser.add_argument('-g', '--grouping',
+                        default='studyUID',
+                        choices=('studyUID', 'accession_number'),
+                        help='''How to group dicoms (default: by studyUID)''')
+    parser.add_argument(
+        'files',
+        nargs='*',
+        help="files (tarballs, dicoms) or directories containing files to "
+             "process. Specify one of the --dicom_dir_template or files "
+             "not both")
     parser.add_argument('--minmeta', dest='minmeta', action='store_true',
                         help="Exclude dcmstack's meta information in sidecar jsons")
-    args = parser.parse_args(args)
+    return parser
+
+
+def main(argv=None):
+    """Given a list of command line arguments, parse them and pass into _main
+    """
+    parser = get_parser()
+    args = parser.parse_args(argv)
+
+    # TODO: assign   distribution-restrictions=sensitive
+
+    # TODO(privat): entirety of .heudiconv/ should get under annex and be marked
+    # as sensitive since date incorporated in the path might leak
+
+    # TODO:  deprecate dicom_dir_template in favor of --files-templated or
+    # smth like that which could take {subject} {session} ... and process
+    # files argument(s) correspondingly before passing into group_dicoms_into_seqinfos
+
+    if args.files and args.dicom_dir_template:
+        raise ValueError("Specify files or dicom_dir_template, not both")
 
     if args.debug:
         setup_exceptionhook()
-    convert_dicoms(args.subjs, os.path.abspath(args.dicom_dir_template),
-                   os.path.abspath(args.outputdir),
-                   heuristic_file=os.path.realpath(args.heuristic_file),
-                   converter=args.converter,
-                   queue=args.queue,
-                   anon_sid_cmd=args.anon_cmd,
-                   anon_outdir=args.conv_outputdir,
-                   with_prov=args.with_prov,
-                   ses=args.session,
-                   is_bids=args.bids,
-                   sbatch_args=args.sbatch_args,
-                   min_meta=args.minmeta)
+
+    orig_global_options = global_options.copy()
+    try:
+        global_options['overwrite'] = args.overwrite
+        return _main(args)
+    finally:
+        # reset back
+        for k, v in orig_global_options.items():
+            global_options[k] = v
 
 
 if __name__ == '__main__':
diff --git a/bin/heudiconv_monitor b/bin/heudiconv_monitor
new file mode 100755
index 00000000..2faec825
--- /dev/null
+++ b/bin/heudiconv_monitor
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+import logging
+import os
+import re
+import subprocess
+import time
+import json
+
+from collections import deque, OrderedDict
+from datetime import date
+import inotify.adapters
+from inotify.constants import IN_MODIFY, IN_CREATE, IN_ISDIR
+from py.path import local as localpath
+from tinydb import TinyDB
+
+_DEFAULT_LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+_LOGGER = logging.getLogger(__name__)
+
+MASK = (IN_MODIFY | IN_CREATE)
+MASK_NEWDIR = (IN_CREATE | IN_ISDIR)
+WAIT_TIME = 86400  # in seconds
+
+
+#def _configure_logging():
+_LOGGER.setLevel(logging.DEBUG)
+ch = logging.StreamHandler()
+formatter = logging.Formatter(_DEFAULT_LOG_FORMAT)
+ch.setFormatter(formatter)
+_LOGGER.addHandler(ch)
+
+
+def run_heudiconv(cmd):
+    info_dict = dict()
+    proc = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    return_code = proc.wait()
+    if return_code == 0:
+        _LOGGER.info("Done running {0}".format(cmd))
+        info_dict['success'] = 1
+    else:   
+        _LOGGER.error("{0} failed".format(cmd))
+        info_dict['success'] = 0
+    # get info on what we run
+    stdout = proc.communicate()[0].decode('utf-8')
+    match = re.match('INFO: PROCESSING STARTS: (.*)', stdout)
+    info_dict_ = json.loads(match.group(1) if match else '{}')
+    info_dict.update(info_dict_)
+    return stdout, info_dict
+    
+
+def process(paths2process, db, wait=WAIT_TIME, logdir='log'):
+    cmd = 'ls -l {0}'
+    #if paths2process and time.time() - os.path.getmtime(paths2process[0]) > WAIT_TIME:
+    processed = []
+    for path, mod_time in paths2process.items():
+        if time.time() - mod_time > wait:
+            #process_me = paths2process.popleft().decode('utf-8')
+            process_me = path
+            cmd_ = cmd.format(process_me)
+            process_dict = {'input_path': process_me, 'accession_number': os.path.basename(process_me)}
+            print("Time to process {0}".format(process_me))
+            stdout, run_dict = run_heudiconv(cmd_)
+            process_dict.update(run_dict)
+            db.insert(process_dict)
+            # save log
+            logdir = localpath(logdir)
+            log = logdir.join(process_dict['accession_number'] + '.log')
+            log.write(stdout)
+            # if we processed it, or it failed, we need to remove it to avoid running it again
+            processed.append(path)
+    for processed_path in processed:
+        del paths2process[processed_path]
+
+
+def monitor(topdir='/tmp/new_dir', check_ptrn='/20../../..', db=None, wait=WAIT_TIME, logdir='log'):
+    # make logdir if not existant
+    try:
+        os.makedirs(parsed.logdir)
+    except OSError:
+        pass
+    #paths2process = deque()
+    paths2process = OrderedDict()
+    # watch only today's folder
+    path_re = re.compile("(%s%s)/?$" % (topdir, check_ptrn))
+    i = inotify.adapters.InotifyTree(topdir.encode())#, mask=MASK)
+    for event in i.event_gen():
+        if event is not None:
+            (header, type_names, watch_path, filename) = event
+            _LOGGER.info("WD=(%d) MASK=(%d) COOKIE=(%d) LEN=(%d) MASK->NAMES=%s "
+                         "WATCH-PATH=[%s] FILENAME=[%s]",
+                         header.wd, header.mask, header.cookie, header.len, type_names,
+                         watch_path.decode('utf-8'), filename.decode('utf-8'))
+            if header.mask == MASK_NEWDIR and path_re.match(watch_path.decode('utf-8')):
+                # we got our directory, now let's do something on it
+                newpath2process = os.path.join(watch_path, filename).decode('utf-8')
+                #paths2process.append(newpath2process)
+                # update time
+                paths2process[newpath2process] = time.time()
+                print(newpath2process, time.time()) 
+            # check if we need to update the time
+            for path in paths2process.keys():
+                if path in watch_path.decode('utf-8'):
+                    paths2process[path] = time.time()
+                    print("Updating {0}: {1}".format(path, paths2process[path]))
+                
+        # check if there's anything to process
+        process(paths2process, db, wait=wait, logdir=logdir)
+
+
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(prog='monitor.py', description='Small monitoring script to detect new directories and process them', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('path', help='Which directory to monitor')
+    parser.add_argument('--check_ptrn', '-p', help='regexp pattern for which subdirectories to check', default='/20../../..')
+    parser.add_argument('--database', '-d', help='database location', default='database.json')
+    parser.add_argument('--wait_time', '-w', help='After how long should we start processing datasets? (in seconds)', default=86400, type=float)
+    parser.add_argument('--logdir', '-l', help='Where should we save the logs?', default='log')
+
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    parsed = parse_args()
+    print('Got {0}'.format(parsed))
+    # open database
+    db = TinyDB(parsed.database, default_table='heudiconv')
+    monitor(parsed.path, parsed.check_ptrn, db, wait=parsed.wait_time, logdir=parsed.logdir)
diff --git a/bin/test_heudiconv_monitor.py b/bin/test_heudiconv_monitor.py
new file mode 100644
index 00000000..78c0285d
--- /dev/null
+++ b/bin/test_heudiconv_monitor.py
@@ -0,0 +1,98 @@
+from collections import namedtuple
+import os
+import pytest
+from mock import patch
+from monitor import monitor, process, run_heudiconv, MASK_NEWDIR
+from os.path import exists
+from tinydb import TinyDB, Query
+from subprocess import CalledProcessError
+
+
+class MockInotifyTree(object):
+    def __init__(self, events):
+        self.events = iter(events)
+    def event_gen(self):
+        for e in self.events:
+            yield e
+    def __call__(self, topdir):
+        return self
+
+
+class MockTime(object):
+    def __init__(self, time):
+        self.time = time
+    def __call__(self):
+        return self.time
+
+
+Header = namedtuple('header', ['wd', 'mask', 'cookie', 'len'])
+header = Header(5, MASK_NEWDIR, 5, 5)
+watch_path = b'WATCHME'
+filename = b'FILE'
+type_names = b'TYPE'
+
+path2 = watch_path + b'/' + filename + b'/subpath'
+
+my_events = [(header, type_names, watch_path, filename), 
+             (header, type_names, path2, b'')]
+
+@patch('inotify.adapters.InotifyTree', MockInotifyTree(my_events))
+@patch('time.time', MockTime(42))
+def test_monitor(capsys):
+    monitor(watch_path.decode(), check_ptrn='')
+    out, err = capsys.readouterr()
+    desired_output = '{0}/{1} {2}\n'.format(watch_path.decode(), filename.decode(), 42) 
+    desired_output += 'Updating {0}/{1}: {2}\n'.format(watch_path.decode(), filename.decode(), 42)
+    assert out == desired_output
+
+
+@patch('time.time', MockTime(42))
+@pytest.mark.parametrize("side_effect,success", [
+    (None, 1),
+    (CalledProcessError('mycmd', 1), 0)
+])
+def test_process(tmpdir, capsys, side_effect, success):
+    db_fn = tmpdir.join('database.json')
+    log_dir = tmpdir.mkdir('log')
+    db = TinyDB(db_fn.strpath)
+    process_me = '/my/path/A12345'
+    accession_number = os.path.basename(process_me)
+    paths2process = {process_me: 42} 
+    with patch('subprocess.Popen') as mocked_popen:
+        stdout = b"INFO: PROCESSING STARTS: {'just': 'a test'}"
+        mocked_popen_instance = mocked_popen.return_value
+        mocked_popen_instance.side_effect = side_effect
+        mocked_popen_instance.communicate.return_value = (stdout, )
+        # set return value for wait
+        mocked_popen_instance.wait.return_value = 1 - success
+        # mock also communicate to get the supposed stdout
+        # mocked_popen.communicate = lambda: (b"INFO: PROCESSING STARTS: {'just': 'a test'}", )
+        process(paths2process, db, wait=-30, logdir=log_dir.strpath)
+        out, err = capsys.readouterr()
+        log_fn = log_dir.join(accession_number + '.log')
+        
+        mocked_popen.assert_called_once()
+        assert log_fn.check()
+        assert log_fn.read() == stdout.decode('utf-8')
+        assert db_fn.check()
+        # dictionary should be empty
+        assert not paths2process
+        assert out == 'Time to process {0}\n'.format(process_me)
+
+        # check what we have in the database
+        Path = Query()
+        query = db.get(Path.input_path == process_me)
+        assert len(db) == 1
+        assert query
+        assert query['success'] == success
+        assert query['accession_number'] == os.path.basename(process_me)
+        assert query['just'] == 'a test'
+
+
+def test_run_heudiconv():
+    # echo should succeed always
+    mydict = {'key1': 'value1', 'key2': 'value2', 'success': 1}
+    cmd = "echo INFO: PROCESSING STARTS: {0}".format(str(mydict))
+    stdout, info_dict = run_heudiconv(cmd)
+    assert info_dict == mydict
+    assert "echo " + stdout.strip() == cmd
diff --git a/custom/dbic/README b/custom/dbic/README
new file mode 100644
index 00000000..844e7446
--- /dev/null
+++ b/custom/dbic/README
@@ -0,0 +1,3 @@
+Scripts and configurations used alongside with heudiconv setup at DBIC
+(Dartmouth Brain Imaging Center).  Might migrate to an independent repository
+eventually but for now placed here with hope they might come useful for some.
diff --git a/custom/dbic/singularity-env.def b/custom/dbic/singularity-env.def
new file mode 100644
index 00000000..a7722e4a
--- /dev/null
+++ b/custom/dbic/singularity-env.def
@@ -0,0 +1,64 @@
+# Copyright (c) 2015-2016, Gregory M. Kurtzer. All rights reserved.
+#
+# Changes for NeuroDebian/DBIC setup are Copyright (c) 2017 Yaroslav Halchenko.
+#
+# The purpose of the environment is to provide a complete suite for running
+# heudiconv on the INBOX server to provide conversion into BIDS layout.
+# ATM it does not ship heudiconv itself which would be accessed directly
+# from the main drive for now.
+#
+# "Singularity" Copyright (c) 2016, The Regents of the University of California,
+# through Lawrence Berkeley National Laboratory (subject to receipt of any
+# required approvals from the U.S. Dept. of Energy).  All rights reserved.
+
+#
+# Notes:
+#  - Due to  https://github.com/singularityware/singularity/issues/471
+#    bootstrapping leads to non-usable/non-removable-without-reboot
+#    image due to some rogue run away processes.
+#    This line could help to kill them but should be used with caution
+#    since could kill other unrelated processes
+#
+#      grep -l loop /proc/*/mountinfo | sed -e 's,/proc/\(.*\)/.*,\1,g' | while read pid; do sudo kill $pid; done
+
+BootStrap: debootstrap
+#OSVersion: stable
+# needs nipype 0.12.1 but that one didn't build for stable since needs python-prov...
+# so trying stretch
+OSVersion: stretch
+MirrorURL: http://ftp.us.debian.org/debian/
+
+# so if image is executed we just enter the environment
+%runscript
+    echo "Welcome to the DBIC BIDS environment"
+    /bin/bash
+
+
+%post
+    echo "Configuring the environment"
+    apt-get update
+    apt-get -y install eatmydata 
+    eatmydata apt-get -y install vim wget strace time ncdu gnupg curl procps
+    wget -q -O/tmp/nd-configurerepo https://raw.githubusercontent.com/neurodebian/neurodebian/4d26c8f30433145009aa3f74516da12f560a5a13/tools/nd-configurerepo
+    bash /tmp/nd-configurerepo
+    chmod a+r -R /etc/apt
+    eatmydata apt-get -y install datalad python-nipype virtualenv dcm2niix python-dcmstack python-configparser python-funcsigs python-pytest dcmtk
+
+    # for bids-validator
+    curl -sL https://deb.nodesource.com/setup_6.x | bash - && \
+       eatmydata apt-get install -y nodejs
+    npm install -g bids-validator@0.20.0
+    chmod a+rX -R /usr/lib/node_modules/
+
+    chmod a+rX -R /etc/apt/sources.list.d
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+    apt-get clean
+
+    # and wipe out apt lists since not to be used RW for further tuning
+    # find /var/lib/apt/lists/ -type f -delete
+    # /usr/bin/find /var/lib/apt/lists/ -type f -name \*Packages\* -o -name \*Contents\*
+    # complicates later interrogation - thus disabled
+
+    # Create some bind mount directories present on rolando
+    mkdir -p /afs /inbox
+    chmod a+rX /afs /inbox
diff --git a/dev-requirements.txt b/dev-requirements.txt
index 78e5af36..26b77f68 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,3 +1,2 @@
 -r requirements.txt
-six
-nose
+pytest
diff --git a/heuristics/banda-bids.py b/heuristics/banda-bids.py
index 92d8391a..946c33b7 100644
--- a/heuristics/banda-bids.py
+++ b/heuristics/banda-bids.py
@@ -38,9 +38,9 @@ def infotodict(seqinfo):
     for idx, s in enumerate(seqinfo):
         # T1 and T2 scans
         if (s.dim3 == 208) and (s.dim4 == 1) and ('T1w' in s.protocol_name):
-            info[t1] = [s.series_number]
+            info[t1] = [s.series_id]
         if (s.dim3 == 208) and ('T2w' in s.protocol_name):
-            info[t2] = [s.series_number]
+            info[t2] = [s.series_id]
         # diffusion scans
         if ('dMRI_dir9' in s.protocol_name):
             key = None
@@ -49,7 +49,7 @@ def infotodict(seqinfo):
             elif (s.dim4 == 1) and ('SBRef' in s.series_description):
                 key = dwi_sbref
             if key:
-                info[key].append({'item': s.series_number})
+                info[key].append({'item': s.series_id})
         # functional scans
         if ('fMRI' in s.protocol_name):
             tasktype = s.protocol_name.split('fMRI')[1].split('_')[1]
@@ -65,10 +65,10 @@ def infotodict(seqinfo):
                 if 'conflict' in tasktype: key = conflict_sbref
                 if 'gambling' in tasktype: key = gamble_sbref
             if key:
-                info[key].append({'item': s.series_number})
+                info[key].append({'item': s.series_id})
         if (s.dim4 == 3) and ('SpinEchoFieldMap' in s.protocol_name):
             dirtype = s.protocol_name.split('_')[-1]
-            info[fmap].append({'item': s.series_number, 'dir': dirtype})
+            info[fmap].append({'item': s.series_id, 'dir': dirtype})
 
     # You can even put checks in place for your protocol
     msg = []
diff --git a/heuristics/bids_with_ses.py b/heuristics/bids_with_ses.py
index 1972b51b..e8dc84d8 100644
--- a/heuristics/bids_with_ses.py
+++ b/heuristics/bids_with_ses.py
@@ -37,38 +37,38 @@ def infotodict(seqinfo):
     last_run = len(seqinfo)
     for s in seqinfo:
         if (s.dim3 == 176 or s.dim3 == 352) and (s.dim4 == 1) and ('MEMPRAGE' in s.protocol_name):
-            info[t1] = [s.series_number]
+            info[t1] = [s.series_id]
         elif (s.dim4 == 1) and ('MEMPRAGE' in s.protocol_name):
-            info[t1] = [s.series_number]
+            info[t1] = [s.series_id]
         elif (s.dim3 == 176 or s.dim3 == 352) and (s.dim4 == 1) and ('T2_SPACE' in s.protocol_name):
-            info[t2] = [s.series_number]
+            info[t2] = [s.series_id]
         elif ('field_mapping_diffusion' in s.protocol_name):
-            info[fm_diff].append([s.series_number])
+            info[fm_diff].append([s.series_id])
         elif (s.dim4 >= 70) and ('DIFFUSION_HighRes_AP' in s.protocol_name):
-            info[dwi_ap].append([s.series_number])
+            info[dwi_ap].append([s.series_id])
         elif ('DIFFUSION_HighRes_PA' in s.protocol_name):
-            info[dwi_pa].append([s.series_number])
+            info[dwi_pa].append([s.series_id])
         elif ('field_mapping_resting' in s.protocol_name):
-            info[fm_rest].append([s.series_number])
+            info[fm_rest].append([s.series_id])
         elif (s.dim4 == 144) and ('resting' in s.protocol_name):
             if not s.is_motion_corrected:
-                info[rs].append([(s.series_number)])
+                info[rs].append([(s.series_id)])
         elif (s.dim4 == 183 or s.dim4 == 366) and ('localizer' in s.protocol_name):
             if not s.is_motion_corrected:
-                info[boldt1].append([s.series_number])
+                info[boldt1].append([s.series_id])
         elif (s.dim4 == 227 or s.dim4 == 454) and ('transfer1' in s.protocol_name):
             if not s.is_motion_corrected:
-                info[boldt2].append([s.series_number])
+                info[boldt2].append([s.series_id])
         elif (s.dim4 == 227 or s.dim4 == 454) and ('transfer2' in s.protocol_name):
             if not s.is_motion_corrected:
-                info[boldt3].append([s.series_number])
+                info[boldt3].append([s.series_id])
         elif (('run1' in s.protocol_name) or ('run6' in s.protocol_name)) and (s.dim4 == 159):
             if not s.is_motion_corrected:
-               info[nofb_task].append([s.series_number])
+               info[nofb_task].append([s.series_id])
         elif (('run2' in s.protocol_name) or ('run3' in s.protocol_name) or ('run4' in s.protocol_name)
                 or ('run5' in s.protocol_name)) and (s.dim4 == 159):
             if not s.is_motion_corrected:
-                info[fb_task].append([s.series_number])
+                info[fb_task].append([s.series_id])
         else:
             pass
     return info
diff --git a/heuristics/cmrr_heuristic.py b/heuristics/cmrr_heuristic.py
index 79c370c1..449b5473 100644
--- a/heuristics/cmrr_heuristic.py
+++ b/heuristics/cmrr_heuristic.py
@@ -31,41 +31,41 @@ def infotodict(seqinfo):
 
     for idx, s in enumerate(seqinfo):
         if (s.dim3 == 208) and (s.dim4 == 1) and ('T1w' in s.protocol_name):
-            info[t1] = [s.series_number]
+            info[t1] = [s.series_id]
         if (s.dim3 == 208) and ('T2w' in s.protocol_name):
-            info[t2] = [s.series_number]
+            info[t2] = [s.series_id]
         if (s.dim4 >= 99) and (('dMRI_dir98_AP' in s.protocol_name) or ('dMRI_dir99_AP' in s.protocol_name)):
             acq = s.protocol_name.split('dMRI_')[1].split('_')[0] + 'AP'
-            info[dwi].append({'item': s.series_number, 'acq': acq})
+            info[dwi].append({'item': s.series_id, 'acq': acq})
         if (s.dim4 >= 99) and (('dMRI_dir98_PA' in s.protocol_name) or ('dMRI_dir99_PA' in s.protocol_name)):
             acq = s.protocol_name.split('dMRI_')[1].split('_')[0] + 'PA'
-            info[dwi].append({'item': s.series_number, 'acq': acq})
+            info[dwi].append({'item': s.series_id, 'acq': acq})
         if (s.dim4 == 1) and (('dMRI_dir98_AP' in s.protocol_name) or ('dMRI_dir99_AP' in s.protocol_name)):
             acq = s.protocol_name.split('dMRI_')[1].split('_')[0]
-            info[fmap_dwi].append({'item': s.series_number, 'dir': 'AP', 'acq': acq})
+            info[fmap_dwi].append({'item': s.series_id, 'dir': 'AP', 'acq': acq})
         if (s.dim4 == 1) and (('dMRI_dir98_PA' in s.protocol_name) or ('dMRI_dir99_PA' in s.protocol_name)):
             acq = s.protocol_name.split('dMRI_')[1].split('_')[0]
-            info[fmap_dwi].append({'item': s.series_number, 'dir': 'PA', 'acq': acq})
+            info[fmap_dwi].append({'item': s.series_id, 'dir': 'PA', 'acq': acq})
         if (s.dim4 == 420) and ('rfMRI_REST_AP' in s.protocol_name):
-            info[rest].append({'item': s.series_number, 'acq': 'AP'})
+            info[rest].append({'item': s.series_id, 'acq': 'AP'})
         if (s.dim4 == 420) and ('rfMRI_REST_PA' in s.protocol_name):
-            info[rest].append({'item': s.series_number, 'acq': 'PA'})
+            info[rest].append({'item': s.series_id, 'acq': 'PA'})
         if (s.dim4 == 1) and ('rfMRI_REST_AP' in s.protocol_name):
             if seqinfo[idx + 1][9] != 420:
                 continue
-            info[fmap_rest].append({'item': s.series_number, 'dir': 'AP', 'acq': ''})
+            info[fmap_rest].append({'item': s.series_id, 'dir': 'AP', 'acq': ''})
         if (s.dim4 == 1) and ('rfMRI_REST_PA' in s.protocol_name):
-            info[fmap_rest].append({'item': s.series_number, 'dir': 'PA', 'acq': ''})
+            info[fmap_rest].append({'item': s.series_id, 'dir': 'PA', 'acq': ''})
         if (s.dim4 == 346) and ('tfMRI_faceMatching_AP' in s.protocol_name):
-            info[face].append({'item': s.series_number, 'acq': 'AP'})
+            info[face].append({'item': s.series_id, 'acq': 'AP'})
         if (s.dim4 == 346) and ('tfMRI_faceMatching_PA' in s.protocol_name):
-            info[face].append({'item': s.series_number, 'acq': 'PA'})
+            info[face].append({'item': s.series_id, 'acq': 'PA'})
         if (s.dim4 == 288) and ('tfMRI_conflict_AP' in s.protocol_name):
-            info[conflict].append({'item': s.series_number, 'acq': 'AP'})
+            info[conflict].append({'item': s.series_id, 'acq': 'AP'})
         if (s.dim4 == 288) and ('tfMRI_conflict_PA' in s.protocol_name):
-            info[conflict].append({'item': s.series_number, 'acq': 'PA'})
+            info[conflict].append({'item': s.series_id, 'acq': 'PA'})
         if (s.dim4 == 223) and ('tfMRI_gambling_AP' in (s.protocol_name)):
-            info[gamble].append({'item': s.series_number, 'acq': 'AP'})
+            info[gamble].append({'item': s.series_id, 'acq': 'AP'})
         if (s.dim4 == 223) and ('tfMRI_gambling_PA' in s.protocol_name):
-            info[gamble].append({'item': s.series_number, 'acq': 'PA'})
+            info[gamble].append({'item': s.series_id, 'acq': 'PA'})
     return info
diff --git a/heuristics/convertall.py b/heuristics/convertall.py
index 0b264a81..eb315e49 100644
--- a/heuristics/convertall.py
+++ b/heuristics/convertall.py
@@ -28,7 +28,7 @@ def infotodict(seqinfo):
 
         * total_files_till_now
         * example_dcm_file
-        * series_number
+        * series_id
         * dcm_dir_name
         * unspecified2
         * unspecified3
@@ -48,5 +48,5 @@ def infotodict(seqinfo):
         * image_type
         """
 
-        info[data].append(s.series_number)
+        info[data].append(s.series_id)
     return info
diff --git a/heuristics/dbic_bids.py b/heuristics/dbic_bids.py
index 2262041d..046f125b 100644
--- a/heuristics/dbic_bids.py
+++ b/heuristics/dbic_bids.py
@@ -1,21 +1,284 @@
 import os
-
-from collections import defaultdict
+import re
+from collections import OrderedDict
+import hashlib
+from glob import glob
 
 import logging
 lgr = logging.getLogger('heudiconv')
 
+# dictionary from accession-number to runs that need to be marked as bad
+# NOTE: even if filename has number that is 0-padded, internally no padding
+# is done
+fix_accession2run = {
+    'A000005': ['^1-'],
+    'A000035': ['^8-', '^9-'],
+    'A000067': ['^9-'],
+    'A000072': ['^5-'],
+    'A000081': ['^5-'],
+    'A000082': ['^5-'],
+    'A000088': ['^9-'],
+    'A000090': ['^5-'],
+    'A000127': ['^21-'],
+    'A000130': ['^15-'],
+    'A000137': ['^9-', '^11-'],
+    'A000297': ['^12-'],
+    'A000326': ['^15-'],
+    'A000376': ['^15-'],
+    'A000384': ['^8-', '^11-'],
+    'A000467': ['^15-'],
+    'A000490': ['^15-'],
+    'A000511': ['^15-'],
+}
+
+# dictionary containing fixes, keys are md5sum of study_description from
+# dicoms, in the form of PI-Experimenter^protocolname
+# values are list of tuples in the form (regex_pattern, substitution)
+protocols2fix = {
+    # QA
+    '43b67d9139e8c7274578b7451ab21123':
+        [
+         #('anat-scout.*', 'anat-scout_ses-{date}'),
+         # do not change it so we retain _ses-{date}
+         #('anat-scout.*', 'anat-scout'),
+         ('BOLD_p2_s4_3\.5mm', 'func_task-rest_acq-p2-s4-3.5mm'),
+         ('BOLD_p2_s4',        'func_task-rest_acq-p2-s4'),
+         ('BOLD_p2_noprescannormalize', 'func-bold_task-rest_acq-p2noprescannormalize'),
+         ('BOLD_p2',                    'func-bold_task-rest_acq-p2'),
+         ('BOLD_', 'func_task-rest'),
+         ('DTI_30_p2_s4_3\.5mm', 'dwi_acq-DTI-30-p2-s4-3.5mm'),
+         ('DTI_30_p2_s4',        'dwi_acq-DTI-30-p2-s4'),
+         ('DTI_30_p2',           'dwi_acq-DTI-30-p2'),
+         ('_p2_s4_3\.5mm', '_acq-p2-s4-3.5mm'),
+         ('_p2_s4',        '_acq-p2-s4'),
+         ('_p2', '_acq-p2'),
+        ],
+    '9d148e2a05f782273f6343507733309d':
+        [('anat_', 'anat-'),
+         ('run-life[0-9]', 'run+_task-life'),
+         ('scout_run\+', 'scout'),
+         ('T2w', 'T2w_run+'),
+         # substitutions for old protocol names
+         ('AAHead_Scout_32ch-head-coil', 'anat-scout'),
+         ('MPRAGE', 'anat-T1w_acq-MPRAGE_run+'),
+         ('gre_field_mapping_2mm', 'fmap_run+_acq-2mm'),
+         ('gre_field_mapping_3mm', 'fmap_run+_acq-3mm'),
+         ('epi_bold_sms_p2_s4_2mm_life1_748',
+            'func_run+_task-life_acq-2mm748'),
+         ('epi_bold_sms_p2_s4_2mm_life2_692',
+            'func_run+_task-life_acq-2mm692'),
+         ('epi_bold_sms_p2_s4_2mm_life3_754',
+            'func_run+_task-life_acq-2mm754'),
+         ('epi_bold_sms_p2_s4_2mm_life4_824',
+            'func_run+_task-life_acq-2mm824'),
+         ('epi_bold_p2_3mm_nofs_life1_374',
+            'func_run+_task-life_acq-3mmnofs374'),
+         ('epi_bold_p2_3mm_nofs_life2_346',
+          'func_run+_task-life_acq-3mmnofs346'),
+         ('epi_bold_p2_3mm_nofs_life3_377',
+          'func_run+_task-life_acq-3mmnofs377'),
+         ('epi_bold_p2_3mm_nofs_life4_412',
+          'func_run+_task-life_acq-3mmnofs412'),
+         ('t2_space_sag_p4_iso', 'anat-T2w_run+'),
+         ('gre_field_mapping_2.4mm', 'fmap_run+_acq-2.4mm'),
+         ('rest_p2_sms4_2.4mm_64sl_1000tr_32te_600dyn',
+            'func_run+_task-rest_acq-2.4mm64sl1000tr32te600dyn'),
+         ('DTI_30', 'dwi_run+_acq-30'),
+         ('t1_space_sag_p2_iso', 'anat-T1w_acq-060mm_run+')],
+    '76b36c80231b0afaf509e2d52046e964':
+        [('fmap_run\+_2mm', 'fmap_run+_acq-2mm')],
+    'c6d8fbccc72990bee61d28e73b2618a4':
+        [('run=', 'run+')],
+    'a751cc977f1e354fcafcb0ea2de123bd':
+        [
+          ('_unlabeled', '_task-unlabeled'),
+          ('_mSense', '_acq-mSense'),
+          ('_p1_sms4_2.5mm', '_acq-p1-sms4-2.5mm'),
+          ('_p1_sms4_3mm', '_acq-p1-sms4-3mm'),
+        ],
+    'd160113cf5ea8c5d0cbbbe14ef625e76':
+        [
+          ('_run0', '_run-0'),
+        ],
+    '1bd62e10672fe0b435a9aa8d75b45425':
+        [
+            # need to add incrementing session -- study should have 2
+            ('scout_run\+$', 'scout_run+_ses+'),
+        ],
+    'da218a66de902adb3ad9407d514e3639':
+        [
+            # those sequences renamed later to include DTI- in their acq-
+            # so fot consistency
+            ('hardi_64',  'dwi_acq-DTI-hardi64'),
+            ('acq-hardi', 'acq-DTI-hardi'),
+        ],
+}
+keys2replace = ['protocol_name', 'series_description']
+
+# list containing StudyInstanceUID to skip -- hopefully doesn't happen too often
+dicoms2skip = [
+    '1.3.12.2.1107.5.2.43.66112.30000016110117002435700000001',
+    '1.3.12.2.1107.5.2.43.66112.30000016102813152550600000004',  # double scout
+]
+
+DEFAULT_FIELDS = {
+    # Let it just be in each json file extracted
+    #'Manufacturer': "Siemens",
+    #'ManufacturersModelName': "Prisma",
+    "Acknowledgements":
+        "We thank Terry Sacket and the rest of the DBIC (Dartmouth Brain Imaging "
+        "Center) personnel for assistance in data collection, and "
+        "Yaroslav Halchenko and Matteo Visconti for preparing BIDS dataset. "
+        "TODO: more",
+}
+
+
+def _delete_chars(from_str, deletechars):
+    """ Delete characters from string allowing for Python 2 / 3 difference
+    """
+    try:
+        return from_str.translate(None, deletechars)
+    except TypeError:
+        return from_str.translate(str.maketrans('', '', deletechars))
+
 
-def create_key(subdir, file_suffix, outtype=('nii.gz',),
-               annotation_classes=None):
+def filter_dicom(dcmdata):
+    """Return True if a DICOM dataset should be filtered out, else False"""
+    return True if dcmdata.StudyInstanceUID in dicoms2skip else False
+
+
+def filter_files(fn):
+    """Return True if a file should be kept, else False.
+    We're using it to filter out files that do not start with a number."""
+
+    # do not check for these accession numbers because they haven't been
+    # recopied with the initial number
+    donotfilter = ['A000012', 'A000013', 'A000020', 'A000041']
+
+    split = os.path.split(fn)
+    split2 = os.path.split(split[0])
+    sequence_dir = split2[1]
+    split3 = os.path.split(split2[0])
+    accession_number = split3[1]
+    return True
+    if accession_number == 'A000043':
+        # crazy one that got copied for some runs but not for others,
+        # so we are going to discard those that got copied and let heudiconv
+        # figure out the rest
+        return False if re.match('^[0-9]+-', sequence_dir) else True
+    elif accession_number == 'unknown':
+        # this one had some stuff without study description, filter stuff before
+        # collecting info, so it doesn't crash completely
+        return False if re.match('^[34][07-9]-sn', sequence_dir) else True
+    elif accession_number in donotfilter:
+        return True
+    elif accession_number.startswith('phantom-'):
+        # Accessions on phantoms, e.g. in dartmouth-phantoms/bids_test4-20161014
+        return True
+    elif accession_number.startswith('heudiconvdcm'):
+        # we were given some tarball with dicoms which was extracted so we
+        # better obey
+        return True
+    else:
+        return True if re.match('^[0-9]+-', sequence_dir) else False
+
+
+def create_key(subdir, file_suffix, outtype=('nii.gz', 'dicom'),
+               annotation_classes=None, prefix=''):
     if not subdir:
         raise ValueError('subdir must be a valid format string')
     # may be even add "performing physician" if defined??
-    template = "{referring_physician_name}/{study_description}/{bids_subject_session_dir}/" \
-               "%s/{bids_subject_session_prefix}_%s" % (subdir, file_suffix)
+    template = os.path.join(
+        prefix,
+        "{bids_subject_session_dir}",
+        subdir,
+        "{bids_subject_session_prefix}_%s" % file_suffix
+    )
     return template, outtype, annotation_classes
 
 
+def md5sum(string):
+    """Computes md5sum of as string"""
+    if not string:
+        return ""  # not None so None was not compared to strings
+    m = hashlib.md5(string.encode())
+    return m.hexdigest()
+
+def get_study_description(seqinfo):
+    # Centralized so we could fix/override
+    v = get_unique(seqinfo, 'study_description')
+    return v
+
+def get_study_hash(seqinfo):
+    # XXX: ad hoc hack
+    return md5sum(get_study_description(seqinfo))
+
+
+def fix_canceled_runs(seqinfo, accession2run=fix_accession2run):
+    """Function that adds cancelme_ to known bad runs which were forgotten
+    """
+    accession_number = get_unique(seqinfo, 'accession_number')
+    if accession_number in accession2run:
+        lgr.info("Considering some runs possibly marked to be "
+                 "canceled for accession %s", accession_number)
+        badruns = accession2run[accession_number]
+        badruns_pattern = '|'.join(badruns)
+        for i, s in enumerate(seqinfo):
+            if re.match(badruns_pattern, s.series_id):
+                lgr.info('Fixing bad run {0}'.format(s.series_id))
+                fixedkwargs = dict()
+                for key in keys2replace:
+                    fixedkwargs[key] = 'cancelme_' + getattr(s, key)
+                seqinfo[i] = s._replace(**fixedkwargs)
+    return seqinfo
+
+
+def fix_dbic_protocol(seqinfo, keys=keys2replace, subsdict=protocols2fix):
+    """Ad-hoc fixup for existing protocols
+    """
+    study_hash = get_study_hash(seqinfo)
+
+    if study_hash not in subsdict:
+        raise ValueError("I don't know how to fix {0}".format(study_hash))
+
+    # need to replace both protocol_name series_description
+    substitutions = subsdict[study_hash]
+    for i, s in enumerate(seqinfo):
+        fixed_kwargs = dict()
+        for key in keys:
+            value = getattr(s, key)
+            # replace all I need to replace
+            for substring, replacement in substitutions:
+                value = re.sub(substring, replacement, value)
+            fixed_kwargs[key] = value
+        # namedtuples are immutable
+        seqinfo[i] = s._replace(**fixed_kwargs)
+
+    return seqinfo
+
+
+def fix_seqinfo(seqinfo):
+    """Just a helper on top of both fixers
+    """
+    # add cancelme to known bad runs
+    seqinfo = fix_canceled_runs(seqinfo)
+    study_hash = get_study_hash(seqinfo)
+    if study_hash in protocols2fix:
+        lgr.info("Fixing up protocol for {0}".format(study_hash))
+        seqinfo = fix_dbic_protocol(seqinfo)
+    return seqinfo
+
+
+def ls(study_session, seqinfo):
+    """Additional ls output for a seqinfo"""
+    #assert len(sequences) <= 1  # expecting only a single study here
+    #seqinfo = sequences.keys()[0]
+    return ' study hash: %s' % get_study_hash(seqinfo)
+
+
+# XXX we killed session indicator!  what should we do now?!!!
+# WE DON:T NEED IT -- it will be provided into conversion_info as `session`
+# So we just need subdir and file_suffix!
 def infotodict(seqinfo):
     """Heuristic evaluator for determining which runs belong where
     
@@ -27,130 +290,578 @@ def infotodict(seqinfo):
     subindex: sub index within group
     session: scan index for longitudinal acq
     """
+
+    seqinfo = fix_seqinfo(seqinfo)
     lgr.info("Processing %d seqinfo entries", len(seqinfo))
     and_dicom = ('dicom', 'nii.gz')
 
-    # decide on global prefix (TODO -- manage to do it outside of here)
-
-    # no checks for consistency for now -- assume that those fields we rely on
-    # ARE the same
-    # Actually all below is taken care of outside now!
-    # s = seqinfo[0]
-    #
-    # session_id, subject_id = get_session_subject_id(s)
-    #
-    # session_suffix = session_prefix = ''
-    # if session_id:
-    #     session_suffix = "_ses-%s" % session_id
-    #     session_prefix = "ses-%s/" % session_id
-    # if not subject_id:
-    #     raise ValueError("Could not figure out the subject id")
-    #
-    # lgr.debug("Figured out subject %s. Session prefix: %s", subject_id, session_prefix)
-    # del subject_id   # not to be used
-
-    t1 = create_key('anat', 'T1w', outtype=and_dicom)
-    t2 = create_key('anat', 'T2w', outtype=and_dicom)
-    fm_diff = create_key('fmap', 'fieldmap-dwi')
-    dwi_ap = create_key('dwi', 'dir-AP_dwi', outtype=and_dicom)
-    dwi_pa = create_key('dwi', 'dir-PA_dwi', outtype=and_dicom)
-    fm_rest = create_key('fmap', 'fieldmap-rest')
-    rs = create_key('func', 'task-rest_run-{item:02d}_bold', outtype=and_dicom)
-    boldt1 = create_key('func', 'task-bird1back_run-{item:02d}_bold', outtype=and_dicom)
-    boldt2 = create_key('func', 'task-letter1back_run-{item:02d}_bold', outtype=and_dicom)
-    boldt3 = create_key('func', 'task-letter2back_run-{item:02d}_bold', outtype=and_dicom)
-    nofb_task = create_key('func', 'task-nofb_run-{item:02d}_bold', outtype=and_dicom)
-    fb_task = create_key('func', 'task-fb_run-{item:02d}_bold', outtype=and_dicom)
-    #info = {t1: [], t2: [], fm_diff:[], dwi_ap:[], dwi_pa:[], fm_rest:[], rs:[],
-    #        boldt1:[], boldt2:[], boldt3:[], nofb_task:[], fb_task:[]}
-    info = defaultdict(list)
-    last_run = len(seqinfo)
+    info = OrderedDict()
     skipped, skipped_unknown = [], []
-
+    current_run = 0
+    run_label = None   # run-
+    image_data_type = None
     for s in seqinfo:
+        # XXX: skip derived sequences, we don't store them to avoid polluting
+        # the directory
+        if s.is_derived:
+            skipped.append(s.series_id)
+            lgr.debug("Ignoring derived data %s", s.series_id)
+            continue
+
+        # possibly apply present formatting in the series_description or protocol name
+        for f in 'series_description', 'protocol_name':
+            s = s._replace(**{f: getattr(s, f).format(**s._asdict())})
+
         template = None
         suffix = ''
         seq = []
 
-        # figure out type of image from s.image_info
-        image_dir = {
-            'P': 'fmap',
+        # figure out type of image from s.image_info -- just for checking ATM
+        # since we primarily rely on encoded in the protocol name information
+        prev_image_data_type = image_data_type
+        image_data_type = s.image_type[2]
+        image_type_seqtype = {
+            'P': 'fmap',   # phase
             'FMRI': 'func',
             'MPR': 'anat',
-            'M': 'anat',
+            # 'M': 'func',  "magnitude"  -- can be for scout, anat, bold, fmap
             'DIFFUSION': 'dwi',
-        }.get(s.image_type[2], None)
+            'MIP_SAG': 'anat',  # angiography
+            'MIP_COR': 'anat',  # angiography
+            'MIP_TRA': 'anat',  # angiography
+        }.get(image_data_type, None)
 
-        if image_dir is None:
-            # must be exhaustive!
-            raise ValueError(
-                "Cannot figure out type of image with image_info %s"
-                % str(s.image_type)
-            )
+        protocol_name_tuned = s.protocol_name
+        # Few common replacements
+        if protocol_name_tuned in {'AAHead_Scout'}:
+            protocol_name_tuned = 'anat-scout'
 
-        if s.is_derived:
-            # Let's for now stash those close to original images
-            image_dir += '/derivative'
-            # just keep it lower case and without special characters
-            seq.append(s.series_description.lower())
-
-        # analyze s.protocol_name (series_number is based on it) for full name mapping etc
-        if image_dir == 'func':
-            if '_pace_' in s.protocol_name:
-                suffix += '_pace'  # or should it be part of seq-
+        regd = parse_dbic_protocol_name(protocol_name_tuned)
+
+        if image_data_type.startswith('MIP'):
+            regd['acq'] = regd.get('acq', '') + sanitize_str(image_data_type)
+
+        if not regd:
+            skipped_unknown.append(s.series_id)
+            continue
+
+        seqtype = regd.pop('seqtype')
+        seqtype_label = regd.pop('seqtype_label', None)
+
+        if image_type_seqtype and seqtype != image_type_seqtype:
+            lgr.warning(
+                "Deduced seqtype to be %s from DICOM, but got %s out of %s",
+                image_type_seqtype, seqtype, protocol_name_tuned)
+
+        # if s.is_derived:
+        #     # Let's for now stash those close to original images
+        #     # TODO: we might want a separate tree for all of this!?
+        #     # so more of a parameter to the create_key
+        #     #seqtype += '/derivative'
+        #     # just keep it lower case and without special characters
+        #     # XXXX what for???
+        #     #seq.append(s.series_description.lower())
+        #     prefix = os.path.join('derivatives', 'scanner')
+        # else:
+        #     prefix = ''
+        prefix = ''
+
+        # analyze s.protocol_name (series_id is based on it) for full name mapping etc
+        if seqtype == 'func' and not seqtype_label:
+            if '_pace_' in protocol_name_tuned:
+                seqtype_label = 'pace'  # or should it be part of seq-
             else:
                 # assume bold by default
-                suffix += '_bold'
-
-            # TODO run.  might be needed for fieldmap
-
-        # .series_description in case of
-        sdesc = s.study_description
-        # temporary aliases for those phantoms which we already collected
-        # so we rename them into this
-        #MAPPING
-
-        # the idea ias to have sequence names in the format like
-        # bids_<subdir>_bidsrecord
-        # in bids record we could have  _run[+=]
-        #  which would say to either increment run number from already encountered
-        #  or reuse the last one
-        if seq:
-            suffix += 'seq-%s' % ('+'.join(seq))
-
-        if template:
-            info[template].append(s.series_number)
-        else:
-            # some are ok to skip and not to whine
-            if "_Scout_" in s.series_description:
-                skipped.append(s.series_number)
-                lgr.debug("Ignoring %s", s.series_number)
+                seqtype_label = 'bold'
+
+        if seqtype == 'fmap' and not seqtype_label:
+            seqtype_label = {
+                'M': 'magnitude',  # might want explicit {file_index}  ?
+                'P': 'phasediff'
+            }[image_data_type]
+
+        # label for dwi as well
+        if seqtype == 'dwi' and not seqtype_label:
+            seqtype_label = 'dwi'
+
+        run = regd.get('run')
+        if run is not None:
+            # so we have an indicator for a run
+            if run == '+':
+                # some sequences, e.g.  fmap, would generate two (or more?)
+                # sequences -- e.g. one for magnitude(s) and other ones for
+                # phases.  In those we must not increment run!
+                if image_data_type == 'P':
+                    if prev_image_data_type != 'M':
+                        # XXX if we have a known earlier study, we need to always
+                        # increase the run counter for phasediff because magnitudes
+                        # were not acquired
+                        if get_study_hash([s]) == '9d148e2a05f782273f6343507733309d':
+                            current_run += 1
+                        else:
+                            raise RuntimeError(
+                                "Was expecting phase image to follow magnitude "
+                                "image, but previous one was %r", prev_image_data_type)
+                        # else we do nothing special
+                else:  # and otherwise we go to the next run
+                    current_run += 1
+            elif run == '=':
+                if not current_run:
+                    current_run = 1
+            elif run.isdigit():
+                current_run_ = int(run)
+                if current_run_ < current_run:
+                    lgr.warning(
+                        "Previous run (%s) was larger than explicitly specified %s",
+                        current_run, current_run_)
+                current_run = current_run_
             else:
-                skipped_unknown.append(s.series_number)
+                raise ValueError(
+                    "Don't know how to deal with run specification %s" % repr(run))
+            if isinstance(current_run, str) and current_run.isdigit():
+                current_run = int(current_run)
+            run_label = "run-" + ("%02d" % current_run
+                                  if isinstance(current_run, int)
+                                  else current_run)
+        else:
+            # if there is no _run -- no run label addded
+            run_label = None
+
+        if s.is_motion_corrected and 'rec-' in regd.get('bids', ''):
+            raise NotImplementedError("want to add _acq-moco but there is _acq- already")
+
+        suffix_parts = [
+            None if not regd.get('task') else "task-%s" % regd['task'],
+            None if not regd.get('acq') else "acq-%s" % regd['acq'],
+            # But we want to add an indicator in case it was motion corrected
+            # in the magnet. ref sample  /2017/01/03/qa
+            None if not s.is_motion_corrected else 'rec-moco',
+            regd.get('bids'),
+            run_label,
+            seqtype_label,
+        ]
+        # filter tose which are None, and join with _
+        suffix = '_'.join(filter(bool, suffix_parts))
+
+        # # .series_description in case of
+        # sdesc = s.study_description
+        # # temporary aliases for those phantoms which we already collected
+        # # so we rename them into this
+        # #MAPPING
+        #
+        # # the idea ias to have sequence names in the format like
+        # # bids_<subdir>_bidsrecord
+        # # in bids record we could have  _run[+=]
+        # #  which would say to either increment run number from already encountered
+        # #  or reuse the last one
+        # if seq:
+        #     suffix += 'seq-%s' % ('+'.join(seq))
+
+        # some are ok to skip and not to whine
+
+        if "_Scout" in s.series_description or \
+                (seqtype == 'anat' and seqtype_label and seqtype_label.startswith('scout')):
+            skipped.append(s.series_id)
+            lgr.debug("Ignoring %s", s.series_id)
+        else:
+            template = create_key(seqtype, suffix, prefix=prefix)
+            # we wanted ordered dict for consistent demarcation of dups
+            if template not in info:
+                info[template] = []
+            info[template].append(s.series_id)
 
-    info = dict(info)  # convert to dict since outside functionality depends on it being a basic dict
     if skipped:
         lgr.info("Skipped %d sequences: %s" % (len(skipped), skipped))
     if skipped_unknown:
         lgr.warning("Could not figure out where to stick %d sequences: %s" %
                     (len(skipped_unknown), skipped_unknown))
+
+    info = get_dups_marked(info)  # mark duplicate ones with __dup-0x suffix
+
+    info = dict(info)  # convert to dict since outside functionality depends on it being a basic dict
+    return info
+
+
+def get_dups_marked(info):
+    # analyze for "cancelled" runs, if run number was explicitly specified and
+    # thus we ended up with multiple entries which would mean that older ones
+    #  were "cancelled"
+    info = info.copy()
+    dup_id = 0
+    for template, series_ids in list(info.items()):
+        if len(series_ids) > 1:
+            lgr.warning("Detected %d duplicated run(s) for template %s: %s",
+                        len(series_ids) - 1, template[0], series_ids[:-1])
+            # copy the duplicate ones into separate ones
+            for dup_series_id in series_ids[:-1]:
+                dup_id += 1
+                dup_template = (
+                    '%s__dup-%02d' % (template[0], dup_id),
+                    ) + template[1:]
+                # There must have not been such a beast before!
+                assert dup_template not in info
+                info[dup_template] = [dup_series_id]
+            info[template] = series_ids[-1:]
+        assert len(info[template]) == 1
     return info
 
 
-def get_session_subject_id(s):
+def get_unique(seqinfos, attr):
+    """Given a list of seqinfos, which must have come from a single study
+    get specific attr, which must be unique across all of the entries
+
+    If not -- fail!
+
+    """
+    values = set(getattr(si, attr) for si in seqinfos)
+    assert (len(values) == 1)
+    return values.pop()
+
+
+# TODO: might need to do groupping per each session and return here multiple
+# hits, or may be we could just somehow demarkate that it will be multisession
+# one and so then later value parsed (again) in infotodict  would be used???
+def infotoids(seqinfos, outdir):
     # decide on subjid and session based on patient_id
-    pid_split = s.patient_id.split('_')
-    if len(pid_split) == 1:
-        # there were no explicit session
-        # then it is not a multi-session study
-        sid = s.patient_id
-        session_id = None
-    elif len(pid_split) == 2:
-        sid, session_id = pid_split
-    elif len(pid_split) == 3:
-        _nonanon_sid, session_id, sid = pid_split
+    lgr.info("Processing sequence infos to deduce study/session")
+    study_description = get_study_description(seqinfos)
+    study_description_hash = md5sum(study_description)
+    subject = fixup_subjectid(get_unique(seqinfos, 'patient_id'))
+    # TODO:  fix up subject id if missing some 0s
+    if study_description:
+        split = study_description.split('^', 1)
+        # split first one even more, since couldbe PI_Student or PI-Student
+        split = re.split('-|_', split[0], 1) + split[1:]
+
+        # locator = study_description.replace('^', '/')
+        locator = '/'.join(split)
     else:
-        raise ValueError(
-            "No logic for more than 3 _-separated entries in patient_id. Got:"
-            " %s" % s.patient_id)
-    return session_id, sid
+        locator = 'unknown'
+
+    # TODO: actually check if given study is study we would care about
+    # and if not -- we should throw some ???? exception
+
+    # So -- use `outdir` and locator etc to see if for a given locator/subject
+    # and possible ses+ in the sequence names, so we would provide a sequence
+    # So might need to go through  parse_dbic_protocol_name(s.protocol_name)
+    # to figure out presence of sessions.
+    ses_markers = []
+
+    # there might be fixups needed so we could deduce session etc
+    # this copy is not replacing original one, so the same fix_seqinfo
+    # might be called later
+    seqinfos = fix_seqinfo(seqinfos)
+    for s in seqinfos:
+        if s.is_derived:
+            continue
+        session_ = parse_dbic_protocol_name(s.protocol_name).get('session', None)
+        if session_ and '{' in session_:
+            # there was a marker for something we could provide from our seqinfo
+            # e.g. {date}
+            session_ = session_.format(**s._asdict())
+        ses_markers.append(session_)
+    ses_markers = list(filter(bool, ses_markers))  # only present ones
+    session = None
+    if ses_markers:
+        # we have a session or possibly more than one even
+        # let's figure out which case we have
+        nonsign_vals = set(ses_markers).difference('+=')
+        # although we might want an explicit '=' to note the same session as
+        # mentioned before?
+        if len(nonsign_vals) > 1:
+            lgr.warning( #raise NotImplementedError(
+                "Cannot deal with multiple sessions in the same study yet!"
+                " We will process until the end of the first session"
+            )
+        if nonsign_vals:
+            # get only unique values
+            ses_markers = list(set(ses_markers))
+            if set(ses_markers).intersection('+='):
+                raise NotImplementedError(
+                    "Should not mix hardcoded session markers with incremental ones (+=)"
+                )
+            assert len(ses_markers) == 1
+            session = ses_markers[0]
+        else:
+            # TODO - I think we are doomed to go through the sequence and split
+            # ... actually the same as with nonsign_vals, we just would need to figure
+            # out initial one if sign ones, and should make use of knowing
+            # outdir
+            #raise NotImplementedError()
+            # we need to look at what sessions we already have
+            sessions_dir = os.path.join(outdir, locator, 'sub-' + subject)
+            prior_sessions = sorted(glob(os.path.join(sessions_dir, 'ses-*')))
+            # TODO: more complicated logic
+            # For now just increment session if + and keep the same number if =
+            # and otherwise just give it 001
+            # Note: this disables our safety blanket which would refuse to process
+            # what was already processed before since it would try to override,
+            # BUT there is no other way besides only if heudiconv was storing
+            # its info based on some UID
+            if ses_markers == ['+']:
+                session = '%03d' % (len(prior_sessions) + 1)
+            elif ses_markers == ['=']:
+                session = os.path.basename(prior_sessions[-1])[4:] if prior_sessions else '001'
+            else:
+                session = '001'
+
+
+    if study_description_hash == '9d148e2a05f782273f6343507733309d':
+        session = 'siemens1'
+        lgr.info('Imposing session {0}'.format(session))
+
+    return {
+        # TODO: request info on study from the JedCap
+        'locator': locator,
+        # Sessions to be deduced yet from the names etc TODO
+        'session': session,
+        'subject': subject,
+    }
+
+
+def sanitize_str(value):
+    """Remove illegal characters for BIDS from task/acq/etc.."""
+    return _delete_chars(value, '#!@$%^&.,:;_-')
+
+
+def parse_dbic_protocol_name(protocol_name):
+    """Parse protocol name according to our convention with minimal set of fixups
+    """
+    # Since Yarik didn't know better place to put it in, but could migrate outside
+    # at some point
+    protocol_name = protocol_name.replace("anat_T1w", "anat-T1w")
+    protocol_name = protocol_name.replace("hardi_64", "dwi_acq-hardi64")
+
+    # Parse the name according to our convention
+    # https://docs.google.com/document/d/1R54cgOe481oygYVZxI7NHrifDyFUZAjOBwCTu7M7y48/edit?usp=sharing
+    # Remove possible suffix we don't care about after __
+    protocol_name = protocol_name.split('__', 1)[0]
+
+    bids = None  # we don't know yet for sure
+    # We need to figure out if it is a valid bids
+    split = protocol_name.split('_')
+    prefix = split[0]
+
+    # Fixups
+    if prefix == 'scout':
+        prefix = split[0] = 'anat-scout'
+
+    if prefix != 'bids' and '-' in prefix:
+        prefix, _ = prefix.split('-', 1)
+    if prefix == 'bids':
+        bids = True  # for sure
+        split = split[1:]
+
+    def split2(s):
+        # split on - if present, if not -- 2nd one returned None
+        if '-' in s:
+            return s.split('-', 1)
+        return s, None
+
+    # Let's analyze first element which should tell us sequence type
+    seqtype, seqtype_label = split2(split[0])
+    if seqtype not in {'anat', 'func', 'dwi', 'behav', 'fmap'}:
+        # It is not something we don't consume
+        if bids:
+            lgr.warning("It was instructed to be BIDS sequence but unknown "
+                        "type %s found", seqtype)
+        return {}
+
+    regd = dict(seqtype=seqtype)
+    if seqtype_label:
+        regd['seqtype_label'] = seqtype_label
+    # now go through each to see if one which we care
+    bids_leftovers = []
+    for s in split[1:]:
+        key, value = split2(s)
+        if value is None and key[-1] in "+=":
+            value = key[-1]
+            key = key[:-1]
+
+        # sanitize values, which must not have _ and - is undesirable ATM as well
+        # TODO: BIDSv2.0 -- allows "-" so replace with it instead
+        value = str(value).replace('_', 'X').replace('-', 'X')
+
+        if key in ['ses', 'run', 'task', 'acq']:
+            # those we care about explicitly
+            regd[{'ses': 'session'}.get(key, key)] = sanitize_str(value)
+        else:
+            bids_leftovers.append(s)
+
+    if bids_leftovers:
+        regd['bids'] = '_'.join(bids_leftovers)
+
+    # TODO: might want to check for all known "standard" BIDS suffixes here
+    # among bids_leftovers, thus serve some kind of BIDS validator
+
+    # if not regd.get('seqtype_label', None):
+    #     # might need to assign a default label for each seqtype if was not
+    #     # given
+    #     regd['seqtype_label'] = {
+    #         'func': 'bold'
+    #     }.get(regd['seqtype'], None)
+
+    return regd
+
+
+def fixup_subjectid(subjectid):
+    """Just in case someone managed to miss a zero or added an extra one"""
+    # make it lowercase
+    subjectid = subjectid.lower()
+    reg = re.match("sid0*(\d+)$", subjectid)
+    if not reg:
+        # some completely other pattern
+        # just filter out possible _- in it
+        return re.sub('[-_]', '', subjectid)
+    return "sid%06d" % int(reg.groups()[0])
+
+
+def test_filter_files():
+    assert(filter_files('/home/mvdoc/dbic/09-run_func_meh/0123432432.dcm'))
+    assert(not filter_files('/home/mvdoc/dbic/run_func_meh/012343143.dcm'))
+
+
+def test_md5sum():
+    assert md5sum('cryptonomicon') == '1cd52edfa41af887e14ae71d1db96ad1'
+    assert md5sum('mysecretmessage') == '07989808231a0c6f522f9d8e34695794'
+
+
+def test_fix_canceled_runs():
+    from collections import namedtuple
+    FakeSeqInfo = namedtuple('FakeSeqInfo',
+                             ['accession_number', 'series_id',
+                              'protocol_name', 'series_description'])
+
+    seqinfo = []
+    runname = 'func_run+'
+    for i in range(1, 6):
+        seqinfo.append(
+            FakeSeqInfo('accession1',
+                        '{0:02d}-'.format(i) + runname,
+                        runname, runname)
+        )
+
+    fake_accession2run = {
+        'accession1': ['^01-', '^03-']
+    }
+
+    seqinfo_ = fix_canceled_runs(seqinfo, fake_accession2run)
+
+    for i, s in enumerate(seqinfo_, 1):
+        output = runname
+        if i == 1 or i == 3:
+            output = 'cancelme_' + output
+        for key in ['series_description', 'protocol_name']:
+            value = getattr(s, key)
+            assert(value == output)
+        # check we didn't touch series_id
+        assert(s.series_id == '{0:02d}-'.format(i) + runname)
+
+
+def test_fix_dbic_protocol():
+    from collections import namedtuple
+    FakeSeqInfo = namedtuple('FakeSeqInfo',
+                             ['accession_number', 'study_description',
+                              'field1', 'field2'])
+    accession_number = 'A003'
+    seq1 = FakeSeqInfo(accession_number,
+                       'mystudy',
+                       '02-anat-scout_run+_MPR_sag',
+                       '11-func_run-life2_acq-2mm692')
+    seq2 = FakeSeqInfo(accession_number,
+                       'mystudy',
+                       'nochangeplease',
+                       'nochangeeither')
+
+
+    seqinfos = [seq1, seq2]
+    keys = ['field1']
+    subsdict = {
+        md5sum('mystudy'):
+            [('scout_run\+', 'scout'),
+             ('run-life[0-9]', 'run+_task-life')],
+    }
+
+    seqinfos_ = fix_dbic_protocol(seqinfos, keys=keys, subsdict=subsdict)
+    assert(seqinfos[1] == seqinfos_[1])
+    # field2 shouldn't have changed since I didn't pass it
+    assert(seqinfos_[0] == FakeSeqInfo(accession_number,
+                                       'mystudy',
+                                       '02-anat-scout_MPR_sag',
+                                       seq1.field2))
+
+    # change also field2 please
+    keys = ['field1', 'field2']
+    seqinfos_ = fix_dbic_protocol(seqinfos, keys=keys, subsdict=subsdict)
+    assert(seqinfos[1] == seqinfos_[1])
+    # now everything should have changed
+    assert(seqinfos_[0] == FakeSeqInfo(accession_number,
+                                       'mystudy',
+                                       '02-anat-scout_MPR_sag',
+                                       '11-func_run+_task-life_acq-2mm692'))
+
+
+def test_sanitize_str():
+    assert sanitize_str('super@duper.faster') == 'superduperfaster'
+    assert sanitize_str('perfect') == 'perfect'
+    assert sanitize_str('never:use:colon:!') == 'neverusecolon'
+
+
+def test_fixupsubjectid():
+    assert fixup_subjectid("abra") == "abra"
+    assert fixup_subjectid("sub") == "sub"
+    assert fixup_subjectid("sid") == "sid"
+    assert fixup_subjectid("sid000030") == "sid000030"
+    assert fixup_subjectid("sid0000030") == "sid000030"
+    assert fixup_subjectid("sid00030") == "sid000030"
+    assert fixup_subjectid("sid30") == "sid000030"
+    assert fixup_subjectid("SID30") == "sid000030"
+
+
+def test_parse_dbic_protocol_name():
+    pdpn = parse_dbic_protocol_name
+
+    assert pdpn("nondbic_func-bold") == {}
+    assert pdpn("cancelme_func-bold") == {}
+
+    assert pdpn("bids_func-bold") == \
+           pdpn("func-bold") == \
+           {'seqtype': 'func', 'seqtype_label': 'bold'}
+
+    # pdpn("bids_func_ses+_task-boo_run+") == \
+    # order should not matter
+    assert pdpn("bids_func_ses+_run+_task-boo") == \
+           {
+               'seqtype': 'func',
+               # 'seqtype_label': 'bold',
+               'session': '+',
+               'run': '+',
+               'task': 'boo',
+            }
+    # TODO: fix for that
+    assert pdpn("bids_func-pace_ses-1_task-boo_acq-bu_bids-please_run-2__therest") == \
+           pdpn("bids_func-pace_ses-1_run-2_task-boo_acq-bu_bids-please__therest") == \
+           pdpn("func-pace_ses-1_task-boo_acq-bu_bids-please_run-2") == \
+           {
+               'seqtype': 'func', 'seqtype_label': 'pace',
+               'session': '1',
+               'run': '2',
+               'task': 'boo',
+               'acq': 'bu',
+               'bids': 'bids-please'
+           }
+
+    assert pdpn("bids_anat-scout_ses+") == \
+           {
+               'seqtype': 'anat',
+               'seqtype_label': 'scout',
+               'session': '+',
+           }
+
+    assert pdpn("anat_T1w_acq-MPRAGE_run+") == \
+           {
+                'seqtype': 'anat',
+                'run': '+',
+                'acq': 'MPRAGE',
+                'seqtype_label': 'T1w'
+           }
diff --git a/heuristics/dbic_bids_validator.cfg b/heuristics/dbic_bids_validator.cfg
new file mode 100644
index 00000000..0da468b5
--- /dev/null
+++ b/heuristics/dbic_bids_validator.cfg
@@ -0,0 +1,14 @@
+{
+    "ignore": [
+       "TOTAL_READOUT_TIME_NOT_DEFINED"
+    ],
+    "warn": [],
+    "error": [],
+    "ignoredFiles": [
+          "/.heudiconv/*", "/.heudiconv/*/*", "/.heudiconv/*/*/*", "/.heudiconv/*/*/*/*",  
+          "/.git*", 
+          "/.datalad/*", "/.datalad/.*", 
+          "/sub*/ses*/*/*__dup*", "/sub*/*/*__dup*"
+    ]
+}
+
diff --git a/heuristics/example.py b/heuristics/example.py
index 324d875b..5c2bc5ca 100644
--- a/heuristics/example.py
+++ b/heuristics/example.py
@@ -86,4 +86,4 @@ def infotodict(seqinfo):
             info[fmrest].append(s[2])
         else:
             pass
-    return info
\ No newline at end of file
+    return info
diff --git a/heuristics/test_dbic_bids.py b/heuristics/test_dbic_bids.py
new file mode 100644
index 00000000..4d106914
--- /dev/null
+++ b/heuristics/test_dbic_bids.py
@@ -0,0 +1,25 @@
+#
+# Tests for dbic_bids.py
+#
+from collections import OrderedDict
+from dbic_bids import get_dups_marked
+
+def test_get_dups_marked():
+    no_dups = {('some',): [1]}
+    assert get_dups_marked(no_dups) == no_dups
+
+    assert get_dups_marked(
+        OrderedDict([
+            (('bu', 'du'), [1, 2]),
+            (('smth',), [3]),
+            (('smth2',), ['a', 'b', 'c'])
+        ])) == \
+        {
+            ('bu__dup-01', 'du'): [1],
+            ('bu', 'du'): [2],
+            ('smth',): [3],
+            ('smth2__dup-02',): ['a'],
+            ('smth2__dup-03',): ['b'],
+            ('smth2',): ['c']
+        }
+
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..df3eb518
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+addopts = --doctest-modules
diff --git a/requirements.txt b/requirements.txt
index 79d31ffe..4ce7ccd7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
-pydicom
-dcmstack
-rdflib
-nipype
+.[full]
+# This is a dcmstack branch with changes for Python 3
+# sent PR to main repo, TODO: check if merged
+# https://github.com/ghisvail/dcmstack/pull/1
+git+git://github.com/mvdoc/dcmstack@bf/importsys
diff --git a/setup.py b/setup.py
index 1acee7da..dca6f021 100755
--- a/setup.py
+++ b/setup.py
@@ -33,12 +33,20 @@ def findsome(subdir, extensions):
 requires = {
     'core': [
         'nibabel',
-        # TODO: migrate more from requirements.txt
+        'pydicom',
+        'nipype',
     ],
     'tests': [
         'six',
-        'nose',
+        'pytest',
     ],
+    'monitor': [
+        'inotify',
+        'tinydb'
+    ],
+    'datalad': [
+        'datalad'
+    ]
 }
 requires['full'] = sum(list(requires.values()), [])
 
diff --git a/tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm b/tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm
new file mode 100644
index 00000000..bafdaa51
Binary files /dev/null and b/tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm differ
diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py
new file mode 100644
index 00000000..1bd70d51
--- /dev/null
+++ b/tests/test_heuristics.py
@@ -0,0 +1,102 @@
+from . import heudiconv
+
+import os
+from mock import patch
+from six.moves import StringIO
+
+from glob import glob
+from os.path import join as pjoin, dirname
+import csv
+import re
+
+import pytest
+
+from datalad.api import Dataset
+
+
+def test_smoke_converall(tmpdir):
+    heudiconv.main(
+        ("-f heuristics/convertall.py -c dcm2niix -o %s -b --datalad "
+         "-s fmap_acq-3mm -d tests/data/{subject}/*" % tmpdir).split(' ')
+    )
+
+
+@pytest.mark.parametrize('heuristic', [ 'dbic_bids', 'convertall' ])
+@pytest.mark.parametrize(
+    'invocation', [
+        "tests/data",    # our new way with automated groupping
+        "-d tests/data/{subject}/* -s 01-fmap_acq-3mm" # "old" way specifying subject
+        # should produce the same results
+    ])
+def test_dbic_bids_largely_smoke(tmpdir, heuristic, invocation):
+    is_bids = True if heuristic == 'dbic_bids' else False
+    arg = "-f heuristics/%s.py -c dcm2niix -o %s" % (heuristic, tmpdir)
+    if is_bids:
+        arg += " -b"
+    arg += " --datalad "
+    args = (
+        arg + invocation
+    ).split(' ')
+    if heuristic != 'dbic_bids' and invocation == 'tests/data':
+        # none other heuristic has mighty infotoids atm
+        with pytest.raises(NotImplementedError):
+            heudiconv.main(args)
+        return
+    heudiconv.main(args)
+    ds = Dataset(str(tmpdir))
+    assert ds.is_installed()
+    assert not ds.repo.dirty
+    head = ds.repo.get_hexsha()
+
+    # and if we rerun -- should fail
+    if heuristic != 'dbic_bids' and invocation != 'tests/data':
+        # those guys -- they just plow through it ATM without failing, i.e.
+        # the logic is to reprocess
+        heudiconv.main(args)
+    else:
+        with pytest.raises(RuntimeError):
+            heudiconv.main(args)
+    # but there should be nothing new
+    assert not ds.repo.dirty
+    assert head == ds.repo.get_hexsha()
+
+    # unless we pass 'overwrite' flag
+    heudiconv.main(args + ['--overwrite'])
+    # but result should be exactly the same, so it still should be clean
+    # and at the same commit
+    assert ds.is_installed()
+    assert not ds.repo.dirty
+    assert head == ds.repo.get_hexsha()
+
+
+@pytest.mark.parametrize(
+    'invocation', [
+        "tests/data",    # our new way with automated groupping
+    ])
+def test_scans_keys_dbic_bids(tmpdir, invocation):
+    args = "-f heuristics/dbic_bids.py -c dcm2niix -o %s -b " % tmpdir
+    args += invocation
+    heudiconv.main(args.split())
+    # for now check it exists
+    scans_keys = glob(pjoin(tmpdir.strpath, '*/*/*/*/*.tsv'))
+    assert(len(scans_keys) == 1)
+    with open(scans_keys[0]) as f:
+        reader = csv.reader(f, delimiter='\t')
+        for i, row in enumerate(reader):
+            if i == 0:
+                assert(row == ['filename', 'acq_time', 'operator', 'randstr'])
+            assert(len(row) == 4)
+            if i != 0:
+                assert(os.path.exists(pjoin(dirname(scans_keys[0]), row[0])))
+                assert(re.match(
+                    '^[\d]{4}-[\d]{2}-[\d]{2}T[\d]{2}:[\d]{2}:[\d]{2}$',
+                    row[1]))
+
+
+@patch('sys.stdout', new_callable=StringIO)
+def test_ls(stdout):
+    args = "-f heuristics/dbic_bids.py --command ls tests/data".split(' ')
+    heudiconv.main(args)
+    out = stdout.getvalue()
+    assert 'StudySessionInfo(locator=' in out
+    assert 'Halchenko/Yarik/950_bids_test4' in out
\ No newline at end of file
diff --git a/tests/test_main.py b/tests/test_main.py
index 1a496b2e..8d29d11e 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -1,19 +1,159 @@
+import csv
+import os
+import pytest
 import sys
 
 from mock import patch
+from os.path import join as opj
 from six.moves import StringIO
-from nose.tools import assert_raises, assert_equal
+
 
 from . import heudiconv
 
 
 @patch('sys.stdout', new_callable=StringIO)
 def test_main_help(stdout):
-    assert_raises(SystemExit, heudiconv.main, ['--help'])
-    assert(stdout.getvalue().startswith("usage: "))
+    with pytest.raises(SystemExit):
+        heudiconv.main(['--help'])
+    assert stdout.getvalue().startswith("usage: ")
 
 
 @patch('sys.stderr' if sys.version_info[:2] <= (3, 3) else 'sys.stdout', new_callable=StringIO)
 def test_main_version(std):
-    assert_raises(SystemExit, heudiconv.main, ['--version'])
-    assert_equal(std.getvalue().rstrip(), heudiconv.__version__)
+    with pytest.raises(SystemExit):
+        heudiconv.main(['--version'])
+    assert std.getvalue().rstrip() == heudiconv.__version__
+
+
+def test_create_file_if_missing(tmpdir):
+    tf = tmpdir.join("README.txt")
+    assert not tf.exists()
+    heudiconv.create_file_if_missing(str(tf), "content")
+    assert tf.exists()
+    assert tf.read() == "content"
+    heudiconv.create_file_if_missing(str(tf), "content2")
+    # nothing gets changed
+    assert tf.read() == "content"
+
+
+def test_populate_bids_templates(tmpdir):
+    heudiconv.populate_bids_templates(
+        str(tmpdir),
+        defaults={'Acknowledgements': 'something'})
+    for f in "README", "dataset_description.json", "CHANGES":
+        # Just test that we have created them and they all have stuff TODO
+        assert "TODO" in tmpdir.join(f).read()
+    assert "something" in tmpdir.join('dataset_description.json').read()
+
+
+def test_add_participant_record(tmpdir):
+    tf = tmpdir.join('participants.tsv')
+    assert not tf.exists()
+    heudiconv.add_participant_record(str(tmpdir), "sub01", "023Y", "M")
+    # should create the file and place corrected record
+    sub01 = tf.read()
+    assert sub01 == """\
+participant_id	age	sex	group
+sub-sub01	23	M	control
+"""
+    heudiconv.add_participant_record(str(tmpdir), "sub01", "023Y", "F")
+    assert tf.read() == sub01  # nothing was added even though differs in values
+    heudiconv.add_participant_record(str(tmpdir), "sub02", "2", "F")
+    assert tf.read() == """\
+participant_id	age	sex	group
+sub-sub01	23	M	control
+sub-sub02	2	F	control
+"""
+
+
+def test_prepare_for_datalad(tmpdir):
+    pytest.importorskip("datalad")
+    studydir = tmpdir.join("PI").join("study")
+    studydir_ = str(studydir)
+    os.makedirs(studydir_)
+    heudiconv.populate_bids_templates(studydir_)
+
+    heudiconv.add_to_datalad(str(tmpdir), studydir_)
+
+    from datalad.api import Dataset
+    superds = Dataset(str(tmpdir))
+
+    assert superds.is_installed()
+    assert not superds.repo.dirty
+    subdss = superds.get_subdatasets(recursive=True)
+    for ds_path in sorted(subdss):
+        ds = Dataset(opj(superds.path, ds_path))
+        assert ds.is_installed()
+        assert not ds.repo.dirty
+
+    # the last one should have been the study
+    target_files = {
+        '.gitattributes',
+        '.datalad/config', '.datalad/.gitattributes',
+        'dataset_description.json',
+        'CHANGES', 'README'}
+    assert set(ds.repo.get_indexed_files()) == target_files
+    # and all are under git
+    for f in target_files:
+        assert not ds.repo.is_under_annex(f)
+    assert not ds.repo.is_under_annex('.gitattributes')
+
+
+def test_json_dumps_pretty():
+    pretty = heudiconv.json_dumps_pretty
+    assert pretty({}) == "{}"
+    assert pretty({"a": -1, "b": "123", "c": [1, 2, 3], "d": ["1.0", "2.0"]}) \
+        == '{\n  "a": -1,\n  "b": "123",\n  "c": [1, 2, 3],\n  "d": ["1.0", "2.0"]\n}'
+    assert pretty({'a': ["0.3", "-1.9128906358217845e-12", "0.2"]}) \
+        == '{\n  "a": ["0.3", "-1.9128906358217845e-12", "0.2"]\n}'
+
+
+def test_get_formatted_scans_key_row():
+    item = [
+        ('tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm',
+         ('nii.gz', 'dicom'),
+         ['tests/data/01-fmap_acq-3mm/1.3.12.2.1107.5.2.43.66112.2016101409263663466202201.dcm'])
+    ]
+    outname_bids_file = '/a/path/Halchenko/Yarik/950_bids_test4/sub-phantom1sid1/fmap/sub-phantom1sid1_acq-3mm_phasediff.json'
+
+    row = heudiconv.get_formatted_scans_key_row(item)
+    assert(len(row) == 3)
+    assert(row[0] == '2016-10-14T09:26:34')
+    assert(row[1] == '')
+    randstr1 = row[2]
+    row = heudiconv.get_formatted_scans_key_row(item)
+    randstr2 = row[2]
+    assert(randstr1 != randstr2)
+
+
+# TODO: finish this
+def test_add_rows_to_scans_keys_file(tmpdir):
+    fn = opj(tmpdir.strpath, 'file.tsv')
+    rows = {
+        'my_file.nii.gz': ['2016adsfasd', '', 'fasadfasdf'],
+        'another_file.nii.gz': ['2018xxxxx', '', 'fasadfasdf']
+    }
+    heudiconv.add_rows_to_scans_keys_file(fn, rows)
+
+    def _check_rows(fn, rows):
+        with open(fn, 'r') as csvfile:
+            reader = csv.reader(csvfile, delimiter='\t')
+            rows_loaded = []
+            for row in reader:
+                rows_loaded.append(row)
+        for i, row_ in enumerate(rows_loaded):
+            if i == 0:
+                assert(row_ == ['filename', 'acq_time', 'operator', 'randstr'])
+            else:
+                assert(rows[row_[0]] == row_[1:])
+
+    _check_rows(fn, rows)
+    # add a new one
+    extra_rows = {
+        'a_new_file.nii.gz': ['2016adsfasd23', '', 'fasadfasdf'],
+        'my_file.nii.gz': ['2016adsfasd', '', 'fasadfasdf'],
+        'another_file.nii.gz': ['2018xxxxx', '', 'fasadfasdf']
+    }
+    heudiconv.add_rows_to_scans_keys_file(fn, extra_rows)
+    _check_rows(fn, extra_rows)
+
diff --git a/tests/test_tarballs.py b/tests/test_tarballs.py
new file mode 100644
index 00000000..3a9a9edf
--- /dev/null
+++ b/tests/test_tarballs.py
@@ -0,0 +1,34 @@
+import os
+import pytest
+import sys
+import time
+
+from mock import patch
+from os.path import join as opj
+from os.path import dirname
+from six.moves import StringIO
+from glob import glob
+
+from . import heudiconv
+from .utils import md5sum
+
+tests_datadir = opj(dirname(__file__), 'data')
+
+
+def test_reproducibility(tmpdir):
+    #heudiconv.compress_dicoms(dicom_list, prefix, sourcedir)
+    prefix = str(tmpdir.join("precious"))
+    args = [glob(opj(tests_datadir, '01-fmap_acq-3mm', '*')), prefix]
+    tarball = heudiconv.compress_dicoms(*args)
+    md5 = md5sum(tarball)
+    assert tarball
+    # must not override by default
+    with pytest.raises(RuntimeError):
+        heudiconv.compress_dicoms(*args)
+    os.unlink(tarball)
+
+    time.sleep(1.1)  # need to guarantee change of time
+    tarball_ = heudiconv.compress_dicoms(*args)
+    md5_ = md5sum(tarball_)
+    assert tarball == tarball_
+    assert md5 == md5_
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 00000000..4de67fae
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,6 @@
+import hashlib
+
+
+def md5sum(filename):
+    with open(filename, 'rb') as f:
+        return hashlib.md5(f.read()).hexdigest()
diff --git a/tox.ini b/tox.ini
index d483a432..5f8f5fcd 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,5 +2,5 @@
 envlist = py27,py33,py34,py35
 
 [testenv]
-commands = nosetests -s -v {posargs} tests
+commands = python -m pytest -s -v {posargs} tests
 deps = -r{toxinidir}/dev-requirements.txt