Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ created.

### Fixed

- ([#482](https://github.com/microsoft/InnerEye-DeepLearning/pull/482)) Check bool parameter is either true or false.
- ([#475](https://github.com/microsoft/InnerEye-DeepLearning/pull/475)) Bug in AML SDK meant that we could not train
any large models anymore because data loaders ran out of memory.
- ([#472](https://github.com/microsoft/InnerEye-DeepLearning/pull/472)) Correct model path for moving ensemble models.
Expand Down
50 changes: 48 additions & 2 deletions InnerEye/Common/generic_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing import Any, Callable, Dict, List, Optional, Set, Type, Union

import param
from param.parameterized import Parameter

from InnerEye.Common.common_util import is_private_field_name
from InnerEye.Common.type_annotations import T
Expand Down Expand Up @@ -114,6 +115,21 @@ def add_args(cls: Type[GenericConfig],
:param parser: Parser to add properties to.
"""

def parse_bool(x: str) -> bool:
"""
Parse a string as a bool. Supported values are case insensitive and one of:
'on', 't', 'true', 'y', 'yes', '1' for True
'off', 'f', 'false', 'n', 'no', '0' for False.
:param x: string to test.
:return: Bool value if string valid, otherwise a ValueError is raised.
"""
sx = str(x).lower()
if sx in ('on', 't', 'true', 'y', 'yes', '1'):
return True
if sx in ('off', 'f', 'false', 'n', 'no', '0'):
return False
raise ValueError(f"Invalid value {x}, please supply one of True, true, false or False.")

def _get_basic_type(_p: param.Parameter) -> Union[type, Callable]:
"""
Given a parameter, get its basic Python type, e.g.: param.Boolean -> bool.
Expand All @@ -122,7 +138,7 @@ def _get_basic_type(_p: param.Parameter) -> Union[type, Callable]:
:return: Type
"""
if isinstance(_p, param.Boolean):
p_type: Union[type, Callable] = lambda x: (str(x).lower() == 'true')
p_type: Callable = parse_bool
elif isinstance(_p, param.Integer):
p_type = lambda x: _p.default if x == "" else int(x)
elif isinstance(_p, param.Number):
Expand Down Expand Up @@ -156,8 +172,38 @@ def list_or_dict(x: str) -> Union[Dict, List]:

return p_type

def add_boolean_argument(parser: argparse.ArgumentParser, k: str, p: Parameter) -> None:
"""
Add a boolean argument.
If the parameter default is False then allow --flag (to set it True) and --flag=Bool as usual.
If the parameter default is True then allow --no-flag (to set it to False) and --flag=Bool as usual.
:param parser: parser to add a boolean argument to.
:param k: argument name.
:param p: boolean parameter.
"""
if not p.default:
# If the parameter default is False then use nargs="?" (argparse.OPTIONAL).
# This means that the argument is optional.
# If it is not supplied, i.e. in the --flag mode, use the "const" value, i.e. True.
# Otherwise, i.e. in the --flag=value mode, try to parse the argument as a bool.
parser.add_argument("--" + k, help=p.doc, type=parse_bool, default=False,
nargs=argparse.OPTIONAL, const=True)
else:
# If the parameter default is True then create an exclusive group of arguments.
# Either --flag=value as usual
# Or --no-flag to store False in the parameter k.
group = parser.add_mutually_exclusive_group(required=False)
group.add_argument("--" + k, help=p.doc, type=parse_bool)
group.add_argument('--no-' + k, dest=k, action='store_false')
parser.set_defaults(**{k: p.default})

for k, p in cls.get_overridable_parameters().items():
parser.add_argument("--" + k, help=p.doc, type=_get_basic_type(p), default=p.default)
# param.Booleans need to be handled separately, they are more complicated because they have
# an optional argument.
if isinstance(p, param.Boolean):
add_boolean_argument(parser, k, p)
else:
parser.add_argument("--" + k, help=p.doc, type=_get_basic_type(p), default=p.default)

return parser

Expand Down
4 changes: 2 additions & 2 deletions InnerEye/ML/configs/other/fastmri_varnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def __init__(self) -> None:
super().__init__()
self.azure_dataset_id = "knee_multicoil"
# If the Azure nodes run out of disk space when downloading the dataset, re-submit with the
# --use_dataset_mount=True flag. The dataset will be mounted to the fixed path given here.
# --use_dataset_mount flag. The dataset will be mounted to the fixed path given here.
self.dataset_mountpoint = "/tmp/knee_multicoil"

def get_data_module(self) -> LightningDataModule:
Expand All @@ -144,7 +144,7 @@ def __init__(self) -> None:
super().__init__()
self.azure_dataset_id = "brain_multicoil"
# If the Azure nodes run out of disk space when downloading the dataset, re-submit with the
# --use_dataset_mount=True flag. The dataset will be mounted to the fixed path given here.
# --use_dataset_mount flag. The dataset will be mounted to the fixed path given here.
self.dataset_mountpoint = "/tmp/brain_multicoil"

def get_data_module(self) -> LightningDataModule:
Expand Down
41 changes: 36 additions & 5 deletions Tests/Common/test_generic_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ class ParamEnum(Enum):
class ParamClass(GenericConfig):
name: str = param.String(None, doc="Name")
seed: int = param.Integer(42, doc="Seed")
flag: int = param.Boolean(False, doc="Flag")
flag: bool = param.Boolean(False, doc="Flag")
not_flag: bool = param.Boolean(True, doc="Not Flag")
number: float = param.Number(3.14)
integers: List[int] = param.List(None, class_=int)
optional_int: Optional[int] = param.Integer(None, doc="Optional int")
Expand All @@ -40,6 +41,7 @@ def test_overridable_parameter() -> None:
param_dict = ParamClass.get_overridable_parameters()
assert "name" in param_dict
assert "flag" in param_dict
assert "not_flag" in param_dict
assert "seed" in param_dict
assert "number" in param_dict
assert "integers" in param_dict
Expand All @@ -62,6 +64,12 @@ def check(arg: List[str], expected_key: str, expected_value: Any) -> None:
parsed = ParamClass.parse_args(arg)
assert getattr(parsed, expected_key) == expected_value

def check_fails(arg: List[str]) -> None:
with pytest.raises(SystemExit) as e:
ParamClass.parse_args(arg)
assert e.type == SystemExit
assert e.value.code == 2

check(["--name=foo"], "name", "foo")
check(["--seed", "42"], "seed", 42)
check(["--seed", ""], "seed", 42)
Expand All @@ -77,17 +85,40 @@ def check(arg: List[str], expected_key: str, expected_value: Any) -> None:
check(["--enum=2"], "enum", ParamEnum.EnumValue2)
check(["--floats=1,2,3.14"], "floats", [1., 2., 3.14])
check(["--integers=1,2,3"], "integers", [1, 2, 3])
check(["--flag=true"], "flag", True)
check(["--flag=True"], "flag", True)
check(["--flag=false"], "flag", False)
check(["--flag=False"], "flag", False)
# Check all the ways of passing in True, with and without the first letter capitialized
for flag in ('on', 't', 'true', 'y', 'yes', '1'):
check([f"--flag={flag}"], "flag", True)
check([f"--flag={flag.capitalize()}"], "flag", True)
check([f"--not_flag={flag}"], "not_flag", True)
check([f"--not_flag={flag.capitalize()}"], "not_flag", True)
# Check all the ways of passing in False, with and without the first letter capitialized
for flag in ('off', 'f', 'false', 'n', 'no', '0'):
check([f"--flag={flag}"], "flag", False)
check([f"--flag={flag.capitalize()}"], "flag", False)
check([f"--not_flag={flag}"], "not_flag", False)
check([f"--not_flag={flag.capitalize()}"], "not_flag", False)
# Check that passing no value to flag sets it to True (the opposite of its default)
check(["--flag"], "flag", True)
# Check that no-flag is not an option
check_fails(["--no-flag"])
# Check that passing no value to not_flag fails
check_fails(["--not_flag"])
# Check that --no-not_flag is an option and sets it to False (the opposite of its default)
check(["--no-not_flag"], "not_flag", False)
# Check that both not_flag and no-not_flag cannot be passed at the same time
check_fails(["--not_flag=false", "--no-not_flag"])
# Check that invalid bools are caught
check_fails(["--flag=Falsf"])
check_fails(["--flag=Truf"])
# Check that default values are created as expected, and that the non-overridable parameters
# are omitted.
defaults = vars(ParamClass.create_argparser().parse_args([]))
assert defaults["seed"] == 42
assert defaults["tuple1"] == (1, 2.3)
assert defaults["int_tuple"] == (1, 1, 1)
assert defaults["enum"] == ParamEnum.EnumValue1
assert not defaults["flag"]
assert defaults["not_flag"]
assert "readonly" not in defaults
assert "constant" not in defaults
assert "_non_override" not in defaults
Expand Down
2 changes: 1 addition & 1 deletion docs/bring_your_own_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ This can be used by
be used for training and testing.
- Adding essential trainer parameters like number of epochs to that container.
- Invoking the InnerEye runner and providing the name of the container class, like this:
`python InnerEye/ML/runner.py --model=MyContainer`. To train in AzureML, just add a `--azureml=True` flag.
`python InnerEye/ML/runner.py --model=MyContainer`. To train in AzureML, just add a `--azureml` flag.

There is a fully working example [HelloContainer](../InnerEye/ML/configs/other/HelloContainer.py), that implements
a simple 1-dimensional regression model from data stored in a CSV file. You can run that
Expand Down
24 changes: 19 additions & 5 deletions docs/building_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,20 +75,34 @@ The InnerEye runner will check if there is a dataset in the AzureML workspace al

* Train a new model, for example `Prostate`:
```shell script
python InnerEyeLocal/ML/runner.py --azureml=True --model=Prostate --train=True
python InnerEyeLocal/ML/runner.py --azureml --model=Prostate
```

Alternatively, you can train the model on your current machine if it is powerful enough. In
this case, you would simply omit the `azureml` flag, and instead of specifying
`azure_dataset_id` in the class constructor, you can instead use `local_dataset="my/data/folder"`,
where the folder `my/data/folder` contains a `dataset.csv` file and all the files that are referenced therein.

### Boolean Options

Note that for command line options that take a boolean argument, and that are `False` by default, there are multiple ways of setting the option. For example alternatives to `--azureml` include:
* `--azureml=True`, or `--azureml=true`, or `--azureml=T`, or `--azureml=t`
* `--azureml=Yes`, or `--azureml=yes`, or `--azureml=Y`, or `--azureml=y`
* `--azureml=On`, or `--azureml=on`
* `--azureml=1`

Conversely, for command line options that take a boolean argument, and that are `True` by default, there are multiple ways of un-setting the option. For example alternatives to `--no-train` include:
* `--train=False`, or `--train=false`, or `--train=F`, or `--train=f`
* `--train=No`, or `--train=no`, or `--train=N`, or `--train=n`
* `--train=Off`, or `--train=off`
* `--train=0`


### Training using multiple machines
To speed up training in AzureML, you can use multiple machines, by specifying the additional
`--num_nodes` argument. For example, to use 2 machines to train, specify:
```shell script
python InnerEyeLocal/ML/runner.py --azureml=True --model=Prostate --num_nodes=2
python InnerEyeLocal/ML/runner.py --azureml --model=Prostate --num_nodes=2
```
On each of the 2 machines, all available GPUs will be used. Model inference will always use only one machine.

Expand Down Expand Up @@ -149,18 +163,18 @@ run recovery ID without the final underscore and digit.
To evaluate an existing model on a test set, you can use models from previous runs in AzureML or from local checkpoints.

#### From a previus run in AzureML:
This is similar to continuing training using a run_recovery object, but you will need to set `--train` to `False`.
This is similar to continuing training using a run_recovery object, but you will need to set `--no-train`.
Thus your command should look like this:

```shell script
python Inner/ML/runner.py --azureml=True --model=Prostate --train=False --cluster=my_cluster_name \
python Inner/ML/runner.py --azureml --model=Prostate --no-train --cluster=my_cluster_name \
--run_recovery_id=foo_bar:foo_bar_12345_abcd --start_epoch=120
```
#### From a local checkpoint:
To evaluate a model using a local checkpoint, use the local_weights_path to specify the path to the model checkpoint
and set train to `False`.
```shell script
python Inner/ML/runner.py --model=Prostate --train=False --local_weights_path=path_to_your_checkpoint
python Inner/ML/runner.py --model=Prostate --no-train --local_weights_path=path_to_your_checkpoint
```

Alternatively, to submit an AzureML run to apply a model to a single image on your local disc,
Expand Down
2 changes: 1 addition & 1 deletion docs/debugging_and_monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ To quickly access this script from PyCharm, there is a template PyCharm run conf
`Template: Tensorboard monitoring` in the repository. Create a copy of that, and modify the commandline
arguments with your jobs to monitor.

* **New jobs**: when queuing a new AzureML job, pass `--tensorboard=True`, which will automatically start a new TensorBoard
* **New jobs**: when queuing a new AzureML job, pass `--tensorboard`, which will automatically start a new TensorBoard
session, monitoring the newly queued job.

### Resource Monitor
Expand Down
14 changes: 7 additions & 7 deletions docs/fastmri.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ There are 2 example models already coded up in the InnerEye toolbox, defined in
`BrainMulticoil`. As with all InnerEye models, you can start a training run by specifying the name of the class
that defines the model, like this:
```shell script
python InnerEye/ML/runner.py --model KneeMulticoil --azureml=True --num_nodes=4
python InnerEye/ML/runner.py --model KneeMulticoil --azureml --num_nodes=4
```
This will start an AzureML job with 4 nodes training at the same time. Depending on how you set up your compute
cluster, this will use a different number of GPUs: For example, if your cluster uses ND24 virtual machines, where
Expand All @@ -110,7 +110,7 @@ Note that the download times depend on the type of Azure storage account that yo
using Premium storage accounts for optimal performance.

You can avoid the time to download the dataset, by specifying that the data is always read on-the-fly from the network.
For that, just add the `--use_dataset_mount=True` flag to the commandline. This may impact training throughput if
For that, just add the `--use_dataset_mount` flag to the commandline. This may impact training throughput if
the storage account cannot provide the data quick enough - however, we have not observed a drop in GPU utilization even
when training on 8 nodes in parallel. For more details around dataset mounting please refer to the next section.

Expand All @@ -122,9 +122,9 @@ Downloading the dataset can - depending on the types of nodes - already make the

The InnerEye toolbox has a way of working around that problem, by reading the dataset on-the-fly from the network,
rather than downloading it at the start of the job. You can trigger this behaviour by supplying an additional
commandline argument `--use_dataset_mount=True`, for example:
commandline argument `--use_dataset_mount`, for example:
```shell script
python InnerEye/ML/runner.py --model BrainMulticoil --azureml=True --num_nodes=4 --use_dataset_mount=True
python InnerEye/ML/runner.py --model BrainMulticoil --azureml --num_nodes=4 --use_dataset_mount
```
With this flag, the InnerEye training script will start immediately, without downloading data beforehand.
However, the fastMRI data module generates a cache file before training, and to build that, it needs to traverse the
Expand All @@ -133,7 +133,7 @@ creating this cache file. This can be avoided by copying the cache file from a p
More specifically, you need to follow these steps:
* Start a training job, training for only 1 epoch, like
```shell script
python InnerEye/ML/runner.py --model BrainMulticoil --azureml=True --use_dataset_mount=True --num_epochs=1
python InnerEye/ML/runner.py --model BrainMulticoil --azureml --use_dataset_mount --num_epochs=1
```
* Wait until the job starts has finished creating the cache file - the job will print out a message
"Saving dataset cache to dataset_cache.pkl", visible in the log file `azureml-logs/70_driver_log.txt`, about 1-2 hours
Expand All @@ -147,7 +147,7 @@ folder that was previously created by the Azure Data Factory. You can do that vi
`brain_multicoil`. Once in that folder, press the "Upload" button at the top and select the `dataset_cache.pkl` file.
* Start the training job again, this time you can start multi-node training right away, like this:
```shell script
python InnerEye/ML/runner.py --model BrainMulticoil --azureml=True --use_dataset_mount=True --num_nodes=8. This new
python InnerEye/ML/runner.py --model BrainMulticoil --azureml --use_dataset_mount --num_nodes=8. This new
```
This job should pick up the existing cache file, and output a message like "Copying a pre-computed dataset cache
file ..."
Expand Down Expand Up @@ -177,7 +177,7 @@ azcopy --source-key <storage_account_key> --source https://<your_storage_acount>
Replace `brain_multicoil` with any of the other datasets names if needed.

If you follow these suggested folder structures, there is no further change necessary to the models. You can then
run, for example, the `BrainMulticoil` model by dropping the `--azureml=True` flag like this:
run, for example, the `BrainMulticoil` model by dropping the `--azureml` flag like this:
```shell script
python InnerEye/ML/runner.py --model BrainMulticoil
```
Expand Down
2 changes: 1 addition & 1 deletion docs/hello_world_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ We have created this file to demonstrate how to:
* Upload to datasets storage account for your AzureML workspace: `Tests/ML/test_data/dataset.csv` and
`Test/ML/test_data/train_and_test_data` and name the folder "hello_world"
* If you have set up AzureML then parameter search can be performed for this model by running:
`python InnerEye/ML/runner.py --model=HelloWorld --azureml=True --hyperdrive=True`
`python InnerEye/ML/runner.py --model=HelloWorld --azureml --hyperdrive`
2 changes: 1 addition & 1 deletion docs/innereye_as_submodule.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,6 @@ is found by the runner. Set `extra_code_directory` to `InnerEyeLocal`.
#### Start Training
Run the following to start a job on AzureML:
```
python myrunner.py --azureml=True --model=MyGlaucomaModel
python myrunner.py --azureml --model=MyGlaucomaModel
```
See [Model Training](building_models.md) for details on training outputs, resuming training, testing models and model ensembles.
5 changes: 3 additions & 2 deletions docs/sample_tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ description below).

### Creating the model configuration and starting training


Next, you need to create a configuration file `InnerEye/ML/configs/MyGlaucoma.py`
which extends the GlaucomaPublic class like this:
```python
Expand All @@ -40,7 +41,7 @@ The value for `self.azure_dataset_id` should match the dataset upload location,

Once that config is in place, you can start training in AzureML via
```
python InnerEye/ML/runner.py --model=MyGlaucomaModel --azureml=True
python InnerEye/ML/runner.py --model=MyGlaucomaModel --azureml
```

As an alternative to working with a fork of the repository, you can use InnerEye-DeepLearning via a submodule.
Expand Down Expand Up @@ -91,7 +92,7 @@ as described for the Glaucoma model [here](innereye_as_submodule.md).

You can now run the following command to start a job on AzureML:
```
python InnerEye/ML/runner.py --azureml=True --model=MyLungModel
python InnerEye/ML/runner.py --azureml --model=MyLungModel
```
See [Model Training](building_models.md) for details on training outputs, resuming training, testing models and model ensembles.

Expand Down