diff --git a/CHANGELOG.md b/CHANGELOG.md index 31e625db6..723f5d135 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ created. ### Fixed +- ([#482](https://github.com/microsoft/InnerEye-DeepLearning/pull/482)) Check bool parameter is either true or false. - ([#475](https://github.com/microsoft/InnerEye-DeepLearning/pull/475)) Bug in AML SDK meant that we could not train any large models anymore because data loaders ran out of memory. - ([#472](https://github.com/microsoft/InnerEye-DeepLearning/pull/472)) Correct model path for moving ensemble models. diff --git a/InnerEye/Common/generic_parsing.py b/InnerEye/Common/generic_parsing.py index 6eae3f20a..38f8eed7f 100644 --- a/InnerEye/Common/generic_parsing.py +++ b/InnerEye/Common/generic_parsing.py @@ -9,6 +9,7 @@ from typing import Any, Callable, Dict, List, Optional, Set, Type, Union import param +from param.parameterized import Parameter from InnerEye.Common.common_util import is_private_field_name from InnerEye.Common.type_annotations import T @@ -114,6 +115,21 @@ def add_args(cls: Type[GenericConfig], :param parser: Parser to add properties to. """ + def parse_bool(x: str) -> bool: + """ + Parse a string as a bool. Supported values are case insensitive and one of: + 'on', 't', 'true', 'y', 'yes', '1' for True + 'off', 'f', 'false', 'n', 'no', '0' for False. + :param x: string to test. + :return: Bool value if string valid, otherwise a ValueError is raised. + """ + sx = str(x).lower() + if sx in ('on', 't', 'true', 'y', 'yes', '1'): + return True + if sx in ('off', 'f', 'false', 'n', 'no', '0'): + return False + raise ValueError(f"Invalid value {x}, please supply one of True, true, false or False.") + def _get_basic_type(_p: param.Parameter) -> Union[type, Callable]: """ Given a parameter, get its basic Python type, e.g.: param.Boolean -> bool. @@ -122,7 +138,7 @@ def _get_basic_type(_p: param.Parameter) -> Union[type, Callable]: :return: Type """ if isinstance(_p, param.Boolean): - p_type: Union[type, Callable] = lambda x: (str(x).lower() == 'true') + p_type: Callable = parse_bool elif isinstance(_p, param.Integer): p_type = lambda x: _p.default if x == "" else int(x) elif isinstance(_p, param.Number): @@ -156,8 +172,38 @@ def list_or_dict(x: str) -> Union[Dict, List]: return p_type + def add_boolean_argument(parser: argparse.ArgumentParser, k: str, p: Parameter) -> None: + """ + Add a boolean argument. + If the parameter default is False then allow --flag (to set it True) and --flag=Bool as usual. + If the parameter default is True then allow --no-flag (to set it to False) and --flag=Bool as usual. + :param parser: parser to add a boolean argument to. + :param k: argument name. + :param p: boolean parameter. + """ + if not p.default: + # If the parameter default is False then use nargs="?" (argparse.OPTIONAL). + # This means that the argument is optional. + # If it is not supplied, i.e. in the --flag mode, use the "const" value, i.e. True. + # Otherwise, i.e. in the --flag=value mode, try to parse the argument as a bool. + parser.add_argument("--" + k, help=p.doc, type=parse_bool, default=False, + nargs=argparse.OPTIONAL, const=True) + else: + # If the parameter default is True then create an exclusive group of arguments. + # Either --flag=value as usual + # Or --no-flag to store False in the parameter k. + group = parser.add_mutually_exclusive_group(required=False) + group.add_argument("--" + k, help=p.doc, type=parse_bool) + group.add_argument('--no-' + k, dest=k, action='store_false') + parser.set_defaults(**{k: p.default}) + for k, p in cls.get_overridable_parameters().items(): - parser.add_argument("--" + k, help=p.doc, type=_get_basic_type(p), default=p.default) + # param.Booleans need to be handled separately, they are more complicated because they have + # an optional argument. + if isinstance(p, param.Boolean): + add_boolean_argument(parser, k, p) + else: + parser.add_argument("--" + k, help=p.doc, type=_get_basic_type(p), default=p.default) return parser diff --git a/InnerEye/ML/configs/other/fastmri_varnet.py b/InnerEye/ML/configs/other/fastmri_varnet.py index f6ae74c09..9a77bbe40 100644 --- a/InnerEye/ML/configs/other/fastmri_varnet.py +++ b/InnerEye/ML/configs/other/fastmri_varnet.py @@ -123,7 +123,7 @@ def __init__(self) -> None: super().__init__() self.azure_dataset_id = "knee_multicoil" # If the Azure nodes run out of disk space when downloading the dataset, re-submit with the - # --use_dataset_mount=True flag. The dataset will be mounted to the fixed path given here. + # --use_dataset_mount flag. The dataset will be mounted to the fixed path given here. self.dataset_mountpoint = "/tmp/knee_multicoil" def get_data_module(self) -> LightningDataModule: @@ -144,7 +144,7 @@ def __init__(self) -> None: super().__init__() self.azure_dataset_id = "brain_multicoil" # If the Azure nodes run out of disk space when downloading the dataset, re-submit with the - # --use_dataset_mount=True flag. The dataset will be mounted to the fixed path given here. + # --use_dataset_mount flag. The dataset will be mounted to the fixed path given here. self.dataset_mountpoint = "/tmp/brain_multicoil" def get_data_module(self) -> LightningDataModule: diff --git a/Tests/Common/test_generic_parsing.py b/Tests/Common/test_generic_parsing.py index c9eb691d2..92f55ebe8 100644 --- a/Tests/Common/test_generic_parsing.py +++ b/Tests/Common/test_generic_parsing.py @@ -19,7 +19,8 @@ class ParamEnum(Enum): class ParamClass(GenericConfig): name: str = param.String(None, doc="Name") seed: int = param.Integer(42, doc="Seed") - flag: int = param.Boolean(False, doc="Flag") + flag: bool = param.Boolean(False, doc="Flag") + not_flag: bool = param.Boolean(True, doc="Not Flag") number: float = param.Number(3.14) integers: List[int] = param.List(None, class_=int) optional_int: Optional[int] = param.Integer(None, doc="Optional int") @@ -40,6 +41,7 @@ def test_overridable_parameter() -> None: param_dict = ParamClass.get_overridable_parameters() assert "name" in param_dict assert "flag" in param_dict + assert "not_flag" in param_dict assert "seed" in param_dict assert "number" in param_dict assert "integers" in param_dict @@ -62,6 +64,12 @@ def check(arg: List[str], expected_key: str, expected_value: Any) -> None: parsed = ParamClass.parse_args(arg) assert getattr(parsed, expected_key) == expected_value + def check_fails(arg: List[str]) -> None: + with pytest.raises(SystemExit) as e: + ParamClass.parse_args(arg) + assert e.type == SystemExit + assert e.value.code == 2 + check(["--name=foo"], "name", "foo") check(["--seed", "42"], "seed", 42) check(["--seed", ""], "seed", 42) @@ -77,10 +85,31 @@ def check(arg: List[str], expected_key: str, expected_value: Any) -> None: check(["--enum=2"], "enum", ParamEnum.EnumValue2) check(["--floats=1,2,3.14"], "floats", [1., 2., 3.14]) check(["--integers=1,2,3"], "integers", [1, 2, 3]) - check(["--flag=true"], "flag", True) - check(["--flag=True"], "flag", True) - check(["--flag=false"], "flag", False) - check(["--flag=False"], "flag", False) + # Check all the ways of passing in True, with and without the first letter capitialized + for flag in ('on', 't', 'true', 'y', 'yes', '1'): + check([f"--flag={flag}"], "flag", True) + check([f"--flag={flag.capitalize()}"], "flag", True) + check([f"--not_flag={flag}"], "not_flag", True) + check([f"--not_flag={flag.capitalize()}"], "not_flag", True) + # Check all the ways of passing in False, with and without the first letter capitialized + for flag in ('off', 'f', 'false', 'n', 'no', '0'): + check([f"--flag={flag}"], "flag", False) + check([f"--flag={flag.capitalize()}"], "flag", False) + check([f"--not_flag={flag}"], "not_flag", False) + check([f"--not_flag={flag.capitalize()}"], "not_flag", False) + # Check that passing no value to flag sets it to True (the opposite of its default) + check(["--flag"], "flag", True) + # Check that no-flag is not an option + check_fails(["--no-flag"]) + # Check that passing no value to not_flag fails + check_fails(["--not_flag"]) + # Check that --no-not_flag is an option and sets it to False (the opposite of its default) + check(["--no-not_flag"], "not_flag", False) + # Check that both not_flag and no-not_flag cannot be passed at the same time + check_fails(["--not_flag=false", "--no-not_flag"]) + # Check that invalid bools are caught + check_fails(["--flag=Falsf"]) + check_fails(["--flag=Truf"]) # Check that default values are created as expected, and that the non-overridable parameters # are omitted. defaults = vars(ParamClass.create_argparser().parse_args([])) @@ -88,6 +117,8 @@ def check(arg: List[str], expected_key: str, expected_value: Any) -> None: assert defaults["tuple1"] == (1, 2.3) assert defaults["int_tuple"] == (1, 1, 1) assert defaults["enum"] == ParamEnum.EnumValue1 + assert not defaults["flag"] + assert defaults["not_flag"] assert "readonly" not in defaults assert "constant" not in defaults assert "_non_override" not in defaults diff --git a/docs/bring_your_own_model.md b/docs/bring_your_own_model.md index 3b857690e..a3c9455f6 100644 --- a/docs/bring_your_own_model.md +++ b/docs/bring_your_own_model.md @@ -13,7 +13,7 @@ This can be used by be used for training and testing. - Adding essential trainer parameters like number of epochs to that container. - Invoking the InnerEye runner and providing the name of the container class, like this: -`python InnerEye/ML/runner.py --model=MyContainer`. To train in AzureML, just add a `--azureml=True` flag. +`python InnerEye/ML/runner.py --model=MyContainer`. To train in AzureML, just add a `--azureml` flag. There is a fully working example [HelloContainer](../InnerEye/ML/configs/other/HelloContainer.py), that implements a simple 1-dimensional regression model from data stored in a CSV file. You can run that diff --git a/docs/building_models.md b/docs/building_models.md index f3ebbd283..3e2a60796 100755 --- a/docs/building_models.md +++ b/docs/building_models.md @@ -75,7 +75,7 @@ The InnerEye runner will check if there is a dataset in the AzureML workspace al * Train a new model, for example `Prostate`: ```shell script -python InnerEyeLocal/ML/runner.py --azureml=True --model=Prostate --train=True +python InnerEyeLocal/ML/runner.py --azureml --model=Prostate ``` Alternatively, you can train the model on your current machine if it is powerful enough. In @@ -83,12 +83,26 @@ this case, you would simply omit the `azureml` flag, and instead of specifying `azure_dataset_id` in the class constructor, you can instead use `local_dataset="my/data/folder"`, where the folder `my/data/folder` contains a `dataset.csv` file and all the files that are referenced therein. +### Boolean Options + +Note that for command line options that take a boolean argument, and that are `False` by default, there are multiple ways of setting the option. For example alternatives to `--azureml` include: +* `--azureml=True`, or `--azureml=true`, or `--azureml=T`, or `--azureml=t` +* `--azureml=Yes`, or `--azureml=yes`, or `--azureml=Y`, or `--azureml=y` +* `--azureml=On`, or `--azureml=on` +* `--azureml=1` + +Conversely, for command line options that take a boolean argument, and that are `True` by default, there are multiple ways of un-setting the option. For example alternatives to `--no-train` include: +* `--train=False`, or `--train=false`, or `--train=F`, or `--train=f` +* `--train=No`, or `--train=no`, or `--train=N`, or `--train=n` +* `--train=Off`, or `--train=off` +* `--train=0` + ### Training using multiple machines To speed up training in AzureML, you can use multiple machines, by specifying the additional `--num_nodes` argument. For example, to use 2 machines to train, specify: ```shell script -python InnerEyeLocal/ML/runner.py --azureml=True --model=Prostate --num_nodes=2 +python InnerEyeLocal/ML/runner.py --azureml --model=Prostate --num_nodes=2 ``` On each of the 2 machines, all available GPUs will be used. Model inference will always use only one machine. @@ -149,18 +163,18 @@ run recovery ID without the final underscore and digit. To evaluate an existing model on a test set, you can use models from previous runs in AzureML or from local checkpoints. #### From a previus run in AzureML: -This is similar to continuing training using a run_recovery object, but you will need to set `--train` to `False`. +This is similar to continuing training using a run_recovery object, but you will need to set `--no-train`. Thus your command should look like this: ```shell script -python Inner/ML/runner.py --azureml=True --model=Prostate --train=False --cluster=my_cluster_name \ +python Inner/ML/runner.py --azureml --model=Prostate --no-train --cluster=my_cluster_name \ --run_recovery_id=foo_bar:foo_bar_12345_abcd --start_epoch=120 ``` #### From a local checkpoint: To evaluate a model using a local checkpoint, use the local_weights_path to specify the path to the model checkpoint and set train to `False`. ```shell script -python Inner/ML/runner.py --model=Prostate --train=False --local_weights_path=path_to_your_checkpoint +python Inner/ML/runner.py --model=Prostate --no-train --local_weights_path=path_to_your_checkpoint ``` Alternatively, to submit an AzureML run to apply a model to a single image on your local disc, diff --git a/docs/debugging_and_monitoring.md b/docs/debugging_and_monitoring.md index 427b33211..aa8bef338 100644 --- a/docs/debugging_and_monitoring.md +++ b/docs/debugging_and_monitoring.md @@ -12,7 +12,7 @@ To quickly access this script from PyCharm, there is a template PyCharm run conf `Template: Tensorboard monitoring` in the repository. Create a copy of that, and modify the commandline arguments with your jobs to monitor. -* **New jobs**: when queuing a new AzureML job, pass `--tensorboard=True`, which will automatically start a new TensorBoard +* **New jobs**: when queuing a new AzureML job, pass `--tensorboard`, which will automatically start a new TensorBoard session, monitoring the newly queued job. ### Resource Monitor diff --git a/docs/fastmri.md b/docs/fastmri.md index 5b4a83d81..375a02d7f 100644 --- a/docs/fastmri.md +++ b/docs/fastmri.md @@ -88,7 +88,7 @@ There are 2 example models already coded up in the InnerEye toolbox, defined in `BrainMulticoil`. As with all InnerEye models, you can start a training run by specifying the name of the class that defines the model, like this: ```shell script -python InnerEye/ML/runner.py --model KneeMulticoil --azureml=True --num_nodes=4 +python InnerEye/ML/runner.py --model KneeMulticoil --azureml --num_nodes=4 ``` This will start an AzureML job with 4 nodes training at the same time. Depending on how you set up your compute cluster, this will use a different number of GPUs: For example, if your cluster uses ND24 virtual machines, where @@ -110,7 +110,7 @@ Note that the download times depend on the type of Azure storage account that yo using Premium storage accounts for optimal performance. You can avoid the time to download the dataset, by specifying that the data is always read on-the-fly from the network. -For that, just add the `--use_dataset_mount=True` flag to the commandline. This may impact training throughput if +For that, just add the `--use_dataset_mount` flag to the commandline. This may impact training throughput if the storage account cannot provide the data quick enough - however, we have not observed a drop in GPU utilization even when training on 8 nodes in parallel. For more details around dataset mounting please refer to the next section. @@ -122,9 +122,9 @@ Downloading the dataset can - depending on the types of nodes - already make the The InnerEye toolbox has a way of working around that problem, by reading the dataset on-the-fly from the network, rather than downloading it at the start of the job. You can trigger this behaviour by supplying an additional -commandline argument `--use_dataset_mount=True`, for example: +commandline argument `--use_dataset_mount`, for example: ```shell script -python InnerEye/ML/runner.py --model BrainMulticoil --azureml=True --num_nodes=4 --use_dataset_mount=True +python InnerEye/ML/runner.py --model BrainMulticoil --azureml --num_nodes=4 --use_dataset_mount ``` With this flag, the InnerEye training script will start immediately, without downloading data beforehand. However, the fastMRI data module generates a cache file before training, and to build that, it needs to traverse the @@ -133,7 +133,7 @@ creating this cache file. This can be avoided by copying the cache file from a p More specifically, you need to follow these steps: * Start a training job, training for only 1 epoch, like ```shell script -python InnerEye/ML/runner.py --model BrainMulticoil --azureml=True --use_dataset_mount=True --num_epochs=1 +python InnerEye/ML/runner.py --model BrainMulticoil --azureml --use_dataset_mount --num_epochs=1 ``` * Wait until the job starts has finished creating the cache file - the job will print out a message "Saving dataset cache to dataset_cache.pkl", visible in the log file `azureml-logs/70_driver_log.txt`, about 1-2 hours @@ -147,7 +147,7 @@ folder that was previously created by the Azure Data Factory. You can do that vi `brain_multicoil`. Once in that folder, press the "Upload" button at the top and select the `dataset_cache.pkl` file. * Start the training job again, this time you can start multi-node training right away, like this: ```shell script -python InnerEye/ML/runner.py --model BrainMulticoil --azureml=True --use_dataset_mount=True --num_nodes=8. This new +python InnerEye/ML/runner.py --model BrainMulticoil --azureml --use_dataset_mount --num_nodes=8. This new ``` This job should pick up the existing cache file, and output a message like "Copying a pre-computed dataset cache file ..." @@ -177,7 +177,7 @@ azcopy --source-key --source https:// Replace `brain_multicoil` with any of the other datasets names if needed. If you follow these suggested folder structures, there is no further change necessary to the models. You can then -run, for example, the `BrainMulticoil` model by dropping the `--azureml=True` flag like this: +run, for example, the `BrainMulticoil` model by dropping the `--azureml` flag like this: ```shell script python InnerEye/ML/runner.py --model BrainMulticoil ``` diff --git a/docs/hello_world_model.md b/docs/hello_world_model.md index 69ac0d561..6b5c873cc 100644 --- a/docs/hello_world_model.md +++ b/docs/hello_world_model.md @@ -12,4 +12,4 @@ We have created this file to demonstrate how to: * Upload to datasets storage account for your AzureML workspace: `Tests/ML/test_data/dataset.csv` and `Test/ML/test_data/train_and_test_data` and name the folder "hello_world" * If you have set up AzureML then parameter search can be performed for this model by running: - `python InnerEye/ML/runner.py --model=HelloWorld --azureml=True --hyperdrive=True` + `python InnerEye/ML/runner.py --model=HelloWorld --azureml --hyperdrive` diff --git a/docs/innereye_as_submodule.md b/docs/innereye_as_submodule.md index 9b5c247a5..b1a7469a8 100644 --- a/docs/innereye_as_submodule.md +++ b/docs/innereye_as_submodule.md @@ -90,6 +90,6 @@ is found by the runner. Set `extra_code_directory` to `InnerEyeLocal`. #### Start Training Run the following to start a job on AzureML: ``` -python myrunner.py --azureml=True --model=MyGlaucomaModel +python myrunner.py --azureml --model=MyGlaucomaModel ``` See [Model Training](building_models.md) for details on training outputs, resuming training, testing models and model ensembles. diff --git a/docs/sample_tasks.md b/docs/sample_tasks.md index 57b7c69b2..665403086 100644 --- a/docs/sample_tasks.md +++ b/docs/sample_tasks.md @@ -26,6 +26,7 @@ description below). ### Creating the model configuration and starting training + Next, you need to create a configuration file `InnerEye/ML/configs/MyGlaucoma.py` which extends the GlaucomaPublic class like this: ```python @@ -40,7 +41,7 @@ The value for `self.azure_dataset_id` should match the dataset upload location, Once that config is in place, you can start training in AzureML via ``` -python InnerEye/ML/runner.py --model=MyGlaucomaModel --azureml=True +python InnerEye/ML/runner.py --model=MyGlaucomaModel --azureml ``` As an alternative to working with a fork of the repository, you can use InnerEye-DeepLearning via a submodule. @@ -91,7 +92,7 @@ as described for the Glaucoma model [here](innereye_as_submodule.md). You can now run the following command to start a job on AzureML: ``` -python InnerEye/ML/runner.py --azureml=True --model=MyLungModel +python InnerEye/ML/runner.py --azureml --model=MyLungModel ``` See [Model Training](building_models.md) for details on training outputs, resuming training, testing models and model ensembles.