Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ gets uploaded to AzureML, by skipping all test folders.
- ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Add `cudatoolkit=11.1` specification to environment.yml.
- ([#615](https://github.com/microsoft/InnerEye-DeepLearning/pull/615)) Minor changes to checkpoint download from AzureML.
- ([#605](https://github.com/microsoft/InnerEye-DeepLearning/pull/605)) Make build jobs deterministic for regression testing.
- ([#633](https://github.com/microsoft/InnerEye-DeepLearning/pull/633)) Model training now only writes one recovery checkpoint, rather than multiple ones. Frequency is controlled by
`autosave_every_n_val_epochs`.
- ([#632](https://github.com/microsoft/InnerEye-DeepLearning/pull/632)) Nifti test data is no longer stored in Git LFS

### Fixed
Expand Down Expand Up @@ -125,6 +127,9 @@ in inference-only runs when using lightning containers.

### Deprecated

- ([#633](https://github.com/microsoft/InnerEye-DeepLearning/pull/633)) Model fields `recovery_checkpoint_save_interval` and `recovery_checkpoints_save_last_k` have been retired.
Recovery checkpoint handling is now controlled by `autosave_every_n_val_epochs`.


## 0.3 (2021-06-01)

Expand Down
49 changes: 8 additions & 41 deletions InnerEye/Azure/azure_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
from pathlib import Path
from typing import Generator, List, Optional, Tuple

from azureml.core import Experiment, Run, Workspace, get_run
from azureml.core import Experiment, Run, Workspace
from azureml.exceptions import UserErrorException

from InnerEye.Common import fixed_paths
from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME
from health_azure.utils import create_run_recovery_id
from health_azure.utils import create_run_recovery_id, get_aml_run_from_run_id

DEFAULT_CROSS_VALIDATION_SPLIT_INDEX = -1
EXPERIMENT_RUN_SEPARATOR = ":"
Expand Down Expand Up @@ -79,40 +79,7 @@ def fetch_run(workspace: Workspace, run_recovery_id: str) -> Run:
or just the run_id
:return: The AzureML run.
"""
experiment, run = split_recovery_id(run_recovery_id)
try:
experiment_to_recover = Experiment(workspace, experiment)
except Exception as ex:
raise Exception(
f"Unable to retrieve run {run} in experiment {experiment}: {str(ex)}"
)
run_to_recover = fetch_run_for_experiment(experiment_to_recover, run)
logging.info(
"Fetched run #{} {} from experiment {}.".format(
run, run_to_recover.number, experiment
)
)
return run_to_recover


def fetch_run_for_experiment(experiment_to_recover: Experiment, run_id: str) -> Run:
"""
:param experiment_to_recover: an experiment
:param run_id: a string representing the Run ID of one of the runs of the experiment
:return: the run matching run_id_or_number; raises an exception if not found
"""
try:
return get_run(experiment=experiment_to_recover, run_id=run_id, rehydrate=True)
except Exception:
available_runs = experiment_to_recover.get_runs()
available_ids = ", ".join([run.id for run in available_runs])
raise (
Exception(
"Run {} not found for experiment: {}. Available runs are: {}".format(
run_id, experiment_to_recover.name, available_ids
)
)
)
return get_aml_run_from_run_id(aml_workspace=workspace, run_id=run_recovery_id)


def fetch_runs(experiment: Experiment, filters: List[str]) -> List[Run]:
Expand All @@ -133,9 +100,9 @@ def fetch_runs(experiment: Experiment, filters: List[str]) -> List[Run]:


def fetch_child_runs(
run: Run,
status: Optional[str] = None,
expected_number_cross_validation_splits: int = 0,
run: Run,
status: Optional[str] = None,
expected_number_cross_validation_splits: int = 0,
) -> List[Run]:
"""
Fetch child runs for the provided runs that have the provided AML status (or fetch all by default)
Expand Down Expand Up @@ -312,7 +279,7 @@ def download_run_output_file(blob_path: Path, destination: Path, run: Run) -> Pa


def download_run_outputs_by_prefix(
blobs_prefix: Path, destination: Path, run: Run
blobs_prefix: Path, destination: Path, run: Run
) -> None:
"""
Download all the blobs from the run's default output directory: DEFAULT_AML_UPLOAD_DIR ("outputs") that
Expand Down Expand Up @@ -354,7 +321,7 @@ def is_running_on_azure_agent() -> bool:


def get_comparison_baseline_paths(
outputs_folder: Path, blob_path: Path, run: Run, dataset_csv_file_name: str
outputs_folder: Path, blob_path: Path, run: Run, dataset_csv_file_name: str
) -> Tuple[Optional[Path], Optional[Path]]:
run_rec_id = run.id
# We usually find dataset.csv in the same directory as metrics.csv, but we sometimes
Expand Down
14 changes: 9 additions & 5 deletions InnerEye/ML/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@
DATASET_CSV_FILE_NAME = "dataset.csv"
CHECKPOINT_SUFFIX = ".ckpt"

RECOVERY_CHECKPOINT_FILE_NAME = "recovery"
RECOVERY_CHECKPOINT_FILE_NAME_WITH_SUFFIX = RECOVERY_CHECKPOINT_FILE_NAME + CHECKPOINT_SUFFIX
# The file names for the legacy "recovery" checkpoints behaviour, which stored the most recent N checkpoints
LEGACY_RECOVERY_CHECKPOINT_FILE_NAME = "recovery"

BEST_CHECKPOINT_FILE_NAME = "best_checkpoint"
BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX = BEST_CHECKPOINT_FILE_NAME + CHECKPOINT_SUFFIX
# The file names for the new recovery checkpoint behaviour: A single fixed checkpoint that is written every N epochs.
# Lightning does not overwrite files in place, and will hence create files "autosave.ckpt", "autosave-v1.ckpt"
# alternatingly
AUTOSAVE_CHECKPOINT_FILE_NAME = "autosave"
AUTOSAVE_CHECKPOINT_CANDIDATES = [AUTOSAVE_CHECKPOINT_FILE_NAME + CHECKPOINT_SUFFIX,
AUTOSAVE_CHECKPOINT_FILE_NAME + "-v1" + CHECKPOINT_SUFFIX]

# This is a constant that must match a filename defined in pytorch_lightning.ModelCheckpoint, but we don't want
# to import that here.
Expand Down Expand Up @@ -84,4 +88,4 @@ def get_best_checkpoint_path(path: Path) -> Path:
Given a path and checkpoint, formats a path based on the checkpoint file name format.
:param path to checkpoint folder
"""
return path / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
return path / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ def __init__(self, **kwargs: Any) -> None:
# To mount the dataset instead of downloading in AML, pass --use_dataset_mount in the CLI
# declared in TrainerParams:
num_epochs=16,
recovery_checkpoint_save_interval=16,
recovery_checkpoints_save_last_k=-1,
# declared in WorkflowParams:
number_of_cross_validation_splits=5,
cross_validation_split_index=0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ def __init__(self, **kwargs: Any) -> None:
# To mount the dataset instead of downloading in AML, pass --use_dataset_mount in the CLI
# declared in TrainerParams:
num_epochs=200,
recovery_checkpoint_save_interval=10,
recovery_checkpoints_save_last_k=-1,
# use_mixed_precision = True,
# declared in WorkflowParams:
number_of_cross_validation_splits=5,
Expand Down
5 changes: 2 additions & 3 deletions InnerEye/ML/configs/segmentation/BasicModel2Epochs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd

from InnerEye.ML.config import PhotometricNormalizationMethod, SegmentationModelBase, equally_weighted_classes
from InnerEye.ML.configs.segmentation.Lung import AZURE_DATASET_ID
from InnerEye.ML.configs.segmentation.Lung import LUNG_AZURE_DATASET_ID
from InnerEye.ML.deep_learning_config import LRSchedulerType
from InnerEye.ML.utils.split_dataset import DatasetSplits

Expand Down Expand Up @@ -40,9 +40,8 @@ def __init__(self, **kwargs: Any) -> None:
num_dataload_workers=1,
train_batch_size=8,
num_epochs=2,
recovery_checkpoint_save_interval=1,
use_mixed_precision=True,
azure_dataset_id=AZURE_DATASET_ID,
azure_dataset_id=LUNG_AZURE_DATASET_ID,
comparison_blob_storage_paths=comparison_blob_storage_paths,
inference_on_val_set=True,
inference_on_test_set=True,
Expand Down
1 change: 0 additions & 1 deletion InnerEye/ML/configs/segmentation/GbmBase.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def __init__(self, **kwargs: Any) -> None:
adam_betas=(0.9, 0.999),
momentum=0.9,
weight_decay=1e-4,
recovery_checkpoint_save_interval=10,
use_mixed_precision=True,
use_model_parallel=True,
)
Expand Down
1 change: 0 additions & 1 deletion InnerEye/ML/configs/segmentation/HeadAndNeckBase.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def __init__(self,
super().__init__(
should_validate=False, # we'll validate after kwargs are added
num_epochs=num_epochs,
recovery_checkpoint_save_interval=10,
architecture="UNet3D",
kernel_size=3,
train_batch_size=1,
Expand Down
1 change: 0 additions & 1 deletion InnerEye/ML/configs/segmentation/HelloWorld.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def __init__(self, **kwargs: Any) -> None:
num_dataload_workers=0,
train_batch_size=2,
num_epochs=2,
recovery_checkpoint_save_interval=1,
use_mixed_precision=True,

# Pre-processing - in this section we define how to normalize our inputs, in this case we are doing
Expand Down
5 changes: 2 additions & 3 deletions InnerEye/ML/configs/segmentation/Lung.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from InnerEye.ML.utils.split_dataset import DatasetSplits

# Change this string to the name of your dataset on Azure blob storage.
AZURE_DATASET_ID = "2339eba2-8ec5-4ccb-86ff-c170470ac6e2_geonorm_with_train_test_split_2020_05_26"
LUNG_AZURE_DATASET_ID = "2339eba2-8ec5-4ccb-86ff-c170470ac6e2_geonorm_with_train_test_split_2020_05_26"


class Lung(SegmentationModelBase):
Expand All @@ -29,7 +29,7 @@ def __init__(self, **kwargs: Any) -> None:
architecture="UNet3D",
feature_channels=[32],
kernel_size=3,
azure_dataset_id=AZURE_DATASET_ID,
azure_dataset_id=LUNG_AZURE_DATASET_ID,
crop_size=(64, 224, 224),
test_crop_size=(128, 512, 512),
image_channels=["ct"],
Expand All @@ -56,7 +56,6 @@ def __init__(self, **kwargs: Any) -> None:
adam_betas=(0.9, 0.999),
momentum=0.9,
weight_decay=1e-4,
recovery_checkpoint_save_interval=10,
use_mixed_precision=True,
use_model_parallel=True,
monitoring_interval_seconds=0,
Expand Down
3 changes: 0 additions & 3 deletions InnerEye/ML/configs/ssl/CIFAR_SSL_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ def __init__(self) -> None:
ssl_encoder=EncoderName.resnet50,
ssl_training_type=SSLTrainingType.SimCLR,
random_seed=1,
recovery_checkpoint_save_interval=200,
num_epochs=2500,
num_workers=6)

Expand All @@ -38,7 +37,6 @@ def __init__(self) -> None:
ssl_encoder=EncoderName.resnet50,
ssl_training_type=SSLTrainingType.BYOL,
random_seed=1,
recovery_checkpoint_save_interval=200,
num_epochs=2500,
num_workers=6)

Expand All @@ -55,6 +53,5 @@ def __init__(self) -> None:
ssl_encoder=EncoderName.resnet50,
ssl_training_type=SSLTrainingType.BYOL,
random_seed=1,
recovery_checkpoint_save_interval=200,
num_epochs=2500,
num_workers=6)
1 change: 0 additions & 1 deletion InnerEye/ML/configs/ssl/CIFAR_classifier_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ def __init__(self) -> None:
super().__init__(
linear_head_dataset_name=SSLDatasetName.CIFAR10,
random_seed=1,
recovery_checkpoint_save_interval=5,
num_epochs=100,
l_rate=1e-4,
num_workers=6)
3 changes: 0 additions & 3 deletions InnerEye/ML/configs/ssl/CXR_SSL_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def __init__(self) -> None:
linear_head_dataset_name=SSLDatasetName.RSNAKaggleCXR,
azure_dataset_id=NIH_AZURE_DATASET_ID,
random_seed=1,
recovery_checkpoint_save_interval=200,
num_epochs=1000,
# We usually train this model with 16 GPUs, giving an effective batch size of 1200
ssl_training_batch_size=75,
Expand All @@ -44,7 +43,6 @@ def __init__(self) -> None:
linear_head_dataset_name=SSLDatasetName.RSNAKaggleCXR,
azure_dataset_id=NIH_AZURE_DATASET_ID,
random_seed=1,
recovery_checkpoint_save_interval=200,
num_epochs=1000,
# We usually train this model with 16 GPUs, giving an effective batch size of 1200
ssl_training_batch_size=75,
Expand All @@ -60,7 +58,6 @@ class CXRImageClassifier(SSLClassifierContainer):
def __init__(self) -> None:
super().__init__(linear_head_dataset_name=SSLDatasetName.RSNAKaggleCXR,
random_seed=1,
recovery_checkpoint_save_interval=10,
num_epochs=200,
use_balanced_binary_loss_for_linear_head=True,
azure_dataset_id=RSNA_AZURE_DATASET_ID,
Expand Down
2 changes: 0 additions & 2 deletions InnerEye/ML/configs/ssl/CovidContainers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ def __init__(self,
super().__init__(ssl_training_dataset_name=SSLDatasetName.NIHCXR,
linear_head_dataset_name=SSLDatasetName.Covid,
random_seed=1,
recovery_checkpoint_save_interval=50,
recovery_checkpoints_save_last_k=3,
num_epochs=500,
ssl_training_batch_size=75, # This runs with 16 gpus (4 nodes)
num_workers=12,
Expand Down
4 changes: 2 additions & 2 deletions InnerEye/ML/configs/unit_testing/passthrough_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from InnerEye.Common.type_annotations import TupleInt3
from InnerEye.ML.config import ModelArchitectureConfig, SegmentationModelBase, equally_weighted_classes
from InnerEye.ML.configs.segmentation.Lung import AZURE_DATASET_ID
from InnerEye.ML.configs.segmentation.Lung import LUNG_AZURE_DATASET_ID
from InnerEye.ML.models.architectures.base_model import BaseSegmentationModel
from InnerEye.ML.models.parallel.model_parallel import get_device_from_parameters, move_to_device
from InnerEye.ML.utils.model_metadata_util import generate_random_colours_list
Expand All @@ -34,7 +34,7 @@ def __init__(self, **kwargs: Any) -> None:
should_validate=False,
# Set as UNet3D only because this does not shrink patches in the forward pass.
architecture=ModelArchitectureConfig.UNet3D,
azure_dataset_id=AZURE_DATASET_ID,
azure_dataset_id=LUNG_AZURE_DATASET_ID,
crop_size=(64, 224, 224),
num_dataload_workers=1,
# Disable monitoring so that we can use VS Code remote debugging
Expand Down
15 changes: 5 additions & 10 deletions InnerEye/ML/deep_learning_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,6 @@ def logs_folder(self) -> Path:
@property
def checkpoint_folder(self) -> Path:
"""Gets the full path in which the model checkpoints should be stored during training."""
print(f"Expected Checkpoint path {self.outputs_folder / CHECKPOINT_FOLDER}")
return self.outputs_folder / CHECKPOINT_FOLDER

@property
Expand Down Expand Up @@ -567,15 +566,11 @@ def min_l_rate(self, value: float) -> None:

class TrainerParams(param.Parameterized):
num_epochs: int = param.Integer(100, bounds=(1, None), doc="Number of epochs to train.")
recovery_checkpoint_save_interval: int = param.Integer(10, bounds=(0, None),
doc="Save epoch checkpoints when epoch number is a multiple "
"of recovery_checkpoint_save_interval. The intended use "
"is to allow restore training from failed runs.")
recovery_checkpoints_save_last_k: int = param.Integer(default=1, bounds=(-1, None),
doc="Number of recovery checkpoints to keep. Recovery "
"checkpoints will be stored as recovery_epoch:{"
"epoch}.ckpt. If set to -1 keep all recovery "
"checkpoints.")
autosave_every_n_val_epochs: int = param.Integer(1, bounds=(0, None),
doc="Save epoch checkpoints every N validation epochs. "
"If pl_check_val_every_n_epoch > 1, this means that "
"checkpoints are saved every N * pl_check_val_every_n_epoch "
"training epochs.")
detect_anomaly: bool = param.Boolean(False, doc="If true, test gradients for anomalies (NaN or Inf) during "
"training.")
use_mixed_precision: bool = param.Boolean(False, doc="If true, mixed precision training is activated during "
Expand Down
Loading