diff --git a/.dockerignore b/.dockerignore index 75b93558..1ec7bacc 100644 --- a/.dockerignore +++ b/.dockerignore @@ -6,3 +6,5 @@ **/.terraform **/node_modules **/.terraform +**/docs/_build +**/htmlcov \ No newline at end of file diff --git a/datasets/nasa-nex-gddp-cmip6-netcdf/Dockerfile b/datasets/nasa-nex-gddp-cmip6-netcdf/Dockerfile new file mode 100644 index 00000000..da67d9c3 --- /dev/null +++ b/datasets/nasa-nex-gddp-cmip6-netcdf/Dockerfile @@ -0,0 +1,74 @@ +FROM ubuntu:20.04 + +# Setup timezone info +ENV TZ=UTC + +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 + +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + +RUN apt-get update && apt-get install -y software-properties-common + +RUN add-apt-repository ppa:ubuntugis/ppa && \ + apt-get update && \ + apt-get install -y build-essential python3-dev python3-pip \ + jq unzip ca-certificates wget curl git && \ + apt-get autoremove && apt-get autoclean && apt-get clean + +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10 + +# See https://github.com/mapbox/rasterio/issues/1289 +ENV CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt + +# Install Python 3.10 +RUN curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" \ + && bash "Mambaforge-$(uname)-$(uname -m).sh" -b -p /opt/conda \ + && rm -rf "Mambaforge-$(uname)-$(uname -m).sh" + +ENV PATH /opt/conda/bin:$PATH +ENV LD_LIBRARY_PATH /opt/conda/lib/:$LD_LIBRARY_PATH + +RUN mamba install -y -c conda-forge python=3.10 gdal=3.3.3 pip setuptools cython numpy==1.21.5 + +RUN python -m pip install --upgrade pip + +# Install common packages +COPY requirements-task-base.txt /tmp/requirements.txt +RUN python -m pip install --no-build-isolation -r /tmp/requirements.txt + +# +# Copy and install packages +# + +COPY pctasks/core /opt/src/pctasks/core +RUN cd /opt/src/pctasks/core && \ + pip install . + +COPY pctasks/cli /opt/src/pctasks/cli +RUN cd /opt/src/pctasks/cli && \ + pip install . + +COPY pctasks/task /opt/src/pctasks/task +RUN cd /opt/src/pctasks/task && \ + pip install . + +COPY pctasks/client /opt/src/pctasks/client +RUN cd /opt/src/pctasks/client && \ + pip install . + +COPY pctasks/ingest /opt/src/pctasks/ingest +RUN cd /opt/src/pctasks/ingest && \ + pip install . + +COPY pctasks/dataset /opt/src/pctasks/dataset +RUN cd /opt/src/pctasks/dataset && \ + pip install . + +COPY ./datasets/nasa-nex-gddp-cmip6-netcdf/requirements.txt /opt/src/datasets/nasa-nex-gddp-cmip6-netcdf/requirements.txt +RUN python3 -m pip install -r /opt/src/datasets/nasa-nex-gddp-cmip6-netcdf/requirements.txt + +# Setup Python Path to allow import of test modules +ENV PYTHONPATH=/opt/src:$PYTHONPATH + +WORKDIR /opt/src diff --git a/datasets/nasa-nex-gddp-cmip6-netcdf/README.md b/datasets/nasa-nex-gddp-cmip6-netcdf/README.md new file mode 100644 index 00000000..7bddc126 --- /dev/null +++ b/datasets/nasa-nex-gddp-cmip6-netcdf/README.md @@ -0,0 +1,44 @@ +# planetary-computer-tasks dataset: nasa-nex-gddp-cmip6-netcdf + +NASA NEX GDDP CMIP6 Dataset + +## Building the Docker image + +To build and push a custom docker image to our container registry: + +```shell +az acr build -r {the registry} --subscription {the subscription} -t pctasks-nasa-nex-gddp-cmip6-netcdf:latest -t pctasks-nasa-nex-gddp-cmip6-netcdf:{date}.{count} -f datasets/nasa-nex-gddp-cmip6-netcdf/Dockerfile . +``` + +## Version Information + +The upstream provider will occasionally update certain assets in the dataset +(e.g. the `pr` variable will be updated for some models). We want to host just +the latest version of each asset. + +The code in `nasa_nex_gddp_cmip6.py` will list files under a prefix and discover +the latest version of each asset. These files are read and passed into the STAC +item creation method. + +## Static update + +This collection is not regularly updated. + +```console +$ pctasks dataset process-items \ + -d datasets/nasa-nex-gddp-cmip6-netcdf/dataset.yaml \ + nasa-nex-gddp-cmip-test + --arg registry pccommponents.azurecr.io \ + --upsert --submit +``` + +## Kerchunk Index Files + +We have "experimental" Kerchunk index files. We include a +[kerchunk-workflow](./kerchunk-workflow.yaml) for generating these files. + + +**Notes:** + +- Currently uses chunk size of one, because the item creation was timing out with chunksize of 100. However, haven't investigated middle ground. +- Runs in about 10 hours. \ No newline at end of file diff --git a/datasets/nasa-nex-gddp-cmip6-netcdf/collection/collection.json b/datasets/nasa-nex-gddp-cmip6-netcdf/collection/collection.json new file mode 100644 index 00000000..b8f4fc34 --- /dev/null +++ b/datasets/nasa-nex-gddp-cmip6-netcdf/collection/collection.json @@ -0,0 +1,771 @@ +{ + "type": "Collection", + "id": "nasa-nex-gddp-cmip6", + "links": [ + { + "rel": "items", + "type": "application/geo+json", + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/nasa-nex-gddp-cmip6/items" + }, + { + "rel": "parent", + "type": "application/json", + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/" + }, + { + "rel": "root", + "type": "application/json", + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/" + }, + { + "rel": "self", + "type": "application/json", + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/nasa-nex-gddp-cmip6" + }, + { + "rel": "license", + "href": "https://pcmdi.llnl.gov/CMIP6/TermsOfUse/TermsOfUse6-1.html", + "type": "text/html", + "title": "CMIP6 Terms of Use" + }, + { + "rel": "documentation", + "href": "https://www.nccs.nasa.gov/sites/default/files/NEX-GDDP-CMIP6-Tech_Note.pdf", + "type": "application/pdf" + }, + { + "rel": "describedby", + "href": "https://planetarycomputer.microsoft.com/dataset/nasa-nex-gddp-cmip6", + "title": "Human readable dataset overview and reference", + "type": "text/html" + } + ], + "stac_extensions": [ + "https://stac-extensions.github.io/datacube/v2.0.0/schema.json", + "https://stac-extensions.github.io/scientific/v1.0.0/schema.json", + "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json" + ], + "msft:storage_account": "nasagddp", + "msft:container": "nex-gddp-cmip6", + "msft:short_description": [ + "Global downscaled climate scenarios derived from the General Circulation Model conducted under CMIP6." + ], + "cube:dimensions": { + "time": { + "extent": [ + "1950-01-01T12:00:00Z", + "2100-12-31T00:00:00Z" + ], + "description": "time", + "step": "P1DT0H0M0S", + "type": "temporal" + }, + "lon": { + "axis": "x", + "extent": [ + 0.125, + 359.875 + ], + "step": 0.25, + "description": "longitude", + "reference_system": 4326, + "type": "spatial" + }, + "lat": { + "axis": "y", + "extent": [ + -59.875, + 89.875 + ], + "step": 0.25, + "description": "latitude", + "reference_system": 4326, + "type": "spatial" + } + }, + "cube:variables": { + "hurs": { + "type": "data", + "description": "Near-Surface Relative Humidity", + "dimensions": [ + "time", + "lat", + "lon" + ], + "unit": "%", + "attrs": { + "standard_name": "relative_humidity", + "long_name": "Near-Surface Relative Humidity", + "units": "%", + "comment": "The relative humidity with respect to liquid water for T> 0 C, and with respect to ice for T<0 C.", + "cell_methods": "area: time: mean", + "cell_measures": "area: areacella" + }, + "shape": [ + 365, + 600, + 1440 + ] + }, + "huss": { + "type": "data", + "description": "Near-Surface Specific Humidity", + "dimensions": [ + "time", + "lat", + "lon" + ], + "unit": "1", + "attrs": { + "standard_name": "specific_humidity", + "long_name": "Near-Surface Specific Humidity", + "units": "1", + "comment": "Near-surface (usually, 2 meter) specific humidity.", + "cell_methods": "area: time: mean", + "cell_measures": "area: areacella" + }, + "shape": [ + 365, + 600, + 1440 + ] + }, + "pr": { + "type": "data", + "description": "Precipitation", + "dimensions": [ + "time", + "lat", + "lon" + ], + "unit": "kg m-2 s-1", + "attrs": { + "standard_name": "precipitation_flux", + "long_name": "Precipitation", + "units": "kg m-2 s-1", + "comment": "includes both liquid and solid phases", + "cell_methods": "area: time: mean", + "cell_measures": "area: areacella" + }, + "shape": [ + 365, + 600, + 1440 + ] + }, + "rlds": { + "type": "data", + "description": "Surface Downwelling Longwave Radiation", + "dimensions": [ + "time", + "lat", + "lon" + ], + "unit": "W m-2", + "attrs": { + "standard_name": "surface_downwelling_longwave_flux_in_air", + "long_name": "Surface Downwelling Longwave Radiation", + "units": "W m-2", + "comment": "The surface called 'surface' means the lower boundary of the atmosphere. 'longwave' means longwave radiation. Downwelling radiation is radiation from above. It does not mean 'net downward'. When thought of as being incident on a surface, a radiative flux is sometimes called 'irradiance'. In addition, it is identical with the quantity measured by a cosine-collector light-meter and sometimes called 'vector irradiance'. In accordance with common usage in geophysical disciplines, 'flux' implies per unit area, called 'flux density' in physics.", + "cell_methods": "area: time: mean", + "cell_measures": "area: areacella" + }, + "shape": [ + 365, + 600, + 1440 + ] + }, + "rsds": { + "type": "data", + "description": "Surface Downwelling Shortwave Radiation", + "dimensions": [ + "time", + "lat", + "lon" + ], + "unit": "W m-2", + "attrs": { + "standard_name": "surface_downwelling_shortwave_flux_in_air", + "long_name": "Surface Downwelling Shortwave Radiation", + "units": "W m-2", + "comment": "Surface solar irradiance for UV calculations.", + "cell_methods": "area: time: mean", + "cell_measures": "area: areacella" + }, + "shape": [ + 365, + 600, + 1440 + ] + }, + "sfcWind": { + "type": "data", + "description": "Daily-Mean Near-Surface Wind Speed", + "dimensions": [ + "time", + "lat", + "lon" + ], + "unit": "m s-1", + "attrs": { + "standard_name": "wind_speed", + "long_name": "Daily-Mean Near-Surface Wind Speed", + "units": "m s-1", + "comment": "near-surface (usually, 10 meters) wind speed.", + "cell_methods": "area: time: mean", + "cell_measures": "area: areacella" + }, + "shape": [ + 365, + 600, + 1440 + ] + }, + "tas": { + "type": "data", + "description": "Daily Near-Surface Air Temperature", + "dimensions": [ + "time", + "lat", + "lon" + ], + "unit": "K", + "attrs": { + "cell_measures": "area: areacella", + "cell_methods": "area: mean time: maximum", + "comment": "near-surface (usually, 2 meter) air temperature; derived from downscaled tasmax & tasmin", + "units": "K", + "long_name": "Daily Near-Surface Air Temperature", + "standard_name": "air_temperature" + }, + "shape": [ + 365, + 600, + 1440 + ] + }, + "tasmax": { + "type": "data", + "description": "Daily Maximum Near-Surface Air Temperature", + "dimensions": [ + "time", + "lat", + "lon" + ], + "unit": "K", + "attrs": { + "standard_name": "air_temperature", + "long_name": "Daily Maximum Near-Surface Air Temperature", + "units": "K", + "comment": "maximum near-surface (usually, 2 meter) air temperature (add cell_method attribute 'time: max')", + "cell_methods": "area: mean time: maximum", + "cell_measures": "area: areacella" + }, + "shape": [ + 365, + 600, + 1440 + ] + }, + "tasmin": { + "type": "data", + "description": "Daily Minimum Near-Surface Air Temperature", + "dimensions": [ + "time", + "lat", + "lon" + ], + "unit": "K", + "attrs": { + "standard_name": "air_temperature", + "long_name": "Daily Minimum Near-Surface Air Temperature", + "units": "K", + "comment": "minimum near-surface (usually, 2 meter) air temperature (add cell_method attribute 'time: min')", + "cell_methods": "area: mean time: minimum", + "cell_measures": "area: areacella" + }, + "shape": [ + 365, + 600, + 1440 + ] + } + }, + "sci:citation": "Climate scenarios used were from the NEX-GDDP-CMIP6 dataset, prepared by the Climate Analytics Group and NASA Ames Research Center using the NASA Earth Exchange, and distributed by the NASA Center for Climate Simulation (NCCS).", + "item_assets": { + "hurs": { + "description": "Near-Surface Relative Humidity", + "title": "Near-Surface Relative Humidity", + "type": "application/netcdf", + "roles": [ + "data" + ] + }, + "huss": { + "description": "Near-Surface Specific Humidity", + "title": "Near-Surface Specific Humidity", + "type": "application/netcdf", + "roles": [ + "data" + ] + }, + "pr": { + "description": "Precipitation", + "title": "Precipitation", + "type": "application/netcdf", + "roles": [ + "data" + ] + }, + "rlds": { + "description": "Surface Downwelling Longwave Radiation", + "title": "Surface Downwelling Longwave Radiation", + "type": "application/netcdf", + "roles": [ + "data" + ] + }, + "rsds": { + "description": "Surface Downwelling Shortwave Radiation", + "title": "Surface Downwelling Shortwave Radiation", + "type": "application/netcdf", + "roles": [ + "data" + ] + }, + "sfcWind": { + "description": "Daily-Mean Near-Surface Wind Speed", + "title": "Daily-Mean Near-Surface Wind Speed", + "type": "application/netcdf", + "roles": [ + "data" + ] + }, + "tas": { + "description": "Daily Near-Surface Air Temperature", + "title": "Daily Near-Surface Air Temperature", + "type": "application/netcdf", + "roles": [ + "data" + ] + }, + "tasmax": { + "description": "Daily Maximum Near-Surface Air Temperature", + "title": "Daily Maximum Near-Surface Air Temperature", + "type": "application/netcdf", + "roles": [ + "data" + ] + }, + "tasmin": { + "description": "Daily Minimum Near-Surface Air Temperature", + "title": "Daily Minimum Near-Surface Air Temperature", + "type": "application/netcdf", + "roles": [ + "data" + ] + } + }, + "title": "Earth Exchange Global Daily Downscaled Projections (NEX-GDDP-CMIP6)", + "extent": { + "spatial": { + "bbox": [ + [ + -180, + -90, + 180, + 90 + ] + ] + }, + "temporal": { + "interval": [ + [ + "1950-01-01T00:00:00Z", + "2100-12-31T00:00:00Z" + ] + ] + } + }, + "license": "proprietary", + "keywords": [ + "CMIP6", + "NASA", + "Climate", + "Humidity", + "Precipitation", + "Temperature" + ], + "providers": [ + { + "name": "NASA NEX", + "roles": [ + "producer" + ], + "url": "https://www.nasa.gov/nex" + }, + { + "name": "Microsoft", + "roles": [ + "host", + "processor" + ], + "url": "https://planetarycomputer.microsoft.com/" + } + ], + "summaries": { + "cmip6:model": [ + "ACCESS-CM2", + "ACCESS-ESM1-5", + "BCC-CSM2-MR", + "CESM2", + "CESM2-WACCM", + "CMCC-CM2-SR5", + "CMCC-ESM2", + "CNRM-CM6-1", + "CNRM-ESM2-1", + "CanESM5", + "EC-Earth3", + "EC-Earth3-Veg-LR", + "FGOALS-g3", + "GFDL-CM4", + "GFDL-CM4_gr2", + "GFDL-ESM4", + "GISS-E2-1-G", + "HadGEM3-GC31-LL", + "HadGEM3-GC31-MM", + "IITM-ESM", + "INM-CM4-8", + "INM-CM5-0", + "IPSL-CM6A-LR", + "KACE-1-0-G", + "KIOST-ESM", + "MIROC-ES2L", + "MIROC6", + "MPI-ESM1-2-HR", + "MPI-ESM1-2-LR", + "MRI-ESM2-0", + "NESM3", + "NorESM2-LM", + "NorESM2-MM", + "TaiESM1", + "UKESM1-0-LL" + ], + "cmip6:variable": [ + "hurs", + "huss", + "pr", + "rlds", + "rsds", + "sfcWind", + "tas", + "tasmax", + "tasmin" + ], + "cmip6:scenario": [ + "historical", + "ssp245", + "ssp585" + ] + }, + "assets": { + "thumbnail": { + "href": "https://ai4edatasetspublicassets.blob.core.windows.net/assets/pc_thumbnails/nasa-nex-gddp-thumbnail.png", + "type": "image/png", + "title": "thumbnail", + "roles": [ + "thumbnail" + ] + }, + "ACCESS-CM2.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/ACCESS-CM2_historical.json", + "type": "application/json", + "title": "ACCESS-CM2-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "ACCESS-CM2", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "CESM2-WACCM.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/CESM2-WACCM_historical.json", + "type": "application/json", + "title": "CESM2-WACCM-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "CESM2-WACCM", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "CESM2.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/CESM2_historical.json", + "type": "application/json", + "title": "CESM2-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "CESM2", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "CMCC-CM2-SR5.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/CMCC-CM2-SR5_historical.json", + "type": "application/json", + "title": "CMCC-CM2-SR5-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "CMCC-CM2-SR5", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "CMCC-ESM2.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/CMCC-ESM2_historical.json", + "type": "application/json", + "title": "CMCC-ESM2-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "CMCC-ESM2", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "IPSL-CM6A-LR.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/IPSL-CM6A-LR_historical.json", + "type": "application/json", + "title": "IPSL-CM6A-LR-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "IPSL-CM6A-LR", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "KACE-1-0-G.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/KACE-1-0-G_historical.json", + "type": "application/json", + "title": "KACE-1-0-G-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "KACE-1-0-G", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "KIOST-ESM.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/KIOST-ESM_historical.json", + "type": "application/json", + "title": "KIOST-ESM-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "KIOST-ESM", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "MIROC-ES2L.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/MIROC-ES2L_historical.json", + "type": "application/json", + "title": "MIROC-ES2L-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "MIROC-ES2L", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "MIROC6.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/MIROC6_historical.json", + "type": "application/json", + "title": "MIROC6-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "MIROC6", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "MPI-ESM1-2-HR.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/MPI-ESM1-2-HR_historical.json", + "type": "application/json", + "title": "MPI-ESM1-2-HR-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "MPI-ESM1-2-HR", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "MPI-ESM1-2-LR.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/MPI-ESM1-2-LR_historical.json", + "type": "application/json", + "title": "MPI-ESM1-2-LR-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "MPI-ESM1-2-LR", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "MRI-ESM2-0.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/MRI-ESM2-0_historical.json", + "type": "application/json", + "title": "MRI-ESM2-0-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "MRI-ESM2-0", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "NorESM2-LM.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/NorESM2-LM_historical.json", + "type": "application/json", + "title": "NorESM2-LM-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "NorESM2-LM", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "NorESM2-MM.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/NorESM2-MM_historical.json", + "type": "application/json", + "title": "NorESM2-MM-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "NorESM2-MM", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "TaiESM1.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/TaiESM1_historical.json", + "type": "application/json", + "title": "TaiESM1-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "TaiESM1", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + }, + "UKESM1-0-LL.historical": { + "href": "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6-references/UKESM1-0-LL_historical.json", + "type": "application/json", + "title": "UKESM1-0-LL-historical references", + "xarray:open_dataset_kwargs": { + "engine": "zarr", + "backend_kwargs": { + "consolidated": false, + "chunks": {} + } + }, + "cmip6:model": "UKESM1-0-LL", + "cmip6:scenario": "historical", + "roles": [ + "references" + ] + } + } +} \ No newline at end of file diff --git a/datasets/nasa-nex-gddp-cmip6-netcdf/dataset.yaml b/datasets/nasa-nex-gddp-cmip6-netcdf/dataset.yaml new file mode 100644 index 00000000..c84bc143 --- /dev/null +++ b/datasets/nasa-nex-gddp-cmip6-netcdf/dataset.yaml @@ -0,0 +1,41 @@ +id: nasa_nex_gddp_cmip6 +image: ${{ args.registry }}/pctasks-nasa-nex-gddp-cmip6-netcdf:2024.4.11.0 + +args: +- registry + +code: + src: ${{ local.path(./nasa_nex_gddp_cmip6.py) }} + +environment: + AZURE_TENANT_ID: ${{ secrets.task-tenant-id }} + AZURE_CLIENT_ID: ${{ secrets.task-client-id }} + AZURE_CLIENT_SECRET: ${{ secrets.task-client-secret }} + APPLICATIONINSIGHTS_CONNECTION_STRING: ${{ secrets.task-application-insights-connection-string }} + +collections: + - id: nasa-nex-gddp-cmip6 + template: ${{ local.path(./collection/) }} + class: nasa_nex_gddp_cmip6:NASANEXGDDPCMIP6Collection + asset_storage: + # The blob storage pattern is NEX/GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/hurs/hurs_day_ACCESS-CM2_historical_r1i1p1f1_gn_2014.nc + # + # | NEX/GDDP-CMIP6/ + # | ACCESS-CM2/ # Model: ~35 directories + # | historical/ # Scenario: 3 directories + # | r1.../ # Something: 1 directory + # | hurs/ # Variable: ~9 directories + # + # We want to list up to the // section. + + - uri: blob://nasagddp/nex-gddp-cmip6/NEX/GDDP-CMIP6/ + chunks: + options: + chunk_length: 1 + min_depth: 2 + max_depth: 2 + list_folders: true + + chunk_storage: + uri: blob://nasagddp/nex-gddp-cmip6-etl-data/pctasks/ + diff --git a/datasets/nasa-nex-gddp-cmip6-netcdf/kerchunk-workflow.yaml b/datasets/nasa-nex-gddp-cmip6-netcdf/kerchunk-workflow.yaml new file mode 100644 index 00000000..15014fe4 --- /dev/null +++ b/datasets/nasa-nex-gddp-cmip6-netcdf/kerchunk-workflow.yaml @@ -0,0 +1,40 @@ +name: "NASA NEX GDDP CMIP6 Kerchunk" +dataset: "nasa-nex-gddp-cmip6" +id: "nasa-nex-gddp-cmip6-kerchunk" +args: + - prefix + - registry + - references_prefix + +jobs: + list: + tasks: + - id: list + image: ${{ args.registry }}/pctasks-nasa-nex-gddp-cmip6-netcdf:2024.4.16.1 + code: + src: ${{ local.path(nasa_nex_gddp_cmip6.py) }} + environment: + APPLICATIONINSIGHTS_CONNECTION_STRING: ${{ secrets.task-application-insights-connection-string }} + AZURE_TENANT_ID: ${{ secrets.task-tenant-id }} + AZURE_CLIENT_ID: ${{ secrets.task-client-id }} + AZURE_CLIENT_SECRET: ${{ secrets.task-client-secret }} + task: nasa_nex_gddp_cmip6:ListInputsTask + args: + prefix: ${{ args.prefix }} + kerchunk: + foreach: + items: ${{ jobs.list.tasks.list.output.asset_uris }} + tasks: + - id: kerchunk + image: ${{ args.registry }}/pctasks-nasa-nex-gddp-cmip6-netcdf:2024.4.16.1 + code: + src: ${{ local.path(nasa_nex_gddp_cmip6.py) }} + environment: + APPLICATIONINSIGHTS_CONNECTION_STRING: ${{ secrets.task-application-insights-connection-string }} + AZURE_TENANT_ID: ${{ secrets.task-tenant-id }} + AZURE_CLIENT_ID: ${{ secrets.task-client-id }} + AZURE_CLIENT_SECRET: ${{ secrets.task-client-secret }} + task: nasa_nex_gddp_cmip6:KerchunkTask + args: + asset_uri: ${{ item }} + references_prefix: ${{ args.references_prefix }} diff --git a/datasets/nasa-nex-gddp-cmip6-netcdf/nasa_nex_gddp_cmip6.py b/datasets/nasa-nex-gddp-cmip6-netcdf/nasa_nex_gddp_cmip6.py new file mode 100644 index 00000000..00f70213 --- /dev/null +++ b/datasets/nasa-nex-gddp-cmip6-netcdf/nasa_nex_gddp_cmip6.py @@ -0,0 +1,335 @@ +# Basic plan: reuse as much of Collection as we can. +# +# Thanks to the layout on in Blob Storage, the asset_uri will be something +# 'NEX/GDDP-CMIP6/ACCESS-CM2/historical/' There are 104 of those {model, +# scenario} pairs. Each prefix will result in ~65 - 86 items, depending +# on which variables are produced for that combination and the number +# of years covered by that scenario. +# Each of these items will take a while to create (minutes...) +# so having a chunksize of 1 is fine. + +import gc +import sys +import time +import logging + +import pystac + +from pctasks.core.storage import StorageFactory +from pctasks.dataset.collection import Collection +from pctasks.core.models.base import PCBaseModel + +import itertools + +import adlfs +import xarray as xr +import azure.storage.blob +import azure.identity.aio +from stactools.nasa_nex_gddp_cmip6 import stac +import json +import string +from typing import Generator + +import fsspec +import azure.storage.blob +import azure.identity +import kerchunk.hdf +import kerchunk.combine +import planetary_computer + +from pctasks.task.task import Task +from pctasks.task.context import TaskContext + + +logger = logging.getLogger(__name__) + + +class ItemSpec(PCBaseModel): + """ + Representation of the assets associated with an Item. + + `item_id` is the unique identifier for the item. Everything other + field is a path to the asset in Blob Storage (the name). + + This is needed because + + 1. The actual files are dynamic (depending on which model/scenario + has updated v1.1 or v1.2 assets) + 2. The prefix we're listing under has many items. + """ + + item_id: str + pr: str + tas: str + rlds: str + rsds: str + sfcWind: str + hurs: str | None = None + huss: str | None = None + tasmax: str | None = None + tasmin: str | None = None + + +def item_key(x: tuple[str, stac.Parts]) -> str: + _, parts = x + return parts.item_id + + +def asset_key(x: tuple[str, stac.Parts]) -> str: + return x[0].split("/")[-1].split("_")[0] + + +def version_key(x: tuple[str, stac.Parts]) -> str: + return x[1].version + + +def list_item_specs( + prefix: str, container_client: azure.storage.blob.ContainerClient +) -> list[ItemSpec]: + # prefix ix like NEX/GDDP-CMIP6/ACCESS-CM2/historical/ + blobs = list(container_client.list_blobs(prefix)) + item_specs = build_item_spec(blobs) + return item_specs + + +def build_item_spec(blobs: list[azure.storage.blob.BlobProperties]) -> list[ItemSpec]: + blobs = [x for x in blobs if x.size > 0] + names = [x.name for x in blobs] + parts = [stac.Parts.from_path(name) for name in names] + + pairs = list(zip(names, parts)) + by_item = { + k: list(v) for k, v in itertools.groupby(sorted(pairs, key=item_key), item_key) + } + item_specs = [] + + for k, v in by_item.items(): + d = { + k: max(v, key=version_key)[0] + for k, v in itertools.groupby(sorted(v, key=asset_key), key=asset_key) + } + item_specs.append(ItemSpec(item_id=k, **d)) + return item_specs + + +def read_dataset( + item_spec: ItemSpec, fs: adlfs.AzureBlobFileSystem, container_name: str +) -> xr.Dataset: + # open the datasets + data_vars = item_spec.dict() + data_vars.pop("item_id") + + datasets = [] + for var, name in data_vars.items(): + if name is None: + # Some models are missing some variables + continue + + # See https://github.com/h5py/h5py/issues/2019 for the gc stuff + gc.disable() + logger.info("Reading variable. item_id=%s, variable=%s", item_spec.item_id, var) + ds = xr.open_dataset(fs.open(f"{container_name}/{name}"), engine="h5netcdf") + gc.enable() + source = "/".join([fs.account_url, container_name, name]) + ds[var].encoding["source"] = source + datasets.append(ds) + + ds = xr.merge(datasets, join="exact") + return ds + + +class NASANEXGDDPCMIP6Collection(Collection): + limit: int | None = None + + @classmethod + def create_item( + cls, asset_uri: str, storage_factory: StorageFactory + ) -> list[pystac.Item]: + # asset_uri should be a prefix like: + # blob://nasagddp/nex-gddp-cmip6/NEX/GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1 + # fsspec_uri = as + *_, account_name, container_name, path = asset_uri.split("/", 4) + container_client = azure.storage.blob.ContainerClient( + f"https://{account_name}.blob.core.windows.net", + container_name, + azure.identity.DefaultAzureCredential(), + ) + fs = adlfs.AzureBlobFileSystem( + account_name, credential=azure.identity.aio.DefaultAzureCredential() + ) + item_specs = list_item_specs(path, container_client) + items = [] + N = len(item_specs) + + for i, item_spec in enumerate(item_specs, 1): + if item_spec.item_id.split(".")[0] in {"GISS-E2-1-G"}: + logger.info( + "Item has misaligned coordinates. Skipping. id=%s", + item_spec.item_id, + ) + continue + + t0 = time.monotonic() + logger.info("Creating item. id=%s", item_spec.item_id) + ds = read_dataset(item_spec, fs, container_name) + item = stac.create_item_from_dataset(ds) + t1 = time.monotonic() + logger.info( + "Created item. id=%s. time=%0.2f, [%d/%d]", + item_spec.item_id, + t1 - t0, + i, + N, + ) + items.append(item) + if cls.limit is not None and i >= cls.limit: + break + + return items + + +# -------------------------------------------------------------------------------------- +# Kerchunk Stuff +# Throwing in here to simplify packaging. + + +def make_templates(all_urls: list[str]) -> Generator[str, None, None]: + N = len(string.ascii_letters) + i = 0 + + while i < len(all_urls): + n = (i // N) + 1 + for tup in itertools.combinations_with_replacement(string.ascii_letters, n): + id_ = "".join(tup) + yield "{{" + id_ + "}}" + i += 1 + + +def make_timeseries(urls: list[str]) -> dict: + gen = make_templates(urls) + urls_to_templates = {url: template for url, template in zip(urls, gen)} + templates_to_urls = {template: url for url, template in urls_to_templates.items()} + templates = {k[2:-2]: v.split("?")[0] for k, v in templates_to_urls.items()} + + translated = [] + for url in urls: + # Some (all?) versions of h5netcdf / h5py / fsspec can deadlocks + # when reading files over the network. + # See https://github.com/h5py/h5py/issues/2019 for background. + # https://github.com/TomAugspurger/httpfile for an alternative. + print("Reading", url.split("?")[0], file=sys.stderr) + signed_url = planetary_computer.sign(url) + + with fsspec.open(signed_url) as inf: + h5chunks = kerchunk.hdf.SingleHdf5ToZarr(inf, signed_url) + tr = h5chunks.translate() + translated.append(tr) + + mzz = kerchunk.combine.MultiZarrToZarr( + translated, + remote_protocol="https", + concat_dims=["time"], + ) + d = mzz.translate() + + # We use templates to reduce the size of the JSON and make + # planetary_computer.sign work + d["templates"] = templates + for _, ref in d["refs"].items(): + if ( + isinstance(ref, list) + and len(ref) == 3 + and ref[0].split("?")[0] in urls_to_templates + ): + template = urls_to_templates[ref[0].split("?")[0]] + ref[0] = template + + return d + + +class ListInputsInput(PCBaseModel): + prefix: str + + +class ListInputsOutput(PCBaseModel): + asset_uris: list[str] + + +class ListInputsTask(Task[ListInputsInput, ListInputsOutput]): + _input_model = ListInputsInput + _output_model = ListInputsOutput + + def run(self, input: ListInputsInput, context: TaskContext) -> ListInputsOutput: + storage, path = context.storage_factory.get_storage_for_file(input.prefix) + prefix, models, _ = list(storage.walk(name_starts_with=path, max_depth=2))[0] + + asset_uris = [] + + for model in models: + for _, scenarios, _ in storage.walk( + name_starts_with=f"{prefix}{model}/", max_depth=3 + ): + if "historical" in scenarios: + for _, rthings, _ in storage.walk( + name_starts_with=f"{prefix}{model}/historical/", max_depth=4 + ): + assert len(rthings) == 1 + rthing = rthings[0] + print(model, rthing, file=sys.stderr) + asset_uris.append( + storage.get_uri(f"{prefix}{model}/historical/{rthing}") + ) + + return ListInputsOutput(asset_uris=asset_uris) + + +class KerchunkInput(PCBaseModel): + # A path like 'blob://nasagddp/nex-gddp-cmip6/NEX/GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1' + # Only works for "historical" + asset_uri: str + references_prefix: str = "blob://nasagddp/nex-gddp-cmip6-references" + + +class KerchunkOutput(PCBaseModel): + references_uri: str + + +class KerchunkTask(Task[KerchunkInput, KerchunkOutput]): + _input_model = KerchunkInput + _output_model = KerchunkOutput + + def run(self, input: KerchunkInput, context: TaskContext): + result = run_kerchunk(input.asset_uri) + *_, model, scenario, _ = input.asset_uri.split("/") + output = ( + f"b{input.references_prefix.rstrip('/')}/{model}_{scenario}.json" + ) + + # output name needs to be derived from the input. + dest_storage, dest_path = context.storage_factory.get_storage_for_file(output) + dest_storage.write_text(dest_path, json.dumps(result)) + return KerchunkOutput(references_uri=dest_path) + + +def run_kerchunk(asset_uri: str) -> dict: + account_name = "nasagddp" + container_name = "nex-gddp-cmip6" + + *_, account_name, container_name, path = asset_uri.split("/", 4) + container_client = azure.storage.blob.ContainerClient( + f"https://{account_name}.blob.core.windows.net", + container_name, + azure.identity.DefaultAzureCredential(), + ) + item_specs = list_item_specs(path, container_client) + account_url = f"https://{account_name}.blob.core.windows.net" + + all_urls = [ + f"{account_url}/{container_name}/{v}" + for item_spec in item_specs + for k, v in item_spec.dict().items() + if k != "item_id" + ] + + refs = make_timeseries(all_urls) + return refs diff --git a/datasets/nasa-nex-gddp-cmip6-netcdf/requirements.txt b/datasets/nasa-nex-gddp-cmip6-netcdf/requirements.txt new file mode 100644 index 00000000..7957ae33 --- /dev/null +++ b/datasets/nasa-nex-gddp-cmip6-netcdf/requirements.txt @@ -0,0 +1,9 @@ +git+https://github.com/TomAugspurger/nasa-nex-gddp-cmip6.git@v0.1.0 +adlfs +xarray +cftime +kerchunk +planetary-computer +fsspec +h5netcdf +h5py \ No newline at end of file diff --git a/datasets/nasa-nex-gddp-cmip6-netcdf/test_nasa_nex_gddp_cmip6.py b/datasets/nasa-nex-gddp-cmip6-netcdf/test_nasa_nex_gddp_cmip6.py new file mode 100644 index 00000000..a8cb2d19 --- /dev/null +++ b/datasets/nasa-nex-gddp-cmip6-netcdf/test_nasa_nex_gddp_cmip6.py @@ -0,0 +1,29 @@ +from pctasks.core.storage import StorageFactory + +import nasa_nex_gddp_cmip6 + + +def test_create_item(): + asset_uri = ( + "blob://nasagddp/nex-gddp-cmip6/NEX/GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1" + ) + nasa_nex_gddp_cmip6.NASANEXGDDPCMIP6Collection.limit = 1 + + items = nasa_nex_gddp_cmip6.NASANEXGDDPCMIP6Collection.create_item(asset_uri, StorageFactory()) + item = items[0] + + assert set(item.assets) == { + "hurs", + "huss", + "pr", + "rlds", + "rsds", + "sfcWind", + "tas", + "tasmax", + "tasmin", + } + assert ( + item.assets["hurs"].href + == "https://nasagddp.blob.core.windows.net/nex-gddp-cmip6/NEX/GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/hurs/hurs_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950_v1.1.nc" + ) # noqa: E501