From 8cd3c655814116b6ec2a4c992063589f193aead9 Mon Sep 17 00:00:00 2001 From: SarahAlidoost Date: Tue, 7 May 2024 12:01:18 +0200 Subject: [PATCH 1/8] add preprocessing to example nb --- surf_research_day24/Example.ipynb | 6034 +++++++++++++++++++++++++++++ 1 file changed, 6034 insertions(+) create mode 100644 surf_research_day24/Example.ipynb diff --git a/surf_research_day24/Example.ipynb b/surf_research_day24/Example.ipynb new file mode 100644 index 0000000..6db1554 --- /dev/null +++ b/surf_research_day24/Example.ipynb @@ -0,0 +1,6034 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3a72d8d0-e4fd-4b5e-ac80-992ae2e98b75", + "metadata": {}, + "source": [ + "## Preparing data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "236cfe17-19dc-4c09-a33d-385637259cd8", + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "import geopandas as gpd\n", + "from dask.distributed import Client, LocalCluster\n", + "from datetime import datetime, timedelta\n", + "from functools import partial" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1cdea174-0312-40e6-b678-1666b8101439", + "metadata": {}, + "outputs": [], + "source": [ + "start_time = \"2014-01-01\"\n", + "end_time = \"2014-01-15\"\n", + "year = 2014\n", + "\n", + "parent_in_path = f\"/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/{year}global\"\n", + "data_paths = {\"era5land\": f\"{parent_in_path}/era5land/*.nc\",\n", + " \"lai\": f\"{parent_in_path}/lai_v2/*.nc\",\n", + " \"ssm\": f\"{parent_in_path}/ssm/GlobalGSSM11km2014_20240214.tif\",\n", + " \"co2\": f\"{parent_in_path}/co2/CAMS_CO2_2003-2020.nc\",\n", + " \"landcover\": f\"{parent_in_path}/igbp/landcover10km_global.nc\",\n", + " \"vcmax\": f\"{parent_in_path}/vcmax/TROPOMI_Vmax_Tg_mean10km_global.nc\",\n", + " \"canopyheight\": f\"{parent_in_path}/canopy_height/canopy_height_11kmEurope20230921_10km.nc\",\n", + " }\n", + "\n", + "parent_out_path = \"/scratch-shared/falidoost\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4b8250e9-407a-4866-8d27-1c02f9d61cc4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-31.28903052, 34.93055094, 68.93136141, 81.85192337])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# read shape file\n", + "eu_shape_file = \"/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/EuropeBoundary.shp\"\n", + "gdf = gpd.read_file(eu_shape_file)\n", + "bbox = gdf.total_bounds\n", + "bbox" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "140366de-8dad-4e02-b03d-c5a73f62a99b", + "metadata": {}, + "outputs": [], + "source": [ + "def era5_preprocess(ds): \n", + " # Convert the longitude coordinates from [0, 360] to [-180, 180]\n", + " ds = ds.assign_coords(longitude=(((ds.longitude + 180) % 360) - 180))\n", + " return ds\n", + "\n", + "def co2_preprocess(ds, start_time, end_time): \n", + " ds = ds.sel(time=slice(start_time, end_time))\n", + " return ds\n", + "\n", + "co2_partial_func = partial(co2_preprocess, start_time=start_time, end_time=end_time)\n", + "\n", + "def fix_coords(ds):\n", + " if 'band' in ds.dims:\n", + " ds = ds.rename_dims({'band': 'time'})\n", + " ds = ds.rename_vars({'band': 'time'})\n", + "\n", + " if 'x' in ds.dims and 'y' in ds.dims:\n", + " ds = ds.rename_dims({'x': 'longitude', 'y': 'latitude'})\n", + " ds = ds.rename_vars({'x': 'longitude', 'y': 'latitude'})\n", + " \n", + " elif 'lon' in ds.dims and 'lat' in ds.dims:\n", + " ds = ds.rename_dims({'lon': 'longitude', 'lat': 'latitude'})\n", + " ds = ds.rename_vars({'lon': 'longitude', 'lat': 'latitude'})\n", + " return ds\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ead07977-2f85-4551-bfb9-cb734f423516", + "metadata": {}, + "outputs": [], + "source": [ + "cluster = LocalCluster(n_workers=25, threads_per_worker=1)\n", + "client = Client(cluster)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "fd0b2392-ce97-48f8-a523-9fbf5b5c4a0c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 5 µs, sys: 2 µs, total: 7 µs\n", + "Wall time: 15.7 µs\n", + "/scratch-shared/falidoost/era5land_2014.zarr is saved\n", + "=======================================\n", + "/scratch-shared/falidoost/lai_2014.zarr is saved\n", + "=======================================\n", + "/scratch-shared/falidoost/ssm_2014.zarr is saved\n", + "=======================================\n", + "/scratch-shared/falidoost/co2_2014.zarr is saved\n", + "=======================================\n", + "/scratch-shared/falidoost/landcover_2014.zarr is saved\n", + "=======================================\n", + "/scratch-shared/falidoost/vcmax_2014.zarr is saved\n", + "=======================================\n", + "/scratch-shared/falidoost/canopyheight_2014.zarr is saved\n", + "=======================================\n" + ] + } + ], + "source": [ + "%time\n", + "chunks = 'auto'\n", + "\n", + "for data_path in data_paths:\n", + " \n", + " if data_path == \"era5land\":\n", + " ds = xr.open_mfdataset(data_paths[data_path], preprocess=era5_preprocess, chunks=chunks)\n", + " \n", + " if data_path == \"co2\":\n", + " ds = xr.open_mfdataset(data_paths[data_path], preprocess=co2_partial_func, chunks=chunks)\n", + " ds = ds.assign_coords(longitude=(((ds.longitude + 180) % 360) - 180)) \n", + "\n", + " else:\n", + " ds = xr.open_mfdataset(data_paths[data_path], preprocess=fix_coords, chunks=chunks)\n", + " \n", + " # TODO improve this block\n", + " if ds.time.size == 1:\n", + " ds['time'] = [datetime(year, 1, 1)]\n", + " elif ds.time.dtype == 'int64':\n", + " # Convert day of year to datetime\n", + " ds['time'] = [datetime(year, 1, 1) + timedelta(int(day) - 1) for day in ds.time.values]\n", + " \n", + " ds = ds.sortby(['longitude', 'latitude'])\n", + " masked_ds = ds.sel(longitude=slice(bbox[0], bbox[2]), latitude=slice(bbox[1], bbox[3]), time=slice(start_time, end_time))\n", + " masked_ds = masked_ds.chunk(chunks='auto')\n", + " \n", + " # svae to zarr\n", + " out_path = f\"{parent_out_path}/{data_path}_{year}.zarr\"\n", + " masked_ds.to_zarr(out_path, mode='w')\n", + " print(f\"{out_path} is saved\")\n", + " print(\"=======================================\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "1443ca07-759f-4f93-a5e8-3588f9f3d630", + "metadata": {}, + "outputs": [], + "source": [ + "client.shutdown()" + ] + }, + { + "cell_type": "markdown", + "id": "0e93078d-7ed8-4c23-8372-8958d4fb2483", + "metadata": {}, + "source": [ + "## Variable derivation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "06141206-18c5-4f1c-ae2c-935595b742d5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/sarah/miniconda3/envs/emulator/lib/python3.9/site-packages/xarray/backends/cfgrib_.py:29: UserWarning: Failed to load cfgrib - most likely there is a problem accessing the ecCodes library. Try `import cfgrib` to get the full error message\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import xarray as xr\n", + "import numpy as np\n", + "import pandas as pd\n", + "from dask.distributed import Client, LocalCluster\n", + "from datetime import datetime, timedelta\n", + "from PyStemmusScope import variable_conversion as vc\n", + "from dask_ml.preprocessing import OneHotEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e4c425a8-2eff-4636-8a36-69f6a62696d5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "start_time = \"2014-01-01\"\n", + "end_time = \"2014-01-15\"\n", + "year = 2014\n", + "\n", + "parent_in_path = \"/scratch-shared/falidoost\"\n", + "parent_in_path = \"../data\"\n", + "data_paths = {\"era5land\": f\"{parent_in_path}/era5land_{year}.zarr\",\n", + " \"lai\": f\"{parent_in_path}/lai_{year}.zarr\",\n", + " \"ssm\": f\"{parent_in_path}/ssm_{year}.zarr\",\n", + " \"co2\": f\"{parent_in_path}/co2_{year}.zarr\",\n", + " \"landcover\": f\"{parent_in_path}/landcover_{year}.zarr\",\n", + " \"vcmax\": f\"{parent_in_path}/vcmax_{year}.zarr\",\n", + " \"canopyheight\": f\"{parent_in_path}/canopyheight_{year}.zarr\",\n", + " \"igbp_table\": f\"{parent_in_path}/lccs_to_igbp_table.csv\",\n", + " \"igbp_class\": f\"{parent_in_path}/IGBP11unique.csv\",\n", + " }\n", + "\n", + "parent_out_path = \"/scratch-shared/falidoost\"" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "6c1b74e1-9e57-48fc-ab0a-500794f17afa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# era5_land variables\n", + "era5land = xr.open_zarr(data_paths[\"era5land\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "dd57b78e-e0e7-4a19-b160-b4a76d97bf83", + "metadata": {}, + "outputs": [], + "source": [ + "def era5land_accumulated_vars(ds, input_name, output_name, scale_factor):\n", + " input_da = ds[input_name] / scale_factor\n", + " output_da = input_da.diff(\"time\")\n", + " output_da[0::24] = input_da[1::24] # accumulation starts at t01 instead of t00\n", + "\n", + " t00 = xr.DataArray(np.nan, coords=input_da.isel(time=0).coords) # assign first t00 to none\n", + " output_da = xr.concat([output_da, t00], dim='time')\n", + " ds[output_name] = output_da\n", + " return ds" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0de6983d-e078-45cf-8fd1-1910ecae17ed", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cluster = LocalCluster(n_workers=5, threads_per_worker=1)\n", + "client = Client(cluster)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "381377a7-527f-495f-b92c-35e1f8ccaf2d", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:     (latitude: 469, longitude: 690, time: 360)\n",
+       "Coordinates:\n",
+       "  * latitude    (latitude) float32 35.0 35.1 35.2 35.3 ... 81.5 81.6 81.7 81.8\n",
+       "  * longitude   (longitude) float32 0.0 0.1 0.2 0.3 0.4 ... 68.6 68.7 68.8 68.9\n",
+       "  * time        (time) datetime64[ns] 2014-01-01 ... 2014-01-15T23:00:00\n",
+       "Data variables: (12/17)\n",
+       "    d2m         (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    sp          (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    ssr         (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    ssrd        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    str         (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    strd        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    ...          ...\n",
+       "    Rli         (time, latitude, longitude) float64 dask.array<chunksize=(1, 341, 690), meta=np.ndarray>\n",
+       "    Precip_msr  (time, latitude, longitude) float64 dask.array<chunksize=(1, 341, 690), meta=np.ndarray>\n",
+       "    p           (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    Ta          (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    ea          (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    u           (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "Attributes:\n",
+       "    Conventions:  CF-1.6\n",
+       "    history:      2023-06-19 03:39:36 GMT by grib_to_netcdf-2.25.1: /opt/ecmw...
" + ], + "text/plain": [ + "\n", + "Dimensions: (latitude: 469, longitude: 690, time: 360)\n", + "Coordinates:\n", + " * latitude (latitude) float32 35.0 35.1 35.2 35.3 ... 81.5 81.6 81.7 81.8\n", + " * longitude (longitude) float32 0.0 0.1 0.2 0.3 0.4 ... 68.6 68.7 68.8 68.9\n", + " * time (time) datetime64[ns] 2014-01-01 ... 2014-01-15T23:00:00\n", + "Data variables: (12/17)\n", + " d2m (time, latitude, longitude) float32 dask.array\n", + " sp (time, latitude, longitude) float32 dask.array\n", + " ssr (time, latitude, longitude) float32 dask.array\n", + " ssrd (time, latitude, longitude) float32 dask.array\n", + " str (time, latitude, longitude) float32 dask.array\n", + " strd (time, latitude, longitude) float32 dask.array\n", + " ... ...\n", + " Rli (time, latitude, longitude) float64 dask.array\n", + " Precip_msr (time, latitude, longitude) float64 dask.array\n", + " p (time, latitude, longitude) float32 dask.array\n", + " Ta (time, latitude, longitude) float32 dask.array\n", + " ea (time, latitude, longitude) float32 dask.array\n", + " u (time, latitude, longitude) float32 dask.array\n", + "Attributes:\n", + " Conventions: CF-1.6\n", + " history: 2023-06-19 03:39:36 GMT by grib_to_netcdf-2.25.1: /opt/ecmw..." + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "era5land = era5land_accumulated_vars(era5land, \"ssrd\", \"Rin\", 3600)\n", + "era5land = era5land_accumulated_vars(era5land, \"strd\", \"Rli\", 3600)\n", + "era5land = era5land_accumulated_vars(era5land, \"tp\", \"Precip_msr\", 0.001) # to mm\n", + "era5land[\"p\"] = era5land[\"sp\"] / 100 # Pa -> hPa\n", + "era5land[\"Ta\"] = era5land[\"t2m\"] - 273.15 # K -> degC\n", + "era5land[\"ea\"] = vc.calculate_es(era5land[\"d2m\"] - 273.15)*10 # *10 is for kPa -> hPa\n", + "era5land[\"u\"] = (era5land[\"u10\"] ** 2 + era5land[\"v10\"] ** 2) ** 0.5\n", + "era5land" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "9b90549f-ca92-4249-b2f8-75f5c90ac390", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:    (time: 360, latitude: 469, longitude: 690)\n",
+       "Coordinates:\n",
+       "  * latitude   (latitude) float32 35.0 35.1 35.2 35.3 ... 81.5 81.6 81.7 81.8\n",
+       "  * longitude  (longitude) float32 0.0 0.1 0.2 0.3 0.4 ... 68.6 68.7 68.8 68.9\n",
+       "  * time       (time) datetime64[ns] 2014-01-01 ... 2014-01-15T23:00:00\n",
+       "Data variables:\n",
+       "    d2m        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    sp         (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    ssr        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    ssrd       (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    str        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    strd       (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    t2m        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    tp         (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    u10        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    v10        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    landcover  (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "Attributes:\n",
+       "    Conventions:  CF-1.6\n",
+       "    history:      2023-06-19 03:39:36 GMT by grib_to_netcdf-2.25.1: /opt/ecmw...
" + ], + "text/plain": [ + "\n", + "Dimensions: (time: 360, latitude: 469, longitude: 690)\n", + "Coordinates:\n", + " * latitude (latitude) float32 35.0 35.1 35.2 35.3 ... 81.5 81.6 81.7 81.8\n", + " * longitude (longitude) float32 0.0 0.1 0.2 0.3 0.4 ... 68.6 68.7 68.8 68.9\n", + " * time (time) datetime64[ns] 2014-01-01 ... 2014-01-15T23:00:00\n", + "Data variables:\n", + " d2m (time, latitude, longitude) float32 dask.array\n", + " sp (time, latitude, longitude) float32 dask.array\n", + " ssr (time, latitude, longitude) float32 dask.array\n", + " ssrd (time, latitude, longitude) float32 dask.array\n", + " str (time, latitude, longitude) float32 dask.array\n", + " strd (time, latitude, longitude) float32 dask.array\n", + " t2m (time, latitude, longitude) float32 dask.array\n", + " tp (time, latitude, longitude) float32 dask.array\n", + " u10 (time, latitude, longitude) float32 dask.array\n", + " v10 (time, latitude, longitude) float32 dask.array\n", + " landcover (time, latitude, longitude) float32 dask.array\n", + "Attributes:\n", + " Conventions: CF-1.6\n", + " history: 2023-06-19 03:39:36 GMT by grib_to_netcdf-2.25.1: /opt/ecmw..." + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Interpolations\n", + "def interpolation(ds, other):\n", + " # in time\n", + " ds_interpolated = ds.interp(coords={\"time\": other.time}, method='nearest')\n", + " # in space\n", + " ds_interpolated = ds_interpolated.interp(coords={\"longitude\": other.longitude, \"latitude\": other.latitude}, method='linear')\n", + " return ds_interpolated\n", + "\n", + "variable_names = {\"lai\": \"LAI\",\n", + " \"ssm\": \"band_data\",\n", + " \"co2\": \"co2\",\n", + " \"canopyheight\": \"__xarray_dataarray_variable__\",\n", + " \"vcmax\": \"__xarray_dataarray_variable__\",\n", + " \"landcover\": \"lccs_class\"} \n", + "\n", + "variable_names = {\"landcover\": \"lccs_class\"} \n", + "for name in variable_names:\n", + " ds = xr.open_zarr(data_paths[name])\n", + " ds_interpolated = interpolation(ds, era5land)\n", + " era5land[name] = ds_interpolated[variable_names[name]].chunk(era5land.chunks)\n", + "\n", + "era5land" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "68792d75-d088-4560-a258-7cb073fde638", + "metadata": {}, + "outputs": [], + "source": [ + "# fix ssm \n", + "era5land[\"ssm\"] = era5land[\"ssm\"] / 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "0f18acfa-c243-45e6-8632-7c02a5b3ddde", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Other variables\n", + "igbp_table = pd.read_csv(data_paths[\"igbp_table\"])\n", + "igbp_class = pd.read_csv(data_paths[\"igbp_class\"])['0'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "08a99f69-8391-49dc-8677-b3c81e65bcc9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# we need to convert landcover to IGBP\n", + "\n", + "# define one hot encoding for IGBP\n", + "encoder = OneHotEncoder(\n", + " sparse_output=False,\n", + ")\n", + "encoder = encoder.fit(np.sort(igbp_class).reshape(-1,1)) # Unsorted categories are not yet supported by dask-ml\n", + "\n", + "lookup_table = igbp_table.set_index(\"lccs_class\").T.to_dict('records')[0]\n", + "\n", + "def map_landcover_to_igbp(landcover_block):\n", + " return np.vectorize(lookup_table.get)(landcover_block)\n", + "\n", + "def landcover_to_igbp(ds, landcover_var_name, encoder):\n", + " landcover = ds[landcover_var_name]\n", + " \n", + " # Replace NaN values with zeros\n", + " landcover = landcover.fillna(0)\n", + " \n", + " igbp = xr.apply_ufunc(map_landcover_to_igbp, landcover, dask=\"parallelized\", output_dtypes=[landcover.dtype])\n", + " igbp_reshaped = igbp.data.reshape(-1, 1)\n", + " transformed = encoder.transform(igbp_reshaped)\n", + "\n", + " # Add each column of the transformed array as a new variable in the dataset\n", + " for i in range(transformed.shape[1]):\n", + " ds[f\"IGBP_veg_long{i+1}\"] = ((\"time\", \"latitude\", \"longitude\"), transformed[:, i].reshape(igbp.shape))\n", + "\n", + " return ds\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "669dafbd-6dfe-4025-a631-227a2bec2d1f", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/sarah/miniconda3/envs/emulator/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3490: PerformanceWarning: Reshaping is producing a large chunk. To accept the large\n", + "chunk and silence this warning, set the option\n", + " >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n", + " ... array.reshape(shape)\n", + "\n", + "To avoid creating the large chunks, set the option\n", + " >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n", + " ... array.reshape(shape)Explictly passing ``limit`` to ``reshape`` will also silence this warning\n", + " >>> array.reshape(shape, limit='128 MiB')\n", + " if await self.run_code(code, result, async_=asy):\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:          (time: 360, latitude: 469, longitude: 690)\n",
+       "Coordinates:\n",
+       "  * latitude         (latitude) float32 35.0 35.1 35.2 35.3 ... 81.6 81.7 81.8\n",
+       "  * longitude        (longitude) float32 0.0 0.1 0.2 0.3 ... 68.6 68.7 68.8 68.9\n",
+       "  * time             (time) datetime64[ns] 2014-01-01 ... 2014-01-15T23:00:00\n",
+       "Data variables: (12/22)\n",
+       "    d2m              (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    sp               (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    ssr              (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    ssrd             (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    str              (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    strd             (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
+       "    ...               ...\n",
+       "    IGBP_veg_long6   (time, latitude, longitude) float64 dask.array<chunksize=(71, 469, 690), meta=np.ndarray>\n",
+       "    IGBP_veg_long7   (time, latitude, longitude) float64 dask.array<chunksize=(71, 469, 690), meta=np.ndarray>\n",
+       "    IGBP_veg_long8   (time, latitude, longitude) float64 dask.array<chunksize=(71, 469, 690), meta=np.ndarray>\n",
+       "    IGBP_veg_long9   (time, latitude, longitude) float64 dask.array<chunksize=(71, 469, 690), meta=np.ndarray>\n",
+       "    IGBP_veg_long10  (time, latitude, longitude) float64 dask.array<chunksize=(71, 469, 690), meta=np.ndarray>\n",
+       "    IGBP_veg_long11  (time, latitude, longitude) float64 dask.array<chunksize=(71, 469, 690), meta=np.ndarray>\n",
+       "Attributes:\n",
+       "    Conventions:  CF-1.6\n",
+       "    history:      2023-06-19 03:39:36 GMT by grib_to_netcdf-2.25.1: /opt/ecmw...
" + ], + "text/plain": [ + "\n", + "Dimensions: (time: 360, latitude: 469, longitude: 690)\n", + "Coordinates:\n", + " * latitude (latitude) float32 35.0 35.1 35.2 35.3 ... 81.6 81.7 81.8\n", + " * longitude (longitude) float32 0.0 0.1 0.2 0.3 ... 68.6 68.7 68.8 68.9\n", + " * time (time) datetime64[ns] 2014-01-01 ... 2014-01-15T23:00:00\n", + "Data variables: (12/22)\n", + " d2m (time, latitude, longitude) float32 dask.array\n", + " sp (time, latitude, longitude) float32 dask.array\n", + " ssr (time, latitude, longitude) float32 dask.array\n", + " ssrd (time, latitude, longitude) float32 dask.array\n", + " str (time, latitude, longitude) float32 dask.array\n", + " strd (time, latitude, longitude) float32 dask.array\n", + " ... ...\n", + " IGBP_veg_long6 (time, latitude, longitude) float64 dask.array\n", + " IGBP_veg_long7 (time, latitude, longitude) float64 dask.array\n", + " IGBP_veg_long8 (time, latitude, longitude) float64 dask.array\n", + " IGBP_veg_long9 (time, latitude, longitude) float64 dask.array\n", + " IGBP_veg_long10 (time, latitude, longitude) float64 dask.array\n", + " IGBP_veg_long11 (time, latitude, longitude) float64 dask.array\n", + "Attributes:\n", + " Conventions: CF-1.6\n", + " history: 2023-06-19 03:39:36 GMT by grib_to_netcdf-2.25.1: /opt/ecmw..." + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = landcover_to_igbp(era5land, \"landcover\", encoder)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "6dc3edbe-a7b0-4d6a-9afa-14564716af1c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "client.shutdown()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8163083b-3f7b-4986-84f6-5d2182e363e7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 71266f2db87199c29e4f665884027c63633b9077 Mon Sep 17 00:00:00 2001 From: SarahAlidoost Date: Tue, 7 May 2024 14:04:28 +0200 Subject: [PATCH 2/8] add model prediction to nb --- surf_research_day24/Example.ipynb | 1761 +++++------------------------ 1 file changed, 255 insertions(+), 1506 deletions(-) diff --git a/surf_research_day24/Example.ipynb b/surf_research_day24/Example.ipynb index 6db1554..4726870 100644 --- a/surf_research_day24/Example.ipynb +++ b/surf_research_day24/Example.ipynb @@ -29,8 +29,8 @@ "metadata": {}, "outputs": [], "source": [ - "start_time = \"2014-01-01\"\n", - "end_time = \"2014-01-15\"\n", + "start_time = \"2014-1-31\"\n", + "end_time = \"2014-02-10\"\n", "year = 2014\n", "\n", "parent_in_path = f\"/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/{year}global\"\n", @@ -148,6 +148,8 @@ "%time\n", "chunks = 'auto'\n", "\n", + "out_data = {}\n", + "\n", "for data_path in data_paths:\n", " \n", " if data_path == \"era5land\":\n", @@ -170,9 +172,10 @@ " ds = ds.sortby(['longitude', 'latitude'])\n", " masked_ds = ds.sel(longitude=slice(bbox[0], bbox[2]), latitude=slice(bbox[1], bbox[3]), time=slice(start_time, end_time))\n", " masked_ds = masked_ds.chunk(chunks='auto')\n", + " out_data[data_path] = masked_ds\n", " \n", " # svae to zarr\n", - " out_path = f\"{parent_out_path}/{data_path}_{year}.zarr\"\n", + " out_path = f\"{parent_out_path}/{data_path}_{start_time}_{end_time}.zarr\"\n", " masked_ds.to_zarr(out_path, mode='w')\n", " print(f\"{out_path} is saved\")\n", " print(\"=======================================\")" @@ -181,7 +184,41 @@ { "cell_type": "code", "execution_count": 14, - "id": "1443ca07-759f-4f93-a5e8-3588f9f3d630", + "id": "cfd29cff-2f7c-4a9e-b5b1-e5a9dab1324e", + "metadata": {}, + "outputs": [], + "source": [ + "# Interpolations\n", + "def interpolation(ds, other):\n", + " # in time\n", + " ds_interpolated = ds.interp(coords={\"time\": other.time}, method='nearest')\n", + " # in space\n", + " ds_interpolated = ds_interpolated.interp(coords={\"longitude\": other.longitude, \"latitude\": other.latitude}, method='linear')\n", + " return ds_interpolated\n", + "\n", + "variable_names = {\"lai\": \"LAI\",\n", + " \"ssm\": \"band_data\",\n", + " \"co2\": \"co2\",\n", + " \"canopyheight\": \"__xarray_dataarray_variable__\",\n", + " \"vcmax\": \"__xarray_dataarray_variable__\",\n", + " \"landcover\": \"lccs_class\"} \n", + "\n", + "all_data = out_data[\"era5land\"].copy()\n", + "for name in variable_names:\n", + " ds = out_data[name]\n", + " ds_interpolated = interpolation(ds, all_data)\n", + " all_data[name] = ds_interpolated[variable_names[name]].chunk(all_data.chunks)\n", + "\n", + " # svae to zarr\n", + "out_path = f\"{parent_out_path}/all_data_{start_time}_{end_time}.zarr\"\n", + "all_data.to_zarr(out_path, mode='w')\n", + "print(f\"{out_path} is saved\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "eac32989-4421-4dfa-8907-03c56d4d25eb", "metadata": {}, "outputs": [], "source": [ @@ -198,25 +235,17 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 54, "id": "06141206-18c5-4f1c-ae2c-935595b742d5", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/sarah/miniconda3/envs/emulator/lib/python3.9/site-packages/xarray/backends/cfgrib_.py:29: UserWarning: Failed to load cfgrib - most likely there is a problem accessing the ecCodes library. Try `import cfgrib` to get the full error message\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "import xarray as xr\n", "import numpy as np\n", "import pandas as pd\n", + "import dask.array as da\n", "from dask.distributed import Client, LocalCluster\n", "from datetime import datetime, timedelta\n", "from PyStemmusScope import variable_conversion as vc\n", @@ -237,7 +266,6 @@ "year = 2014\n", "\n", "parent_in_path = \"/scratch-shared/falidoost\"\n", - "parent_in_path = \"../data\"\n", "data_paths = {\"era5land\": f\"{parent_in_path}/era5land_{year}.zarr\",\n", " \"lai\": f\"{parent_in_path}/lai_{year}.zarr\",\n", " \"ssm\": f\"{parent_in_path}/ssm_{year}.zarr\",\n", @@ -2214,13 +2242,176 @@ "era5land[\"Ta\"] = era5land[\"t2m\"] - 273.15 # K -> degC\n", "era5land[\"ea\"] = vc.calculate_es(era5land[\"d2m\"] - 273.15)*10 # *10 is for kPa -> hPa\n", "era5land[\"u\"] = (era5land[\"u10\"] ** 2 + era5land[\"v10\"] ** 2) ** 0.5\n", + "era5land[\"ssm\"] = era5land[\"ssm\"] / 1000\n", "era5land" ] }, { "cell_type": "code", - "execution_count": 48, - "id": "9b90549f-ca92-4249-b2f8-75f5c90ac390", + "execution_count": 55, + "id": "08a99f69-8391-49dc-8677-b3c81e65bcc9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# we need to convert landcover to IGBP\n", + "# lookup tables\n", + "igbp_table = pd.read_csv(data_paths[\"igbp_table\"])\n", + "igbp_class = pd.read_csv(data_paths[\"igbp_class\"])['0'].unique()\n", + "\n", + "# define one hot encoding for IGBP\n", + "encoder = OneHotEncoder(\n", + " sparse_output=False,\n", + ")\n", + "encoder = encoder.fit(np.sort(igbp_class).reshape(-1,1)) # Unsorted categories are not yet supported by dask-ml\n", + "\n", + "lookup_table = igbp_table.set_index(\"lccs_class\").T.to_dict('records')[0]\n", + "\n", + "def map_landcover_to_igbp(landcover_block):\n", + " return np.vectorize(lookup_table.get)(landcover_block)\n", + "\n", + "def landcover_to_igbp(ds, landcover_var_name, encoder):\n", + " landcover = ds[landcover_var_name]\n", + " \n", + " # Replace NaN values with zeros\n", + " landcover = landcover.fillna(0)\n", + " \n", + " igbp = xr.apply_ufunc(map_landcover_to_igbp, landcover, dask=\"parallelized\", output_dtypes=[landcover.dtype])\n", + " igbp_reshaped = igbp.data.reshape(-1, 1)\n", + " transformed = encoder.transform(igbp_reshaped)\n", + " \n", + " # Replace zeros with np.nan\n", + " transformed = da.where(transformed == 0, np.nan, transformed)\n", + "\n", + " # Add each column of the transformed array as a new variable in the dataset\n", + " for i in range(transformed.shape[1]):\n", + " ds[f\"IGBP_veg_long{i+1}\"] = ((\"time\", \"latitude\", \"longitude\"), transformed[:, i].reshape(igbp.shape))\n", + "\n", + " return ds" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "d71905aa-16f7-4150-8062-6b3c6127b364", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ds = landcover_to_igbp(era5land, \"landcover\", encoder)\n", + "ds = ds.unify_chunks()" + ] + }, + { + "cell_type": "markdown", + "id": "25aa8223-1b6f-40cd-b916-e73706929378", + "metadata": { + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "source": [ + "## Model prediction" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "9387e7b6-86e2-4795-aaa6-31430a5a3dae", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# rename some variables\n", + "rename_vars = {\"co2\": \"CO2\", \"lai\": \"LAI\", \"canopyheight\": \"hc\", \"ssm\": \"SSM\"}\n", + "ds = ds.rename(rename_vars)\n", + "\n", + "input_vars = [\n", + " 'Rin', 'Rli', 'p', 'Ta', 'ea', 'u', 'CO2','LAI','Vcmo','hc', 'Precip_msr', \n", + " 'SSM', *[f'IGBP_veg_long{i}' for i in range(1, 12)]\n", + "]\n", + "\n", + "# select input data \n", + "input_ds = ds[input_vars]\n", + "\n", + "# define output template\n", + "output_vars = ['LEtot','Htot','Rntot','Gtot', 'Actot','SIF685', 'SIF740']\n", + "output_temp = xr.Dataset()\n", + "ds_shape = (input_ds.dims['time'], input_ds.dims['latitude'], input_ds.dims['longitude'])\n", + "\n", + "for var in output_vars:\n", + " output_temp[var] = xr.DataArray(\n", + " name = var,\n", + " data=da.zeros(ds_shape, chunks=\"auto\"),\n", + " dims=input_ds.dims,\n", + " coords=input_ds.coords,\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "b99a209d-aa6c-43f8-a8b2-8246ca125eb5", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# load model\n", + "path_model = \"\"\n", + "model = load_model(path_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "ff08854b-6c67-49b0-8317-abfa1d7a05b6", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def predictFlux(input_ds, model, output_vars):\n", + "\n", + " df_features = input_ds.to_dataframe()\n", + " df_features = df_features.reset_index()\n", + " invalid_index = df_features.isnull().any(axis=1)\n", + " \n", + " # Convert the nan value as 0 for the calculation\n", + " df_features[invalid_index] = 0\n", + " \n", + " LEH = model.predict(df_features)\n", + " LEH[invalid_index] = np.nan # convert the original nan values to nan back\n", + " \n", + " output_ds = input_ds.copy()\n", + " ds_shape = (output_ds.dims['time'], output_ds.dims['lon'], output_ds.dims['lat'])\n", + " \n", + " for i, name in enumerate(output_vars):\n", + " output_ds[name] = ((\"time\", \"longitude\", \"latitude\"), LEH[:, i].reshape(ds_shape))\n", + " \n", + " return output_ds" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "32086adc-07cf-4a87-a7fe-e87424ffcb14", "metadata": { "collapsed": true, "jupyter": { @@ -2585,1457 +2776,6 @@ " fill: currentColor;\n", "}\n", "
<xarray.Dataset>\n",
-       "Dimensions:    (time: 360, latitude: 469, longitude: 690)\n",
-       "Coordinates:\n",
-       "  * latitude   (latitude) float32 35.0 35.1 35.2 35.3 ... 81.5 81.6 81.7 81.8\n",
-       "  * longitude  (longitude) float32 0.0 0.1 0.2 0.3 0.4 ... 68.6 68.7 68.8 68.9\n",
-       "  * time       (time) datetime64[ns] 2014-01-01 ... 2014-01-15T23:00:00\n",
-       "Data variables:\n",
-       "    d2m        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
-       "    sp         (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
-       "    ssr        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
-       "    ssrd       (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
-       "    str        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
-       "    strd       (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
-       "    t2m        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
-       "    tp         (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
-       "    u10        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
-       "    v10        (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
-       "    landcover  (time, latitude, longitude) float32 dask.array<chunksize=(142, 341, 690), meta=np.ndarray>\n",
-       "Attributes:\n",
-       "    Conventions:  CF-1.6\n",
-       "    history:      2023-06-19 03:39:36 GMT by grib_to_netcdf-2.25.1: /opt/ecmw...
" - ], - "text/plain": [ - "\n", - "Dimensions: (time: 360, latitude: 469, longitude: 690)\n", - "Coordinates:\n", - " * latitude (latitude) float32 35.0 35.1 35.2 35.3 ... 81.5 81.6 81.7 81.8\n", - " * longitude (longitude) float32 0.0 0.1 0.2 0.3 0.4 ... 68.6 68.7 68.8 68.9\n", - " * time (time) datetime64[ns] 2014-01-01 ... 2014-01-15T23:00:00\n", - "Data variables:\n", - " d2m (time, latitude, longitude) float32 dask.array\n", - " sp (time, latitude, longitude) float32 dask.array\n", - " ssr (time, latitude, longitude) float32 dask.array\n", - " ssrd (time, latitude, longitude) float32 dask.array\n", - " str (time, latitude, longitude) float32 dask.array\n", - " strd (time, latitude, longitude) float32 dask.array\n", - " t2m (time, latitude, longitude) float32 dask.array\n", - " tp (time, latitude, longitude) float32 dask.array\n", - " u10 (time, latitude, longitude) float32 dask.array\n", - " v10 (time, latitude, longitude) float32 dask.array\n", - " landcover (time, latitude, longitude) float32 dask.array\n", - "Attributes:\n", - " Conventions: CF-1.6\n", - " history: 2023-06-19 03:39:36 GMT by grib_to_netcdf-2.25.1: /opt/ecmw..." - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Interpolations\n", - "def interpolation(ds, other):\n", - " # in time\n", - " ds_interpolated = ds.interp(coords={\"time\": other.time}, method='nearest')\n", - " # in space\n", - " ds_interpolated = ds_interpolated.interp(coords={\"longitude\": other.longitude, \"latitude\": other.latitude}, method='linear')\n", - " return ds_interpolated\n", - "\n", - "variable_names = {\"lai\": \"LAI\",\n", - " \"ssm\": \"band_data\",\n", - " \"co2\": \"co2\",\n", - " \"canopyheight\": \"__xarray_dataarray_variable__\",\n", - " \"vcmax\": \"__xarray_dataarray_variable__\",\n", - " \"landcover\": \"lccs_class\"} \n", - "\n", - "variable_names = {\"landcover\": \"lccs_class\"} \n", - "for name in variable_names:\n", - " ds = xr.open_zarr(data_paths[name])\n", - " ds_interpolated = interpolation(ds, era5land)\n", - " era5land[name] = ds_interpolated[variable_names[name]].chunk(era5land.chunks)\n", - "\n", - "era5land" - ] - }, - { - "cell_type": "code", - "execution_count": 128, - "id": "68792d75-d088-4560-a258-7cb073fde638", - "metadata": {}, - "outputs": [], - "source": [ - "# fix ssm \n", - "era5land[\"ssm\"] = era5land[\"ssm\"] / 1000" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "0f18acfa-c243-45e6-8632-7c02a5b3ddde", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Other variables\n", - "igbp_table = pd.read_csv(data_paths[\"igbp_table\"])\n", - "igbp_class = pd.read_csv(data_paths[\"igbp_class\"])['0'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "08a99f69-8391-49dc-8677-b3c81e65bcc9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# we need to convert landcover to IGBP\n", - "\n", - "# define one hot encoding for IGBP\n", - "encoder = OneHotEncoder(\n", - " sparse_output=False,\n", - ")\n", - "encoder = encoder.fit(np.sort(igbp_class).reshape(-1,1)) # Unsorted categories are not yet supported by dask-ml\n", - "\n", - "lookup_table = igbp_table.set_index(\"lccs_class\").T.to_dict('records')[0]\n", - "\n", - "def map_landcover_to_igbp(landcover_block):\n", - " return np.vectorize(lookup_table.get)(landcover_block)\n", - "\n", - "def landcover_to_igbp(ds, landcover_var_name, encoder):\n", - " landcover = ds[landcover_var_name]\n", - " \n", - " # Replace NaN values with zeros\n", - " landcover = landcover.fillna(0)\n", - " \n", - " igbp = xr.apply_ufunc(map_landcover_to_igbp, landcover, dask=\"parallelized\", output_dtypes=[landcover.dtype])\n", - " igbp_reshaped = igbp.data.reshape(-1, 1)\n", - " transformed = encoder.transform(igbp_reshaped)\n", - "\n", - " # Add each column of the transformed array as a new variable in the dataset\n", - " for i in range(transformed.shape[1]):\n", - " ds[f\"IGBP_veg_long{i+1}\"] = ((\"time\", \"latitude\", \"longitude\"), transformed[:, i].reshape(igbp.shape))\n", - "\n", - " return ds\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "669dafbd-6dfe-4025-a631-227a2bec2d1f", - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/sarah/miniconda3/envs/emulator/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3490: PerformanceWarning: Reshaping is producing a large chunk. To accept the large\n", - "chunk and silence this warning, set the option\n", - " >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n", - " ... array.reshape(shape)\n", - "\n", - "To avoid creating the large chunks, set the option\n", - " >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n", - " ... array.reshape(shape)Explictly passing ``limit`` to ``reshape`` will also silence this warning\n", - " >>> array.reshape(shape, limit='128 MiB')\n", - " if await self.run_code(code, result, async_=asy):\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.Dataset>\n",
        "Dimensions:          (time: 360, latitude: 469, longitude: 690)\n",
        "Coordinates:\n",
        "  * latitude         (latitude) float32 35.0 35.1 35.2 35.3 ... 81.6 81.7 81.8\n",
@@ -4057,10 +2797,10 @@
        "    IGBP_veg_long11  (time, latitude, longitude) float64 dask.array<chunksize=(71, 469, 690), meta=np.ndarray>\n",
        "Attributes:\n",
        "    Conventions:  CF-1.6\n",
-       "    history:      2023-06-19 03:39:36 GMT by grib_to_netcdf-2.25.1: /opt/ecmw...