Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 43 additions & 11 deletions pyglider/ncprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,12 @@ def make_gridfiles(
"""
Turn a timeseries netCDF file into a vertically gridded netCDF.

Timeseries variables can be excluded from the gridded netCDF file
by including ``grid_exclude: 'true'`` in the deployment yaml.
'distance_over_ground' will always be excluded.
'profile_direction', 'profile_time_start', and 'profile_time_end'
will always be included, but will only have one dimension ('profile').

Parameters
----------
inname : str or Path
Expand All @@ -219,13 +225,17 @@ def make_gridfiles(
dz : float, default = 1
Vertical grid spacing in meters. Ignored if ``depth_bins`` is not None

starttime : str, default = '1970-01-01'
The minimum time of data that will be gridded. All data before this
time will be dropped

Returns
-------
outname : str
Name of gridded netCDF file. The gridded netCDF file has dimensions of
'depth' and 'profile', so each variable is gridded in depth bins and by
profile number. Each profile has a time, latitude, and longitude.
The depth values are the bin centers
The depth values are the bin centers.
"""
try:
os.mkdir(outdir)
Expand Down Expand Up @@ -274,6 +284,7 @@ def make_gridfiles(
dsout = xr.Dataset(
coords={'depth': ('depth', depths), 'profile': (xdimname, profiles)}
)
dsout['profile'].attrs = ds.profile_index.attrs
dsout['depth'].attrs = {
'units': 'm',
'long_name': 'Depth',
Expand All @@ -285,26 +296,35 @@ def make_gridfiles(
}

# Bin by profile index, for the mean time, lat, and lon values for each profile
ds['time_1970'] = ds.temperature.copy()
ds['time_1970'] = ds.longitude.copy()
ds['time_1970'].values = ds.time.values.astype(np.float64)
for td in ('time_1970', 'longitude', 'latitude'):
td_lookup = {
'time_1970': 'mean',
'longitude': 'mean',
'latitude': 'mean',
'profile_direction': lambda x: stats.mode(x, keepdims=True)[0][0],
}
for td, bin_stat in td_lookup.items():
good = np.where(~np.isnan(ds[td]) & (ds['profile_index'] % 1 == 0))[0]
dat, xedges, binnumber = stats.binned_statistic(
ds['profile_index'].values[good],
ds[td].values[good],
statistic='mean',
statistic=bin_stat,
bins=[profile_bins],
)
if td == 'time_1970':
td = 'time'
dat = dat.astype('timedelta64[ns]') + np.datetime64('1970-01-01T00:00:00')
_log.info(f'{td} {len(dat)}')
dsout[td] = (('time'), dat, ds[td].attrs)
dsout[td] = (xdimname, dat, ds[td].attrs)

# Bin by profile index, for the profile start (min) and end (max) times
profile_lookup = {'profile_time_start': "min", 'profile_time_end': "max"}
profile_time_lookup = {
'profile_time_start': "min",
'profile_time_end': "max"
}
good = np.where(~np.isnan(ds['time']) & (ds['profile_index'] % 1 == 0))[0]
for td, bin_stat in profile_lookup.items():
for td, bin_stat in profile_time_lookup.items():
_log.debug(f'td, bin_stat {td}, {bin_stat}')
dat, xedges, binnumber = stats.binned_statistic(
ds['profile_index'].values[good],
Expand All @@ -319,9 +339,19 @@ def make_gridfiles(
ds = ds.drop('time_1970')
_log.info(f'Done times!')

for k in ds.keys():
if k in ['time', 'profile', 'longitude', 'latitude', 'depth'] or 'time' in k:
grid_exclude_vars = (
list(dsout.keys())
+ ["depth", "profile_index", "distance_over_ground"]
)
for k in ds.keys():
if (k in grid_exclude_vars) or ('time' in k):
_log.debug('Not gridding %s', k)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks to be gridding longitude and latitude by deafult now? Or are they in the default exclude_vars?

I wonder if for non-coordinate data (eg temperature etc) we should really be setting whether to grid it or not in the yaml instead of specifying a list here. eg in the yaml say grid_exclude: True if we don't want it to be gridded, but we do want it in the time series?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks to be gridding longitude and latitude by deafult now? Or are they in the default exclude_vars?

longitude and latitude (along with time, profile_direction, profile_time_start, and profile_time_end) are all guaranteed to be part of dsout. Thus, they're always added to exclude_vars in exclude_vars += list(dsout.keys()) + ["depth"]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if for non-coordinate data (eg temperature etc) we should really be setting whether to grid it or not in the yaml instead of specifying a list here. eg in the yaml say grid_exclude: True if we don't want it to be gridded, but we do want it in the time series?

A grid_exclude boolean key feels totally reasonable to me. It's also more consistent with the function checking for the 'average_method' key. I'll update the pr now

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, this seems good. I was just wondering though if we wanted to allow the user to exclude a timeseries variable from being gridded at this stage by adding a property to the yaml. But maybe that can be a follow up?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@smwoodman I was waiting to understand this...

continue
if 'grid_exclude' in ds[k].attrs:
if ds[k].attrs['grid_exclude'] == 'true':
_log.debug('Not gridding %s due to grid_exclude flag', k)
continue

_log.info('Gridding %s', k)
good = np.where(~np.isnan(ds[k]) & (ds['profile_index'] % 1 == 0))[0]
if len(good) <= 0:
Expand Down Expand Up @@ -372,10 +402,10 @@ def make_gridfiles(
dsout.attrs['time_coverage_end'] = dsout.attrs['time_coverage_end'][:19]
# fix standard_name so they don't overlap!
try:
dsout['waypoint_latitude'].attrs.pop('standard_name')
dsout['waypoint_longitude'].attrs.pop('standard_name')
dsout['profile_time_start'].attrs.pop('standard_name')
dsout['profile_time_end'].attrs.pop('standard_name')
dsout['waypoint_latitude'].attrs.pop('standard_name')
dsout['waypoint_longitude'].attrs.pop('standard_name')
except:
pass
# remove, so they can be encoded later:
Expand All @@ -398,6 +428,8 @@ def make_gridfiles(
for k in dsout:
if k in ['profile', 'depth', 'latitude', 'longitude', 'time', 'mission_number']:
dsout[k].attrs['coverage_content_type'] = 'coordinate'
elif k in grid_exclude_vars:
dsout[k].attrs['coverage_content_type'] = 'auxiliaryInformation'
else:
dsout[k].attrs['coverage_content_type'] = 'physicalMeasurement'

Expand Down