Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 4 additions & 9 deletions nchelpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import hashlib
import re
import collections
import uuid

from cached_property import cached_property
import numpy as np
Expand Down Expand Up @@ -1751,16 +1752,10 @@ def cmor_filename(self):
extension='.nc', **self._cmor_type_filename_components()
)

@property
@cached_property
def unique_id(self):
"""A unique id for this file, based on its CMOR filename"""
unique_id = cmor_type_filename(**self._cmor_type_filename_components())

dim_axes = set(self.dim_axes_from_names().values())
if not (dim_axes <= {'X', 'Y', 'Z', 'T'}):
unique_id += "_dim" + ''.join(sorted(dim_axes))

return unique_id.replace('+', '-') # In original code, but why?
"""A unique id for this file"""
return str(uuid.uuid4())
Copy link
Copy Markdown
Collaborator

@rod-glover rod-glover Aug 6, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This generates a different random UUID every time it is executed. Which means that every time a program that uses nchelpers is run, a different UUID will be generated for any given dataset (file).

But I think the way we use unique_id relies on it being stable across time.

Possible problematic use case: Indexing. When a file is indexed in any given run, its unique_id will be generated (only once during that indexing run, but as a random value). Then when the same file is re-indexed in a later indexing run (it has moved location, say), its unique_id is generated, but it is by definition different than the first one. If the indexer (or anything else) tries to look up this file according to its unique_id, it will get incorrect results. Essentially, if anything (e.g., a modelmeta database) has a memory of a past unique_id for a file, it will be wrong the next time.

Copy link
Copy Markdown
Collaborator

@rod-glover rod-glover Aug 6, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, however, this argument also can be used against using metadata to generate the unique_id, since reindexing could in principle include reindexing a data file with slightly different metadata. But I think that case is implicitly excluded; the idea was that the metadata uniquely identified the file. We have an identity crisis here (pun, but serious too). What is it that uniquely characterizes a file?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I was concerned about that too, so thank you for pointing this out.

Admittedly, I filed this PR more as provocation than as a finished product.

Maybe it would be better to change this from a (assumed to be static) property to a method (e.g. create_unique_id()), so users of it have to be intentional about when they want a new one (for example, during indexing) and when they want the existent one.

Copy link
Copy Markdown
Collaborator

@rod-glover rod-glover Aug 6, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds more robust.

Question is how to generate the unique id and, if it is not a deterministic computation based on the file contents, how to persist it. One solution is to make method create_unique_id() write its value to the metadata (attribute unique_id?) in the file, and for property unique_id to read it from there. If we want to emphasize "don't screw with this", then maybe name the attribute __unique_id__ or something similarly repellent.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that's a great idea.


###########################################################################
# Climatology-specific methods
Expand Down
21 changes: 0 additions & 21 deletions tests/test_CFDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,6 @@ def test_filepath(cwd, raw_dataset, converter, expected):
('gcm', 'ensemble_member', 'r1i1p1'),
('gcm', 'cmor_filename',
'tasmax_day_BNU-ESM_historical_r1i1p1_19650101-19750101.nc'),
('gcm', 'unique_id',
'tasmax_day_BNU-ESM_historical_r1i1p1_19650101-19750101'),

('downscaled', 'first_MiB_md5sum', '6ebca934615ad7e6bd328bcc6fa9058b'),
('downscaled', 'md5', '6ebca934615ad7e6bd328bcc6fa9058b'),
Expand All @@ -126,8 +124,6 @@ def test_filepath(cwd, raw_dataset, converter, expected):
('downscaled', 'ensemble_member', 'r1i1p1'),
('downscaled', 'cmor_filename',
'tasmax_day_BCCAQ2_ACCESS1-0_historical+rcp45_r1i1p1_19600101-19911231.nc'),
('downscaled', 'unique_id',
'tasmax_day_BCCAQ2_ACCESS1-0_historical-rcp45_r1i1p1_19600101-19911231'),

('hydromodel_gcm', 'first_MiB_md5sum', '6544f8a39ba722e2085677525269c883'),
('hydromodel_gcm', 'md5', '36af1a6d4665fecf0d1a727a7cbdc6ef'),
Expand All @@ -154,9 +150,6 @@ def test_filepath(cwd, raw_dataset, converter, expected):
('hydromodel_gcm', 'cmor_filename',
'BASEFLOW+EVAP+GLAC_AREA_BAND+GLAC_MBAL_BAND+RUNOFF+SWE_BAND_day_VICGL+'
'RGM+HydroCon_ACCESS1-0_historical+rcp45_r1i1p1_19840101-19951231.nc'),
('hydromodel_gcm', 'unique_id',
'BASEFLOW-EVAP-GLAC_AREA_BAND-GLAC_MBAL_BAND-RUNOFF-SWE_BAND_day_VICGL-'
'RGM-HydroCon_ACCESS1-0_historical-rcp45_r1i1p1_19840101-19951231'),

# Note: The following properties are not meaningful for a climatological
# output file and so are not tested:
Expand Down Expand Up @@ -184,8 +177,6 @@ def test_filepath(cwd, raw_dataset, converter, expected):
('mClim_gcm', 'model_type', 'GCM'),
('mClim_gcm', 'cmor_filename',
'tasmax_mClim_BNU-ESM_historical_r1i1p1_19650101-19701231.nc'),
('mClim_gcm', 'unique_id',
'tasmax_mClim_BNU-ESM_historical_r1i1p1_19650101-19701231'),

('sClim_gcm', 'first_MiB_md5sum', 'ecd2a0a28ffc12cc795d4e6b623543b6'),
('sClim_gcm', 'md5', 'ecd2a0a28ffc12cc795d4e6b623543b6'),
Expand All @@ -209,8 +200,6 @@ def test_filepath(cwd, raw_dataset, converter, expected):
('sClim_gcm', 'model_type', 'GCM'),
('sClim_gcm', 'cmor_filename',
'tasmax_sClim_BNU-ESM_historical_r1i1p1_19650101-19701231.nc'),
('sClim_gcm', 'unique_id',
'tasmax_sClim_BNU-ESM_historical_r1i1p1_19650101-19701231'),

('aClim_gcm', 'first_MiB_md5sum', 'b002ec3839db4daffdad335ad0d31563'),
('aClim_gcm', 'md5', 'b002ec3839db4daffdad335ad0d31563'),
Expand All @@ -234,8 +223,6 @@ def test_filepath(cwd, raw_dataset, converter, expected):
('aClim_gcm', 'model_type', 'GCM'),
('aClim_gcm', 'cmor_filename',
'tasmax_aClim_BNU-ESM_historical_r1i1p1_19650101-19701231.nc'),
('aClim_gcm', 'unique_id',
'tasmax_aClim_BNU-ESM_historical_r1i1p1_19650101-19701231'),

('climdex_ds_gcm', 'first_MiB_md5sum', '5cbe8412f19599f893ba28062e0d7a9b'),
('climdex_ds_gcm', 'md5', '5cbe8412f19599f893ba28062e0d7a9b'),
Expand All @@ -262,8 +249,6 @@ def test_filepath(cwd, raw_dataset, converter, expected):
('climdex_ds_gcm', 'cmor_filename',
'altcddETCCDI_yr_BCCAQ_ACCESS1-0_historical+rcp85_'
'r1i1p1_19500702-21000702.nc'),
('climdex_ds_gcm', 'unique_id',
'altcddETCCDI_yr_BCCAQ_ACCESS1-0_historical-rcp85_r1i1p1_19500702-21000702'),

('gridded_obs', 'first_MiB_md5sum', '6e4b0f8968a18ffa917e34b68a3e5636'),
('gridded_obs', 'md5', '6e4b0f8968a18ffa917e34b68a3e5636'),
Expand All @@ -287,8 +272,6 @@ def test_filepath(cwd, raw_dataset, converter, expected):
('gridded_obs', 'is_gridded_obs', True),
('gridded_obs', 'cmor_filename',
'pr_day_SYMAP_BC_v1_historical_19500101-19500104.nc'),
('gridded_obs', 'unique_id',
'pr_day_SYMAP_BC_v1_historical_19500101-19500104'),

('gridded_mClimSD_obs', 'first_MiB_md5sum', '7eb975dfd17845621123400dbb6d0e5b'),
('gridded_mClimSD_obs', 'md5', '7eb975dfd17845621123400dbb6d0e5b'),
Expand All @@ -312,8 +295,6 @@ def test_filepath(cwd, raw_dataset, converter, expected):
('gridded_mClimSD_obs', 'is_gridded_obs', True),
('gridded_mClimSD_obs', 'cmor_filename',
'pr_mClimSD_anusplin_historical_19710201-20000531.nc'),
('gridded_mClimSD_obs', 'unique_id',
'pr_mClimSD_anusplin_historical_19710201-20000531'),

('streamflow', 'first_MiB_md5sum', 'e399c143415d13b7eab6809daa9cfc2f'),
('streamflow', 'md5', 'e399c143415d13b7eab6809daa9cfc2f'),
Expand All @@ -339,8 +320,6 @@ def test_filepath(cwd, raw_dataset, converter, expected):
('streamflow', 'ensemble_member', 'r1i2p3'),
('streamflow', 'cmor_filename',
'streamflow_day_model_exp_r1i2p3_19450102-19450117.nc'),
('streamflow', 'unique_id',
'streamflow_day_model_exp_r1i2p3_19450102-19450117'),

], indirect=['tiny_dataset'])
def test_simple_property(tiny_dataset, prop, expected):
Expand Down