Skip to content

Commit 7a9ad53

Browse files
author
Bryan Lawrence
committed
Fixing all but one test
1 parent db4c65b commit 7a9ad53

File tree

9 files changed

+291
-246
lines changed

9 files changed

+291
-246
lines changed

activestorage/backends.py

Lines changed: 52 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -44,25 +44,47 @@ def __init__(self, active):
4444
def reduce_chunk(self, request: ChunkRequest) -> ChunkResult:
4545
simulate_cbor = bool((self._active.storage_options or {}).get("local_simulate_cbor"))
4646
method = self._active._methods.get(request.method) if request.method else None
47-
data, count = reduce_chunk(
48-
request.uri,
49-
request.offset,
50-
request.size,
51-
request.compressor,
52-
request.filters,
53-
(
54-
request.missing.fill_value,
55-
request.missing.missing_value,
56-
request.missing.valid_min,
57-
request.missing.valid_max,
58-
),
59-
request.dtype,
60-
request.chunks,
61-
request.order,
62-
request.chunk_selection,
63-
method=method,
64-
axis=request.axis,
47+
missing = (
48+
request.missing.fill_value,
49+
request.missing.missing_value,
50+
request.missing.valid_min,
51+
request.missing.valid_max,
6552
)
53+
54+
# Remote datasets loaded through fsspec/pyfive need fh-based reads,
55+
# not open(uri, "rb") on the URL string.
56+
parsed = urllib.parse.urlparse(str(request.uri))
57+
if parsed.scheme in ("http", "https", "s3"):
58+
fh = self._active._format.file_handle
59+
data, count = reduce_opens3_chunk(
60+
fh,
61+
request.offset,
62+
request.size,
63+
request.compressor,
64+
request.filters,
65+
missing,
66+
request.dtype,
67+
request.chunks,
68+
request.order,
69+
request.chunk_selection,
70+
method=method,
71+
axis=request.axis,
72+
)
73+
else:
74+
data, count = reduce_chunk(
75+
request.uri,
76+
request.offset,
77+
request.size,
78+
request.compressor,
79+
request.filters,
80+
missing,
81+
request.dtype,
82+
request.chunks,
83+
request.order,
84+
request.chunk_selection,
85+
method=method,
86+
axis=request.axis,
87+
)
6688
if simulate_cbor:
6789
payload = reductionist.encode_result(data, count)
6890
data, count = reductionist.decode_result_buffer(payload)
@@ -118,12 +140,20 @@ def reduce_chunk(self, request: ChunkRequest) -> ChunkResult:
118140

119141
bucket, obj = self._resolve_bucket_object(request.uri, self._active.storage_options)
120142
if self._active.storage_options is None:
121-
source = S3_URL
143+
endpoint = S3_URL
122144
server = S3_ACTIVE_STORAGE_URL
123145
else:
124-
source = get_endpoint_url(self._active.storage_options, request.uri)
146+
endpoint = get_endpoint_url(self._active.storage_options, request.uri)
125147
server = self._active.active_storage_url or S3_ACTIVE_STORAGE_URL
126148

149+
endpoint = str(endpoint).rstrip("/")
150+
bucket = str(bucket).strip("/")
151+
obj = str(obj).lstrip("/")
152+
if bucket:
153+
source = f"{endpoint}/{bucket}/{obj}"
154+
else:
155+
source = f"{endpoint}/{obj}"
156+
127157
session = self.get_session()
128158
data, count = reductionist.reduce_chunk(
129159
session,
@@ -146,6 +176,7 @@ def reduce_chunk(self, request: ChunkRequest) -> ChunkResult:
146176
axis=request.axis,
147177
operation=request.method,
148178
interface_type='s3',
179+
option_disable_chunk_cache=self._active._option_disable_chunk_cache,
149180
)
150181
self.close_session(session)
151182
return ChunkResult(data=data, count=count, out_selection=())
@@ -184,6 +215,7 @@ def reduce_chunk(self, request: ChunkRequest) -> ChunkResult:
184215
axis=request.axis,
185216
operation=request.method,
186217
interface_type='https',
218+
option_disable_chunk_cache=self._active._option_disable_chunk_cache,
187219
)
188220
self.close_session(session)
189221
return ChunkResult(data=data, count=count, out_selection=())

activestorage/core.py

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ def _select_backend(interface_type, version):
144144
("s3", 1): S3Backend,
145145
("s3", 2): S3Backend,
146146
("https", 0): HttpsBackend,
147+
("https", 1): LocalBackend,
147148
("https", 2): HttpsBackend,
148149
("p5rem", 0): P5RemBackend,
149150
("p5rem", 1): P5RemBackend,
@@ -183,28 +184,58 @@ def __new__(cls, *args, **kwargs):
183184
def __init__(
184185
self,
185186
uri,
186-
ncvar,
187+
ncvar=None,
187188
storage_type=None,
189+
interface_type=None,
188190
max_threads=100,
189191
storage_options=None,
190192
active_storage_url=None,
191193
axis=None,
194+
option_disable_chunk_cache=False,
192195
):
193196
self.uri = uri
194197
if self.uri is None:
195198
raise ValueError(f"Must use a valid file for uri. Got {uri}")
196199

197-
self.storage_type = storage_type or return_interface_type(uri)
198-
self.storage_options = storage_options
200+
# Keep source URI when a dataset/variable object is provided.
201+
is_pathlike = isinstance(uri, (str, bytes, os.PathLike))
202+
source_uri = uri
203+
if not is_pathlike:
204+
file_obj = getattr(uri, "file", None)
205+
fh = getattr(file_obj, "_fh", None)
206+
source_uri = (
207+
getattr(fh, "path", None)
208+
or getattr(fh, "url", None)
209+
or str(uri)
210+
)
211+
212+
# interface_type is an alias for storage_type
213+
if interface_type is not None:
214+
storage_type = interface_type
215+
216+
self.storage_type = storage_type or return_interface_type(source_uri)
217+
self.storage_options = storage_options or {}
218+
if self.storage_type is None:
219+
# Backward-compatible inference for bare S3 object paths like
220+
# "bucket/key.nc" when only storage_options indicate S3 access.
221+
# Only infer when the URI is not an existing local file.
222+
s3_hints = {"key", "secret", "anon", "client_kwargs", "endpoint_url"}
223+
if any(k in self.storage_options for k in s3_hints):
224+
if not (is_pathlike and os.path.isfile(str(uri))):
225+
self.storage_type = "s3"
226+
self._option_disable_chunk_cache = bool(option_disable_chunk_cache)
199227
self.active_storage_url = active_storage_url
200228

201-
if not os.path.isfile(self.uri) and not self.storage_type:
229+
# Allow passing dataset/variable objects directly (ncvar optional).
230+
is_file_object = not is_pathlike
231+
if is_pathlike and not os.path.isfile(self.uri) and not self.storage_type:
202232
raise ValueError(f"Must use existing file for uri. {self.uri} not found")
203233

204-
self.ncvar = ncvar
205-
if self.ncvar is None:
234+
# When uri is a dataset object, ncvar can be None (user will select variable via indexing)
235+
if ncvar is None and not is_file_object:
206236
raise ValueError("Must set a netCDF variable name to slice")
207237

238+
self._ncvar = ncvar
208239
self._version = 1
209240
self._components = False
210241
self._method = None
@@ -213,9 +244,19 @@ def __init__(
213244
self.metric_data = {}
214245
self.data_read = 0
215246

216-
self._format = _select_format(uri)()
217-
self._format.open(uri, storage_options)
218-
self.ds = self._format.get_variable(ncvar)
247+
self._format = _select_format(source_uri)()
248+
self._format._storage_type = self.storage_type
249+
if is_file_object:
250+
# uri is already a pyfive.Group or similar
251+
self._format._dataset = uri
252+
self._format._uri = str(source_uri)
253+
self.ds = uri
254+
else:
255+
self._format.open(uri, self.storage_options)
256+
if ncvar is not None:
257+
self.ds = self._format.get_variable(ncvar)
258+
else:
259+
self.ds = None
219260
self.missing = None
220261

221262
self._refresh_backend()
@@ -228,6 +269,10 @@ def ncvar(self):
228269
def ncvar(self, value):
229270
self._ncvar = value
230271

272+
@property
273+
def interface_type(self):
274+
return self.storage_type
275+
231276
def _refresh_backend(self):
232277
backend_cls = _select_backend(self.storage_type, self._version)
233278
self._backend = backend_cls(self)

activestorage/formats.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,15 @@ def open(self, uri, storage_options):
1919
scheme = ""
2020
if "://" in self._uri:
2121
scheme = self._uri.split("://", 1)[0]
22-
if scheme == "s3":
22+
storage_type = getattr(self, '_storage_type', None) or scheme
23+
if storage_type == "s3":
2324
from activestorage import active as active_module
2425

2526
self._dataset_file = active_module.load_from_s3(self._uri, storage_options)
27+
elif storage_type in ("http", "https"):
28+
from activestorage import active as active_module
29+
30+
self._dataset_file = active_module.load_from_https(self._uri, storage_options)
2631
else:
2732
self._dataset_file = pyfive.File(self._uri)
2833

activestorage/helpers.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,26 @@ def load_from_https(uri, storage_options=None):
1515
Load a pyfive.high_level.Dataset from a netCDF4 file on an https server.
1616
Works for both http and https endpoints.
1717
"""
18-
if storage_options is None:
19-
client_kwargs = {'auth': None}
20-
fs = fsspec.filesystem('http', **client_kwargs)
21-
http_file = fs.open(uri, 'rb')
22-
else:
23-
username = storage_options.get("username", None)
24-
password = storage_options.get("password", None)
25-
client_kwargs = {
26-
'auth': aiohttp.BasicAuth(username, password) if username and password else None
27-
}
28-
fs = fsspec.filesystem('http', **client_kwargs)
29-
http_file = fs.open(uri, 'rb')
18+
try:
19+
if storage_options is None:
20+
client_kwargs = {'auth': None}
21+
fs = fsspec.filesystem('http', **client_kwargs)
22+
http_file = fs.open(uri, 'rb')
23+
else:
24+
username = storage_options.get("username", None)
25+
password = storage_options.get("password", None)
26+
client_kwargs = {
27+
'auth': aiohttp.BasicAuth(username, password) if username and password else None
28+
}
29+
fs = fsspec.filesystem('http', **client_kwargs)
30+
http_file = fs.open(uri, 'rb')
31+
except FileNotFoundError as exc:
32+
# fsspec wraps all failures as FileNotFoundError.
33+
# Distinguish by cause: connection-level errors (bad hostname, refused)
34+
# have an OSError cause; HTTP-level errors (404) do not.
35+
if isinstance(exc.__cause__, OSError):
36+
raise ValueError(f"Failed to access HTTPS dataset: {uri}") from exc
37+
raise
3038

3139
ds = pyfive.File(http_file)
3240
print(f"Dataset loaded from https with Pyfive: {uri}")

pyproject.toml

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,79 @@
11
[build-system]
22
requires = ["setuptools >= 40.6.0", "wheel", "setuptools_scm>=6.2"]
33
build-backend = "setuptools.build_meta"
4+
5+
[project]
6+
name = "ActiveStorage"
7+
dynamic = ["version"]
8+
description = ""
9+
readme = {file = "README.md", content-type = "text/markdown"}
10+
classifiers = [
11+
"Development Status :: 0 - Prototype",
12+
"Environment :: Console",
13+
"Intended Audience :: Developers",
14+
"Intended Audience :: Science/Research",
15+
"Natural Language :: English",
16+
"Operating System :: POSIX :: Linux",
17+
"Programming Language :: Python :: 3",
18+
"Programming Language :: Python :: 3.10",
19+
"Programming Language :: Python :: 3.11",
20+
"Programming Language :: Python :: 3.12",
21+
"Programming Language :: Python :: 3.13",
22+
"Topic :: Scientific/Engineering",
23+
"Topic :: Scientific/Engineering :: Atmospheric Science",
24+
"Topic :: Scientific/Engineering :: GIS",
25+
"Topic :: Scientific/Engineering :: Hydrology",
26+
"Topic :: Scientific/Engineering :: Physics",
27+
]
28+
dependencies = [
29+
"dask!=2024.8.0", # github.com/dask/dask/issues/11296
30+
"fsspec",
31+
"h5netcdf",
32+
"h5py",
33+
"kerchunk>=0.2.4",
34+
"netcdf4",
35+
"numcodecs>=0.12", # github/issues/162
36+
"numpy!=1.24.3", # severe masking bug
37+
"requests",
38+
"s3fs>=2024.2.0",
39+
"zarr>=2.13.3", # github.com/zarr-developers/zarr-python/issues/1362
40+
]
41+
42+
[project.optional-dependencies]
43+
test = [
44+
"moto",
45+
"pytest",
46+
"pytest-cov>=2.10.1",
47+
"pytest-html!=2.1.0",
48+
"pytest-metadata>=1.5.1",
49+
"pytest-xdist",
50+
]
51+
52+
[tool.setuptools]
53+
packages = ["activestorage"]
54+
include-package-data = true
55+
56+
[tool.setuptools_scm]
57+
58+
[tool.pytest.ini_options]
59+
addopts = """
60+
-m 'not slow'
61+
--ignore=old_code/
62+
--ignore=tests/s3_exploratory
63+
--ignore=bnl
64+
--cov=activestorage
65+
--cov-report=xml:test-reports/coverage.xml
66+
--cov-report=html:test-reports/coverage_html
67+
--html=test-reports/report.html
68+
"""
69+
markers = [
70+
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
71+
]
72+
73+
[tool.coverage.run]
74+
parallel = true
75+
76+
[tool.coverage.report]
77+
exclude_lines = [
78+
"if __name__ == .__main__.:",
79+
]

setup.cfg

Lines changed: 0 additions & 22 deletions
This file was deleted.

0 commit comments

Comments
 (0)