feat(s3,utils): use niquests on_upload hook and optimize stream chunking (#49)

Andarius · web-flow · commit dc7dd7db971e · 2026-02-26T21:30:53.000+01:00
* feat(s3,utils): use niquests on_upload hook and optimize stream chunking
diff --git a/tests/s3/test_niquests.py b/tests/s3/test_niquests.py
@@ -256,11 +256,12 @@ async def data_stream():
                     for i in range(0, len(test_data), chunk_size):
                         yield test_data[i : i + chunk_size]
 
-                received_size = 0
+                upload_progress_called = False
 
-                def on_chunk(chunk: bytes):
-                    nonlocal received_size
-                    received_size += len(chunk)
+                def on_upload(req: niquests.PreparedRequest):
+                    nonlocal upload_progress_called
+                    if req.upload_progress is not None:
+                        upload_progress_called = True
 
                 await s3_file_upload(
                     s3,
@@ -269,11 +270,11 @@ def on_chunk(chunk: bytes):
                     key,
                     data_stream(),
                     min_part_size=5 * 1024 * 1024,
-                    on_chunk_received=on_chunk,
+                    on_upload=on_upload,
                     content_length=content_length,
                 )
 
-                assert received_size == data_size
+                assert upload_progress_called
                 result = await s3_get_object(s3, client, s3_bucket, key)
                 assert result == test_data
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -176,3 +176,49 @@ async def async_data():
             assert extra_check(chunks)
 
     asyncio.run(_test())
+
+
+@pytest.mark.parametrize(
+    ("input_chunks", "min_size", "expected_total", "extra_check"),
+    [
+        pytest.param(
+            ["hello", "world", "12345"],
+            5,
+            15,
+            lambda chunks: all(len(c) >= 5 for c in chunks),
+            id="exact_chunks",
+        ),
+        pytest.param(
+            ["small"],
+            100,
+            5,
+            lambda chunks: chunks == ["small"],
+            id="single_small_chunk",
+        ),
+        pytest.param(
+            ["hello", "", "world"],
+            5,
+            10,
+            None,
+            id="empty_chunks_ignored",
+        ),
+    ],
+)
+def test_get_stream_chunk_str(input_chunks, min_size, expected_total, extra_check):
+    from tracktolib.utils import get_stream_chunk_str
+
+    async def _test():
+        async def async_data():
+            for chunk in input_chunks:
+                yield chunk
+
+        chunks = []
+        async for chunk in get_stream_chunk_str(async_data(), min_size=min_size):
+            chunks.append(chunk)
+
+        total_size = sum(len(c) for c in chunks)
+        assert total_size == expected_total
+        if extra_check:
+            assert extra_check(chunks)
+
+    asyncio.run(_test())
diff --git a/tracktolib/s3/niquests.py b/tracktolib/s3/niquests.py
@@ -31,6 +31,7 @@
 from ..utils import get_stream_chunk
 
 __all__ = (
+    "OnUpload",
     "S3MultipartUpload",
     "S3Object",
     "S3ObjectParams",
@@ -200,10 +201,10 @@ class S3Session:
             ...
     """
 
-    endpoint_url: str
-    access_key: str
-    secret_key: str
-    region: str
+    endpoint_url: str | None = None
+    access_key: str | None = None
+    secret_key: str | None = None
+    region: str | None = None
     s3_config: Config | None = None
     s3_client: botocore.client.BaseClient | None = None
     http_client: niquests.AsyncSession = field(default_factory=niquests.AsyncSession)
@@ -212,7 +213,8 @@ class S3Session:
     def __post_init__(self):
         if self.s3_client is None:
             self._botocore_session = botocore.session.Session()
-            self._botocore_session.set_credentials(self.access_key, self.secret_key)
+            if self.access_key is not None and self.secret_key is not None:
+                self._botocore_session.set_credentials(self.access_key, self.secret_key)
             self.s3_client = self._botocore_session.create_client(
                 "s3",
                 endpoint_url=self.endpoint_url,
@@ -307,7 +309,7 @@ async def file_upload(
         data: AsyncIterator[bytes],
         *,
         min_part_size: int = 5 * 1024 * 1024,
-        on_chunk_received: Callable[[bytes], None] | None = None,
+        on_upload: OnUpload | None = None,
         content_length: int | None = None,
         **kwargs: Unpack[S3ObjectParams],
     ) -> None:
@@ -319,7 +321,7 @@ async def file_upload(
             key,
             data,
             min_part_size=min_part_size,
-            on_chunk_received=on_chunk_received,
+            on_upload=on_upload,
             content_length=content_length,
             **kwargs,
         )
@@ -392,7 +394,7 @@ class UploadPart(TypedDict):
 class S3MultipartUpload(NamedTuple):
     fetch_create: Callable[[], Awaitable[str]]
     fetch_complete: Callable[[], Awaitable[niquests.Response]]
-    upload_part: Callable[[bytes], Awaitable[UploadPart]]
+    upload_part: Callable[[bytes | bytearray], Awaitable[UploadPart]]
     generate_presigned_url: Callable[..., str]
     fetch_abort: Callable[[], Awaitable[niquests.Response]]
 
@@ -508,12 +510,21 @@ async def s3_list_files(
             break
 
 
+type OnUpload = Callable[[niquests.PreparedRequest], None]
+
+
+def _upload_hooks(on_upload: OnUpload | None) -> dict | None:
+    return {"on_upload": [on_upload]} if on_upload else None
+
+
 async def s3_put_object(
     s3: botocore.client.BaseClient,
     client: niquests.AsyncSession,
     bucket: str,
     key: str,
-    data: bytes,
+    data: bytes | bytearray,
+    *,
+    on_upload: OnUpload | None = None,
     **kwargs: Unpack[S3ObjectParams],
 ) -> niquests.Response:
     """
@@ -529,7 +540,9 @@ async def s3_put_object(
         ClientMethod="put_object",
         Params=presigned_params,
     )
-    resp = (await client.put(url, data=data, headers=headers if headers else None)).raise_for_status()
+    resp = (
+        await client.put(url, data=data, headers=headers if headers else None, hooks=_upload_hooks(on_upload))
+    ).raise_for_status()
     return resp
 
 
@@ -638,6 +651,7 @@ async def s3_multipart_upload(
     key: str,
     *,
     expires_in: int = 3600,
+    on_upload: OnUpload | None = None,
     **kwargs: Unpack[S3ObjectParams],
 ) -> AsyncIterator[S3MultipartUpload]:
     """Async context manager for S3 multipart upload with automatic cleanup."""
@@ -670,12 +684,12 @@ async def fetch_abort():
         _has_been_aborted = True
         return abort_resp
 
-    async def upload_part(data: bytes) -> UploadPart:
+    async def upload_part(data: bytes | bytearray) -> UploadPart:
         nonlocal _part_number, _parts
         if upload_id is None:
             raise ValueError("Upload ID is not set")
         presigned_url = _generate_presigned_url("upload_part", UploadId=upload_id, PartNumber=_part_number)
-        upload_resp = (await client.put(presigned_url, data=data)).raise_for_status()
+        upload_resp = (await client.put(presigned_url, data=data, hooks=_upload_hooks(on_upload))).raise_for_status()
         _etag = upload_resp.headers.get("ETag")
         etag: str | None = _etag.decode() if isinstance(_etag, bytes) else _etag
         _part: UploadPart = {"PartNumber": _part_number, "ETag": etag}
@@ -723,38 +737,35 @@ async def s3_file_upload(
     *,
     # 5MB minimum for S3 parts
     min_part_size: int = 5 * 1024 * 1024,
-    on_chunk_received: Callable[[bytes], None] | None = None,
+    on_upload: OnUpload | None = None,
     content_length: int | None = None,
     **kwargs: Unpack[S3ObjectParams],
 ) -> None:
     """
     Upload a file to S3 from an async byte stream.
 
     Uses multipart upload for large files. If `content_length` is provided and smaller
-    than `min_part_size`, uses a single PUT instead. Use `on_chunk_received` callback
-    to track upload progress.
+    than `min_part_size`, uses a single PUT instead. The optional `on_upload` callback
+    receives a `niquests.PreparedRequest` with an `upload_progress` attribute for
+    fine-grained byte-level progress tracking.
     """
     if content_length is not None and content_length < min_part_size:
         # Small file - use single PUT operation
-        _data = b""
+        _data = bytearray()
         async for chunk in data:
-            _data += chunk
-            if on_chunk_received:
-                on_chunk_received(chunk)
-        await s3_put_object(s3, client, bucket=bucket, key=key, data=_data, **kwargs)
+            _data.extend(chunk)
+        await s3_put_object(s3, client, bucket=bucket, key=key, data=bytes(_data), on_upload=on_upload, **kwargs)
         return
 
-    async with s3_multipart_upload(s3, client, bucket=bucket, key=key, **kwargs) as mpart:
+    async with s3_multipart_upload(s3, client, bucket=bucket, key=key, on_upload=on_upload, **kwargs) as mpart:
         await mpart.fetch_create()
         has_uploaded_parts = False
         async for chunk in get_stream_chunk(data, min_size=min_part_size):
-            if on_chunk_received:
-                on_chunk_received(chunk)
             if len(chunk) < min_part_size:
                 if not has_uploaded_parts:
                     # No parts uploaded yet, abort multipart and use single PUT
                     await mpart.fetch_abort()
-                    await s3_put_object(s3, client, bucket=bucket, key=key, data=chunk, **kwargs)
+                    await s3_put_object(s3, client, bucket=bucket, key=key, data=chunk, on_upload=on_upload, **kwargs)
                 else:
                     # Parts already uploaded, upload final chunk as last part (S3 allows last part to be smaller)
                     await mpart.upload_part(chunk)
diff --git a/tracktolib/utils.py b/tracktolib/utils.py
@@ -1,6 +1,8 @@
 import asyncio
+import collections
 import datetime as dt
 import importlib.util
+import io
 import itertools
 import mmap
 import os
@@ -112,28 +114,106 @@ def get_chunks[T](it: Iterable[T], size: int, *, as_list: bool = True) -> Iterat
         yield d if not as_list else list(d)
 
 
-async def get_stream_chunk[S: (bytes, str)](data_stream: AsyncIterable[S], min_size: int) -> AsyncIterator[S]:
-    """Buffers an async stream and yields chunks of at least `min_size`."""
-    buffer: S | None = None
+async def get_stream_chunk_str(
+    data_stream: AsyncIterable[str],
+    min_size: int,
+) -> AsyncIterator[str]:
+    """Buffers an async string stream and yields chunks of at least `min_size`."""
+    buffer = ""
     buffer_size = 0
-
     async for chunk in data_stream:
         if not chunk:
             continue
-        buffer = chunk if buffer is None else buffer + chunk  # type: ignore[operator]
+        buffer += chunk
         buffer_size += len(chunk)
-
-        # Yield chunks of min_size while we have enough data for at least 2 chunks
         while buffer_size >= min_size * 2:
             yield buffer[:min_size]
             buffer = buffer[min_size:]
             buffer_size -= min_size
-
-    # Handle the final chunk(s)
-    if buffer is not None and buffer_size > 0:
+    if buffer_size > 0:
         yield buffer
 
 
+class BytesBuffer:
+    """Memory-efficient bytes buffer using a deque of chunks.
+
+    Appends are O(1) (no copy). Reads only copy at chunk boundaries via memoryview.
+    Adapted from urllib3's BytesQueueBuffer.
+    """
+
+    __slots__ = ("buffer", "_size")
+
+    def __init__(self) -> None:
+        self.buffer: collections.deque[bytes | memoryview[bytes]] = collections.deque()
+        self._size: int = 0
+
+    def __len__(self) -> int:
+        return self._size
+
+    def put(self, data: bytes) -> None:
+        self.buffer.append(data)
+        self._size += len(data)
+
+    def get(self, n: int) -> bytes:
+        if not self.buffer:
+            raise RuntimeError("buffer is empty")
+
+        # Fast path: first chunk is exactly the right size
+        if len(self.buffer[0]) == n and isinstance(self.buffer[0], bytes):
+            self._size -= n
+            return self.buffer.popleft()  # type: ignore[return-value]
+
+        fetched = 0
+        ret = io.BytesIO()
+        while fetched < n:
+            remaining = n - fetched
+            chunk = self.buffer.popleft()
+            chunk_length = len(chunk)
+            if remaining < chunk_length:
+                mv = memoryview(chunk)
+                ret.write(mv[:remaining])
+                self.buffer.appendleft(mv[remaining:])  # type: ignore[arg-type]
+                self._size -= remaining
+                break
+            ret.write(chunk)
+            self._size -= chunk_length
+            fetched += chunk_length
+            if not self.buffer:
+                break
+        return ret.getvalue()
+
+    def get_all(self) -> bytes:
+        buffer = self.buffer
+        if not buffer:
+            return b""
+        if len(buffer) == 1:
+            result = buffer.pop()
+            if isinstance(result, memoryview):
+                result = result.tobytes()
+        else:
+            ret = io.BytesIO()
+            ret.writelines(buffer.popleft() for _ in range(len(buffer)))
+            result = ret.getvalue()
+        self._size = 0
+        return result  # type: ignore[return-value]
+
+
+async def get_stream_chunk(
+    data_stream: AsyncIterable[bytes],
+    min_size: int,
+) -> AsyncIterator[bytes]:
+    """Buffers an async byte stream and yields chunks of at least `min_size`."""
+    buffer = BytesBuffer()
+    async for chunk in data_stream:
+        if not chunk:
+            continue
+        buffer.put(chunk)
+        while len(buffer) >= min_size * 2:
+            yield buffer.get(min_size)
+    if len(buffer) > 0:
+        yield buffer.get_all()
+
+
 def json_serial(obj):
     """JSON serializer for objects not serializable by default json code"""
     if isinstance(obj, (dt.datetime, dt.date)):
diff --git a/uv.lock b/uv.lock