From 5dcb9ac8b127f27780914f0c266133dc419c1b48 Mon Sep 17 00:00:00 2001 From: Dsssyc Date: Wed, 4 Mar 2026 02:45:05 +0000 Subject: [PATCH] feat: Enhance MemoryStream to handle large data sizes and update version to 0.1.11 --- README.md | 93 +++++++++++++++- fastcarto/fastdb/include/fastdb.h | 2 +- fastcarto/fastdb/src/FastVectorDbBuild.cpp | 2 +- pyproject.toml | 2 +- python/fastdb4py/orm/__init__.py | 7 +- tests/reproduce_overflow.py | 120 +++++++++++++++++++++ 6 files changed, 221 insertions(+), 5 deletions(-) create mode 100644 tests/reproduce_overflow.py diff --git a/README.md b/README.md index 8e60a7e..5a2e530 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ A C++ local database library with cross language bindings. Aiming to be a fast, lightweight, and easy-to-use data communication solution for RPC and coupled modeling in scientific computing. ## What's new +- **2026-03-04 (Memory Overflow Improvement)**: Enhanced the `MemoryStream` implementation to handle large data sizes exceeding 4GB without causing size overflow in `chunk_data_t.size` (u32). This improvement allows for more robust handling of large datasets in memory. (PR #22) - **2026-02-28 (Release Improvement)**: Fix bugs related to build process in Windows. (PR #20) - **2025-12-31(Bug Fix)**: Fixed an issue where shared memory segments were not being properly unregistered from the resource tracker upon closing, which could lead to resource leaks. (PR #17) - **2025-12-15 (Release Improvement)**: Enabled distribution of pre-compiled binary wheels for macOS (Intel/Apple Silicon) and Linux (x86_64/aarch64), eliminating the need for local compilation tools during installation. (PR #15) @@ -22,7 +23,97 @@ You can install the Python package of fastdb via pip: pip install fastdb4py ``` -**Note:** Pre-compiled binary wheels are provided for major platforms (macOS, Linux). For other systems (including Windows), the package will build from source, requiring a C++ compiler and CMake. +**Note:** Pre-compiled binary wheels are provided for major platforms (macOS-Intel/macOS-Apple Silicon, Linux-Ubuntu, Windows-AMD64). For other systems, the package will build from source, requiring a C++ compiler and CMake. + +## Usage + +### 1. Define a Feature (Schema) + +To use `fastdb`, you first need to define your data schema by subclassing `fastdb4py.Feature`. +Use type hints to define the fields of your feature. + +```python +import fastdb4py + +class Point(fastdb4py.Feature): + x: fastdb4py.F64 + y: fastdb4py.F64 +``` + +### 2. Create and Initialize a Database + +You can create a new database or truncate an existing one using `fastdb4py.ORM.truncate`. +This function takes a list of `TableDefn` objects, specifying the feature class and the initial capacity (number of rows). + +```python +from pathlib import Path + +# specific the path for the database +DB_PATH = "my_fastdb_data" + +# Create a new database with a table for 'Point' features, capacity 1000 +# The name parameter is optional; if not provided, a default name will be generated based on the feature class name. +# In this example, we explicitly set the table name to 'points'. +db = fastdb4py.ORM.truncate([ + fastdb4py.TableDefn(Point, 1000, name='points'), +]) +``` + +### 3. Write Data + +You can access the table using the feature class as a key. +Features can be accessed by index or iterated over. + +```python +# Access the table 'points' with schema defined by the Point feature class +points_table = db[Point]['points'] +# If you did not specify the table name when creating the database, you can access it using the default name: +# points_table = db[Point][Point] +# or +# points_table = db[Point]['Point'] + +# Ensure we are in write mode (if loaded from file later) +# For a newly created DB in memory, we are already good to go. + +for i in range(10): + # Access the feature at index i + p = points_table[i] + + # Set field values + p.x = i * 1.5 + p.y = i * 2.5 + p.label = f"point_{i}" + +# Save the database to disk +db.save(DB_PATH) +``` + +### 4. Read and Modify Data (Columnar Access) + +`fastdb` supports high-performance columnar access using NumPy arrays. +This allows for vectorized operations on your data. + +```python +# Load the database from disk +db = fastdb4py.ORM.load(DB_PATH, from_file=True) +points_table = db[Point]['points'] + +# The length of the table (number of rows) can be obtained using len() +print(f"Number of points: {len(points_table)}") + +# Access fields as numpy arrays via the `.column` property +xs = points_table.column.x +ys = points_table.column.y + +print(f"First 5 X values: {xs[:5]}") + +# Modify data in bulk using numpy operations +# This modifies the data in memory directly! +xs += 10.0 + +# Verify the change via object access +print(f"Point 0 x: {points_table[0].x}") # Should be 0 * 1.5 + 10.0 = 10.0 +``` ## Development Environment This project uses DevContainer for development environment. Please refer to the `.devcontainer/devcontainer.example.json` file for configuration details. diff --git a/fastcarto/fastdb/include/fastdb.h b/fastcarto/fastdb/include/fastdb.h index e162b2e..10145ff 100644 --- a/fastcarto/fastdb/include/fastdb.h +++ b/fastcarto/fastdb/include/fastdb.h @@ -56,7 +56,7 @@ namespace wx struct chunk_data_t { - u32 size; + size_t size; u8* pdata; }; diff --git a/fastcarto/fastdb/src/FastVectorDbBuild.cpp b/fastcarto/fastdb/src/FastVectorDbBuild.cpp index 19a7e5c..2f7b4f7 100644 --- a/fastcarto/fastdb/src/FastVectorDbBuild.cpp +++ b/fastcarto/fastdb/src/FastVectorDbBuild.cpp @@ -43,7 +43,7 @@ namespace wx chunk_data_t MemoryStream::Impl::data() { - return chunk_data_t{ (u32)m_buffer.size(), m_buffer.data() }; + return chunk_data_t{ m_buffer.size(), m_buffer.data() }; } void MemoryStream::Impl::reset() diff --git a/pyproject.toml b/pyproject.toml index 66e7cf0..fddba41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "fastdb4py" -version = "0.1.10" +version = "0.1.11" description = "FastCarto database bindings" readme = "README.md" requires-python = ">=3.10" diff --git a/python/fastdb4py/orm/__init__.py b/python/fastdb4py/orm/__init__.py index 8f5d8ba..6a50c98 100644 --- a/python/fastdb4py/orm/__init__.py +++ b/python/fastdb4py/orm/__init__.py @@ -322,7 +322,12 @@ def close(self): Make sure to unlink the shared memory if you want to completely remove it through the unlink() method by other processes. """ if self._shm: - resource_tracker.unregister(self._shm._name, 'shared_memory') + # Not manually unregistering shared memory + # However, this may cause some warnings in multiprocessing resource tracker + # when the process that shares the memory transmits the ownership to other processes and exits without unlinking the shared memory. + # But it is generally safe to ignore these warnings as long as you ensure proper unlinking of shared memory when it is no longer needed. + # May be optimized in the future if necessary. + # resource_tracker.unregister(self._shm._name, 'shared_memory') self._shm.close() self._shm = None self._origin = None diff --git a/tests/reproduce_overflow.py b/tests/reproduce_overflow.py new file mode 100644 index 0000000..07cd2f5 --- /dev/null +++ b/tests/reproduce_overflow.py @@ -0,0 +1,120 @@ +################################################################# +# Test Script: reproduce_overflow.py +# Purpose: To verify if writing > 4GB to MemoryStream causes size overflow in chunk_data_t.size (u32). +# Usage: python reproduce_overflow.py [size_in_gb] +# Note: This test can consume a lot of RAM due to C++ std::vector resizing strategy. Use with caution. +################################################################# +import sys +import os +import time + +try: + from fastdb4py import core +except ImportError: + # Try local import if package is installed in weird way or running from source + # adjust path as needed + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../python'))) + try: + from fastdb4py import core + except ImportError: + print("Error: Could not import fastdb4py.core") + sys.exit(1) + +def run_test(target_size_gb=4.05): + """ + Tries to write > 4GB data to MemoryStream to verify if size overflows 32-bit integer. + target_size_gb: The size to write in GB. Should be > 4.0 to trigger overflow. + """ + target_bytes = int(target_size_gb * 1024 * 1024 * 1024) + + # Use 64MB chunks to avoid one giant python allocation + chunk_size = 64 * 1024 * 1024 + chunk = b'\x00' * chunk_size + + print(f"[-] Allocating ~{target_size_gb:.2f} GB in MemoryStream...") + print("[-] WARNING: This test can consume > 8GB of RAM due to C++ std::vector resizing strategy.") + print("[-] If your machine has < 8GB RAM, this script might be killed by OOM killer.") + + ms = core.WxMemoryStream() + total_written = 0 + start_time = time.time() + + try: + while total_written < target_bytes: + remaining = target_bytes - total_written + + # Determine how much to write in this iteration + current_write_size = min(remaining, chunk_size) + + # Slice buffer if needed (avoid copy if possible, memoryview might be better but bytes slice works for now) + bytes_to_write = chunk if current_write_size == chunk_size else chunk[:current_write_size] + + # Depending on SWIG typemap, write might take 1 or 2 args. + # Based on inspection, it likely takes 1 argument (buffer) due to %typemap(in) (void* pdata, size_t size) + try: + ms.write(bytes_to_write) + except TypeError: + # Fallback if typemap doesn't match single arg + ms.write(bytes_to_write, len(bytes_to_write)) + + total_written += current_write_size + + # Print progress + if total_written % (512 * 1024 * 1024) < chunk_size: + elapsed = time.time() - start_time + speed = (total_written / (1024**3)) / elapsed if elapsed > 0 else 0 + print(f" Written: {total_written / (1024**3):.2f} GB ({speed:.2f} GB/s)") + + except MemoryError: + print("\n[ERROR] Out of Memory! Python failed to allocate memory.") + sys.exit(1) + except Exception as e: + print(f"\n[ERROR] Unexpected error during write: {e}") + # Even if we crash, we might want to check what we wrote so far + pass + + # Verify result + print(f"\n[-] Finished writing. Total intended: {total_written} bytes") + + try: + result = ms.data() + reported_size = result.size # This is the u32 field from C++ struct + + print(f"[-] Reported Size (from chunk_data_t.size): {reported_size} bytes") + + expected_modulo = total_written % (2**32) + + if reported_size != total_written: + print(f"[FAIL] Size mismatch! {reported_size} != {total_written}") + if reported_size == expected_modulo: + diff = total_written - reported_size + print(f"[FAIL] Bug Reproduced: Size overflowed exactly by {diff} bytes (approx {diff/1024/1024/1024:.0f}GB).") + print(f" This confirms that chunk_data_t.size is truncated to 32-bit unsigned integer.") + + # Check for Magic String issue + # If we wrote > 4GB, the reported size is small (modulo). + # If the modulo is very small (e.g. < 16 bytes), the magic string (first 16 bytes) is effectively lost + # because the system thinks the file is only N bytes long. + if reported_size < 16: + print("[CRITICAL] Magic string compromised! File size is reported as < 16 bytes.") + else: + print("[WARN] Magic string might be present in first 16 bytes, but file is truncated.") + else: + print(f"[FAIL] Size mismatch but not exact modulo match. Got {reported_size}") + else: + print("[PASS] Size matches correctly (No overflow observed).") + print(" (Did you write enough data? You typically need > 4GB to trigger this.)") + + except Exception as e: + print(f"[ERROR] Failed to inspect result: {e}") + +if __name__ == "__main__": + size_gb = 4.05 # Default to slightly over 4GB + if len(sys.argv) > 1: + try: + size_gb = float(sys.argv[1]) + except ValueError: + print("Usage: python reproduce_overflow.py [size_in_gb]") + sys.exit(1) + + run_test(size_gb)