Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ dist
*.pdf
*.png
_*
.claude
52 changes: 52 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,58 @@ Bld['BldAeroNodes'] = ADProp
Bld.write('AeroDyn_Blade_Modified.dat')
```

## Streaming Mode for Large Files (New!)

For large output files (GB-sized), you can use streaming mode to inspect file headers and metadata without loading all data into memory:

**Header-only inspection:**
```python
import weio

# Read only metadata without loading data (memory efficient!)
with weio.read('large_output.outb', streaming=True) as f:
print(f['attribute_names']) # Channel names
print(f['attribute_units']) # Units
print(f.description) # File description
# f.data is None - no data loaded yet
# File automatically closes when exiting 'with' block
```

**Load data after inspecting headers:**
```python
import weio

# Inspect headers first, then decide whether to load
with weio.read('large_output.out', streaming=True) as f:
print(f"File has {len(f['attribute_names'])} channels")

# Only load data if needed
f.readAll()
df = f.toDataFrame()
print(df.shape)
```

**Process CSV files in chunks:**
```python
import weio

# Process very large CSV files incrementally
with weio.read('huge_dataset.csv', streaming=True) as f:
print(f"Columns: {f.colNames}")

# Read and process data in chunks
while True:
chunk = f.readChunk(nlines=10000)
if chunk is None:
break
# Process this chunk...
print(f"Processing {len(chunk)} rows")
```

**Supported formats:** OpenFAST output files (`.out`, `.outb`), CSV files (`.csv`), HAWC2 files (`.dat`, `.sel`)

**Memory savings:** Up to 10,000x reduction for header-only inspection of large files!

## Requirements
The library is compatible python 3, and has limited requirements.
If you have pip installed on your system, you can install them by typing in a terminal:
Expand Down
89 changes: 89 additions & 0 deletions STREAMING_QUICKREF.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Streaming Mode - Quick Reference

## For Users

### Header-only inspection (memory efficient!)
```python
import weio

with weio.read('large_file.outb', streaming=True) as f:
print(f['attribute_names']) # Column names
print(f['attribute_units']) # Units
# f.data is None - no data loaded
```

### Load data after inspection
```python
import weio

with weio.read('file.out', streaming=True) as f:
print(f"Channels: {len(f['attribute_names'])}")
f.readAll() # Load data now
df = f.toDataFrame()
```

### Process CSV in chunks
```python
import weio

with weio.read('huge.csv', streaming=True) as f:
while True:
chunk = f.readChunk(nlines=10000)
if chunk is None:
break
# Process chunk...
```

## For Developers

### Adding streaming to a new file format

1. **Add `streaming` parameter to `_read()`:**
```python
def _read(self, streaming=False, **kwargs):
if streaming:
# Read headers only, keep file open
self._fid = open(self.filename, 'r')
self['header'] = self._fid.readline()
self.data = None
else:
# Normal mode: read everything
with open(self.filename, 'r') as f:
self.data = f.read()
```

2. **Implement `_readAll()`:**
```python
def _readAll(self):
if self._fid is None:
raise RuntimeError("No open file handle")
# Read remaining data
self.data = self._fid.read()
```

3. **Optional: Implement `_readChunk()`:**
```python
def _readChunk(self, nlines=None, **kwargs):
if self._fid is None:
raise RuntimeError("No open file handle")
if nlines is None:
nlines = 1000
# Read chunk...
return chunk # or None if EOF
```

That's it! The base File class handles context managers and enforcement.

## Supported Formats (so far)

- ✅ **OpenFAST:** `.out` (ASCII), `.outb` (binary)
- ✅ **CSV:** `.csv`, `.txt` (with chunk reading)
- ✅ **HAWC2:** `.dat`, `.sel` (ASCII and binary)

## Key Design Points

- **Single `streaming` parameter** (not both `headerOnly` and `streaming`)
- **Context manager required** for streaming mode (`with` statement)
- **Header-only:** Exit `with` block without calling `readAll()`
- **Full streaming:** Call `readAll()` or `readChunk()` inside `with` block
- **Backward compatible:** Default `streaming=False` means existing code works unchanged
77 changes: 74 additions & 3 deletions weio/_NEWFILE_TEMPLATE.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,86 @@ def write(self, filename=None):
# Calling (children) function to write
self._write()

def _read(self):
""" Reads self.filename and stores data into self. Self is (or behaves like) a dictionary"""
# --- Example:
def _read(self, streaming=False, **kwargs):
"""
Reads self.filename and stores data into self. Self is (or behaves like) a dictionary

Parameters
----------
streaming : bool
If True, read only headers and keep file open for later reading.
Requires context manager. Default: False (read entire file)
"""
# --- Example (normal mode - read everything):
#self['data']=[]
#with open(self.filename, 'r', errors="surrogateescape") as f:
# for i, line in enumerate(f):
# self['data'].append(line)

# --- Example (with streaming support):
# if streaming:
# # Read headers only, keep file open
# self._fid = open(self.filename, 'r', errors="surrogateescape")
# # Read header lines
# self['header_line'] = self._fid.readline()
# # Parse header info
# self['attribute_names'] = self['header_line'].split()
# # File is now positioned at start of data
# else:
# # Normal mode: read entire file
# self['data']=[]
# with open(self.filename, 'r', errors="surrogateescape") as f:
# f.readline() # skip header
# for i, line in enumerate(f):
# self['data'].append(line)
raise NotImplementedError()

def _readAll(self):
"""
Read all remaining data after header in streaming mode.
Only called when streaming=True and readAll() is invoked.
"""
# --- Example:
# if self._fid is None:
# raise RuntimeError("No open file handle")
#
# # Read all remaining lines from current position
# self['data'] = []
# for line in self._fid:
# self['data'].append(line.strip())
raise NotImplementedError(f"{self.__class__.__name__} does not support readAll()")

def _readChunk(self, **kwargs):
"""
Read a chunk of data from current position in streaming mode.
Only called when streaming=True and readChunk() is invoked.

Parameters
----------
**kwargs : dict
Format-specific parameters (e.g., nlines=1000, nrows=100, nbytes=1024)
Different file formats can implement different chunking strategies.

Returns
-------
chunk : data
The chunk of data read, or None if end of file
"""
# --- Example:
# if self._fid is None:
# raise RuntimeError("No open file handle")
#
# nlines = kwargs.get('nlines', 1000) # default chunk size
#
# chunk = []
# for i in range(nlines):
# line = self._fid.readline()
# if not line:
# return None if not chunk else chunk
# chunk.append(line.strip())
# return chunk
raise NotImplementedError(f"{self.__class__.__name__} does not support readChunk()")

def _write(self):
""" Writes to self.filename"""
# --- Example:
Expand Down
104 changes: 92 additions & 12 deletions weio/csv_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ def formatName():
return 'CSV file'

def __init__(self, filename=None, sep=None, colNames=None, commentChar=None, commentLines=None,\
colNamesLine=None, detectColumnNames=True, header=None, doRead=True, **kwargs):
colNamesLine=None, detectColumnNames=True, header=None, doRead=True, streaming=False, **kwargs):
# Initialize CSV-specific attributes
colNames = [] if colNames is None else colNames
commentLines = [] if commentLines is None else commentLines
self.sep = sep
Expand All @@ -45,7 +46,6 @@ def __init__(self, filename=None, sep=None, colNames=None, commentChar=None, com
self.commentLines = commentLines
self.colNamesLine = colNamesLine
self.detectColumnNames = detectColumnNames
self.data=[]
if header is None:
self.header=[]
else:
Expand All @@ -58,20 +58,47 @@ def __init__(self, filename=None, sep=None, colNames=None, commentChar=None, com
raise Exception('Provide either `commentChar` or `commentLines` for CSV file types')
if (len(self.colNames)>0) and (self.colNamesLine is not None):
raise Exception('Provide either `colNames` or `colNamesLine` for CSV file types')

# Call parent __init__ - handles streaming, filename, etc.
File.__init__(self, filename=None, streaming=streaming)

# Handle filename after parent init
if filename:
self.read(filename, doRead=doRead, **kwargs)
else:
self.filename = None
if streaming:
# Don't read immediately in streaming mode - wait for context manager
self.filename = filename
else:
self.read(filename, doRead=doRead, **kwargs)

def __enter__(self):
Comment thread
ebranlard marked this conversation as resolved.
"""Context manager entry - CSV needs special handling for detect()."""
self._in_context = True
if self.filename:
# In streaming mode: detect headers, open file but don't read data
# In normal mode: read everything
if self.streaming:
self.read(doRead=False) # Run detect() only
self._read() # Open file handle and skip to data
else:
self.read(doRead=True)
return self

# Inherit __exit__ and _enforce_context_if_needed from parent File class

def read(self, filename=None, doRead=True, **kwargs):
def read(self, filename=None, doRead=True, streaming=None, **kwargs):
if filename:
self.filename = filename
if streaming is not None:
self.streaming = streaming
if not self.filename:
raise Exception('No filename provided')
if not os.path.isfile(self.filename):
raise OSError(2,'File not found:',self.filename)
if os.stat(self.filename).st_size == 0:
raise EmptyFileError('File is empty:',self.filename)

self._enforce_context_if_needed()

# Calling children function
self.detect()
if doRead:
Expand Down Expand Up @@ -259,16 +286,69 @@ def strIsFloat(s):
#print(skiprows)

def _read(self):
"""Read CSV data. In streaming mode, file handle is kept open."""
try:
with open(self.filename,'r',encoding=self.encoding) as f:
self.data = pd.read_csv(f,sep=self.sep,skiprows=self.skiprows,header=None,comment=self.commentChar)
if self.streaming:
# Streaming mode: keep file open
self._fid = open(self.filename, 'r', encoding=self.encoding)
# Skip to data start
for _ in range(max(self.skiprows) + 1 if self.skiprows else 0):
self._fid.readline()
# Data is not loaded yet
self.data = None
else:
# Normal mode: read entire file
with open(self.filename,'r',encoding=self.encoding) as f:
self.data = pd.read_csv(f,sep=self.sep,skiprows=self.skiprows,header=None,comment=self.commentChar)

if (len(self.colNames)==0) or (len(self.colNames)!=len(self.data.columns)):
self.colNames=['C{}'.format(i) for i in range(len(self.data.columns))]
self.data.columns = self.colNames
self.data.rename(columns=lambda x: x.strip(),inplace=True)
except pd.errors.ParserError as e:
raise WrongFormatError('CSV File {}: '.format(self.filename)+e.args[0])

if (len(self.colNames)==0) or (len(self.colNames)!=len(self.data.columns)):
self.colNames=['C{}'.format(i) for i in range(len(self.data.columns))]
self.data.columns = self.colNames;
self.data.rename(columns=lambda x: x.strip(),inplace=True)
def _readAll(self):
"""Read all remaining CSV data in streaming mode."""
if self._fid is None:
raise RuntimeError("No open file handle")

try:
# Read remaining data from current position
self.data = pd.read_csv(self._fid, sep=self.sep, header=None, comment=self.commentChar)

if (len(self.colNames)==0) or (len(self.colNames)!=len(self.data.columns)):
self.colNames=['C{}'.format(i) for i in range(len(self.data.columns))]
self.data.columns = self.colNames
self.data.rename(columns=lambda x: x.strip(), inplace=True)
except pd.errors.ParserError as e:
raise WrongFormatError('CSV File {}: '.format(self.filename)+e.args[0])

def _readChunk(self, nlines=None, **kwargs):
"""Read a chunk of CSV data (nlines rows)."""
if self._fid is None:
raise RuntimeError("No open file handle")

if nlines is None:
nlines = 1000 # Default chunk size

try:
chunk = pd.read_csv(self._fid, sep=self.sep, header=None, comment=self.commentChar, nrows=nlines)

if len(chunk) == 0:
return None # End of file

if (len(self.colNames)==0) or (len(self.colNames)!=len(chunk.columns)):
self.colNames=['C{}'.format(i) for i in range(len(chunk.columns))]
chunk.columns = self.colNames
chunk.rename(columns=lambda x: x.strip(), inplace=True)

return chunk
except pd.errors.EmptyDataError:
# End of file reached
return None
except pd.errors.ParserError as e:
raise WrongFormatError('CSV File {}: '.format(self.filename)+e.args[0])


def read_slow_stop_at_first_empty_lines(self, skiprows=None, sep=None, numeric_only=True, colNames=None):
Expand Down
Loading