From a039ba9fef27cf0fa710d27ac3262406e1d2ffcc Mon Sep 17 00:00:00 2001 From: nubby Date: Tue, 24 Feb 2026 23:40:45 +0000 Subject: [PATCH 01/17] Adds a useful data viewer script for numpy files. --- 01_dsp/dspml_pipeline/scripts/view_data.py | 69 ++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 01_dsp/dspml_pipeline/scripts/view_data.py diff --git a/01_dsp/dspml_pipeline/scripts/view_data.py b/01_dsp/dspml_pipeline/scripts/view_data.py new file mode 100644 index 00000000..5b975d5e --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/view_data.py @@ -0,0 +1,69 @@ +""" +File: + view_data + +Description: + View the contents of a saved numpy file. + +Author: + jLab + nubby + Perplexity.AI + +Date: + 24 Feb 2026 + +Version: + 1.0.0 +""" +import argparse +import numpy as np +import os + +from typing import Union + + +def _load_npy_file(path: str) -> Union[np.array, None]: + """ + _load_npy_file(path) + + Load the contents of a saved .npy file if proper format; + otherwise return None. + + Args: + path (str) Path to file. + + Returns: + data (np.array, None) + """ + try: + assert(os.path.isfile(path) and path.split(".")[-1] == "npy") + data = np.load(path) + except AssertionError: + data = None + + return data + + +def view_data(path: str): + # Load the file in question. + data = _load_npy_file(path=path) + + try: + print(f"Contents: {data}") + print(f"Shape: {data.shape}") + except AttributeError: + print(f"ERROR: File {path} invalid; check the path!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="View the contents of an input .npy file.") + parser.add_argument( + "--path", + "-p", + required=True, + type=str, + help="Path to the desired file." + ) + args = parser.parse_args() + view_data(path=args.path) From 14cd859682c1ce2ebc9ba44379a7e2874f59c322 Mon Sep 17 00:00:00 2001 From: nubby Date: Wed, 25 Feb 2026 01:33:13 +0000 Subject: [PATCH 02/17] Updates transformer library for modern TF. --- .../end_to_end_estimation/transformer.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py index 84b05fc7..8920bafa 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py @@ -14,7 +14,11 @@ from PIL import Image from ..parameters import RANDOM_SEED, KFOLD_SPLITS, num2label -from transformers import MobileViTFeatureExtractor, MobileViTForImageClassification +try: + from transformers import MobileViTFeatureExtractor, MobileViTForImageClassification +except ImportError: + from transformers import MobileViTImageProcessor, MobileViTForImageClassification + # Set seeds for reproducibility torch.manual_seed(RANDOM_SEED) @@ -129,7 +133,11 @@ def __init__(self, X, y, epochs=10, batch_size=4, verbose=False): # Move to device self.mobilevit.to(self.device) - self.feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-small") + try: + self.feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-small") + except: + self.feature_extractor = MobileViTImageProcessor.from_pretrained("apple/mobilevit-small") + # Freeze the backbone, only train the classifier for name, param in self.mobilevit.named_parameters(): @@ -413,4 +421,4 @@ def evaluate(model, dataloader, loss_fn, device): total_loss += loss.item() preds.extend(outputs.cpu().numpy()) trues.extend(targets.cpu().numpy()) - return total_loss / len(dataloader), preds, trues \ No newline at end of file + return total_loss / len(dataloader), preds, trues From e2010f5943f535494a047ed77151297979793eda Mon Sep 17 00:00:00 2001 From: nubby Date: Wed, 25 Feb 2026 02:11:05 +0000 Subject: [PATCH 03/17] Updates name of Pie Ranch dataset label. --- 01_dsp/dspml_pipeline/scripts/config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/01_dsp/dspml_pipeline/scripts/config.yaml b/01_dsp/dspml_pipeline/scripts/config.yaml index e64657ca..d8ef9c27 100644 --- a/01_dsp/dspml_pipeline/scripts/config.yaml +++ b/01_dsp/dspml_pipeline/scripts/config.yaml @@ -16,7 +16,7 @@ data: validation: # Raw validation datasets to combine dataset_dirs: - - "../data/field-pie-ranch-dataset" + - "../data/pie-ranch-dataset" # Target combined validation dataset directory target_dir: "../data/pie-ranch-dataset" @@ -80,4 +80,4 @@ end-to-end: verbose: false advanced: - verbose: true # Set to false to reduce logging output \ No newline at end of file + verbose: true # Set to false to reduce logging output From 6220698e9501adc79e0a78205cb83cd5640a1a3b Mon Sep 17 00:00:00 2001 From: nubby Date: Wed, 25 Feb 2026 02:11:41 +0000 Subject: [PATCH 04/17] Allows for use of previously-processed datasets (without raw radar frame data. --- .../dspml_pipeline/data/frame_loader.py | 80 ++++++++++++++++--- 01_dsp/dspml_pipeline/scripts/main.py | 30 ++++++- 2 files changed, 94 insertions(+), 16 deletions(-) diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py index 6bb6d86f..2d86351d 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py @@ -1,3 +1,21 @@ +""" +File: + frame_loader.py + +Description: + Tools for loading datasets from the GOPHERS pipeline. + +Authors: + jLab + Eric Vetha + nubby + +Date: + 24 Feb 2026 + +Version: + 1.0.9 +""" import logging logger = logging.getLogger(__name__) @@ -5,12 +23,15 @@ from pathlib import Path from ..setup_logging import setup_logging import json +import os import pandas as pd import sys from scipy import signal + THRESHOLD = 50 # For anomoly removal + class FrameLoader: """ FrameLoader class for processing radar data into standardized input (X) and output (y) matrices for regression tasks. @@ -38,10 +59,10 @@ class FrameLoader: y (np.ndarray): Corresponding labels (targets). """ - def __init__(self, dataset_dirs:list, target_dir:str, - data_log:str = "data-log.csv", - folder_name:str = "Sample #", label_name:str = "Bulk Density (g/cm^3)", - verbose:bool = False): + def __init__(self, dataset_dirs: list, target_dir: str, + data_log: str = "data-log.csv", + folder_name: str = "Sample #", label_name: str = "Bulk Density (g/cm^3)", + verbose: bool = False): """ Initializes the FrameLoader instance based on the provided directories. @@ -66,10 +87,36 @@ def __init__(self, dataset_dirs:list, target_dir:str, for i in self.dataset_dirs: if not Path(i).exists(): logger.error(f"Dataset {i} does not exist.") + if not os.path.isdir(i): + logger.error(f"Path {i} does not point to a dataset directory.") data_log_i = Path(i) / data_log if not data_log_i.exists(): - logger.error(f"Data log file {data_log_i} does not exist.") - sys.exit(1) + logger.warning(f"Data log file {data_log_i} does not exist; " + f"checking for preprocessed dataset...") + if not self._is_dataset_preprocessed(i): + logger.warning(f"Dataset {i} is invalid.") + sys.exit(1) + else: + logger.info(f"Dataset {i} initialized.") + + def _is_dataset_preprocessed(self, path: str): + """ + is_dataset_preprocessed(path) + + Check for the existence of X.npy, y.npy, features.csv, and results.csv files in the path provided. + + Args: + path (str) + + Returns: + preprocessed? (bool) + """ + required_files = ["X.npy", "y.npy", "features.csv", "results.csv"] + current_files = os.listdir(path) + if not set(required_files) == set(current_files): + return False + # TODO: Extract data here? + return True def extract_data(self): """ @@ -185,13 +232,14 @@ def save_dataset(self): logger.info(f"Raw dataset saved as X.npy and y.npy") logger.info(f"Saved shapes: X={self.X.shape}, y={self.y.shape}") -def load_dataset(dataset_dir:str): +def load_dataset(dataset_dir: str, fl: FrameLoader): """ Loads data that has already been processed. Assumes the features are named X.npy and the labels are named y.npy. Args: dataset_dir: Directory containing the capture file. + fl: FrameLoader object for given dataset. Returns: X (np.ndarray): Processed radar data (features). @@ -202,11 +250,17 @@ def load_dataset(dataset_dir:str): y_path = Path(dataset_dir) / "y.npy" if not X_path.exists() or not y_path.exists(): - logger.error("X.npy and/or y.npy not found in the dataset directory") - sys.exit(1) - - X = np.load(X_path) - y = np.load(y_path) + logger.warning("X.npy and/or y.npy not found in the dataset directory; generating...") + X, y = fl.extract_data() + fl.save_dataset() + # If the dataset still does not exists, exit. + if not X_path.exists() or not y_path.exists(): + logger.error("X.npy and/or y.npy could not be generated.") + sys.exit(1) + else: + # Load dataset if it has already been processed into .npy files. + X = np.load(X_path) + y = np.load(y_path) logger.info(f"Loaded from existing dataset: X={X.shape}, y={y.shape}") @@ -480,4 +534,4 @@ def novelda_digital_downconvert(raw_frame:np.ndarray): # Baseband signal using convolution (provides downcoverted, filtered analytic signal) baseband_signal = signal.convolve(mixed, window, mode='same') - return baseband_signal \ No newline at end of file + return baseband_signal diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py index 44a6f3a0..42305ec8 100644 --- a/01_dsp/dspml_pipeline/scripts/main.py +++ b/01_dsp/dspml_pipeline/scripts/main.py @@ -1,3 +1,21 @@ +""" +File: + main.py + +Description: + Launch file for WADAR dspml_pipeline. + +Authors: + jLab + Eric Vetha + nubby + +Date: + 24 Feb 2026 + +Version: + 1.0.9 +""" import logging logger = logging.getLogger(__name__) @@ -53,8 +71,14 @@ def main(): X_val, y_val = validationFrameLoader.extract_data() validationFrameLoader.save_dataset() else: - X_train, y_train = load_dataset(dataset_dir=params['data']['training']['target_dir']) - X_val, y_val = load_dataset(dataset_dir=params['data']['validation']['target_dir']) + X_train, y_train = load_dataset( + dataset_dir=params['data']['training']['target_dir'], + fl=trainingFrameLoader + ) + X_val, y_val = load_dataset( + dataset_dir=params['data']['validation']['target_dir'], + fl=validationFrameLoader + ) # ======== Handcrafted Features ======== if params['handcrafted']['enabled']: @@ -523,4 +547,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() From f91b9ee59349d753bb3336fbb0a3bd31f02c4ae8 Mon Sep 17 00:00:00 2001 From: nubby Date: Wed, 25 Feb 2026 19:28:58 +0000 Subject: [PATCH 05/17] Checkpoint; still borken, but refactoring has begun frame_loader; also updates eval_tools for updates to libs. --- .../dspml_pipeline/data/frame_loader.py | 53 ++++++++++++-- .../feature_estimation/eval_tools.py | 20 +++++- 01_dsp/dspml_pipeline/scripts/main.py | 71 ++++++++++++++++--- 3 files changed, 128 insertions(+), 16 deletions(-) diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py index 2d86351d..35369e58 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py @@ -101,8 +101,6 @@ def __init__(self, dataset_dirs: list, target_dir: str, def _is_dataset_preprocessed(self, path: str): """ - is_dataset_preprocessed(path) - Check for the existence of X.npy, y.npy, features.csv, and results.csv files in the path provided. Args: @@ -118,7 +116,45 @@ def _is_dataset_preprocessed(self, path: str): # TODO: Extract data here? return True - def extract_data(self): + def load(self, new: bool) -> tuple: + """ + Loads and combines the specified datasets based on both existence of raw data and user specs. + + Args: + new (bool) Load raw radar frames? If False, load .npy files if they exist. + + Returns: + X, y (tuple[np.array, np.array]) + """ + for dataset_path in self.dataset_dirs: + if new: + # Try to load preprocessed dataset if raw scans unavailable. + if not self._is_new_dataset_valid(): + if not self._is_preprocessed_dataset_valid(): + logger.error(f"Neither existing radar scans nor valid preprocessed " + f"dataset were found for the following dataset:\r\n" + f"\t+ Target:\t\t{self.target_dir}\r\n" + f"\t+ Dataset dirs:\t{self.dataset_dirs}") + sys.exit(1) + # Load preprocessed dataset if it exists and raw scans do not here. + X, y = self.load_preprocessed_dataset() + # Load raw radar scans into new dataset here. + else: + X, y = self.load_new_dataset() + else: + X, y = self.load_preprocessed_dataset() + + + def extract_single_dataset(self, dataset_path: Path) -> tuple[np.ndarray, np.ndarray]: + """ + Extracts the features (X) and labels (y) from a given directory. + + Returns: + X (np.ndarray): Processed radar data (features). + y (np.ndarray): Corresponding labels (targets). + """ + + def extract_data(self) -> tuple: """ Extracts the features (X) and labels (y) from the provided directries. @@ -134,6 +170,7 @@ def extract_data(self): # Iterate through each dataset dir for i in self.dataset_dirs: + logging.info(f"Extracting data from {i}.") dataset_dir = Path(i) subdirs = [d for d in dataset_dir.iterdir() if d.is_dir() and not d.name.startswith('.')] @@ -146,8 +183,9 @@ def extract_data(self): df[self.label_name] = df[self.label_name].astype(float) logger.info(f"Loaded data log with {len(df)} samples") except Exception as e: - logger.error(f"Expected CSV format: columns include '{self.folder_name}' and '{self.label_name}'") - sys.exit(1) + logger.warning(f"Expected CSV format: columns include '{self.folder_name}' and '{self.label_name}'; " + f"attempting to load preprocessed {self.folder_name} dataset...") + return [], [] # In each subdirectory for i, folder in enumerate(subdirs): @@ -252,6 +290,9 @@ def load_dataset(dataset_dir: str, fl: FrameLoader): if not X_path.exists() or not y_path.exists(): logger.warning("X.npy and/or y.npy not found in the dataset directory; generating...") X, y = fl.extract_data() + if len(X) == 0 or len(y) == 0: + logger.error("X.npy and/or y.npy could not be generated.") + sys.exit(1) fl.save_dataset() # If the dataset still does not exists, exit. if not X_path.exists() or not y_path.exists(): @@ -259,6 +300,8 @@ def load_dataset(dataset_dir: str, fl: FrameLoader): sys.exit(1) else: # Load dataset if it has already been processed into .npy files. + print(X_path) + print(y_path) X = np.load(X_path) y = np.load(y_path) diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py b/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py index 8d2ebafc..f44ed0a2 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py @@ -1,3 +1,21 @@ +""" +File: + eval_tools.py + +Description: + ??? + +Authors: + jLab + Eric Vetha + nubby + +Date: + 24 Feb 2026 + +Version: + 1.0.9 +""" from dspml_pipeline.feature_estimation.ridge_regression import RidgeRegression from dspml_pipeline.feature_estimation.random_forest import RandomForest from dspml_pipeline.feature_estimation.xgboost_tree import XGBoostTree @@ -263,4 +281,4 @@ def show_results_summary(feature_type: str, training_dir: str, validation_dir: s print(f"Validation Results for {feature_type}".center(40)) print("="*40) results_df_amp = load_results(validation_dir) - display_feature_results(feature_type, results_df_amp) \ No newline at end of file + display_feature_results(feature_type, results_df_amp) diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py index 42305ec8..d58c94fc 100644 --- a/01_dsp/dspml_pipeline/scripts/main.py +++ b/01_dsp/dspml_pipeline/scripts/main.py @@ -19,6 +19,7 @@ import logging logger = logging.getLogger(__name__) +import argparse import os import sys sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) @@ -42,19 +43,31 @@ import matplotlib.pyplot as plt import yaml -def main(): - if len(sys.argv) < 2: - raise RuntimeError("Usage: python main.py ") - config_file = sys.argv[1] +def load_config(path: str) -> dict: + """ + load_config(path) - # Load configuration - with open(config_file, "r") as f: + Load a configuration file into a return dictionary. + + Args: + path (str) + + Returns: + params (dict) + """ + with open(path, "r") as f: params = yaml.safe_load(f) + return params +def main(config_path: str): + # Load training parameters from config file. + params = load_config(path=config_path) + + # Configure logging. setup_logging(verbose=params['advanced']['verbose']) - # Load data from training and validation datasets + # Load data from training and validation datasets. trainingFrameLoader = FrameLoader(dataset_dirs=params['data']['training']['dataset_dirs'], target_dir=params['data']['training']['target_dir'], data_log="data-log.csv", @@ -63,13 +76,41 @@ def main(): target_dir=params['data']['validation']['target_dir'], data_log="data-log.csv", label_name=params['data']['label_name']) + X_train, y_train = trainingFrameLoader.load(params['data']['new_dataset']) + X_val, y_val = validationFrameLoader.load(params['data']['new_dataset']) + """ # If new dataset, extract data. Otherwise, load from saved file. if params['data']['new_dataset']: X_train, y_train = trainingFrameLoader.extract_data() - trainingFrameLoader.save_dataset() + # Try to load previously-processed data if none found in raw form. + if len(X_train) > 0 and len(y_train) > 0: + trainingFrameLoader.save_dataset() + else: + print(f'Loading dataset from {params["data"]["training"]["target_dir"]}.') + X_train, y_train = load_dataset( + dataset_dir=params['data']['training']['target_dir'], + fl=trainingFrameLoader + ) + # Exit if we still cannot find training data. + if len(X_train) == 0 or len(y_train) == 0: + logger.error(f'Cannot load training data for {params["data"]["training"]["target_dir"]}! Exiting.') + sys.exit() + + # Try to load previously-processed data if none found in raw form. X_val, y_val = validationFrameLoader.extract_data() - validationFrameLoader.save_dataset() + if len(X_val) > 0 and len(y_val) > 0: + validationFrameLoader.save_dataset() + else: + print(f'Loading dataset from {params["data"]["validation"]["target_dir"]}.') + X_val, y_val = load_dataset( + dataset_dir=params['data']['validation']['target_dir'], + fl=validationFrameLoader + ) + # Exit if we still cannot find validation data. + if len(X_val) == 0 or len(y_val) == 0: + logger.error(f'Cannot load training data for {params["data"]["validation"]["target_dir"]}! Exiting.') + sys.exit() else: X_train, y_train = load_dataset( dataset_dir=params['data']['training']['target_dir'], @@ -79,6 +120,7 @@ def main(): dataset_dir=params['data']['validation']['target_dir'], fl=validationFrameLoader ) + """ # ======== Handcrafted Features ======== if params['handcrafted']['enabled']: @@ -547,4 +589,13 @@ def main(): if __name__ == "__main__": - main() + parser = argparse.ArgumentParser(description="Launch training/evaluation of GOPHERS datasets.") + parser.add_argument( + "--config", + "-c", + required=True, + type=str, + help="Path to desired config path." + ) + args = parser.parse_args() + main(config_path=args.config) From 8c0d1593a3307bfc50f885657f0c30b8e8248c88 Mon Sep 17 00:00:00 2001 From: nubby Date: Sat, 28 Feb 2026 07:09:36 +0000 Subject: [PATCH 06/17] Now selects whether to load individual dataset based on existence of valid files, but still needs loader for preprocessed data. --- .../dspml_pipeline/data/frame_loader.py | 190 +++++++++++++++--- 01_dsp/dspml_pipeline/scripts/config.yaml | 21 +- 01_dsp/dspml_pipeline/scripts/main.py | 4 + 3 files changed, 181 insertions(+), 34 deletions(-) diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py index 35369e58..bea15f77 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py @@ -116,43 +116,185 @@ def _is_dataset_preprocessed(self, path: str): # TODO: Extract data here? return True - def load(self, new: bool) -> tuple: + def _is_new_dataset_valid(self, dataset_dir: str) -> bool: """ - Loads and combines the specified datasets based on both existence of raw data and user specs. + Confirm whether a dataset contains the required raw radar frames for processing. Args: - new (bool) Load raw radar frames? If False, load .npy files if they exist. + dataset_dir (str): Relative path to dataset in question. Returns: - X, y (tuple[np.array, np.array]) + valid (bool): Is the dataset valid for extracting radar frame data? """ - for dataset_path in self.dataset_dirs: - if new: - # Try to load preprocessed dataset if raw scans unavailable. - if not self._is_new_dataset_valid(): - if not self._is_preprocessed_dataset_valid(): - logger.error(f"Neither existing radar scans nor valid preprocessed " - f"dataset were found for the following dataset:\r\n" - f"\t+ Target:\t\t{self.target_dir}\r\n" - f"\t+ Dataset dirs:\t{self.dataset_dirs}") - sys.exit(1) - # Load preprocessed dataset if it exists and raw scans do not here. - X, y = self.load_preprocessed_dataset() - # Load raw radar scans into new dataset here. - else: - X, y = self.load_new_dataset() - else: - X, y = self.load_preprocessed_dataset() + required_file = "data-log.csv" + capture_files = [] # Keep track of the number of available radar captures. + + # First check that all required files are in the base directory. + current_files = set(os.listdir(dataset_dir)) + if not required_file in current_files: + return False + + # Next look for a number of raw radar scans greater than zero. + dataset_path = Path(dataset_dir) + subdirs = [d for d in dataset_path.iterdir() + if d.is_dir() and not d.name.startswith('.')] + for i, folder in enumerate(subdirs): + capture_files.append(sorted(folder.glob("*.frames"))) + if len(capture_files) == 0: + return False + return True - def extract_single_dataset(self, dataset_path: Path) -> tuple[np.ndarray, np.ndarray]: + def _is_preprocessed_dataset_valid(self, dataset_dir: str) -> bool: + """ + Confirm whether a dataset contains the required preprocessed radar scans. + + Args: + dataset_dir (str): Relative path to dataset in question. + + Returns: + valid (bool): Is the dataset valid for use of preprocessed radar data? + + Todo: + * Check the contents of the numpy files for validity. + """ + required_files = ["X.npy", "y.npy"] + current_files = set(os.listdir(dataset_dir)) + if not set(required_files).issubset(current_files): + return False + return True + + def extract_single_dataset(self, dataset_dir: Path) -> tuple[np.ndarray, np.ndarray]: """ Extracts the features (X) and labels (y) from a given directory. + Args: + dataset_dir (str): String of relative path to dataset source directory. + Returns: - X (np.ndarray): Processed radar data (features). - y (np.ndarray): Corresponding labels (targets). + frame_data (np.ndarray): Processed radar data (features) from one dataset source. + labels (np.ndarray): Corresponding labels (targets) from one dataset source. + """ + new_frame_data = [] + new_labels = [] + + logging.info(f"Extracting data from {dataset_dir}.") + + dataset_dir = Path(dataset_dir) + subdirs = [d for d in dataset_dir.iterdir() + if d.is_dir() and not d.name.startswith('.')] + data_log = dataset_dir / self.data_log + + # Get the labels from the data log. + try: + df = pd.read_csv(data_log) + df[self.folder_name] = df[self.folder_name].astype(str) + df[self.label_name] = df[self.label_name].astype(float) + logger.info(f"Loaded data log with {len(df)} samples") + except Exception as e: + logger.warning(f"Expected CSV format: columns include '{self.folder_name}' and '{self.label_name}'; " + f"attempting to load preprocessed {self.folder_name} dataset...") + return [], [] + + # In each subdirectory. + for i, folder in enumerate(subdirs): + + capture_files = sorted(folder.glob("*.frames")) + + logger.info(f"Processing {len(capture_files)} files in {folder.name}") + + if not capture_files: + logger.warning(f"No .frames files found in {folder.name}") + continue + + # Find the row in df corresponding to this folder name + sample_row = df[df['Sample #'] == folder.name] + if sample_row.empty: + logger.error(f"No matching sample for folder {folder.name} in data log") + sys.exit(1) + else: + bulk_density = sample_row.iloc[0][self.label_name] + + # Process each capture file + params = None + for capture_file in capture_files: + try: + frame_data, params = process_frames(folder, capture_file.name) + + if frame_data is None: + logger.warning(f"Failed to process: {capture_file.name}") + continue + + # Anomoly removal. Replaces values that deviate from the median by more + # than a threshold with the median. This has been done since the beginning + # of the project because of odd spikes in the raw DAC output that causes + # large deviations in the data. + median = np.median(frame_data, axis=1, keepdims=True) + mask = np.abs(frame_data - median) > THRESHOLD + frame_data_clean = frame_data.copy() + frame_data_clean[mask] = np.broadcast_to(median, frame_data.shape)[mask] + + # DDC + ddc_frame_data = np.zeros_like(frame_data_clean, dtype=np.complex64) + for i in range(frame_data_clean.shape[1]): + ddc_frame_data[:, i] = novelda_digital_downconvert(frame_data_clean[:, i]) + + try: + new_frame_data.append(ddc_frame_data) + new_labels.append(bulk_density) + except: + logger.error(f"Failed to stack radar data from {capture_file.name}") + sys.exit(1) + + # Outputs warning when problem occurs while processing, but continues processing other radar data. + except Exception as e: + logger.warning(f"Error processing {capture_file.name}: {e}") + + # Save radar parameters + if params and len(capture_files) > 0: + params_file = folder / "radar_params.json" + with open(params_file, 'w') as f: + json.dump(params, f) + logger.info(f"Saved parameters: {params_file.name}") + + return new_frame_data, new_labels + + def load(self, new: bool) -> tuple: """ + Loads and combines the specified datasets based on both existence of raw data and user specs. + + Args: + new (bool) Load raw radar frames? If False, load .npy files if they exist. + + Returns: + X, y (tuple[np.ndarray, np.ndarray]) + """ + X = [] + y = [] + + logger.info("Starting frame processing") + + for dataset_dir in self.dataset_dirs: + # Load raw radar scans into new dataset here. + if new and self._is_new_dataset_valid(dataset_dir=dataset_dir): + X_new, y_new = self.extract_single_dataset(dataset_dir=dataset_dir) + # Try to load preprocessed dataset if raw scans unavailable. + elif self._is_preprocessed_dataset_valid(dataset_dir): + X_new, y_new = self.load_preprocessed_dataset(dataset_dir) + else: + logger.error(f"Neither existing radar scans nor valid preprocessed " + f"dataset were found for the following dataset:\r\n" + f"\t+ Target:\t\t{self.target_dir}\r\n" + f"\t+ Dataset dir:\t{dataset_dir}") + sys.exit(1) + # Append the new radar scans and labels to the broader dataset. + X += X_new + y += y_new + + self.X = np.stack(X) + self.y = np.stack(y) + + return self.X, self.y def extract_data(self) -> tuple: """ diff --git a/01_dsp/dspml_pipeline/scripts/config.yaml b/01_dsp/dspml_pipeline/scripts/config.yaml index d8ef9c27..67e23cd9 100644 --- a/01_dsp/dspml_pipeline/scripts/config.yaml +++ b/01_dsp/dspml_pipeline/scripts/config.yaml @@ -1,28 +1,29 @@ # Data configuration data: label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv - new_dataset: false # Set to true if this is a new dataset + new_dataset: true # Set to true if this is a new dataset # Training dataset training: # Raw datasets to combine (list of directories) - dataset_dirs: + dataset_dirs: - "../data/wet-0-soil-compaction-dataset" - "../data/wet-1-soil-compaction-dataset" - "../data/wet-2-soil-compaction-dataset" - "../data/field-soil-compaction-dataset" + - "../data/field-2-soil-compaction-dataset" # Target combined training dataset directory - target_dir: "../data/combined-training-dataset" + target_dir: "../data/sensys-training-dataset" # Validation dataset validation: # Raw validation datasets to combine dataset_dirs: - "../data/pie-ranch-dataset" # Target combined validation dataset directory - target_dir: "../data/pie-ranch-dataset" + target_dir: "../data/sensys-val-dataset" # For handcrafted features handcrafted: - enabled: false # Enable or disable handcrafted features + enabled: true # Enable or disable handcrafted features new_features: true # Set to true if this is a new dataset or if features have not been generated yet pruning_method: all # Options: corr, mi, lasso, none top_n: 16 # Only used if pruning_method is not none @@ -32,10 +33,10 @@ learned: n_features: 8 # Desired number of features # PCA-based feature extraction pca: - enabled: false # Enable or disable PCA features + enabled: true # Enable or disable PCA features # Kernel-PCA-based feature extraction kpca: - enabled: false # Enable or disable kPCA features + enabled: true # Enable or disable kPCA features # Autoencoder-based feature extraction autoencoder: enabled: true @@ -44,7 +45,7 @@ learned: verbose: true # CNN-based feature extraction cnn: - enabled: true + enabled: false epochs: 20 batch_size: 32 verbose: true @@ -62,13 +63,13 @@ deep_learning: end-to-end: # LSTM-based end-to-end regression lstm: - enabled: true + enabled: false epochs: 50 batch_size: 32 verbose: false # CNN-based end-to-end regression cnn: - enabled: true + enabled: false epochs: 20 batch_size: 32 verbose: false diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py index d58c94fc..3f62c8d6 100644 --- a/01_dsp/dspml_pipeline/scripts/main.py +++ b/01_dsp/dspml_pipeline/scripts/main.py @@ -79,6 +79,10 @@ def main(config_path: str): X_train, y_train = trainingFrameLoader.load(params['data']['new_dataset']) X_val, y_val = validationFrameLoader.load(params['data']['new_dataset']) + # TODO: Only save dataset conditionally. + trainingFrameLoader.save_dataset() + validationFrameLoader.save_dataset() + """ # If new dataset, extract data. Otherwise, load from saved file. if params['data']['new_dataset']: From 7b1fe30cc7e5550fcc6305bf561558d6a6ff6bdc Mon Sep 17 00:00:00 2001 From: nubby Date: Sat, 28 Feb 2026 07:33:29 +0000 Subject: [PATCH 07/17] Updated frame_loader to allow for the use of preprocessed data (like that from pie_ranch) --- .../dspml_pipeline/data/frame_loader.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py index bea15f77..90dabe83 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py @@ -259,6 +259,22 @@ def extract_single_dataset(self, dataset_dir: Path) -> tuple[np.ndarray, np.ndar return new_frame_data, new_labels + def load_preprocessed_dataset(self, dataset_dir: str) -> tuple: + """ + Load proprocessed datasets. + """ + print(dataset_dir) + X_path = Path(dataset_dir) / "X.npy" + y_path = Path(dataset_dir) / "y.npy" + + # Load dataset if it has already been processed into .npy files. + X = np.load(X_path) + y = np.load(y_path) + + logger.info(f"Loaded from existing dataset: X={X.shape}, y={y.shape}") + + return X.tolist(), y.tolist() + def load(self, new: bool) -> tuple: """ Loads and combines the specified datasets based on both existence of raw data and user specs. From 9dac1ddab327796ffc6952ebb7841f3322b842d4 Mon Sep 17 00:00:00 2001 From: nubby Date: Mon, 2 Mar 2026 00:22:24 +0000 Subject: [PATCH 08/17] Updates default config.yaml file for Sensys first revision params. --- 01_dsp/dspml_pipeline/scripts/config.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/01_dsp/dspml_pipeline/scripts/config.yaml b/01_dsp/dspml_pipeline/scripts/config.yaml index 67e23cd9..aa6a2e5e 100644 --- a/01_dsp/dspml_pipeline/scripts/config.yaml +++ b/01_dsp/dspml_pipeline/scripts/config.yaml @@ -1,4 +1,7 @@ -# Data configuration +# Data configuration - Sensys first submission configs. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. data: label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv new_dataset: true # Set to true if this is a new dataset @@ -10,14 +13,13 @@ data: - "../data/wet-1-soil-compaction-dataset" - "../data/wet-2-soil-compaction-dataset" - "../data/field-soil-compaction-dataset" - - "../data/field-2-soil-compaction-dataset" # Target combined training dataset directory target_dir: "../data/sensys-training-dataset" # Validation dataset validation: # Raw validation datasets to combine dataset_dirs: - - "../data/pie-ranch-dataset" + - "../data/field-2-soil-compaction-dataset" # Target combined validation dataset directory target_dir: "../data/sensys-val-dataset" From 2e711cf3bfe3c2b1e1432eacf9e8d500a8591a82 Mon Sep 17 00:00:00 2001 From: nubby Date: Wed, 4 Mar 2026 04:18:36 +0000 Subject: [PATCH 09/17] Adds duplicate hunter; also adds a bunch of preset configs. --- .../scripts/configs/f1-f2-config.yaml | 93 ++++++++++++++++++ .../scripts/configs/f1-pr-config.yaml | 93 ++++++++++++++++++ .../scripts/configs/f2-f1-config.yaml | 93 ++++++++++++++++++ .../scripts/configs/f2-pr-config.yaml | 93 ++++++++++++++++++ .../scripts/configs/il-f1-config.yaml | 95 ++++++++++++++++++ .../scripts/configs/il-f2-config.yaml | 95 ++++++++++++++++++ .../scripts/configs/il-pr-config.yaml | 95 ++++++++++++++++++ .../scripts/configs/il_f1_f2-pr-config.yaml | 97 +++++++++++++++++++ .../scripts/configs/pr-f1-config.yaml | 93 ++++++++++++++++++ .../scripts/configs/pr-f2-config.yaml | 93 ++++++++++++++++++ 01_dsp/dspml_pipeline/scripts/main.py | 30 ++++++ .../{config.yaml => template_config.yaml} | 6 +- 12 files changed, 974 insertions(+), 2 deletions(-) create mode 100644 01_dsp/dspml_pipeline/scripts/configs/f1-f2-config.yaml create mode 100644 01_dsp/dspml_pipeline/scripts/configs/f1-pr-config.yaml create mode 100644 01_dsp/dspml_pipeline/scripts/configs/f2-f1-config.yaml create mode 100644 01_dsp/dspml_pipeline/scripts/configs/f2-pr-config.yaml create mode 100644 01_dsp/dspml_pipeline/scripts/configs/il-f1-config.yaml create mode 100644 01_dsp/dspml_pipeline/scripts/configs/il-f2-config.yaml create mode 100644 01_dsp/dspml_pipeline/scripts/configs/il-pr-config.yaml create mode 100644 01_dsp/dspml_pipeline/scripts/configs/il_f1_f2-pr-config.yaml create mode 100644 01_dsp/dspml_pipeline/scripts/configs/pr-f1-config.yaml create mode 100644 01_dsp/dspml_pipeline/scripts/configs/pr-f2-config.yaml rename 01_dsp/dspml_pipeline/scripts/{config.yaml => template_config.yaml} (93%) diff --git a/01_dsp/dspml_pipeline/scripts/configs/f1-f2-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/f1-f2-config.yaml new file mode 100644 index 00000000..1afc76ac --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/f1-f2-config.yaml @@ -0,0 +1,93 @@ +# f1-f2-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/field-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/f1_f2-pr-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/field-2-soil-compaction-dataset" + # Target combined validation dataset directory + target_dir: "../data/f1_f2-pr-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/f1-pr-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/f1-pr-config.yaml new file mode 100644 index 00000000..cebb2840 --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/f1-pr-config.yaml @@ -0,0 +1,93 @@ +# f1-pr-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/field-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/f1-pr-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/pie-ranch-dataset" + # Target combined validation dataset directory + target_dir: "../data/f1-pr-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/f2-f1-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/f2-f1-config.yaml new file mode 100644 index 00000000..878299ac --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/f2-f1-config.yaml @@ -0,0 +1,93 @@ +# f1-f2-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/field-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/f1-f2-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/field-2-soil-compaction-dataset" + # Target combined validation dataset directory + target_dir: "../data/f1-f2-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/f2-pr-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/f2-pr-config.yaml new file mode 100644 index 00000000..312c1767 --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/f2-pr-config.yaml @@ -0,0 +1,93 @@ +# f2-pr-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/field-2-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/f2-pr-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/pie-ranch-dataset" + # Target combined validation dataset directory + target_dir: "../data/f2-pr-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/il-f1-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/il-f1-config.yaml new file mode 100644 index 00000000..0ee79789 --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/il-f1-config.yaml @@ -0,0 +1,95 @@ +# il-f1-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/wet-0-soil-compaction-dataset" + - "../data/wet-1-soil-compaction-dataset" + - "../data/wet-2-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/il-f1-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/field-soil-compaction-dataset" + # Target combined validation dataset directory + target_dir: "../data/il-f1-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/il-f2-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/il-f2-config.yaml new file mode 100644 index 00000000..a1ecacfd --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/il-f2-config.yaml @@ -0,0 +1,95 @@ +# il-f2-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/wet-0-soil-compaction-dataset" + - "../data/wet-1-soil-compaction-dataset" + - "../data/wet-2-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/il-f2-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/field-2-soil-compaction-dataset" + # Target combined validation dataset directory + target_dir: "../data/il-f2-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/il-pr-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/il-pr-config.yaml new file mode 100644 index 00000000..3af5e158 --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/il-pr-config.yaml @@ -0,0 +1,95 @@ +# il-pr-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/wet-0-soil-compaction-dataset" + - "../data/wet-1-soil-compaction-dataset" + - "../data/wet-2-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/il-pr-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/pie-ranch-dataset" + # Target combined validation dataset directory + target_dir: "../data/il-pr-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/il_f1_f2-pr-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/il_f1_f2-pr-config.yaml new file mode 100644 index 00000000..c74a66e3 --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/il_f1_f2-pr-config.yaml @@ -0,0 +1,97 @@ +# il_f1_f2-pr-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/wet-0-soil-compaction-dataset" + - "../data/wet-1-soil-compaction-dataset" + - "../data/wet-2-soil-compaction-dataset" + - "../data/field-soil-compaction-dataset" + - "../data/field-2-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/il_f1_f2-pr-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/pie-ranch-dataset" + # Target combined validation dataset directory + target_dir: "../data/il_f1_f2-pr-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/pr-f1-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/pr-f1-config.yaml new file mode 100644 index 00000000..d7da694a --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/pr-f1-config.yaml @@ -0,0 +1,93 @@ +# pr-f1-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/pie-ranch-dataset" + # Target combined training dataset directory + target_dir: "../data/pr-f1-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/field-soil-compaction-dataset" + # Target combined validation dataset directory + target_dir: "../data/pr-f1-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/pr-f2-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/pr-f2-config.yaml new file mode 100644 index 00000000..ac7553c1 --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/pr-f2-config.yaml @@ -0,0 +1,93 @@ +# pr-f2-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/pie-ranch-dataset" + # Target combined training dataset directory + target_dir: "../data/pr-f2-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/field-2-soil-compaction-dataset" + # Target combined validation dataset directory + target_dir: "../data/pr-f2-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py index 3f62c8d6..dbf4d4c9 100644 --- a/01_dsp/dspml_pipeline/scripts/main.py +++ b/01_dsp/dspml_pipeline/scripts/main.py @@ -60,6 +60,31 @@ def load_config(path: str) -> dict: params = yaml.safe_load(f) return params +def are_duplicate_examples_present(ds1: tuple, ds2: tuple) -> bool: + """ + are_duplicate_examples_present(ds1, ds2) + + Confirm that there are no duplicated examples both within and between each dataset. + + Args: + ds1 (tuple) First dataset. + ds2 (tuple) Second dataset. + + Returns: + (bool) Are duplicates present? + """ + dups = False + # The dimension of each scan is (512x160), and there are many scans. + for i, line1 in enumerate(ds1): + for j, line2 in enumerate(ds2): + if (len(line1) == len(line2)): + for scan1, scan2 in zip(line1, line2): + if (len(scan1) == len(scan2)): + if tuple(scan1) == tuple(scan2): + print(f"Found duplicate at [{i},{j}]!") + dups = True + return dups + def main(config_path: str): # Load training parameters from config file. params = load_config(path=config_path) @@ -79,6 +104,11 @@ def main(config_path: str): X_train, y_train = trainingFrameLoader.load(params['data']['new_dataset']) X_val, y_val = validationFrameLoader.load(params['data']['new_dataset']) + # Verify that there are no duplicate examples in dataset. + if (are_duplicate_examples_present(X_train, X_val)): + print("Found duplicates! Exiting.") + sys.exit(1) + # TODO: Only save dataset conditionally. trainingFrameLoader.save_dataset() validationFrameLoader.save_dataset() diff --git a/01_dsp/dspml_pipeline/scripts/config.yaml b/01_dsp/dspml_pipeline/scripts/template_config.yaml similarity index 93% rename from 01_dsp/dspml_pipeline/scripts/config.yaml rename to 01_dsp/dspml_pipeline/scripts/template_config.yaml index aa6a2e5e..1672bef7 100644 --- a/01_dsp/dspml_pipeline/scripts/config.yaml +++ b/01_dsp/dspml_pipeline/scripts/template_config.yaml @@ -13,15 +13,17 @@ data: - "../data/wet-1-soil-compaction-dataset" - "../data/wet-2-soil-compaction-dataset" - "../data/field-soil-compaction-dataset" + - "../data/field-2-soil-compaction-dataset" # Target combined training dataset directory - target_dir: "../data/sensys-training-dataset" + target_dir: "../data/test-training-dataset" # Validation dataset validation: # Raw validation datasets to combine dataset_dirs: + #- "../data/pie-ranch-dataset" - "../data/field-2-soil-compaction-dataset" # Target combined validation dataset directory - target_dir: "../data/sensys-val-dataset" + target_dir: "../data/test-val-dataset" # For handcrafted features handcrafted: From a4c7cd7cdfa4fff6806e3f0fe74a5bb8f10ba3ab Mon Sep 17 00:00:00 2001 From: nubby Date: Wed, 4 Mar 2026 04:19:48 +0000 Subject: [PATCH 10/17] Removes virtual envs. --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index f3c63fd9..f046e67f 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,6 @@ b1_ws/build/ b1_ws/install/ b1_ws/log/ b1_ws/src/ros2 + +# Virtual environments. +*.venv* From e7617a457ce858e6370fb88e425fd1ffa3fcae77 Mon Sep 17 00:00:00 2001 From: nubby Date: Wed, 4 Mar 2026 04:20:08 +0000 Subject: [PATCH 11/17] Removes unused import from transformer. --- .../dspml_pipeline/end_to_end_estimation/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py index 8920bafa..c99b193a 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py @@ -8,7 +8,7 @@ from torch import nn from torch.utils.data import Dataset, DataLoader from sklearn.model_selection import KFold -from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score +from sklearn.metrics import mean_squared_error, mean_absolute_error import time import os from PIL import Image From 5dc3c5528a7591ebb6c60b76d81d75a2cd8c8e09 Mon Sep 17 00:00:00 2001 From: nubby Date: Fri, 6 Mar 2026 18:01:21 +0000 Subject: [PATCH 12/17] Adds random seed for consistent results. --- 01_dsp/dspml_pipeline/scripts/main.py | 48 +++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py index dbf4d4c9..28ec44a3 100644 --- a/01_dsp/dspml_pipeline/scripts/main.py +++ b/01_dsp/dspml_pipeline/scripts/main.py @@ -21,7 +21,9 @@ import argparse import os +import random import sys +import torch sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) import numpy as np @@ -60,6 +62,48 @@ def load_config(path: str) -> dict: params = yaml.safe_load(f) return params +def plant_seeds(seed: int = 42): + """ + plant_seeds(seed) + + Configure "consistent randomness" in system settings. + + Args: + seed (int) Random seed. + """ + logging.info(f"Configuring random seed of {seed}...") + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + logging.info("DONE.") + +def split_dataset(ds: tuple, + train_split: float = 0.8, + test_split: float = 0.2, + random_seed: int = 42) -> tuple[tuple, tuple]: + """ + split_dataset(ds, train_split, test_split, random_seed) + + Divide a given dataset into a training set and testing set. In the event of an + imperfect split, the number of training data entries will be rounded up, while the + testing entries will be rounded down. + + Args: + ds (tuple) Dataset to split + train_split (float) Percentage of dataset to put into the new training dataset. + test_split (float) Percentage of dataset to put into the new testing dataset. + random_seed (int) Random seed for assigning dataset splits. + + Returns: + training_ds (tuple) New training dataset. + testing_ds (tuple) New testing dataset. + """ + training_ds = [] + testing_ds = [] + def are_duplicate_examples_present(ds1: tuple, ds2: tuple) -> bool: """ are_duplicate_examples_present(ds1, ds2) @@ -92,6 +136,10 @@ def main(config_path: str): # Configure logging. setup_logging(verbose=params['advanced']['verbose']) + # Configure environment for consistent training/results. + seed = 42 # TODO: Import as config. + plant_seeds(seed=seed) + # Load data from training and validation datasets. trainingFrameLoader = FrameLoader(dataset_dirs=params['data']['training']['dataset_dirs'], target_dir=params['data']['training']['target_dir'], From c2e3dafc24888f14093123b82474d0a8200b5b8d Mon Sep 17 00:00:00 2001 From: nubby Date: Fri, 6 Mar 2026 21:40:37 +0000 Subject: [PATCH 13/17] Adds an option to cross validate only on the training dataset. --- 01_dsp/dspml_pipeline/scripts/main.py | 137 ++++++++++++++------------ 1 file changed, 76 insertions(+), 61 deletions(-) diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py index 28ec44a3..c776ed32 100644 --- a/01_dsp/dspml_pipeline/scripts/main.py +++ b/01_dsp/dspml_pipeline/scripts/main.py @@ -11,10 +11,10 @@ nubby Date: - 24 Feb 2026 + 6 Mar 2026 Version: - 1.0.9 + 1.0.11 """ import logging logger = logging.getLogger(__name__) @@ -81,9 +81,10 @@ def plant_seeds(seed: int = 42): logging.info("DONE.") def split_dataset(ds: tuple, + labels: tuple, train_split: float = 0.8, test_split: float = 0.2, - random_seed: int = 42) -> tuple[tuple, tuple]: + random_seed: int = 42) -> tuple[tuple, tuple, tuple, tuple]: """ split_dataset(ds, train_split, test_split, random_seed) @@ -92,17 +93,33 @@ def split_dataset(ds: tuple, testing entries will be rounded down. Args: - ds (tuple) Dataset to split + ds (tuple) Dataset to split. + labels (tuple) Labels to split. train_split (float) Percentage of dataset to put into the new training dataset. test_split (float) Percentage of dataset to put into the new testing dataset. random_seed (int) Random seed for assigning dataset splits. Returns: training_ds (tuple) New training dataset. + training_labels (tuple) New training labels. testing_ds (tuple) New testing dataset. + testing_labels (tuple) New testing labels. """ - training_ds = [] - testing_ds = [] + full_ds_size = len(ds) + training_ds_size = np.ceil(train_split * full_ds_size) + testing_ds_size = np.floor(test_split * full_ds_size) + + # Verify proper dataset split sizes. + assert (training_ds_size + testing_ds_size == full_ds_size), f"Splits of {training_ds_size} and {test_ds_size} are not of total size {full_ds_size}" + + # Split the dataset and labels into training and testing sets based on indices. + training_indices = random.sample(range(full_ds_size), training_ds_size) + testing_indices = [index for index in range(full_ds_size) if index not in training_indices] + training_ds = [ds[index] for index in training_indices] + training_labels = [labels[index] for index in training_indices] + testing_ds = [ds[index] for index in testing_indices] + testing_labels = [labels[index] for index in testing_indices] + return training_ds, training_labels, testing_ds, testing_labels def are_duplicate_examples_present(ds1: tuple, ds2: tuple) -> bool: """ @@ -129,7 +146,16 @@ def are_duplicate_examples_present(ds1: tuple, ds2: tuple) -> bool: dups = True return dups -def main(config_path: str): +def main(config_path: str, cross_val: bool = False): + """ + main(config_path, cross_val) + + Run the main training/validation pipeline. + + Args: + config_path (str) Path to selected configuration .yaml file. + cross_val (bool) Perform cross-validation on training dataset specified. + """ # Load training parameters from config file. params = load_config(path=config_path) @@ -140,17 +166,42 @@ def main(config_path: str): seed = 42 # TODO: Import as config. plant_seeds(seed=seed) - # Load data from training and validation datasets. - trainingFrameLoader = FrameLoader(dataset_dirs=params['data']['training']['dataset_dirs'], - target_dir=params['data']['training']['target_dir'], - data_log="data-log.csv", - label_name=params['data']['label_name']) - validationFrameLoader = FrameLoader(dataset_dirs=params['data']['validation']['dataset_dirs'], - target_dir=params['data']['validation']['target_dir'], - data_log="data-log.csv", - label_name=params['data']['label_name']) - X_train, y_train = trainingFrameLoader.load(params['data']['new_dataset']) - X_val, y_val = validationFrameLoader.load(params['data']['new_dataset']) + # Determine whether to split a single dataset into parts or validate on held-out datasets. + # Load only the training "dataset_dirs" for cross validation testing. + if cross_val: + fullFrameLoader = FrameLoader(dataset_dirs=params['data']['training']['dataset_dirs'], + target_dir=params['data']['training']['target_dir'], + data_log="data-log.csv", + label_name=params['data']['label_name']) + X_full, y_full = trainingFrameLoader.load(params['data']['new_dataset']) + + # Divide the full dataset into training/testing splits. + X_train, y_train, X_val, y_val = split_dataset(ds=X_full, labels=y_full, random_seed=seed) + + # NOTE: Currently, these frame loaders can only write/save each split. + trainingFrameLoader = FrameLoader(dataset=X_train, + data_log="data-log.csv", + label_name=params['data']['label_name'], + labels=y_train, + target_dir=params['data']['training']['target_dir']) + validationFrameLoader = FrameLoader(dataset=X_val, + data_log="data-log.csv", + label_name=params['data']['label_name'], + labels=y_val, + target_dir=params['data']['validation']['target_dir']) + # Load all datasets if not doing strict cross-validation. + else: + # Load data from training and validation datasets. + trainingFrameLoader = FrameLoader(dataset_dirs=params['data']['training']['dataset_dirs'], + target_dir=params['data']['training']['target_dir'], + data_log="data-log.csv", + label_name=params['data']['label_name']) + validationFrameLoader = FrameLoader(dataset_dirs=params['data']['validation']['dataset_dirs'], + target_dir=params['data']['validation']['target_dir'], + data_log="data-log.csv", + label_name=params['data']['label_name']) + X_train, y_train = trainingFrameLoader.load(params['data']['new_dataset']) + X_val, y_val = validationFrameLoader.load(params['data']['new_dataset']) # Verify that there are no duplicate examples in dataset. if (are_duplicate_examples_present(X_train, X_val)): @@ -161,48 +212,6 @@ def main(config_path: str): trainingFrameLoader.save_dataset() validationFrameLoader.save_dataset() - """ - # If new dataset, extract data. Otherwise, load from saved file. - if params['data']['new_dataset']: - X_train, y_train = trainingFrameLoader.extract_data() - # Try to load previously-processed data if none found in raw form. - if len(X_train) > 0 and len(y_train) > 0: - trainingFrameLoader.save_dataset() - else: - print(f'Loading dataset from {params["data"]["training"]["target_dir"]}.') - X_train, y_train = load_dataset( - dataset_dir=params['data']['training']['target_dir'], - fl=trainingFrameLoader - ) - # Exit if we still cannot find training data. - if len(X_train) == 0 or len(y_train) == 0: - logger.error(f'Cannot load training data for {params["data"]["training"]["target_dir"]}! Exiting.') - sys.exit() - - # Try to load previously-processed data if none found in raw form. - X_val, y_val = validationFrameLoader.extract_data() - if len(X_val) > 0 and len(y_val) > 0: - validationFrameLoader.save_dataset() - else: - print(f'Loading dataset from {params["data"]["validation"]["target_dir"]}.') - X_val, y_val = load_dataset( - dataset_dir=params['data']['validation']['target_dir'], - fl=validationFrameLoader - ) - # Exit if we still cannot find validation data. - if len(X_val) == 0 or len(y_val) == 0: - logger.error(f'Cannot load training data for {params["data"]["validation"]["target_dir"]}! Exiting.') - sys.exit() - else: - X_train, y_train = load_dataset( - dataset_dir=params['data']['training']['target_dir'], - fl=trainingFrameLoader - ) - X_val, y_val = load_dataset( - dataset_dir=params['data']['validation']['target_dir'], - fl=validationFrameLoader - ) - """ # ======== Handcrafted Features ======== if params['handcrafted']['enabled']: @@ -679,5 +688,11 @@ def main(config_path: str): type=str, help="Path to desired config path." ) + parser.add_argument( + "--cross-validation", + "-x", + action="store_true", + help="Run cross-validation on the specified dataset (specified as the 'training' dataset in the config)?" + ) args = parser.parse_args() - main(config_path=args.config) + main(config_path=args.config, cross_val=args.cross_validation) From d9f8e725e6a6d0e3ae14fe20c333527c9dff941c Mon Sep 17 00:00:00 2001 From: nubby Date: Fri, 6 Mar 2026 21:40:57 +0000 Subject: [PATCH 14/17] Forgot to add updates to frame_loader. --- .../dspml_pipeline/data/frame_loader.py | 54 ++++++++++++------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py index 90dabe83..20062625 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py @@ -59,9 +59,14 @@ class FrameLoader: y (np.ndarray): Corresponding labels (targets). """ - def __init__(self, dataset_dirs: list, target_dir: str, + def __init__(self, + target_dir: str, data_log: str = "data-log.csv", - folder_name: str = "Sample #", label_name: str = "Bulk Density (g/cm^3)", + dataset: tuple = (), + dataset_dirs: list = [], + folder_name: str = "Sample #", + label_name: str = "Bulk Density (g/cm^3)", + labels: tuple = (), verbose: bool = False): """ Initializes the FrameLoader instance based on the provided directories. @@ -78,26 +83,37 @@ def __init__(self, dataset_dirs: list, target_dir: str, self.dataset_dirs = dataset_dirs self.target_dir = target_dir self.data_log = data_log - self.X = None - self.y = None self.label_name = label_name self.folder_name = folder_name - # Validate dataset directory - for i in self.dataset_dirs: - if not Path(i).exists(): - logger.error(f"Dataset {i} does not exist.") - if not os.path.isdir(i): - logger.error(f"Path {i} does not point to a dataset directory.") - data_log_i = Path(i) / data_log - if not data_log_i.exists(): - logger.warning(f"Data log file {data_log_i} does not exist; " - f"checking for preprocessed dataset...") - if not self._is_dataset_preprocessed(i): - logger.warning(f"Dataset {i} is invalid.") - sys.exit(1) - else: - logger.info(f"Dataset {i} initialized.") + # Import data from the dataset directories if provided. + if dataset_dirs: + # No input datastreams given. + self.X = None + self.y = None + + # Validate dataset directory + for i in self.dataset_dirs: + if not Path(i).exists(): + logger.error(f"Dataset {i} does not exist.") + if not os.path.isdir(i): + logger.error(f"Path {i} does not point to a dataset directory.") + data_log_i = Path(i) / data_log + if not data_log_i.exists(): + logger.warning(f"Data log file {data_log_i} does not exist; " + f"checking for preprocessed dataset...") + if not self._is_dataset_preprocessed(i): + logger.warning(f"Dataset {i} is invalid.") + sys.exit(1) + else: + logger.info(f"Dataset {i} initialized.") + # Directly import tuples of dataset and labels if given. + elif ((len(dataset) > 0 and len(labels) > 0) and (len(labels) == len(dataset))): + self.X = dataset + self.y = labels + else: + print(f"Cannot load dataset.") + sys.exit(1) def _is_dataset_preprocessed(self, path: str): """ From ace5063ad1112a9f3c8c586f318033316ea7b63b Mon Sep 17 00:00:00 2001 From: nubby Date: Fri, 6 Mar 2026 23:05:54 +0000 Subject: [PATCH 15/17] Allows for cross validation on the same dataset now. --- .../dspml_pipeline/data/frame_loader.py | 4 +-- 01_dsp/dspml_pipeline/scripts/main.py | 28 +++++++++++-------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py index 20062625..6cfe8c20 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py @@ -62,11 +62,11 @@ class FrameLoader: def __init__(self, target_dir: str, data_log: str = "data-log.csv", - dataset: tuple = (), + dataset: np.ndarray = None, dataset_dirs: list = [], folder_name: str = "Sample #", label_name: str = "Bulk Density (g/cm^3)", - labels: tuple = (), + labels: np.ndarray = None, verbose: bool = False): """ Initializes the FrameLoader instance based on the provided directories. diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py index c776ed32..f75dc107 100644 --- a/01_dsp/dspml_pipeline/scripts/main.py +++ b/01_dsp/dspml_pipeline/scripts/main.py @@ -80,11 +80,11 @@ def plant_seeds(seed: int = 42): torch.backends.cudnn.benchmark = False logging.info("DONE.") -def split_dataset(ds: tuple, - labels: tuple, +def split_dataset(ds: np.ndarray, + labels: np.ndarray, train_split: float = 0.8, test_split: float = 0.2, - random_seed: int = 42) -> tuple[tuple, tuple, tuple, tuple]: + random_seed: int = 42) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ split_dataset(ds, train_split, test_split, random_seed) @@ -106,19 +106,25 @@ def split_dataset(ds: tuple, testing_labels (tuple) New testing labels. """ full_ds_size = len(ds) - training_ds_size = np.ceil(train_split * full_ds_size) - testing_ds_size = np.floor(test_split * full_ds_size) + training_ds_size = int(np.ceil(train_split * full_ds_size)) + testing_ds_size = int(np.floor(test_split * full_ds_size)) # Verify proper dataset split sizes. assert (training_ds_size + testing_ds_size == full_ds_size), f"Splits of {training_ds_size} and {test_ds_size} are not of total size {full_ds_size}" # Split the dataset and labels into training and testing sets based on indices. - training_indices = random.sample(range(full_ds_size), training_ds_size) + training_indices = np.random.choice(full_ds_size, training_ds_size, replace=False) testing_indices = [index for index in range(full_ds_size) if index not in training_indices] - training_ds = [ds[index] for index in training_indices] - training_labels = [labels[index] for index in training_indices] - testing_ds = [ds[index] for index in testing_indices] - testing_labels = [labels[index] for index in testing_indices] + """ + training_ds = np.ndarray([ds[index] for index in training_indices]) + training_labels = np.ndarray([labels[index] for index in training_indices]) + testing_ds = np.ndarray([ds[index] for index in testing_indices]) + testing_labels = np.ndarray([labels[index] for index in testing_indices]) + """ + training_ds = ds[training_indices] + training_labels = labels[training_indices] + testing_ds = ds[testing_indices] + testing_labels = labels[testing_indices] return training_ds, training_labels, testing_ds, testing_labels def are_duplicate_examples_present(ds1: tuple, ds2: tuple) -> bool: @@ -173,7 +179,7 @@ def main(config_path: str, cross_val: bool = False): target_dir=params['data']['training']['target_dir'], data_log="data-log.csv", label_name=params['data']['label_name']) - X_full, y_full = trainingFrameLoader.load(params['data']['new_dataset']) + X_full, y_full = fullFrameLoader.load(params['data']['new_dataset']) # Divide the full dataset into training/testing splits. X_train, y_train, X_val, y_val = split_dataset(ds=X_full, labels=y_full, random_seed=seed) From 15e69140dbd372cdd99986eeb1ca2b772f766126 Mon Sep 17 00:00:00 2001 From: nubby Date: Sun, 8 Mar 2026 02:09:48 +0000 Subject: [PATCH 16/17] Moved pretrained transformer model to new file. --- .../{transformer.py => pt_transformer.py} | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) rename 01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/{transformer.py => pt_transformer.py} (98%) diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/pt_transformer.py similarity index 98% rename from 01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py rename to 01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/pt_transformer.py index c99b193a..0e209bea 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/pt_transformer.py @@ -1,5 +1,19 @@ -# TODO: update docstrings +""" +pt_transformer.py +Pretrained, lightweight visual Transformer architecture (MobileViT) repurposed to explore +its ability at using transfer learning to detect soil compaction through radargrams. + +Authors: + jLab + Eric Vetha + +Date: + 7 Mar 2026 + +Version: + 1.0.0 +""" import logging logger = logging.getLogger(__name__) From 57eb5e587130b7dfe20cec85a82a9ad79d53cb43 Mon Sep 17 00:00:00 2001 From: nubby Date: Sun, 8 Mar 2026 18:10:23 +0000 Subject: [PATCH 17/17] Migrated transformer.py to pt_transformer.py and tested. --- .../dspml_pipeline/feature_estimation/eval_tools.py | 2 +- 01_dsp/dspml_pipeline/scripts/main.py | 2 +- 01_dsp/dspml_pipeline/tests/test_end_to_end.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py b/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py index f44ed0a2..d44364a6 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py @@ -24,7 +24,7 @@ from dspml_pipeline.feature_estimation.mlp import MLPRegression from dspml_pipeline.end_to_end_estimation.cnn import CNNEstimator from dspml_pipeline.end_to_end_estimation.lstm import LSTMEstimator -from dspml_pipeline.end_to_end_estimation.transformer import TransformerEstimator +from dspml_pipeline.end_to_end_estimation.pt_transformer import TransformerEstimator from dspml_pipeline.parameters import num2label from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py index f75dc107..2f848f29 100644 --- a/01_dsp/dspml_pipeline/scripts/main.py +++ b/01_dsp/dspml_pipeline/scripts/main.py @@ -38,7 +38,7 @@ from dspml_pipeline.feature_extraction.learned.autoencoder import AutoencoderLearnedFeatures from dspml_pipeline.feature_extraction.learned.cnn import CNNLearnedFeatures from dspml_pipeline.end_to_end_estimation.cnn import CNNEstimator -from dspml_pipeline.end_to_end_estimation.transformer import TransformerEstimator +from dspml_pipeline.end_to_end_estimation.pt_transformer import TransformerEstimator from dspml_pipeline.end_to_end_estimation.lstm import LSTMEstimator from scipy import stats diff --git a/01_dsp/dspml_pipeline/tests/test_end_to_end.py b/01_dsp/dspml_pipeline/tests/test_end_to_end.py index a94df172..0a1cb2c0 100644 --- a/01_dsp/dspml_pipeline/tests/test_end_to_end.py +++ b/01_dsp/dspml_pipeline/tests/test_end_to_end.py @@ -13,7 +13,7 @@ from dspml_pipeline.end_to_end_estimation.lstm import LSTMEstimator from dspml_pipeline.results import update_results from dspml_pipeline.end_to_end_estimation.cnn import CNNEstimator -from dspml_pipeline.end_to_end_estimation.transformer import TransformerEstimator +from dspml_pipeline.end_to_end_estimation.pt_transformer import TransformerEstimator from scipy import stats @@ -59,4 +59,4 @@ def display_feature_importance(feature_array, feature_names, labels): X = np.abs(X) trans = TransformerEstimator(X, y, verbose=verbose) model, metrics = trans.full_monty() - # update_results(target_dir, "End-to-end", f"Transformer", metrics) \ No newline at end of file + # update_results(target_dir, "End-to-end", f"Transformer", metrics)