From a039ba9fef27cf0fa710d27ac3262406e1d2ffcc Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Tue, 24 Feb 2026 23:40:45 +0000
Subject: [PATCH 01/17] Adds a useful data viewer script for numpy files.

---
 01_dsp/dspml_pipeline/scripts/view_data.py | 69 ++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 01_dsp/dspml_pipeline/scripts/view_data.py

diff --git a/01_dsp/dspml_pipeline/scripts/view_data.py b/01_dsp/dspml_pipeline/scripts/view_data.py
new file mode 100644
index 00000000..5b975d5e
--- /dev/null
+++ b/01_dsp/dspml_pipeline/scripts/view_data.py
@@ -0,0 +1,69 @@
+"""
+File:
+    view_data
+
+Description:
+    View the contents of a saved numpy file.
+
+Author:
+    jLab
+    nubby
+    Perplexity.AI
+
+Date:
+    24 Feb 2026
+
+Version:
+    1.0.0
+"""
+import argparse
+import numpy as np
+import os
+
+from typing import Union
+
+
+def _load_npy_file(path: str) -> Union[np.array, None]:
+    """
+    _load_npy_file(path)
+    
+    Load the contents of a saved .npy file if proper format;
+    otherwise return None.
+
+    Args:
+        path    (str)   Path to file.
+
+    Returns:
+        data    (np.array, None)
+    """
+    try:
+        assert(os.path.isfile(path) and path.split(".")[-1] == "npy")
+        data = np.load(path)
+    except AssertionError:
+        data = None
+
+    return data
+
+
+def view_data(path: str):
+    # Load the file in question.
+    data = _load_npy_file(path=path)
+
+    try:
+        print(f"Contents: {data}")
+        print(f"Shape: {data.shape}")
+    except AttributeError:
+        print(f"ERROR: File {path} invalid; check the path!")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="View the contents of an input .npy file.")
+    parser.add_argument(
+            "--path",
+            "-p",
+            required=True,
+            type=str,
+            help="Path to the desired file."
+        )
+    args = parser.parse_args()
+    view_data(path=args.path)

From 14cd859682c1ce2ebc9ba44379a7e2874f59c322 Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Wed, 25 Feb 2026 01:33:13 +0000
Subject: [PATCH 02/17] Updates transformer library for modern TF.

---
 .../end_to_end_estimation/transformer.py           | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py
index 84b05fc7..8920bafa 100644
--- a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py
+++ b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py
@@ -14,7 +14,11 @@
 from PIL import Image
 
 from ..parameters import RANDOM_SEED, KFOLD_SPLITS, num2label
-from transformers import MobileViTFeatureExtractor, MobileViTForImageClassification
+try:
+    from transformers import MobileViTFeatureExtractor, MobileViTForImageClassification
+except ImportError:
+    from transformers import MobileViTImageProcessor, MobileViTForImageClassification
+
 
 # Set seeds for reproducibility
 torch.manual_seed(RANDOM_SEED)
@@ -129,7 +133,11 @@ def __init__(self, X, y, epochs=10, batch_size=4, verbose=False):
         # Move to device
         self.mobilevit.to(self.device)
 
-        self.feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-small")
+        try:
+            self.feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-small")
+        except:
+            self.feature_extractor = MobileViTImageProcessor.from_pretrained("apple/mobilevit-small")
+            
         
         # Freeze the backbone, only train the classifier
         for name, param in self.mobilevit.named_parameters():
@@ -413,4 +421,4 @@ def evaluate(model, dataloader, loss_fn, device):
             total_loss += loss.item()
             preds.extend(outputs.cpu().numpy())
             trues.extend(targets.cpu().numpy())
-    return total_loss / len(dataloader), preds, trues
\ No newline at end of file
+    return total_loss / len(dataloader), preds, trues

From e2010f5943f535494a047ed77151297979793eda Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Wed, 25 Feb 2026 02:11:05 +0000
Subject: [PATCH 03/17] Updates name of Pie Ranch dataset label.

---
 01_dsp/dspml_pipeline/scripts/config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/01_dsp/dspml_pipeline/scripts/config.yaml b/01_dsp/dspml_pipeline/scripts/config.yaml
index e64657ca..d8ef9c27 100644
--- a/01_dsp/dspml_pipeline/scripts/config.yaml
+++ b/01_dsp/dspml_pipeline/scripts/config.yaml
@@ -16,7 +16,7 @@ data:
   validation:
     # Raw validation datasets to combine
     dataset_dirs:
-      - "../data/field-pie-ranch-dataset"
+      - "../data/pie-ranch-dataset"
     # Target combined validation dataset directory
     target_dir: "../data/pie-ranch-dataset"
 
@@ -80,4 +80,4 @@ end-to-end:
     verbose: false
 
 advanced:
-  verbose: true               # Set to false to reduce logging output  
\ No newline at end of file
+  verbose: true               # Set to false to reduce logging output  

From 6220698e9501adc79e0a78205cb83cd5640a1a3b Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Wed, 25 Feb 2026 02:11:41 +0000
Subject: [PATCH 04/17] Allows for use of previously-processed datasets
 (without raw radar frame data.

---
 .../dspml_pipeline/data/frame_loader.py       | 80 ++++++++++++++++---
 01_dsp/dspml_pipeline/scripts/main.py         | 30 ++++++-
 2 files changed, 94 insertions(+), 16 deletions(-)

diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
index 6bb6d86f..2d86351d 100644
--- a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
+++ b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
@@ -1,3 +1,21 @@
+"""
+File:
+    frame_loader.py
+
+Description:
+    Tools for loading datasets from the GOPHERS pipeline.
+
+Authors:
+    jLab
+    Eric Vetha
+    nubby
+
+Date:
+    24 Feb 2026
+
+Version:
+    1.0.9
+"""
 import logging
 logger = logging.getLogger(__name__)
 
@@ -5,12 +23,15 @@
 from pathlib import Path
 from ..setup_logging import setup_logging
 import json
+import os
 import pandas as pd
 import sys
 from scipy import signal
 
+
 THRESHOLD = 50 # For anomoly removal
 
+
 class FrameLoader:
     """
     FrameLoader class for processing radar data into standardized input (X) and output (y) matrices for regression tasks.
@@ -38,10 +59,10 @@ class FrameLoader:
         y (np.ndarray):         Corresponding labels (targets).
     """
 
-    def __init__(self, dataset_dirs:list, target_dir:str,
-                 data_log:str = "data-log.csv", 
-                 folder_name:str = "Sample #", label_name:str = "Bulk Density (g/cm^3)", 
-                 verbose:bool = False):
+    def __init__(self, dataset_dirs: list, target_dir: str,
+                 data_log: str = "data-log.csv", 
+                 folder_name: str = "Sample #", label_name: str = "Bulk Density (g/cm^3)", 
+                 verbose: bool = False):
         """
         Initializes the FrameLoader instance based on the provided directories.
 
@@ -66,10 +87,36 @@ def __init__(self, dataset_dirs:list, target_dir:str,
         for i in self.dataset_dirs:
             if not Path(i).exists():
                 logger.error(f"Dataset {i} does not exist.")
+            if not os.path.isdir(i):
+                logger.error(f"Path {i} does not point to a dataset directory.")
             data_log_i = Path(i) / data_log
             if not data_log_i.exists():
-                logger.error(f"Data log file {data_log_i} does not exist.")
-                sys.exit(1)
+                logger.warning(f"Data log file {data_log_i} does not exist; "
+                               f"checking for preprocessed dataset...")
+                if not self._is_dataset_preprocessed(i):
+                    logger.warning(f"Dataset {i} is invalid.")
+                    sys.exit(1)
+                else:
+                    logger.info(f"Dataset {i} initialized.")
+
+    def _is_dataset_preprocessed(self, path: str):
+        """
+        is_dataset_preprocessed(path)
+
+        Check for the existence of X.npy, y.npy, features.csv, and results.csv files in the path provided.
+
+        Args:
+            path            (str)
+
+        Returns:
+            preprocessed?   (bool)
+        """
+        required_files = ["X.npy", "y.npy", "features.csv", "results.csv"]
+        current_files = os.listdir(path)
+        if not set(required_files) == set(current_files):
+            return False
+        # TODO: Extract data here?
+        return True
 
     def extract_data(self):
         """
@@ -185,13 +232,14 @@ def save_dataset(self):
         logger.info(f"Raw dataset saved as X.npy and y.npy")
         logger.info(f"Saved shapes: X={self.X.shape}, y={self.y.shape}")
 
-def load_dataset(dataset_dir:str):
+def load_dataset(dataset_dir: str, fl: FrameLoader):
     """
     Loads data that has already been processed. Assumes the features are named X.npy and the 
     labels are named y.npy.
 
     Args:
         dataset_dir:        Directory containing the capture file.
+        fl:                 FrameLoader object for given dataset.
 
     Returns:
         X (np.ndarray):     Processed radar data (features).
@@ -202,11 +250,17 @@ def load_dataset(dataset_dir:str):
     y_path = Path(dataset_dir) / "y.npy"
 
     if not X_path.exists() or not y_path.exists():
-        logger.error("X.npy and/or y.npy not found in the dataset directory")
-        sys.exit(1)
-
-    X = np.load(X_path)
-    y = np.load(y_path)
+        logger.warning("X.npy and/or y.npy not found in the dataset directory; generating...")
+        X, y = fl.extract_data()
+        fl.save_dataset()
+        # If the dataset still does not exists, exit.
+        if not X_path.exists() or not y_path.exists():
+            logger.error("X.npy and/or y.npy could not be generated.")
+            sys.exit(1)
+    else:
+        # Load dataset if it has already been processed into .npy files.
+        X = np.load(X_path)
+        y = np.load(y_path)
     
     logger.info(f"Loaded from existing dataset: X={X.shape}, y={y.shape}")
 
@@ -480,4 +534,4 @@ def novelda_digital_downconvert(raw_frame:np.ndarray):
     # Baseband signal using convolution (provides downcoverted, filtered analytic signal)
     baseband_signal = signal.convolve(mixed, window, mode='same')
 
-    return baseband_signal
\ No newline at end of file
+    return baseband_signal
diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py
index 44a6f3a0..42305ec8 100644
--- a/01_dsp/dspml_pipeline/scripts/main.py
+++ b/01_dsp/dspml_pipeline/scripts/main.py
@@ -1,3 +1,21 @@
+"""
+File:
+    main.py
+
+Description:
+    Launch file for WADAR dspml_pipeline.
+
+Authors:
+    jLab
+    Eric Vetha
+    nubby
+
+Date:
+    24 Feb 2026
+
+Version:
+    1.0.9
+"""
 import logging
 logger = logging.getLogger(__name__)
 
@@ -53,8 +71,14 @@ def main():
         X_val, y_val = validationFrameLoader.extract_data()
         validationFrameLoader.save_dataset()
     else:
-        X_train, y_train = load_dataset(dataset_dir=params['data']['training']['target_dir'])
-        X_val, y_val = load_dataset(dataset_dir=params['data']['validation']['target_dir'])
+        X_train, y_train = load_dataset(
+                dataset_dir=params['data']['training']['target_dir'],
+                fl=trainingFrameLoader
+            )
+        X_val, y_val = load_dataset(
+                dataset_dir=params['data']['validation']['target_dir'],
+                fl=validationFrameLoader
+            )
 
     # ======== Handcrafted Features ========
     if params['handcrafted']['enabled']:
@@ -523,4 +547,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From f91b9ee59349d753bb3336fbb0a3bd31f02c4ae8 Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Wed, 25 Feb 2026 19:28:58 +0000
Subject: [PATCH 05/17] Checkpoint; still borken, but refactoring has begun
 frame_loader; also updates eval_tools for updates to libs.

---
 .../dspml_pipeline/data/frame_loader.py       | 53 ++++++++++++--
 .../feature_estimation/eval_tools.py          | 20 +++++-
 01_dsp/dspml_pipeline/scripts/main.py         | 71 ++++++++++++++++---
 3 files changed, 128 insertions(+), 16 deletions(-)

diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
index 2d86351d..35369e58 100644
--- a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
+++ b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
@@ -101,8 +101,6 @@ def __init__(self, dataset_dirs: list, target_dir: str,
 
     def _is_dataset_preprocessed(self, path: str):
         """
-        is_dataset_preprocessed(path)
-
         Check for the existence of X.npy, y.npy, features.csv, and results.csv files in the path provided.
 
         Args:
@@ -118,7 +116,45 @@ def _is_dataset_preprocessed(self, path: str):
         # TODO: Extract data here?
         return True
 
-    def extract_data(self):
+    def load(self, new: bool) -> tuple:
+        """
+        Loads and combines the specified datasets based on both existence of raw data and user specs.
+
+        Args:
+            new     (bool)  Load raw radar frames? If False, load .npy files if they exist.
+
+        Returns:
+            X, y    (tuple[np.array, np.array])
+        """
+        for dataset_path in self.dataset_dirs:
+            if new:
+            # Try to load preprocessed dataset if raw scans unavailable.
+            if not self._is_new_dataset_valid():
+                if not self._is_preprocessed_dataset_valid():
+                    logger.error(f"Neither existing radar scans nor valid preprocessed "
+                                 f"dataset were found for the following dataset:\r\n"
+                                 f"\t+ Target:\t\t{self.target_dir}\r\n"
+                                 f"\t+ Dataset dirs:\t{self.dataset_dirs}")
+                    sys.exit(1)
+                # Load preprocessed dataset if it exists and raw scans do not here.
+                X, y = self.load_preprocessed_dataset()
+            # Load raw radar scans into new dataset here.
+            else:
+                X, y = self.load_new_dataset()
+        else:
+            X, y = self.load_preprocessed_dataset()
+
+
+    def extract_single_dataset(self, dataset_path: Path) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Extracts the features (X) and labels (y) from a given directory.
+
+        Returns:
+            X (np.ndarray):         Processed radar data (features).
+            y (np.ndarray):         Corresponding labels (targets).
+        """
+
+    def extract_data(self) -> tuple:
         """
         Extracts the features (X) and labels (y) from the provided directries.
 
@@ -134,6 +170,7 @@ def extract_data(self):
 
         # Iterate through each dataset dir
         for i in self.dataset_dirs:
+            logging.info(f"Extracting data from {i}.")
             dataset_dir = Path(i)
             subdirs = [d for d in dataset_dir.iterdir() 
                     if d.is_dir() and not d.name.startswith('.')]
@@ -146,8 +183,9 @@ def extract_data(self):
                 df[self.label_name] = df[self.label_name].astype(float)
                 logger.info(f"Loaded data log with {len(df)} samples")
             except Exception as e:
-                logger.error(f"Expected CSV format: columns include '{self.folder_name}' and '{self.label_name}'")
-                sys.exit(1)
+                logger.warning(f"Expected CSV format: columns include '{self.folder_name}' and '{self.label_name}'; "
+                               f"attempting to load preprocessed {self.folder_name} dataset...")
+                return [], []
             
             # In each subdirectory
             for i, folder in enumerate(subdirs):
@@ -252,6 +290,9 @@ def load_dataset(dataset_dir: str, fl: FrameLoader):
     if not X_path.exists() or not y_path.exists():
         logger.warning("X.npy and/or y.npy not found in the dataset directory; generating...")
         X, y = fl.extract_data()
+        if len(X) == 0 or len(y) == 0:
+            logger.error("X.npy and/or y.npy could not be generated.")
+            sys.exit(1)
         fl.save_dataset()
         # If the dataset still does not exists, exit.
         if not X_path.exists() or not y_path.exists():
@@ -259,6 +300,8 @@ def load_dataset(dataset_dir: str, fl: FrameLoader):
             sys.exit(1)
     else:
         # Load dataset if it has already been processed into .npy files.
+        print(X_path)
+        print(y_path)
         X = np.load(X_path)
         y = np.load(y_path)
     
diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py b/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py
index 8d2ebafc..f44ed0a2 100644
--- a/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py
+++ b/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py
@@ -1,3 +1,21 @@
+"""
+File:
+    eval_tools.py
+
+Description:
+    ???
+
+Authors:
+    jLab
+    Eric Vetha
+    nubby
+
+Date:
+    24 Feb 2026
+
+Version:
+    1.0.9
+"""
 from dspml_pipeline.feature_estimation.ridge_regression import RidgeRegression
 from dspml_pipeline.feature_estimation.random_forest import RandomForest
 from dspml_pipeline.feature_estimation.xgboost_tree import XGBoostTree
@@ -263,4 +281,4 @@ def show_results_summary(feature_type: str, training_dir: str, validation_dir: s
     print(f"Validation Results for {feature_type}".center(40))
     print("="*40)
     results_df_amp = load_results(validation_dir)
-    display_feature_results(feature_type, results_df_amp)
\ No newline at end of file
+    display_feature_results(feature_type, results_df_amp)
diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py
index 42305ec8..d58c94fc 100644
--- a/01_dsp/dspml_pipeline/scripts/main.py
+++ b/01_dsp/dspml_pipeline/scripts/main.py
@@ -19,6 +19,7 @@
 import logging
 logger = logging.getLogger(__name__)
 
+import argparse
 import os
 import sys
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
@@ -42,19 +43,31 @@
 import matplotlib.pyplot as plt
 import yaml
 
-def main():
 
-    if len(sys.argv) < 2:
-        raise RuntimeError("Usage: python main.py <config_file.yaml>")
-    config_file = sys.argv[1]
+def load_config(path: str) -> dict:
+    """
+    load_config(path)
 
-    # Load configuration
-    with open(config_file, "r") as f:
+    Load a configuration file into a return dictionary.
+
+    Args:
+        path    (str)
+
+    Returns:
+        params  (dict)
+    """
+    with open(path, "r") as f:
         params = yaml.safe_load(f)
+    return params
 
+def main(config_path: str):
+    # Load training parameters from config file.
+    params = load_config(path=config_path)
+
+    # Configure logging.
     setup_logging(verbose=params['advanced']['verbose'])
 
-    # Load data from training and validation datasets
+    # Load data from training and validation datasets.
     trainingFrameLoader = FrameLoader(dataset_dirs=params['data']['training']['dataset_dirs'],
                               target_dir=params['data']['training']['target_dir'],
                               data_log="data-log.csv",
@@ -63,13 +76,41 @@ def main():
                               target_dir=params['data']['validation']['target_dir'],
                               data_log="data-log.csv",
                               label_name=params['data']['label_name'])
+    X_train, y_train = trainingFrameLoader.load(params['data']['new_dataset'])
+    X_val, y_val = validationFrameLoader.load(params['data']['new_dataset'])
 
+    """
     # If new dataset, extract data. Otherwise, load from saved file.
     if params['data']['new_dataset']:
         X_train, y_train = trainingFrameLoader.extract_data()
-        trainingFrameLoader.save_dataset()
+        # Try to load previously-processed data if none found in raw form.
+        if len(X_train) > 0 and len(y_train) > 0:
+            trainingFrameLoader.save_dataset()
+        else:
+            print(f'Loading dataset from {params["data"]["training"]["target_dir"]}.')
+            X_train, y_train = load_dataset(
+                    dataset_dir=params['data']['training']['target_dir'],
+                    fl=trainingFrameLoader
+                )
+            # Exit if we still cannot find training data.
+            if len(X_train) == 0 or len(y_train) == 0:
+                logger.error(f'Cannot load training data for {params["data"]["training"]["target_dir"]}! Exiting.')
+                sys.exit()
+
+        # Try to load previously-processed data if none found in raw form.
         X_val, y_val = validationFrameLoader.extract_data()
-        validationFrameLoader.save_dataset()
+        if len(X_val) > 0 and len(y_val) > 0:
+            validationFrameLoader.save_dataset()
+        else:
+            print(f'Loading dataset from {params["data"]["validation"]["target_dir"]}.')
+            X_val, y_val = load_dataset(
+                    dataset_dir=params['data']['validation']['target_dir'],
+                    fl=validationFrameLoader
+                )
+            # Exit if we still cannot find validation data.
+            if len(X_val) == 0 or len(y_val) == 0:
+                logger.error(f'Cannot load training data for {params["data"]["validation"]["target_dir"]}! Exiting.')
+                sys.exit()
     else:
         X_train, y_train = load_dataset(
                 dataset_dir=params['data']['training']['target_dir'],
@@ -79,6 +120,7 @@ def main():
                 dataset_dir=params['data']['validation']['target_dir'],
                 fl=validationFrameLoader
             )
+    """
 
     # ======== Handcrafted Features ========
     if params['handcrafted']['enabled']:
@@ -547,4 +589,13 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(description="Launch training/evaluation of GOPHERS datasets.")
+    parser.add_argument(
+            "--config",
+            "-c",
+            required=True,
+            type=str,
+            help="Path to desired config path."
+        )
+    args = parser.parse_args()
+    main(config_path=args.config)

From 8c0d1593a3307bfc50f885657f0c30b8e8248c88 Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Sat, 28 Feb 2026 07:09:36 +0000
Subject: [PATCH 06/17] Now selects whether to load individual dataset based on
 existence of valid files, but still needs loader for preprocessed data.

---
 .../dspml_pipeline/data/frame_loader.py       | 190 +++++++++++++++---
 01_dsp/dspml_pipeline/scripts/config.yaml     |  21 +-
 01_dsp/dspml_pipeline/scripts/main.py         |   4 +
 3 files changed, 181 insertions(+), 34 deletions(-)

diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
index 35369e58..bea15f77 100644
--- a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
+++ b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
@@ -116,43 +116,185 @@ def _is_dataset_preprocessed(self, path: str):
         # TODO: Extract data here?
         return True
 
-    def load(self, new: bool) -> tuple:
+    def _is_new_dataset_valid(self, dataset_dir: str) -> bool:
         """
-        Loads and combines the specified datasets based on both existence of raw data and user specs.
+        Confirm whether a dataset contains the required raw radar frames for processing.
 
         Args:
-            new     (bool)  Load raw radar frames? If False, load .npy files if they exist.
+            dataset_dir     (str):  Relative path to dataset in question.
 
         Returns:
-            X, y    (tuple[np.array, np.array])
+            valid           (bool): Is the dataset valid for extracting radar frame data?
         """
-        for dataset_path in self.dataset_dirs:
-            if new:
-            # Try to load preprocessed dataset if raw scans unavailable.
-            if not self._is_new_dataset_valid():
-                if not self._is_preprocessed_dataset_valid():
-                    logger.error(f"Neither existing radar scans nor valid preprocessed "
-                                 f"dataset were found for the following dataset:\r\n"
-                                 f"\t+ Target:\t\t{self.target_dir}\r\n"
-                                 f"\t+ Dataset dirs:\t{self.dataset_dirs}")
-                    sys.exit(1)
-                # Load preprocessed dataset if it exists and raw scans do not here.
-                X, y = self.load_preprocessed_dataset()
-            # Load raw radar scans into new dataset here.
-            else:
-                X, y = self.load_new_dataset()
-        else:
-            X, y = self.load_preprocessed_dataset()
+        required_file = "data-log.csv"
+        capture_files = []      # Keep track of the number of available radar captures.
+        
+        # First check that all required files are in the base directory.
+        current_files = set(os.listdir(dataset_dir))
+        if not required_file in current_files:
+            return False
+
+        # Next look for a number of raw radar scans greater than zero.
+        dataset_path = Path(dataset_dir)
+        subdirs = [d for d in dataset_path.iterdir() 
+                if d.is_dir() and not d.name.startswith('.')]
+        for i, folder in enumerate(subdirs):
+            capture_files.append(sorted(folder.glob("*.frames")))
+        if len(capture_files) == 0:
+            return False
 
+        return True
 
-    def extract_single_dataset(self, dataset_path: Path) -> tuple[np.ndarray, np.ndarray]:
+    def _is_preprocessed_dataset_valid(self, dataset_dir: str) -> bool:
+        """
+        Confirm whether a dataset contains the required preprocessed radar scans.
+
+        Args:
+            dataset_dir     (str):  Relative path to dataset in question.
+
+        Returns:
+            valid           (bool): Is the dataset valid for use of preprocessed radar data?
+
+        Todo:
+            * Check the contents of the numpy files for validity.
+        """
+        required_files = ["X.npy", "y.npy"]
+        current_files = set(os.listdir(dataset_dir))
+        if not set(required_files).issubset(current_files):
+            return False
+        return True
+
+    def extract_single_dataset(self, dataset_dir: Path) -> tuple[np.ndarray, np.ndarray]:
         """
         Extracts the features (X) and labels (y) from a given directory.
 
+        Args:
+            dataset_dir (str):          String of relative path to dataset source directory.
+
         Returns:
-            X (np.ndarray):         Processed radar data (features).
-            y (np.ndarray):         Corresponding labels (targets).
+            frame_data  (np.ndarray):   Processed radar data (features) from one dataset source.
+            labels      (np.ndarray):   Corresponding labels (targets) from one dataset source.
+        """
+        new_frame_data = []
+        new_labels = []
+
+        logging.info(f"Extracting data from {dataset_dir}.")
+
+        dataset_dir = Path(dataset_dir)
+        subdirs = [d for d in dataset_dir.iterdir() 
+                if d.is_dir() and not d.name.startswith('.')]
+        data_log = dataset_dir / self.data_log
+        
+        # Get the labels from the data log.
+        try:
+            df = pd.read_csv(data_log)
+            df[self.folder_name] = df[self.folder_name].astype(str)
+            df[self.label_name] = df[self.label_name].astype(float)
+            logger.info(f"Loaded data log with {len(df)} samples")
+        except Exception as e:
+            logger.warning(f"Expected CSV format: columns include '{self.folder_name}' and '{self.label_name}'; "
+                           f"attempting to load preprocessed {self.folder_name} dataset...")
+            return [], []
+        
+        # In each subdirectory.
+        for i, folder in enumerate(subdirs):
+
+            capture_files = sorted(folder.glob("*.frames"))
+
+            logger.info(f"Processing {len(capture_files)} files in {folder.name}")
+
+            if not capture_files:
+                logger.warning(f"No .frames files found in {folder.name}")
+                continue
+
+            # Find the row in df corresponding to this folder name
+            sample_row = df[df['Sample #'] == folder.name]
+            if sample_row.empty:
+                logger.error(f"No matching sample for folder {folder.name} in data log")
+                sys.exit(1)
+            else:
+                bulk_density = sample_row.iloc[0][self.label_name]
+            
+            # Process each capture file
+            params = None
+            for capture_file in capture_files:
+                try:
+                    frame_data, params = process_frames(folder, capture_file.name)
+
+                    if frame_data is None:
+                        logger.warning(f"Failed to process: {capture_file.name}")
+                        continue
+
+                    # Anomoly removal. Replaces values that deviate from the median by more
+                    # than a threshold with the median. This has been done since the beginning 
+                    # of the project because of odd spikes in the raw DAC output that causes
+                    # large deviations in the data.
+                    median = np.median(frame_data, axis=1, keepdims=True)
+                    mask = np.abs(frame_data - median) > THRESHOLD
+                    frame_data_clean = frame_data.copy()
+                    frame_data_clean[mask] = np.broadcast_to(median, frame_data.shape)[mask]
+                    
+                    # DDC
+                    ddc_frame_data = np.zeros_like(frame_data_clean, dtype=np.complex64)
+                    for i in range(frame_data_clean.shape[1]):
+                        ddc_frame_data[:, i] = novelda_digital_downconvert(frame_data_clean[:, i])
+                    
+                    try:
+                        new_frame_data.append(ddc_frame_data)
+                        new_labels.append(bulk_density)
+                    except:
+                        logger.error(f"Failed to stack radar data from {capture_file.name}")
+                        sys.exit(1)
+
+                # Outputs warning when problem occurs while processing, but continues processing other radar data.
+                except Exception as e:
+                    logger.warning(f"Error processing {capture_file.name}: {e}")
+
+        # Save radar parameters
+        if params and len(capture_files) > 0:
+            params_file = folder / "radar_params.json"
+            with open(params_file, 'w') as f:
+                json.dump(params, f)
+            logger.info(f"Saved parameters: {params_file.name}")
+
+        return new_frame_data, new_labels
+
+    def load(self, new: bool) -> tuple:
         """
+        Loads and combines the specified datasets based on both existence of raw data and user specs.
+
+        Args:
+            new     (bool)  Load raw radar frames? If False, load .npy files if they exist.
+
+        Returns:
+            X, y    (tuple[np.ndarray, np.ndarray])
+        """
+        X = []
+        y = []
+
+        logger.info("Starting frame processing")
+
+        for dataset_dir in self.dataset_dirs:
+            # Load raw radar scans into new dataset here.
+            if new and self._is_new_dataset_valid(dataset_dir=dataset_dir):
+                X_new, y_new = self.extract_single_dataset(dataset_dir=dataset_dir)
+            # Try to load preprocessed dataset if raw scans unavailable.
+            elif self._is_preprocessed_dataset_valid(dataset_dir):
+                X_new, y_new = self.load_preprocessed_dataset(dataset_dir)
+            else:
+                logger.error(f"Neither existing radar scans nor valid preprocessed "
+                             f"dataset were found for the following dataset:\r\n"
+                             f"\t+ Target:\t\t{self.target_dir}\r\n"
+                             f"\t+ Dataset dir:\t{dataset_dir}")
+                sys.exit(1)
+            # Append the new radar scans and labels to the broader dataset.
+            X += X_new
+            y += y_new
+
+        self.X = np.stack(X)
+        self.y = np.stack(y)
+
+        return self.X, self.y
 
     def extract_data(self) -> tuple:
         """
diff --git a/01_dsp/dspml_pipeline/scripts/config.yaml b/01_dsp/dspml_pipeline/scripts/config.yaml
index d8ef9c27..67e23cd9 100644
--- a/01_dsp/dspml_pipeline/scripts/config.yaml
+++ b/01_dsp/dspml_pipeline/scripts/config.yaml
@@ -1,28 +1,29 @@
 # Data configuration
 data:       
   label_name: "Bulk Density (g/cm^3)"     # Name of label used data_log.csv
-  new_dataset: false                      # Set to true if this is a new dataset
+  new_dataset: true                      # Set to true if this is a new dataset
   # Training dataset 
   training:       
     # Raw datasets to combine (list of directories)
-    dataset_dirs:       
+    dataset_dirs: 
       - "../data/wet-0-soil-compaction-dataset"
       - "../data/wet-1-soil-compaction-dataset" 
       - "../data/wet-2-soil-compaction-dataset"
       - "../data/field-soil-compaction-dataset"
+      - "../data/field-2-soil-compaction-dataset"
     # Target combined training dataset directory
-    target_dir: "../data/combined-training-dataset"
+    target_dir: "../data/sensys-training-dataset"
   # Validation dataset
   validation:
     # Raw validation datasets to combine
     dataset_dirs:
       - "../data/pie-ranch-dataset"
     # Target combined validation dataset directory
-    target_dir: "../data/pie-ranch-dataset"
+    target_dir: "../data/sensys-val-dataset"
 
 # For handcrafted features
 handcrafted:
-  enabled: false               # Enable or disable handcrafted features
+  enabled: true               # Enable or disable handcrafted features
   new_features: true         # Set to true if this is a new dataset or if features have not been generated yet
   pruning_method: all      # Options: corr, mi, lasso, none
   top_n: 16             # Only used if pruning_method is not none
@@ -32,10 +33,10 @@ learned:
   n_features: 8               # Desired number of features
   # PCA-based feature extraction
   pca:
-    enabled: false             # Enable or disable PCA features
+    enabled: true             # Enable or disable PCA features
   # Kernel-PCA-based feature extraction
   kpca:
-    enabled: false             # Enable or disable kPCA features
+    enabled: true             # Enable or disable kPCA features
   # Autoencoder-based feature extraction
   autoencoder:
     enabled: true
@@ -44,7 +45,7 @@ learned:
     verbose: true
   # CNN-based feature extraction
   cnn:
-    enabled: true
+    enabled: false
     epochs: 20
     batch_size: 32
     verbose: true
@@ -62,13 +63,13 @@ deep_learning:
 end-to-end:
   # LSTM-based end-to-end regression
   lstm:
-    enabled: true
+    enabled: false
     epochs: 50
     batch_size: 32
     verbose: false
   # CNN-based end-to-end regression
   cnn:
-    enabled: true
+    enabled: false
     epochs: 20
     batch_size: 32
     verbose: false
diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py
index d58c94fc..3f62c8d6 100644
--- a/01_dsp/dspml_pipeline/scripts/main.py
+++ b/01_dsp/dspml_pipeline/scripts/main.py
@@ -79,6 +79,10 @@ def main(config_path: str):
     X_train, y_train = trainingFrameLoader.load(params['data']['new_dataset'])
     X_val, y_val = validationFrameLoader.load(params['data']['new_dataset'])
 
+    # TODO: Only save dataset conditionally.
+    trainingFrameLoader.save_dataset()
+    validationFrameLoader.save_dataset()
+
     """
     # If new dataset, extract data. Otherwise, load from saved file.
     if params['data']['new_dataset']:

From 7b1fe30cc7e5550fcc6305bf561558d6a6ff6bdc Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Sat, 28 Feb 2026 07:33:29 +0000
Subject: [PATCH 07/17] Updated frame_loader to allow for the use of
 preprocessed data (like that from pie_ranch)

---
 .../dspml_pipeline/data/frame_loader.py          | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
index bea15f77..90dabe83 100644
--- a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
+++ b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
@@ -259,6 +259,22 @@ def extract_single_dataset(self, dataset_dir: Path) -> tuple[np.ndarray, np.ndar
 
         return new_frame_data, new_labels
 
+    def load_preprocessed_dataset(self, dataset_dir: str) -> tuple:
+        """
+        Load proprocessed datasets.
+        """
+        print(dataset_dir)
+        X_path = Path(dataset_dir) / "X.npy"
+        y_path = Path(dataset_dir) / "y.npy"
+
+        # Load dataset if it has already been processed into .npy files.
+        X = np.load(X_path)
+        y = np.load(y_path)
+        
+        logger.info(f"Loaded from existing dataset: X={X.shape}, y={y.shape}")
+
+        return X.tolist(), y.tolist()
+
     def load(self, new: bool) -> tuple:
         """
         Loads and combines the specified datasets based on both existence of raw data and user specs.

From 9dac1ddab327796ffc6952ebb7841f3322b842d4 Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Mon, 2 Mar 2026 00:22:24 +0000
Subject: [PATCH 08/17] Updates default config.yaml file for Sensys first
 revision params.

---
 01_dsp/dspml_pipeline/scripts/config.yaml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/01_dsp/dspml_pipeline/scripts/config.yaml b/01_dsp/dspml_pipeline/scripts/config.yaml
index 67e23cd9..aa6a2e5e 100644
--- a/01_dsp/dspml_pipeline/scripts/config.yaml
+++ b/01_dsp/dspml_pipeline/scripts/config.yaml
@@ -1,4 +1,7 @@
-# Data configuration
+# Data configuration - Sensys first submission configs.
+# 
+# This configuration file trains on all controlled data as well as field (1), then
+# validates on field-2. Pie Ranch dataset is excluded.
 data:       
   label_name: "Bulk Density (g/cm^3)"     # Name of label used data_log.csv
   new_dataset: true                      # Set to true if this is a new dataset
@@ -10,14 +13,13 @@ data:
       - "../data/wet-1-soil-compaction-dataset" 
       - "../data/wet-2-soil-compaction-dataset"
       - "../data/field-soil-compaction-dataset"
-      - "../data/field-2-soil-compaction-dataset"
     # Target combined training dataset directory
     target_dir: "../data/sensys-training-dataset"
   # Validation dataset
   validation:
     # Raw validation datasets to combine
     dataset_dirs:
-      - "../data/pie-ranch-dataset"
+      - "../data/field-2-soil-compaction-dataset"
     # Target combined validation dataset directory
     target_dir: "../data/sensys-val-dataset"
 

From 2e711cf3bfe3c2b1e1432eacf9e8d500a8591a82 Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Wed, 4 Mar 2026 04:18:36 +0000
Subject: [PATCH 09/17] Adds duplicate hunter; also adds a bunch of preset
 configs.

---
 .../scripts/configs/f1-f2-config.yaml         | 93 ++++++++++++++++++
 .../scripts/configs/f1-pr-config.yaml         | 93 ++++++++++++++++++
 .../scripts/configs/f2-f1-config.yaml         | 93 ++++++++++++++++++
 .../scripts/configs/f2-pr-config.yaml         | 93 ++++++++++++++++++
 .../scripts/configs/il-f1-config.yaml         | 95 ++++++++++++++++++
 .../scripts/configs/il-f2-config.yaml         | 95 ++++++++++++++++++
 .../scripts/configs/il-pr-config.yaml         | 95 ++++++++++++++++++
 .../scripts/configs/il_f1_f2-pr-config.yaml   | 97 +++++++++++++++++++
 .../scripts/configs/pr-f1-config.yaml         | 93 ++++++++++++++++++
 .../scripts/configs/pr-f2-config.yaml         | 93 ++++++++++++++++++
 01_dsp/dspml_pipeline/scripts/main.py         | 30 ++++++
 .../{config.yaml => template_config.yaml}     |  6 +-
 12 files changed, 974 insertions(+), 2 deletions(-)
 create mode 100644 01_dsp/dspml_pipeline/scripts/configs/f1-f2-config.yaml
 create mode 100644 01_dsp/dspml_pipeline/scripts/configs/f1-pr-config.yaml
 create mode 100644 01_dsp/dspml_pipeline/scripts/configs/f2-f1-config.yaml
 create mode 100644 01_dsp/dspml_pipeline/scripts/configs/f2-pr-config.yaml
 create mode 100644 01_dsp/dspml_pipeline/scripts/configs/il-f1-config.yaml
 create mode 100644 01_dsp/dspml_pipeline/scripts/configs/il-f2-config.yaml
 create mode 100644 01_dsp/dspml_pipeline/scripts/configs/il-pr-config.yaml
 create mode 100644 01_dsp/dspml_pipeline/scripts/configs/il_f1_f2-pr-config.yaml
 create mode 100644 01_dsp/dspml_pipeline/scripts/configs/pr-f1-config.yaml
 create mode 100644 01_dsp/dspml_pipeline/scripts/configs/pr-f2-config.yaml
 rename 01_dsp/dspml_pipeline/scripts/{config.yaml => template_config.yaml} (93%)

diff --git a/01_dsp/dspml_pipeline/scripts/configs/f1-f2-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/f1-f2-config.yaml
new file mode 100644
index 00000000..1afc76ac
--- /dev/null
+++ b/01_dsp/dspml_pipeline/scripts/configs/f1-f2-config.yaml
@@ -0,0 +1,93 @@
+# f1-f2-config.yaml
+#
+# Dataset labels:
+#   il = In-lab, "wet*" datasets
+#   f1 = "field" dataset
+#   f2 = "field2" dataset
+#   pr = Dataset from Pie Ranch
+#
+#   '_' separates included datasets; '-' separates training from validation datasets.
+#
+# Data configuration.
+# 
+# This configuration file trains on all controlled data as well as field (1), then
+# validates on field-2. Pie Ranch dataset is excluded.
+data:       
+  label_name: "Bulk Density (g/cm^3)"     # Name of label used data_log.csv
+  new_dataset: true                      # Set to true if this is a new dataset
+  # Training dataset 
+  training:       
+    # Raw datasets to combine (list of directories)
+    dataset_dirs: 
+      - "../data/field-soil-compaction-dataset"
+    # Target combined training dataset directory
+    target_dir: "../data/f1_f2-pr-training-dataset"
+  # Validation dataset
+  validation:
+    # Raw validation datasets to combine
+    dataset_dirs:
+      - "../data/field-2-soil-compaction-dataset"
+    # Target combined validation dataset directory
+    target_dir: "../data/f1_f2-pr-val-dataset"
+
+# For handcrafted features
+handcrafted:
+  enabled: true               # Enable or disable handcrafted features
+  new_features: true         # Set to true if this is a new dataset or if features have not been generated yet
+  pruning_method: all      # Options: corr, mi, lasso, none
+  top_n: 16             # Only used if pruning_method is not none
+
+# For learned features
+learned:
+  n_features: 8               # Desired number of features
+  # PCA-based feature extraction
+  pca:
+    enabled: true             # Enable or disable PCA features
+  # Kernel-PCA-based feature extraction
+  kpca:
+    enabled: true             # Enable or disable kPCA features
+  # Autoencoder-based feature extraction
+  autoencoder:
+    enabled: true
+    epochs: 1000
+    batch_size: 256
+    verbose: true
+  # CNN-based feature extraction
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: true
+    
+# Classical model configuration for feature regression
+classical:
+  enabled: true               # Enable or disable the evaluation and validation of classical models (on all features)
+  tune_model_params: true    # Set to true to tune the models, or false to save time
+
+# Deep learning model configuration for feature regression
+deep_learning:
+  enabled: true
+
+# End-to-end model configurations for raw data regression
+end-to-end:
+  # LSTM-based end-to-end regression
+  lstm:
+    enabled: false
+    epochs: 50
+    batch_size: 32
+    verbose: false
+  # CNN-based end-to-end regression
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: false
+  # Transformer-based end-to-end regression
+  transformer:
+    enabled: true
+    batch_size: 4
+    epochs: 10
+    verbose: false
+
+advanced:
+  verbose: true               # Set to false to reduce logging output  
diff --git a/01_dsp/dspml_pipeline/scripts/configs/f1-pr-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/f1-pr-config.yaml
new file mode 100644
index 00000000..cebb2840
--- /dev/null
+++ b/01_dsp/dspml_pipeline/scripts/configs/f1-pr-config.yaml
@@ -0,0 +1,93 @@
+# f1-pr-config.yaml
+#
+# Dataset labels:
+#   il = In-lab, "wet*" datasets
+#   f1 = "field" dataset
+#   f2 = "field2" dataset
+#   pr = Dataset from Pie Ranch
+#
+#   '_' separates included datasets; '-' separates training from validation datasets.
+#
+# Data configuration.
+# 
+# This configuration file trains on all controlled data as well as field (1), then
+# validates on field-2. Pie Ranch dataset is excluded.
+data:       
+  label_name: "Bulk Density (g/cm^3)"     # Name of label used data_log.csv
+  new_dataset: true                      # Set to true if this is a new dataset
+  # Training dataset 
+  training:       
+    # Raw datasets to combine (list of directories)
+    dataset_dirs: 
+      - "../data/field-soil-compaction-dataset"
+    # Target combined training dataset directory
+    target_dir: "../data/f1-pr-training-dataset"
+  # Validation dataset
+  validation:
+    # Raw validation datasets to combine
+    dataset_dirs:
+      - "../data/pie-ranch-dataset"
+    # Target combined validation dataset directory
+    target_dir: "../data/f1-pr-val-dataset"
+
+# For handcrafted features
+handcrafted:
+  enabled: true               # Enable or disable handcrafted features
+  new_features: true         # Set to true if this is a new dataset or if features have not been generated yet
+  pruning_method: all      # Options: corr, mi, lasso, none
+  top_n: 16             # Only used if pruning_method is not none
+
+# For learned features
+learned:
+  n_features: 8               # Desired number of features
+  # PCA-based feature extraction
+  pca:
+    enabled: true             # Enable or disable PCA features
+  # Kernel-PCA-based feature extraction
+  kpca:
+    enabled: true             # Enable or disable kPCA features
+  # Autoencoder-based feature extraction
+  autoencoder:
+    enabled: true
+    epochs: 1000
+    batch_size: 256
+    verbose: true
+  # CNN-based feature extraction
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: true
+    
+# Classical model configuration for feature regression
+classical:
+  enabled: true               # Enable or disable the evaluation and validation of classical models (on all features)
+  tune_model_params: true    # Set to true to tune the models, or false to save time
+
+# Deep learning model configuration for feature regression
+deep_learning:
+  enabled: true
+
+# End-to-end model configurations for raw data regression
+end-to-end:
+  # LSTM-based end-to-end regression
+  lstm:
+    enabled: false
+    epochs: 50
+    batch_size: 32
+    verbose: false
+  # CNN-based end-to-end regression
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: false
+  # Transformer-based end-to-end regression
+  transformer:
+    enabled: true
+    batch_size: 4
+    epochs: 10
+    verbose: false
+
+advanced:
+  verbose: true               # Set to false to reduce logging output  
diff --git a/01_dsp/dspml_pipeline/scripts/configs/f2-f1-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/f2-f1-config.yaml
new file mode 100644
index 00000000..878299ac
--- /dev/null
+++ b/01_dsp/dspml_pipeline/scripts/configs/f2-f1-config.yaml
@@ -0,0 +1,93 @@
+# f1-f2-config.yaml
+#
+# Dataset labels:
+#   il = In-lab, "wet*" datasets
+#   f1 = "field" dataset
+#   f2 = "field2" dataset
+#   pr = Dataset from Pie Ranch
+#
+#   '_' separates included datasets; '-' separates training from validation datasets.
+#
+# Data configuration.
+# 
+# This configuration file trains on all controlled data as well as field (1), then
+# validates on field-2. Pie Ranch dataset is excluded.
+data:       
+  label_name: "Bulk Density (g/cm^3)"     # Name of label used data_log.csv
+  new_dataset: true                      # Set to true if this is a new dataset
+  # Training dataset 
+  training:       
+    # Raw datasets to combine (list of directories)
+    dataset_dirs: 
+      - "../data/field-soil-compaction-dataset"
+    # Target combined training dataset directory
+    target_dir: "../data/f1-f2-training-dataset"
+  # Validation dataset
+  validation:
+    # Raw validation datasets to combine
+    dataset_dirs:
+      - "../data/field-2-soil-compaction-dataset"
+    # Target combined validation dataset directory
+    target_dir: "../data/f1-f2-val-dataset"
+
+# For handcrafted features
+handcrafted:
+  enabled: true               # Enable or disable handcrafted features
+  new_features: true         # Set to true if this is a new dataset or if features have not been generated yet
+  pruning_method: all      # Options: corr, mi, lasso, none
+  top_n: 16             # Only used if pruning_method is not none
+
+# For learned features
+learned:
+  n_features: 8               # Desired number of features
+  # PCA-based feature extraction
+  pca:
+    enabled: true             # Enable or disable PCA features
+  # Kernel-PCA-based feature extraction
+  kpca:
+    enabled: true             # Enable or disable kPCA features
+  # Autoencoder-based feature extraction
+  autoencoder:
+    enabled: true
+    epochs: 1000
+    batch_size: 256
+    verbose: true
+  # CNN-based feature extraction
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: true
+    
+# Classical model configuration for feature regression
+classical:
+  enabled: true               # Enable or disable the evaluation and validation of classical models (on all features)
+  tune_model_params: true    # Set to true to tune the models, or false to save time
+
+# Deep learning model configuration for feature regression
+deep_learning:
+  enabled: true
+
+# End-to-end model configurations for raw data regression
+end-to-end:
+  # LSTM-based end-to-end regression
+  lstm:
+    enabled: false
+    epochs: 50
+    batch_size: 32
+    verbose: false
+  # CNN-based end-to-end regression
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: false
+  # Transformer-based end-to-end regression
+  transformer:
+    enabled: true
+    batch_size: 4
+    epochs: 10
+    verbose: false
+
+advanced:
+  verbose: true               # Set to false to reduce logging output  
diff --git a/01_dsp/dspml_pipeline/scripts/configs/f2-pr-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/f2-pr-config.yaml
new file mode 100644
index 00000000..312c1767
--- /dev/null
+++ b/01_dsp/dspml_pipeline/scripts/configs/f2-pr-config.yaml
@@ -0,0 +1,93 @@
+# f2-pr-config.yaml
+#
+# Dataset labels:
+#   il = In-lab, "wet*" datasets
+#   f1 = "field" dataset
+#   f2 = "field2" dataset
+#   pr = Dataset from Pie Ranch
+#
+#   '_' separates included datasets; '-' separates training from validation datasets.
+#
+# Data configuration.
+# 
+# This configuration file trains on all controlled data as well as field (1), then
+# validates on field-2. Pie Ranch dataset is excluded.
+data:       
+  label_name: "Bulk Density (g/cm^3)"     # Name of label used data_log.csv
+  new_dataset: true                      # Set to true if this is a new dataset
+  # Training dataset 
+  training:       
+    # Raw datasets to combine (list of directories)
+    dataset_dirs: 
+      - "../data/field-2-soil-compaction-dataset"
+    # Target combined training dataset directory
+    target_dir: "../data/f2-pr-training-dataset"
+  # Validation dataset
+  validation:
+    # Raw validation datasets to combine
+    dataset_dirs:
+      - "../data/pie-ranch-dataset"
+    # Target combined validation dataset directory
+    target_dir: "../data/f2-pr-val-dataset"
+
+# For handcrafted features
+handcrafted:
+  enabled: true               # Enable or disable handcrafted features
+  new_features: true         # Set to true if this is a new dataset or if features have not been generated yet
+  pruning_method: all      # Options: corr, mi, lasso, none
+  top_n: 16             # Only used if pruning_method is not none
+
+# For learned features
+learned:
+  n_features: 8               # Desired number of features
+  # PCA-based feature extraction
+  pca:
+    enabled: true             # Enable or disable PCA features
+  # Kernel-PCA-based feature extraction
+  kpca:
+    enabled: true             # Enable or disable kPCA features
+  # Autoencoder-based feature extraction
+  autoencoder:
+    enabled: true
+    epochs: 1000
+    batch_size: 256
+    verbose: true
+  # CNN-based feature extraction
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: true
+    
+# Classical model configuration for feature regression
+classical:
+  enabled: true               # Enable or disable the evaluation and validation of classical models (on all features)
+  tune_model_params: true    # Set to true to tune the models, or false to save time
+
+# Deep learning model configuration for feature regression
+deep_learning:
+  enabled: true
+
+# End-to-end model configurations for raw data regression
+end-to-end:
+  # LSTM-based end-to-end regression
+  lstm:
+    enabled: false
+    epochs: 50
+    batch_size: 32
+    verbose: false
+  # CNN-based end-to-end regression
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: false
+  # Transformer-based end-to-end regression
+  transformer:
+    enabled: true
+    batch_size: 4
+    epochs: 10
+    verbose: false
+
+advanced:
+  verbose: true               # Set to false to reduce logging output  
diff --git a/01_dsp/dspml_pipeline/scripts/configs/il-f1-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/il-f1-config.yaml
new file mode 100644
index 00000000..0ee79789
--- /dev/null
+++ b/01_dsp/dspml_pipeline/scripts/configs/il-f1-config.yaml
@@ -0,0 +1,95 @@
+# il-f1-config.yaml
+#
+# Dataset labels:
+#   il = In-lab, "wet*" datasets
+#   f1 = "field" dataset
+#   f2 = "field2" dataset
+#   pr = Dataset from Pie Ranch
+#
+#   '_' separates included datasets; '-' separates training from validation datasets.
+#
+# Data configuration.
+# 
+# This configuration file trains on all controlled data as well as field (1), then
+# validates on field-2. Pie Ranch dataset is excluded.
+data:       
+  label_name: "Bulk Density (g/cm^3)"     # Name of label used data_log.csv
+  new_dataset: true                      # Set to true if this is a new dataset
+  # Training dataset 
+  training:       
+    # Raw datasets to combine (list of directories)
+    dataset_dirs: 
+      - "../data/wet-0-soil-compaction-dataset"
+      - "../data/wet-1-soil-compaction-dataset" 
+      - "../data/wet-2-soil-compaction-dataset"
+    # Target combined training dataset directory
+    target_dir: "../data/il-f1-training-dataset"
+  # Validation dataset
+  validation:
+    # Raw validation datasets to combine
+    dataset_dirs:
+      - "../data/field-soil-compaction-dataset"
+    # Target combined validation dataset directory
+    target_dir: "../data/il-f1-val-dataset"
+
+# For handcrafted features
+handcrafted:
+  enabled: true               # Enable or disable handcrafted features
+  new_features: true         # Set to true if this is a new dataset or if features have not been generated yet
+  pruning_method: all      # Options: corr, mi, lasso, none
+  top_n: 16             # Only used if pruning_method is not none
+
+# For learned features
+learned:
+  n_features: 8               # Desired number of features
+  # PCA-based feature extraction
+  pca:
+    enabled: true             # Enable or disable PCA features
+  # Kernel-PCA-based feature extraction
+  kpca:
+    enabled: true             # Enable or disable kPCA features
+  # Autoencoder-based feature extraction
+  autoencoder:
+    enabled: true
+    epochs: 1000
+    batch_size: 256
+    verbose: true
+  # CNN-based feature extraction
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: true
+    
+# Classical model configuration for feature regression
+classical:
+  enabled: true               # Enable or disable the evaluation and validation of classical models (on all features)
+  tune_model_params: true    # Set to true to tune the models, or false to save time
+
+# Deep learning model configuration for feature regression
+deep_learning:
+  enabled: true
+
+# End-to-end model configurations for raw data regression
+end-to-end:
+  # LSTM-based end-to-end regression
+  lstm:
+    enabled: false
+    epochs: 50
+    batch_size: 32
+    verbose: false
+  # CNN-based end-to-end regression
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: false
+  # Transformer-based end-to-end regression
+  transformer:
+    enabled: true
+    batch_size: 4
+    epochs: 10
+    verbose: false
+
+advanced:
+  verbose: true               # Set to false to reduce logging output  
diff --git a/01_dsp/dspml_pipeline/scripts/configs/il-f2-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/il-f2-config.yaml
new file mode 100644
index 00000000..a1ecacfd
--- /dev/null
+++ b/01_dsp/dspml_pipeline/scripts/configs/il-f2-config.yaml
@@ -0,0 +1,95 @@
+# il-f2-config.yaml
+#
+# Dataset labels:
+#   il = In-lab, "wet*" datasets
+#   f1 = "field" dataset
+#   f2 = "field2" dataset
+#   pr = Dataset from Pie Ranch
+#
+#   '_' separates included datasets; '-' separates training from validation datasets.
+#
+# Data configuration.
+# 
+# This configuration file trains on all controlled data as well as field (1), then
+# validates on field-2. Pie Ranch dataset is excluded.
+data:       
+  label_name: "Bulk Density (g/cm^3)"     # Name of label used data_log.csv
+  new_dataset: true                      # Set to true if this is a new dataset
+  # Training dataset 
+  training:       
+    # Raw datasets to combine (list of directories)
+    dataset_dirs: 
+      - "../data/wet-0-soil-compaction-dataset"
+      - "../data/wet-1-soil-compaction-dataset" 
+      - "../data/wet-2-soil-compaction-dataset"
+    # Target combined training dataset directory
+    target_dir: "../data/il-f2-training-dataset"
+  # Validation dataset
+  validation:
+    # Raw validation datasets to combine
+    dataset_dirs:
+      - "../data/field-2-soil-compaction-dataset"
+    # Target combined validation dataset directory
+    target_dir: "../data/il-f2-val-dataset"
+
+# For handcrafted features
+handcrafted:
+  enabled: true               # Enable or disable handcrafted features
+  new_features: true         # Set to true if this is a new dataset or if features have not been generated yet
+  pruning_method: all      # Options: corr, mi, lasso, none
+  top_n: 16             # Only used if pruning_method is not none
+
+# For learned features
+learned:
+  n_features: 8               # Desired number of features
+  # PCA-based feature extraction
+  pca:
+    enabled: true             # Enable or disable PCA features
+  # Kernel-PCA-based feature extraction
+  kpca:
+    enabled: true             # Enable or disable kPCA features
+  # Autoencoder-based feature extraction
+  autoencoder:
+    enabled: true
+    epochs: 1000
+    batch_size: 256
+    verbose: true
+  # CNN-based feature extraction
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: true
+    
+# Classical model configuration for feature regression
+classical:
+  enabled: true               # Enable or disable the evaluation and validation of classical models (on all features)
+  tune_model_params: true    # Set to true to tune the models, or false to save time
+
+# Deep learning model configuration for feature regression
+deep_learning:
+  enabled: true
+
+# End-to-end model configurations for raw data regression
+end-to-end:
+  # LSTM-based end-to-end regression
+  lstm:
+    enabled: false
+    epochs: 50
+    batch_size: 32
+    verbose: false
+  # CNN-based end-to-end regression
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: false
+  # Transformer-based end-to-end regression
+  transformer:
+    enabled: true
+    batch_size: 4
+    epochs: 10
+    verbose: false
+
+advanced:
+  verbose: true               # Set to false to reduce logging output  
diff --git a/01_dsp/dspml_pipeline/scripts/configs/il-pr-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/il-pr-config.yaml
new file mode 100644
index 00000000..3af5e158
--- /dev/null
+++ b/01_dsp/dspml_pipeline/scripts/configs/il-pr-config.yaml
@@ -0,0 +1,95 @@
+# il-pr-config.yaml
+#
+# Dataset labels:
+#   il = In-lab, "wet*" datasets
+#   f1 = "field" dataset
+#   f2 = "field2" dataset
+#   pr = Dataset from Pie Ranch
+#
+#   '_' separates included datasets; '-' separates training from validation datasets.
+#
+# Data configuration.
+# 
+# This configuration file trains on all controlled data as well as field (1), then
+# validates on field-2. Pie Ranch dataset is excluded.
+data:       
+  label_name: "Bulk Density (g/cm^3)"     # Name of label used data_log.csv
+  new_dataset: true                      # Set to true if this is a new dataset
+  # Training dataset 
+  training:       
+    # Raw datasets to combine (list of directories)
+    dataset_dirs: 
+      - "../data/wet-0-soil-compaction-dataset"
+      - "../data/wet-1-soil-compaction-dataset" 
+      - "../data/wet-2-soil-compaction-dataset"
+    # Target combined training dataset directory
+    target_dir: "../data/il-pr-training-dataset"
+  # Validation dataset
+  validation:
+    # Raw validation datasets to combine
+    dataset_dirs:
+      - "../data/pie-ranch-dataset"
+    # Target combined validation dataset directory
+    target_dir: "../data/il-pr-val-dataset"
+
+# For handcrafted features
+handcrafted:
+  enabled: true               # Enable or disable handcrafted features
+  new_features: true         # Set to true if this is a new dataset or if features have not been generated yet
+  pruning_method: all      # Options: corr, mi, lasso, none
+  top_n: 16             # Only used if pruning_method is not none
+
+# For learned features
+learned:
+  n_features: 8               # Desired number of features
+  # PCA-based feature extraction
+  pca:
+    enabled: true             # Enable or disable PCA features
+  # Kernel-PCA-based feature extraction
+  kpca:
+    enabled: true             # Enable or disable kPCA features
+  # Autoencoder-based feature extraction
+  autoencoder:
+    enabled: true
+    epochs: 1000
+    batch_size: 256
+    verbose: true
+  # CNN-based feature extraction
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: true
+    
+# Classical model configuration for feature regression
+classical:
+  enabled: true               # Enable or disable the evaluation and validation of classical models (on all features)
+  tune_model_params: true    # Set to true to tune the models, or false to save time
+
+# Deep learning model configuration for feature regression
+deep_learning:
+  enabled: true
+
+# End-to-end model configurations for raw data regression
+end-to-end:
+  # LSTM-based end-to-end regression
+  lstm:
+    enabled: false
+    epochs: 50
+    batch_size: 32
+    verbose: false
+  # CNN-based end-to-end regression
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: false
+  # Transformer-based end-to-end regression
+  transformer:
+    enabled: true
+    batch_size: 4
+    epochs: 10
+    verbose: false
+
+advanced:
+  verbose: true               # Set to false to reduce logging output  
diff --git a/01_dsp/dspml_pipeline/scripts/configs/il_f1_f2-pr-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/il_f1_f2-pr-config.yaml
new file mode 100644
index 00000000..c74a66e3
--- /dev/null
+++ b/01_dsp/dspml_pipeline/scripts/configs/il_f1_f2-pr-config.yaml
@@ -0,0 +1,97 @@
+# il_f1_f2-pr-config.yaml
+#
+# Dataset labels:
+#   il = In-lab, "wet*" datasets
+#   f1 = "field" dataset
+#   f2 = "field2" dataset
+#   pr = Dataset from Pie Ranch
+#
+#   '_' separates included datasets; '-' separates training from validation datasets.
+#
+# Data configuration.
+# 
+# This configuration file trains on all controlled data as well as field (1), then
+# validates on field-2. Pie Ranch dataset is excluded.
+data:       
+  label_name: "Bulk Density (g/cm^3)"     # Name of label used data_log.csv
+  new_dataset: true                      # Set to true if this is a new dataset
+  # Training dataset 
+  training:       
+    # Raw datasets to combine (list of directories)
+    dataset_dirs: 
+      - "../data/wet-0-soil-compaction-dataset"
+      - "../data/wet-1-soil-compaction-dataset" 
+      - "../data/wet-2-soil-compaction-dataset"
+      - "../data/field-soil-compaction-dataset"
+      - "../data/field-2-soil-compaction-dataset"
+    # Target combined training dataset directory
+    target_dir: "../data/il_f1_f2-pr-training-dataset"
+  # Validation dataset
+  validation:
+    # Raw validation datasets to combine
+    dataset_dirs:
+      - "../data/pie-ranch-dataset"
+    # Target combined validation dataset directory
+    target_dir: "../data/il_f1_f2-pr-val-dataset"
+
+# For handcrafted features
+handcrafted:
+  enabled: true               # Enable or disable handcrafted features
+  new_features: true         # Set to true if this is a new dataset or if features have not been generated yet
+  pruning_method: all      # Options: corr, mi, lasso, none
+  top_n: 16             # Only used if pruning_method is not none
+
+# For learned features
+learned:
+  n_features: 8               # Desired number of features
+  # PCA-based feature extraction
+  pca:
+    enabled: true             # Enable or disable PCA features
+  # Kernel-PCA-based feature extraction
+  kpca:
+    enabled: true             # Enable or disable kPCA features
+  # Autoencoder-based feature extraction
+  autoencoder:
+    enabled: true
+    epochs: 1000
+    batch_size: 256
+    verbose: true
+  # CNN-based feature extraction
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: true
+    
+# Classical model configuration for feature regression
+classical:
+  enabled: true               # Enable or disable the evaluation and validation of classical models (on all features)
+  tune_model_params: true    # Set to true to tune the models, or false to save time
+
+# Deep learning model configuration for feature regression
+deep_learning:
+  enabled: true
+
+# End-to-end model configurations for raw data regression
+end-to-end:
+  # LSTM-based end-to-end regression
+  lstm:
+    enabled: false
+    epochs: 50
+    batch_size: 32
+    verbose: false
+  # CNN-based end-to-end regression
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: false
+  # Transformer-based end-to-end regression
+  transformer:
+    enabled: true
+    batch_size: 4
+    epochs: 10
+    verbose: false
+
+advanced:
+  verbose: true               # Set to false to reduce logging output  
diff --git a/01_dsp/dspml_pipeline/scripts/configs/pr-f1-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/pr-f1-config.yaml
new file mode 100644
index 00000000..d7da694a
--- /dev/null
+++ b/01_dsp/dspml_pipeline/scripts/configs/pr-f1-config.yaml
@@ -0,0 +1,93 @@
+# pr-f1-config.yaml
+#
+# Dataset labels:
+#   il = In-lab, "wet*" datasets
+#   f1 = "field" dataset
+#   f2 = "field2" dataset
+#   pr = Dataset from Pie Ranch
+#
+#   '_' separates included datasets; '-' separates training from validation datasets.
+#
+# Data configuration.
+# 
+# This configuration file trains on all controlled data as well as field (1), then
+# validates on field-2. Pie Ranch dataset is excluded.
+data:       
+  label_name: "Bulk Density (g/cm^3)"     # Name of label used data_log.csv
+  new_dataset: true                      # Set to true if this is a new dataset
+  # Training dataset 
+  training:       
+    # Raw datasets to combine (list of directories)
+    dataset_dirs: 
+      - "../data/pie-ranch-dataset"
+    # Target combined training dataset directory
+    target_dir: "../data/pr-f1-training-dataset"
+  # Validation dataset
+  validation:
+    # Raw validation datasets to combine
+    dataset_dirs:
+      - "../data/field-soil-compaction-dataset"
+    # Target combined validation dataset directory
+    target_dir: "../data/pr-f1-val-dataset"
+
+# For handcrafted features
+handcrafted:
+  enabled: true               # Enable or disable handcrafted features
+  new_features: true         # Set to true if this is a new dataset or if features have not been generated yet
+  pruning_method: all      # Options: corr, mi, lasso, none
+  top_n: 16             # Only used if pruning_method is not none
+
+# For learned features
+learned:
+  n_features: 8               # Desired number of features
+  # PCA-based feature extraction
+  pca:
+    enabled: true             # Enable or disable PCA features
+  # Kernel-PCA-based feature extraction
+  kpca:
+    enabled: true             # Enable or disable kPCA features
+  # Autoencoder-based feature extraction
+  autoencoder:
+    enabled: true
+    epochs: 1000
+    batch_size: 256
+    verbose: true
+  # CNN-based feature extraction
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: true
+    
+# Classical model configuration for feature regression
+classical:
+  enabled: true               # Enable or disable the evaluation and validation of classical models (on all features)
+  tune_model_params: true    # Set to true to tune the models, or false to save time
+
+# Deep learning model configuration for feature regression
+deep_learning:
+  enabled: true
+
+# End-to-end model configurations for raw data regression
+end-to-end:
+  # LSTM-based end-to-end regression
+  lstm:
+    enabled: false
+    epochs: 50
+    batch_size: 32
+    verbose: false
+  # CNN-based end-to-end regression
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: false
+  # Transformer-based end-to-end regression
+  transformer:
+    enabled: true
+    batch_size: 4
+    epochs: 10
+    verbose: false
+
+advanced:
+  verbose: true               # Set to false to reduce logging output  
diff --git a/01_dsp/dspml_pipeline/scripts/configs/pr-f2-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/pr-f2-config.yaml
new file mode 100644
index 00000000..ac7553c1
--- /dev/null
+++ b/01_dsp/dspml_pipeline/scripts/configs/pr-f2-config.yaml
@@ -0,0 +1,93 @@
+# pr-f2-config.yaml
+#
+# Dataset labels:
+#   il = In-lab, "wet*" datasets
+#   f1 = "field" dataset
+#   f2 = "field2" dataset
+#   pr = Dataset from Pie Ranch
+#
+#   '_' separates included datasets; '-' separates training from validation datasets.
+#
+# Data configuration.
+# 
+# This configuration file trains on all controlled data as well as field (1), then
+# validates on field-2. Pie Ranch dataset is excluded.
+data:       
+  label_name: "Bulk Density (g/cm^3)"     # Name of label used data_log.csv
+  new_dataset: true                      # Set to true if this is a new dataset
+  # Training dataset 
+  training:       
+    # Raw datasets to combine (list of directories)
+    dataset_dirs: 
+      - "../data/pie-ranch-dataset"
+    # Target combined training dataset directory
+    target_dir: "../data/pr-f2-training-dataset"
+  # Validation dataset
+  validation:
+    # Raw validation datasets to combine
+    dataset_dirs:
+      - "../data/field-2-soil-compaction-dataset"
+    # Target combined validation dataset directory
+    target_dir: "../data/pr-f2-val-dataset"
+
+# For handcrafted features
+handcrafted:
+  enabled: true               # Enable or disable handcrafted features
+  new_features: true         # Set to true if this is a new dataset or if features have not been generated yet
+  pruning_method: all      # Options: corr, mi, lasso, none
+  top_n: 16             # Only used if pruning_method is not none
+
+# For learned features
+learned:
+  n_features: 8               # Desired number of features
+  # PCA-based feature extraction
+  pca:
+    enabled: true             # Enable or disable PCA features
+  # Kernel-PCA-based feature extraction
+  kpca:
+    enabled: true             # Enable or disable kPCA features
+  # Autoencoder-based feature extraction
+  autoencoder:
+    enabled: true
+    epochs: 1000
+    batch_size: 256
+    verbose: true
+  # CNN-based feature extraction
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: true
+    
+# Classical model configuration for feature regression
+classical:
+  enabled: true               # Enable or disable the evaluation and validation of classical models (on all features)
+  tune_model_params: true    # Set to true to tune the models, or false to save time
+
+# Deep learning model configuration for feature regression
+deep_learning:
+  enabled: true
+
+# End-to-end model configurations for raw data regression
+end-to-end:
+  # LSTM-based end-to-end regression
+  lstm:
+    enabled: false
+    epochs: 50
+    batch_size: 32
+    verbose: false
+  # CNN-based end-to-end regression
+  cnn:
+    enabled: false
+    epochs: 20
+    batch_size: 32
+    verbose: false
+  # Transformer-based end-to-end regression
+  transformer:
+    enabled: true
+    batch_size: 4
+    epochs: 10
+    verbose: false
+
+advanced:
+  verbose: true               # Set to false to reduce logging output  
diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py
index 3f62c8d6..dbf4d4c9 100644
--- a/01_dsp/dspml_pipeline/scripts/main.py
+++ b/01_dsp/dspml_pipeline/scripts/main.py
@@ -60,6 +60,31 @@ def load_config(path: str) -> dict:
         params = yaml.safe_load(f)
     return params
 
+def are_duplicate_examples_present(ds1: tuple, ds2: tuple) -> bool:
+    """
+    are_duplicate_examples_present(ds1, ds2)
+
+    Confirm that there are no duplicated examples both within and between each dataset.
+
+    Args:
+        ds1 (tuple) First dataset.
+        ds2 (tuple) Second dataset.
+
+    Returns:
+            (bool)  Are duplicates present?
+    """
+    dups = False
+    # The dimension of each scan is (512x160), and there are many scans.
+    for i, line1 in enumerate(ds1):
+        for j, line2 in enumerate(ds2):
+            if (len(line1) == len(line2)):
+                for scan1, scan2 in zip(line1, line2):
+                    if (len(scan1) == len(scan2)):
+                        if tuple(scan1) == tuple(scan2):
+                            print(f"Found duplicate at [{i},{j}]!")
+                            dups = True
+    return dups
+
 def main(config_path: str):
     # Load training parameters from config file.
     params = load_config(path=config_path)
@@ -79,6 +104,11 @@ def main(config_path: str):
     X_train, y_train = trainingFrameLoader.load(params['data']['new_dataset'])
     X_val, y_val = validationFrameLoader.load(params['data']['new_dataset'])
 
+    # Verify that there are no duplicate examples in dataset.
+    if (are_duplicate_examples_present(X_train, X_val)):
+        print("Found duplicates! Exiting.")
+        sys.exit(1)
+
     # TODO: Only save dataset conditionally.
     trainingFrameLoader.save_dataset()
     validationFrameLoader.save_dataset()
diff --git a/01_dsp/dspml_pipeline/scripts/config.yaml b/01_dsp/dspml_pipeline/scripts/template_config.yaml
similarity index 93%
rename from 01_dsp/dspml_pipeline/scripts/config.yaml
rename to 01_dsp/dspml_pipeline/scripts/template_config.yaml
index aa6a2e5e..1672bef7 100644
--- a/01_dsp/dspml_pipeline/scripts/config.yaml
+++ b/01_dsp/dspml_pipeline/scripts/template_config.yaml
@@ -13,15 +13,17 @@ data:
       - "../data/wet-1-soil-compaction-dataset" 
       - "../data/wet-2-soil-compaction-dataset"
       - "../data/field-soil-compaction-dataset"
+      - "../data/field-2-soil-compaction-dataset"
     # Target combined training dataset directory
-    target_dir: "../data/sensys-training-dataset"
+    target_dir: "../data/test-training-dataset"
   # Validation dataset
   validation:
     # Raw validation datasets to combine
     dataset_dirs:
+      #- "../data/pie-ranch-dataset"
       - "../data/field-2-soil-compaction-dataset"
     # Target combined validation dataset directory
-    target_dir: "../data/sensys-val-dataset"
+    target_dir: "../data/test-val-dataset"
 
 # For handcrafted features
 handcrafted:

From a4c7cd7cdfa4fff6806e3f0fe74a5bb8f10ba3ab Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Wed, 4 Mar 2026 04:19:48 +0000
Subject: [PATCH 10/17] Removes virtual envs.

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index f3c63fd9..f046e67f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,6 @@ b1_ws/build/
 b1_ws/install/
 b1_ws/log/
 b1_ws/src/ros2
+
+# Virtual environments.
+*.venv*

From e7617a457ce858e6370fb88e425fd1ffa3fcae77 Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Wed, 4 Mar 2026 04:20:08 +0000
Subject: [PATCH 11/17] Removes unused import from transformer.

---
 .../dspml_pipeline/end_to_end_estimation/transformer.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py
index 8920bafa..c99b193a 100644
--- a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py
+++ b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py
@@ -8,7 +8,7 @@
 from torch import nn
 from torch.utils.data import Dataset, DataLoader
 from sklearn.model_selection import KFold
-from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from sklearn.metrics import mean_squared_error, mean_absolute_error
 import time
 import os
 from PIL import Image

From 5dc3c5528a7591ebb6c60b76d81d75a2cd8c8e09 Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Fri, 6 Mar 2026 18:01:21 +0000
Subject: [PATCH 12/17] Adds random seed for consistent results.

---
 01_dsp/dspml_pipeline/scripts/main.py | 48 +++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py
index dbf4d4c9..28ec44a3 100644
--- a/01_dsp/dspml_pipeline/scripts/main.py
+++ b/01_dsp/dspml_pipeline/scripts/main.py
@@ -21,7 +21,9 @@
 
 import argparse
 import os
+import random
 import sys
+import torch
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 
 import numpy as np
@@ -60,6 +62,48 @@ def load_config(path: str) -> dict:
         params = yaml.safe_load(f)
     return params
 
+def plant_seeds(seed: int = 42):
+    """
+    plant_seeds(seed)
+
+    Configure "consistent randomness" in system settings.
+
+    Args:
+        seed    (int)   Random seed.
+    """
+    logging.info(f"Configuring random seed of {seed}...")
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    logging.info("DONE.")
+
+def split_dataset(ds: tuple,
+                  train_split: float = 0.8,
+                  test_split: float = 0.2,
+                  random_seed: int = 42) -> tuple[tuple, tuple]:
+    """
+    split_dataset(ds, train_split, test_split, random_seed)
+
+    Divide a given dataset into a training set and testing set. In the event of an
+    imperfect split, the number of training data entries will be rounded up, while the
+    testing entries will be rounded down.
+    
+    Args:
+        ds              (tuple) Dataset to split
+        train_split     (float) Percentage of dataset to put into the new training dataset.
+        test_split      (float) Percentage of dataset to put into the new testing dataset.
+        random_seed     (int)   Random seed for assigning dataset splits.
+
+    Returns:
+        training_ds     (tuple) New training dataset.
+        testing_ds      (tuple) New testing dataset.
+    """
+    training_ds = []
+    testing_ds = []
+
 def are_duplicate_examples_present(ds1: tuple, ds2: tuple) -> bool:
     """
     are_duplicate_examples_present(ds1, ds2)
@@ -92,6 +136,10 @@ def main(config_path: str):
     # Configure logging.
     setup_logging(verbose=params['advanced']['verbose'])
 
+    # Configure environment for consistent training/results.
+    seed = 42   # TODO: Import as config.
+    plant_seeds(seed=seed)
+
     # Load data from training and validation datasets.
     trainingFrameLoader = FrameLoader(dataset_dirs=params['data']['training']['dataset_dirs'],
                               target_dir=params['data']['training']['target_dir'],

From c2e3dafc24888f14093123b82474d0a8200b5b8d Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Fri, 6 Mar 2026 21:40:37 +0000
Subject: [PATCH 13/17] Adds an option to cross validate only on the training
 dataset.

---
 01_dsp/dspml_pipeline/scripts/main.py | 137 ++++++++++++++------------
 1 file changed, 76 insertions(+), 61 deletions(-)

diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py
index 28ec44a3..c776ed32 100644
--- a/01_dsp/dspml_pipeline/scripts/main.py
+++ b/01_dsp/dspml_pipeline/scripts/main.py
@@ -11,10 +11,10 @@
     nubby
 
 Date:
-    24 Feb 2026
+    6 Mar 2026
 
 Version:
-    1.0.9
+    1.0.11
 """
 import logging
 logger = logging.getLogger(__name__)
@@ -81,9 +81,10 @@ def plant_seeds(seed: int = 42):
     logging.info("DONE.")
 
 def split_dataset(ds: tuple,
+                  labels: tuple,
                   train_split: float = 0.8,
                   test_split: float = 0.2,
-                  random_seed: int = 42) -> tuple[tuple, tuple]:
+                  random_seed: int = 42) -> tuple[tuple, tuple, tuple, tuple]:
     """
     split_dataset(ds, train_split, test_split, random_seed)
 
@@ -92,17 +93,33 @@ def split_dataset(ds: tuple,
     testing entries will be rounded down.
     
     Args:
-        ds              (tuple) Dataset to split
+        ds              (tuple) Dataset to split.
+        labels          (tuple) Labels to split.
         train_split     (float) Percentage of dataset to put into the new training dataset.
         test_split      (float) Percentage of dataset to put into the new testing dataset.
         random_seed     (int)   Random seed for assigning dataset splits.
 
     Returns:
         training_ds     (tuple) New training dataset.
+        training_labels (tuple) New training labels.
         testing_ds      (tuple) New testing dataset.
+        testing_labels  (tuple) New testing labels.
     """
-    training_ds = []
-    testing_ds = []
+    full_ds_size = len(ds)
+    training_ds_size = np.ceil(train_split * full_ds_size)
+    testing_ds_size = np.floor(test_split * full_ds_size)
+
+    # Verify proper dataset split sizes.
+    assert (training_ds_size + testing_ds_size == full_ds_size), f"Splits of {training_ds_size} and {test_ds_size} are not of total size {full_ds_size}"
+
+    # Split the dataset and labels into training and testing sets based on indices.
+    training_indices = random.sample(range(full_ds_size), training_ds_size)
+    testing_indices = [index for index in range(full_ds_size) if index not in training_indices]
+    training_ds = [ds[index] for index in training_indices]
+    training_labels = [labels[index] for index in training_indices]
+    testing_ds = [ds[index] for index in testing_indices]
+    testing_labels = [labels[index] for index in testing_indices]
+    return training_ds, training_labels, testing_ds, testing_labels
 
 def are_duplicate_examples_present(ds1: tuple, ds2: tuple) -> bool:
     """
@@ -129,7 +146,16 @@ def are_duplicate_examples_present(ds1: tuple, ds2: tuple) -> bool:
                             dups = True
     return dups
 
-def main(config_path: str):
+def main(config_path: str, cross_val: bool = False):
+    """
+    main(config_path, cross_val)
+
+    Run the main training/validation pipeline.
+
+    Args:
+        config_path (str)   Path to selected configuration .yaml file.
+        cross_val   (bool)  Perform cross-validation on training dataset specified.
+    """
     # Load training parameters from config file.
     params = load_config(path=config_path)
 
@@ -140,17 +166,42 @@ def main(config_path: str):
     seed = 42   # TODO: Import as config.
     plant_seeds(seed=seed)
 
-    # Load data from training and validation datasets.
-    trainingFrameLoader = FrameLoader(dataset_dirs=params['data']['training']['dataset_dirs'],
-                              target_dir=params['data']['training']['target_dir'],
-                              data_log="data-log.csv",
-                              label_name=params['data']['label_name'])
-    validationFrameLoader = FrameLoader(dataset_dirs=params['data']['validation']['dataset_dirs'],
-                              target_dir=params['data']['validation']['target_dir'],
-                              data_log="data-log.csv",
-                              label_name=params['data']['label_name'])
-    X_train, y_train = trainingFrameLoader.load(params['data']['new_dataset'])
-    X_val, y_val = validationFrameLoader.load(params['data']['new_dataset'])
+    # Determine whether to split a single dataset into parts or validate on held-out datasets.
+    # Load only the training "dataset_dirs"  for cross validation testing.
+    if cross_val:
+        fullFrameLoader = FrameLoader(dataset_dirs=params['data']['training']['dataset_dirs'],
+                                  target_dir=params['data']['training']['target_dir'],
+                                  data_log="data-log.csv",
+                                  label_name=params['data']['label_name'])
+        X_full, y_full = trainingFrameLoader.load(params['data']['new_dataset'])
+
+        # Divide the full dataset into training/testing splits.
+        X_train, y_train, X_val, y_val = split_dataset(ds=X_full, labels=y_full, random_seed=seed)
+
+        # NOTE: Currently, these frame loaders can only write/save each split.
+        trainingFrameLoader = FrameLoader(dataset=X_train,
+                                          data_log="data-log.csv",
+                                          label_name=params['data']['label_name'],
+                                          labels=y_train,
+                                          target_dir=params['data']['training']['target_dir'])
+        validationFrameLoader = FrameLoader(dataset=X_val,
+                                            data_log="data-log.csv",
+                                            label_name=params['data']['label_name'],
+                                            labels=y_val,
+                                            target_dir=params['data']['validation']['target_dir'])
+    # Load all datasets if not doing strict cross-validation.
+    else:
+        # Load data from training and validation datasets.
+        trainingFrameLoader = FrameLoader(dataset_dirs=params['data']['training']['dataset_dirs'],
+                                  target_dir=params['data']['training']['target_dir'],
+                                  data_log="data-log.csv",
+                                  label_name=params['data']['label_name'])
+        validationFrameLoader = FrameLoader(dataset_dirs=params['data']['validation']['dataset_dirs'],
+                                  target_dir=params['data']['validation']['target_dir'],
+                                  data_log="data-log.csv",
+                                  label_name=params['data']['label_name'])
+        X_train, y_train = trainingFrameLoader.load(params['data']['new_dataset'])
+        X_val, y_val = validationFrameLoader.load(params['data']['new_dataset'])
 
     # Verify that there are no duplicate examples in dataset.
     if (are_duplicate_examples_present(X_train, X_val)):
@@ -161,48 +212,6 @@ def main(config_path: str):
     trainingFrameLoader.save_dataset()
     validationFrameLoader.save_dataset()
 
-    """
-    # If new dataset, extract data. Otherwise, load from saved file.
-    if params['data']['new_dataset']:
-        X_train, y_train = trainingFrameLoader.extract_data()
-        # Try to load previously-processed data if none found in raw form.
-        if len(X_train) > 0 and len(y_train) > 0:
-            trainingFrameLoader.save_dataset()
-        else:
-            print(f'Loading dataset from {params["data"]["training"]["target_dir"]}.')
-            X_train, y_train = load_dataset(
-                    dataset_dir=params['data']['training']['target_dir'],
-                    fl=trainingFrameLoader
-                )
-            # Exit if we still cannot find training data.
-            if len(X_train) == 0 or len(y_train) == 0:
-                logger.error(f'Cannot load training data for {params["data"]["training"]["target_dir"]}! Exiting.')
-                sys.exit()
-
-        # Try to load previously-processed data if none found in raw form.
-        X_val, y_val = validationFrameLoader.extract_data()
-        if len(X_val) > 0 and len(y_val) > 0:
-            validationFrameLoader.save_dataset()
-        else:
-            print(f'Loading dataset from {params["data"]["validation"]["target_dir"]}.')
-            X_val, y_val = load_dataset(
-                    dataset_dir=params['data']['validation']['target_dir'],
-                    fl=validationFrameLoader
-                )
-            # Exit if we still cannot find validation data.
-            if len(X_val) == 0 or len(y_val) == 0:
-                logger.error(f'Cannot load training data for {params["data"]["validation"]["target_dir"]}! Exiting.')
-                sys.exit()
-    else:
-        X_train, y_train = load_dataset(
-                dataset_dir=params['data']['training']['target_dir'],
-                fl=trainingFrameLoader
-            )
-        X_val, y_val = load_dataset(
-                dataset_dir=params['data']['validation']['target_dir'],
-                fl=validationFrameLoader
-            )
-    """
 
     # ======== Handcrafted Features ========
     if params['handcrafted']['enabled']:
@@ -679,5 +688,11 @@ def main(config_path: str):
             type=str,
             help="Path to desired config path."
         )
+    parser.add_argument(
+            "--cross-validation",
+            "-x",
+            action="store_true",
+            help="Run cross-validation on the specified dataset (specified as the 'training' dataset in the config)?"
+        )
     args = parser.parse_args()
-    main(config_path=args.config)
+    main(config_path=args.config, cross_val=args.cross_validation)

From d9f8e725e6a6d0e3ae14fe20c333527c9dff941c Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Fri, 6 Mar 2026 21:40:57 +0000
Subject: [PATCH 14/17] Forgot to add updates to frame_loader.

---
 .../dspml_pipeline/data/frame_loader.py       | 54 ++++++++++++-------
 1 file changed, 35 insertions(+), 19 deletions(-)

diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
index 90dabe83..20062625 100644
--- a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
+++ b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
@@ -59,9 +59,14 @@ class FrameLoader:
         y (np.ndarray):         Corresponding labels (targets).
     """
 
-    def __init__(self, dataset_dirs: list, target_dir: str,
+    def __init__(self,
+                 target_dir: str,
                  data_log: str = "data-log.csv", 
-                 folder_name: str = "Sample #", label_name: str = "Bulk Density (g/cm^3)", 
+                 dataset: tuple = (),
+                 dataset_dirs: list = [],
+                 folder_name: str = "Sample #",
+                 label_name: str = "Bulk Density (g/cm^3)", 
+                 labels: tuple = (),
                  verbose: bool = False):
         """
         Initializes the FrameLoader instance based on the provided directories.
@@ -78,26 +83,37 @@ def __init__(self, dataset_dirs: list, target_dir: str,
         self.dataset_dirs = dataset_dirs
         self.target_dir = target_dir
         self.data_log = data_log
-        self.X = None
-        self.y = None
         self.label_name = label_name
         self.folder_name = folder_name
 
-        # Validate dataset directory
-        for i in self.dataset_dirs:
-            if not Path(i).exists():
-                logger.error(f"Dataset {i} does not exist.")
-            if not os.path.isdir(i):
-                logger.error(f"Path {i} does not point to a dataset directory.")
-            data_log_i = Path(i) / data_log
-            if not data_log_i.exists():
-                logger.warning(f"Data log file {data_log_i} does not exist; "
-                               f"checking for preprocessed dataset...")
-                if not self._is_dataset_preprocessed(i):
-                    logger.warning(f"Dataset {i} is invalid.")
-                    sys.exit(1)
-                else:
-                    logger.info(f"Dataset {i} initialized.")
+        # Import data from the dataset directories if provided.
+        if dataset_dirs:
+            # No input datastreams given.
+            self.X = None
+            self.y = None
+
+            # Validate dataset directory
+            for i in self.dataset_dirs:
+                if not Path(i).exists():
+                    logger.error(f"Dataset {i} does not exist.")
+                if not os.path.isdir(i):
+                    logger.error(f"Path {i} does not point to a dataset directory.")
+                data_log_i = Path(i) / data_log
+                if not data_log_i.exists():
+                    logger.warning(f"Data log file {data_log_i} does not exist; "
+                                   f"checking for preprocessed dataset...")
+                    if not self._is_dataset_preprocessed(i):
+                        logger.warning(f"Dataset {i} is invalid.")
+                        sys.exit(1)
+                    else:
+                        logger.info(f"Dataset {i} initialized.")
+        # Directly import tuples of dataset and labels if given.
+        elif ((len(dataset) > 0 and len(labels) > 0) and (len(labels) == len(dataset))):
+            self.X = dataset
+            self.y = labels
+        else:
+            print(f"Cannot load dataset.")
+            sys.exit(1)
 
     def _is_dataset_preprocessed(self, path: str):
         """

From ace5063ad1112a9f3c8c586f318033316ea7b63b Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Fri, 6 Mar 2026 23:05:54 +0000
Subject: [PATCH 15/17] Allows for cross validation on the same dataset now.

---
 .../dspml_pipeline/data/frame_loader.py       |  4 +--
 01_dsp/dspml_pipeline/scripts/main.py         | 28 +++++++++++--------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
index 20062625..6cfe8c20 100644
--- a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
+++ b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py
@@ -62,11 +62,11 @@ class FrameLoader:
     def __init__(self,
                  target_dir: str,
                  data_log: str = "data-log.csv", 
-                 dataset: tuple = (),
+                 dataset: np.ndarray = None,
                  dataset_dirs: list = [],
                  folder_name: str = "Sample #",
                  label_name: str = "Bulk Density (g/cm^3)", 
-                 labels: tuple = (),
+                 labels: np.ndarray = None,
                  verbose: bool = False):
         """
         Initializes the FrameLoader instance based on the provided directories.
diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py
index c776ed32..f75dc107 100644
--- a/01_dsp/dspml_pipeline/scripts/main.py
+++ b/01_dsp/dspml_pipeline/scripts/main.py
@@ -80,11 +80,11 @@ def plant_seeds(seed: int = 42):
     torch.backends.cudnn.benchmark = False
     logging.info("DONE.")
 
-def split_dataset(ds: tuple,
-                  labels: tuple,
+def split_dataset(ds: np.ndarray,
+                  labels: np.ndarray,
                   train_split: float = 0.8,
                   test_split: float = 0.2,
-                  random_seed: int = 42) -> tuple[tuple, tuple, tuple, tuple]:
+                  random_seed: int = 42) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
     """
     split_dataset(ds, train_split, test_split, random_seed)
 
@@ -106,19 +106,25 @@ def split_dataset(ds: tuple,
         testing_labels  (tuple) New testing labels.
     """
     full_ds_size = len(ds)
-    training_ds_size = np.ceil(train_split * full_ds_size)
-    testing_ds_size = np.floor(test_split * full_ds_size)
+    training_ds_size = int(np.ceil(train_split * full_ds_size))
+    testing_ds_size = int(np.floor(test_split * full_ds_size))
 
     # Verify proper dataset split sizes.
     assert (training_ds_size + testing_ds_size == full_ds_size), f"Splits of {training_ds_size} and {test_ds_size} are not of total size {full_ds_size}"
 
     # Split the dataset and labels into training and testing sets based on indices.
-    training_indices = random.sample(range(full_ds_size), training_ds_size)
+    training_indices = np.random.choice(full_ds_size, training_ds_size, replace=False)
     testing_indices = [index for index in range(full_ds_size) if index not in training_indices]
-    training_ds = [ds[index] for index in training_indices]
-    training_labels = [labels[index] for index in training_indices]
-    testing_ds = [ds[index] for index in testing_indices]
-    testing_labels = [labels[index] for index in testing_indices]
+    """
+    training_ds = np.ndarray([ds[index] for index in training_indices])
+    training_labels = np.ndarray([labels[index] for index in training_indices])
+    testing_ds = np.ndarray([ds[index] for index in testing_indices])
+    testing_labels = np.ndarray([labels[index] for index in testing_indices])
+    """
+    training_ds = ds[training_indices]
+    training_labels = labels[training_indices]
+    testing_ds = ds[testing_indices]
+    testing_labels = labels[testing_indices]
     return training_ds, training_labels, testing_ds, testing_labels
 
 def are_duplicate_examples_present(ds1: tuple, ds2: tuple) -> bool:
@@ -173,7 +179,7 @@ def main(config_path: str, cross_val: bool = False):
                                   target_dir=params['data']['training']['target_dir'],
                                   data_log="data-log.csv",
                                   label_name=params['data']['label_name'])
-        X_full, y_full = trainingFrameLoader.load(params['data']['new_dataset'])
+        X_full, y_full = fullFrameLoader.load(params['data']['new_dataset'])
 
         # Divide the full dataset into training/testing splits.
         X_train, y_train, X_val, y_val = split_dataset(ds=X_full, labels=y_full, random_seed=seed)

From 15e69140dbd372cdd99986eeb1ca2b772f766126 Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Sun, 8 Mar 2026 02:09:48 +0000
Subject: [PATCH 16/17] Moved pretrained transformer model to new file.

---
 .../{transformer.py => pt_transformer.py}        | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)
 rename 01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/{transformer.py => pt_transformer.py} (98%)

diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/pt_transformer.py
similarity index 98%
rename from 01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py
rename to 01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/pt_transformer.py
index c99b193a..0e209bea 100644
--- a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py
+++ b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/pt_transformer.py
@@ -1,5 +1,19 @@
-# TODO: update docstrings
+"""
+pt_transformer.py
 
+Pretrained, lightweight visual Transformer architecture (MobileViT) repurposed to explore
+its ability at using transfer learning to detect soil compaction through radargrams.
+
+Authors:
+    jLab
+    Eric Vetha
+    
+Date:
+    7 Mar 2026
+
+Version:
+    1.0.0
+"""
 import logging
 logger = logging.getLogger(__name__)
 

From 57eb5e587130b7dfe20cec85a82a9ad79d53cb43 Mon Sep 17 00:00:00 2001
From: nubby <nubby.stegosaurus@gmail.com>
Date: Sun, 8 Mar 2026 18:10:23 +0000
Subject: [PATCH 17/17] Migrated transformer.py to pt_transformer.py and
 tested.

---
 .../dspml_pipeline/feature_estimation/eval_tools.py           | 2 +-
 01_dsp/dspml_pipeline/scripts/main.py                         | 2 +-
 01_dsp/dspml_pipeline/tests/test_end_to_end.py                | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py b/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py
index f44ed0a2..d44364a6 100644
--- a/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py
+++ b/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py
@@ -24,7 +24,7 @@
 from dspml_pipeline.feature_estimation.mlp import MLPRegression
 from dspml_pipeline.end_to_end_estimation.cnn import CNNEstimator
 from dspml_pipeline.end_to_end_estimation.lstm import LSTMEstimator
-from dspml_pipeline.end_to_end_estimation.transformer import TransformerEstimator
+from dspml_pipeline.end_to_end_estimation.pt_transformer import TransformerEstimator
 from dspml_pipeline.parameters import num2label
 
 from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score
diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py
index f75dc107..2f848f29 100644
--- a/01_dsp/dspml_pipeline/scripts/main.py
+++ b/01_dsp/dspml_pipeline/scripts/main.py
@@ -38,7 +38,7 @@
 from dspml_pipeline.feature_extraction.learned.autoencoder import AutoencoderLearnedFeatures
 from dspml_pipeline.feature_extraction.learned.cnn import CNNLearnedFeatures
 from dspml_pipeline.end_to_end_estimation.cnn import CNNEstimator
-from dspml_pipeline.end_to_end_estimation.transformer import TransformerEstimator
+from dspml_pipeline.end_to_end_estimation.pt_transformer import TransformerEstimator
 from dspml_pipeline.end_to_end_estimation.lstm import LSTMEstimator
 
 from scipy import stats
diff --git a/01_dsp/dspml_pipeline/tests/test_end_to_end.py b/01_dsp/dspml_pipeline/tests/test_end_to_end.py
index a94df172..0a1cb2c0 100644
--- a/01_dsp/dspml_pipeline/tests/test_end_to_end.py
+++ b/01_dsp/dspml_pipeline/tests/test_end_to_end.py
@@ -13,7 +13,7 @@
 from dspml_pipeline.end_to_end_estimation.lstm import LSTMEstimator
 from dspml_pipeline.results import update_results
 from dspml_pipeline.end_to_end_estimation.cnn import CNNEstimator
-from dspml_pipeline.end_to_end_estimation.transformer import TransformerEstimator
+from dspml_pipeline.end_to_end_estimation.pt_transformer import TransformerEstimator
 
 from scipy import stats
 
@@ -59,4 +59,4 @@ def display_feature_importance(feature_array, feature_names, labels):
     X = np.abs(X)
     trans = TransformerEstimator(X, y, verbose=verbose)
     model, metrics = trans.full_monty()
-    # update_results(target_dir, "End-to-end", f"Transformer", metrics)
\ No newline at end of file
+    # update_results(target_dir, "End-to-end", f"Transformer", metrics)