diff --git a/.gitignore b/.gitignore index f3c63fd9..f046e67f 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,6 @@ b1_ws/build/ b1_ws/install/ b1_ws/log/ b1_ws/src/ros2 + +# Virtual environments. +*.venv* diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py index 6bb6d86f..6cfe8c20 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/data/frame_loader.py @@ -1,3 +1,21 @@ +""" +File: + frame_loader.py + +Description: + Tools for loading datasets from the GOPHERS pipeline. + +Authors: + jLab + Eric Vetha + nubby + +Date: + 24 Feb 2026 + +Version: + 1.0.9 +""" import logging logger = logging.getLogger(__name__) @@ -5,12 +23,15 @@ from pathlib import Path from ..setup_logging import setup_logging import json +import os import pandas as pd import sys from scipy import signal + THRESHOLD = 50 # For anomoly removal + class FrameLoader: """ FrameLoader class for processing radar data into standardized input (X) and output (y) matrices for regression tasks. @@ -38,10 +59,15 @@ class FrameLoader: y (np.ndarray): Corresponding labels (targets). """ - def __init__(self, dataset_dirs:list, target_dir:str, - data_log:str = "data-log.csv", - folder_name:str = "Sample #", label_name:str = "Bulk Density (g/cm^3)", - verbose:bool = False): + def __init__(self, + target_dir: str, + data_log: str = "data-log.csv", + dataset: np.ndarray = None, + dataset_dirs: list = [], + folder_name: str = "Sample #", + label_name: str = "Bulk Density (g/cm^3)", + labels: np.ndarray = None, + verbose: bool = False): """ Initializes the FrameLoader instance based on the provided directories. @@ -57,21 +83,252 @@ def __init__(self, dataset_dirs:list, target_dir:str, self.dataset_dirs = dataset_dirs self.target_dir = target_dir self.data_log = data_log - self.X = None - self.y = None self.label_name = label_name self.folder_name = folder_name - # Validate dataset directory - for i in self.dataset_dirs: - if not Path(i).exists(): - logger.error(f"Dataset {i} does not exist.") - data_log_i = Path(i) / data_log - if not data_log_i.exists(): - logger.error(f"Data log file {data_log_i} does not exist.") + # Import data from the dataset directories if provided. + if dataset_dirs: + # No input datastreams given. + self.X = None + self.y = None + + # Validate dataset directory + for i in self.dataset_dirs: + if not Path(i).exists(): + logger.error(f"Dataset {i} does not exist.") + if not os.path.isdir(i): + logger.error(f"Path {i} does not point to a dataset directory.") + data_log_i = Path(i) / data_log + if not data_log_i.exists(): + logger.warning(f"Data log file {data_log_i} does not exist; " + f"checking for preprocessed dataset...") + if not self._is_dataset_preprocessed(i): + logger.warning(f"Dataset {i} is invalid.") + sys.exit(1) + else: + logger.info(f"Dataset {i} initialized.") + # Directly import tuples of dataset and labels if given. + elif ((len(dataset) > 0 and len(labels) > 0) and (len(labels) == len(dataset))): + self.X = dataset + self.y = labels + else: + print(f"Cannot load dataset.") + sys.exit(1) + + def _is_dataset_preprocessed(self, path: str): + """ + Check for the existence of X.npy, y.npy, features.csv, and results.csv files in the path provided. + + Args: + path (str) + + Returns: + preprocessed? (bool) + """ + required_files = ["X.npy", "y.npy", "features.csv", "results.csv"] + current_files = os.listdir(path) + if not set(required_files) == set(current_files): + return False + # TODO: Extract data here? + return True + + def _is_new_dataset_valid(self, dataset_dir: str) -> bool: + """ + Confirm whether a dataset contains the required raw radar frames for processing. + + Args: + dataset_dir (str): Relative path to dataset in question. + + Returns: + valid (bool): Is the dataset valid for extracting radar frame data? + """ + required_file = "data-log.csv" + capture_files = [] # Keep track of the number of available radar captures. + + # First check that all required files are in the base directory. + current_files = set(os.listdir(dataset_dir)) + if not required_file in current_files: + return False + + # Next look for a number of raw radar scans greater than zero. + dataset_path = Path(dataset_dir) + subdirs = [d for d in dataset_path.iterdir() + if d.is_dir() and not d.name.startswith('.')] + for i, folder in enumerate(subdirs): + capture_files.append(sorted(folder.glob("*.frames"))) + if len(capture_files) == 0: + return False + + return True + + def _is_preprocessed_dataset_valid(self, dataset_dir: str) -> bool: + """ + Confirm whether a dataset contains the required preprocessed radar scans. + + Args: + dataset_dir (str): Relative path to dataset in question. + + Returns: + valid (bool): Is the dataset valid for use of preprocessed radar data? + + Todo: + * Check the contents of the numpy files for validity. + """ + required_files = ["X.npy", "y.npy"] + current_files = set(os.listdir(dataset_dir)) + if not set(required_files).issubset(current_files): + return False + return True + + def extract_single_dataset(self, dataset_dir: Path) -> tuple[np.ndarray, np.ndarray]: + """ + Extracts the features (X) and labels (y) from a given directory. + + Args: + dataset_dir (str): String of relative path to dataset source directory. + + Returns: + frame_data (np.ndarray): Processed radar data (features) from one dataset source. + labels (np.ndarray): Corresponding labels (targets) from one dataset source. + """ + new_frame_data = [] + new_labels = [] + + logging.info(f"Extracting data from {dataset_dir}.") + + dataset_dir = Path(dataset_dir) + subdirs = [d for d in dataset_dir.iterdir() + if d.is_dir() and not d.name.startswith('.')] + data_log = dataset_dir / self.data_log + + # Get the labels from the data log. + try: + df = pd.read_csv(data_log) + df[self.folder_name] = df[self.folder_name].astype(str) + df[self.label_name] = df[self.label_name].astype(float) + logger.info(f"Loaded data log with {len(df)} samples") + except Exception as e: + logger.warning(f"Expected CSV format: columns include '{self.folder_name}' and '{self.label_name}'; " + f"attempting to load preprocessed {self.folder_name} dataset...") + return [], [] + + # In each subdirectory. + for i, folder in enumerate(subdirs): + + capture_files = sorted(folder.glob("*.frames")) + + logger.info(f"Processing {len(capture_files)} files in {folder.name}") + + if not capture_files: + logger.warning(f"No .frames files found in {folder.name}") + continue + + # Find the row in df corresponding to this folder name + sample_row = df[df['Sample #'] == folder.name] + if sample_row.empty: + logger.error(f"No matching sample for folder {folder.name} in data log") sys.exit(1) + else: + bulk_density = sample_row.iloc[0][self.label_name] + + # Process each capture file + params = None + for capture_file in capture_files: + try: + frame_data, params = process_frames(folder, capture_file.name) + + if frame_data is None: + logger.warning(f"Failed to process: {capture_file.name}") + continue + + # Anomoly removal. Replaces values that deviate from the median by more + # than a threshold with the median. This has been done since the beginning + # of the project because of odd spikes in the raw DAC output that causes + # large deviations in the data. + median = np.median(frame_data, axis=1, keepdims=True) + mask = np.abs(frame_data - median) > THRESHOLD + frame_data_clean = frame_data.copy() + frame_data_clean[mask] = np.broadcast_to(median, frame_data.shape)[mask] + + # DDC + ddc_frame_data = np.zeros_like(frame_data_clean, dtype=np.complex64) + for i in range(frame_data_clean.shape[1]): + ddc_frame_data[:, i] = novelda_digital_downconvert(frame_data_clean[:, i]) + + try: + new_frame_data.append(ddc_frame_data) + new_labels.append(bulk_density) + except: + logger.error(f"Failed to stack radar data from {capture_file.name}") + sys.exit(1) + + # Outputs warning when problem occurs while processing, but continues processing other radar data. + except Exception as e: + logger.warning(f"Error processing {capture_file.name}: {e}") + + # Save radar parameters + if params and len(capture_files) > 0: + params_file = folder / "radar_params.json" + with open(params_file, 'w') as f: + json.dump(params, f) + logger.info(f"Saved parameters: {params_file.name}") + + return new_frame_data, new_labels + + def load_preprocessed_dataset(self, dataset_dir: str) -> tuple: + """ + Load proprocessed datasets. + """ + print(dataset_dir) + X_path = Path(dataset_dir) / "X.npy" + y_path = Path(dataset_dir) / "y.npy" - def extract_data(self): + # Load dataset if it has already been processed into .npy files. + X = np.load(X_path) + y = np.load(y_path) + + logger.info(f"Loaded from existing dataset: X={X.shape}, y={y.shape}") + + return X.tolist(), y.tolist() + + def load(self, new: bool) -> tuple: + """ + Loads and combines the specified datasets based on both existence of raw data and user specs. + + Args: + new (bool) Load raw radar frames? If False, load .npy files if they exist. + + Returns: + X, y (tuple[np.ndarray, np.ndarray]) + """ + X = [] + y = [] + + logger.info("Starting frame processing") + + for dataset_dir in self.dataset_dirs: + # Load raw radar scans into new dataset here. + if new and self._is_new_dataset_valid(dataset_dir=dataset_dir): + X_new, y_new = self.extract_single_dataset(dataset_dir=dataset_dir) + # Try to load preprocessed dataset if raw scans unavailable. + elif self._is_preprocessed_dataset_valid(dataset_dir): + X_new, y_new = self.load_preprocessed_dataset(dataset_dir) + else: + logger.error(f"Neither existing radar scans nor valid preprocessed " + f"dataset were found for the following dataset:\r\n" + f"\t+ Target:\t\t{self.target_dir}\r\n" + f"\t+ Dataset dir:\t{dataset_dir}") + sys.exit(1) + # Append the new radar scans and labels to the broader dataset. + X += X_new + y += y_new + + self.X = np.stack(X) + self.y = np.stack(y) + + return self.X, self.y + + def extract_data(self) -> tuple: """ Extracts the features (X) and labels (y) from the provided directries. @@ -87,6 +344,7 @@ def extract_data(self): # Iterate through each dataset dir for i in self.dataset_dirs: + logging.info(f"Extracting data from {i}.") dataset_dir = Path(i) subdirs = [d for d in dataset_dir.iterdir() if d.is_dir() and not d.name.startswith('.')] @@ -99,8 +357,9 @@ def extract_data(self): df[self.label_name] = df[self.label_name].astype(float) logger.info(f"Loaded data log with {len(df)} samples") except Exception as e: - logger.error(f"Expected CSV format: columns include '{self.folder_name}' and '{self.label_name}'") - sys.exit(1) + logger.warning(f"Expected CSV format: columns include '{self.folder_name}' and '{self.label_name}'; " + f"attempting to load preprocessed {self.folder_name} dataset...") + return [], [] # In each subdirectory for i, folder in enumerate(subdirs): @@ -185,13 +444,14 @@ def save_dataset(self): logger.info(f"Raw dataset saved as X.npy and y.npy") logger.info(f"Saved shapes: X={self.X.shape}, y={self.y.shape}") -def load_dataset(dataset_dir:str): +def load_dataset(dataset_dir: str, fl: FrameLoader): """ Loads data that has already been processed. Assumes the features are named X.npy and the labels are named y.npy. Args: dataset_dir: Directory containing the capture file. + fl: FrameLoader object for given dataset. Returns: X (np.ndarray): Processed radar data (features). @@ -202,11 +462,22 @@ def load_dataset(dataset_dir:str): y_path = Path(dataset_dir) / "y.npy" if not X_path.exists() or not y_path.exists(): - logger.error("X.npy and/or y.npy not found in the dataset directory") - sys.exit(1) - - X = np.load(X_path) - y = np.load(y_path) + logger.warning("X.npy and/or y.npy not found in the dataset directory; generating...") + X, y = fl.extract_data() + if len(X) == 0 or len(y) == 0: + logger.error("X.npy and/or y.npy could not be generated.") + sys.exit(1) + fl.save_dataset() + # If the dataset still does not exists, exit. + if not X_path.exists() or not y_path.exists(): + logger.error("X.npy and/or y.npy could not be generated.") + sys.exit(1) + else: + # Load dataset if it has already been processed into .npy files. + print(X_path) + print(y_path) + X = np.load(X_path) + y = np.load(y_path) logger.info(f"Loaded from existing dataset: X={X.shape}, y={y.shape}") @@ -480,4 +751,4 @@ def novelda_digital_downconvert(raw_frame:np.ndarray): # Baseband signal using convolution (provides downcoverted, filtered analytic signal) baseband_signal = signal.convolve(mixed, window, mode='same') - return baseband_signal \ No newline at end of file + return baseband_signal diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/pt_transformer.py similarity index 94% rename from 01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py rename to 01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/pt_transformer.py index 84b05fc7..0e209bea 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/transformer.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/end_to_end_estimation/pt_transformer.py @@ -1,5 +1,19 @@ -# TODO: update docstrings +""" +pt_transformer.py +Pretrained, lightweight visual Transformer architecture (MobileViT) repurposed to explore +its ability at using transfer learning to detect soil compaction through radargrams. + +Authors: + jLab + Eric Vetha + +Date: + 7 Mar 2026 + +Version: + 1.0.0 +""" import logging logger = logging.getLogger(__name__) @@ -8,13 +22,17 @@ from torch import nn from torch.utils.data import Dataset, DataLoader from sklearn.model_selection import KFold -from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score +from sklearn.metrics import mean_squared_error, mean_absolute_error import time import os from PIL import Image from ..parameters import RANDOM_SEED, KFOLD_SPLITS, num2label -from transformers import MobileViTFeatureExtractor, MobileViTForImageClassification +try: + from transformers import MobileViTFeatureExtractor, MobileViTForImageClassification +except ImportError: + from transformers import MobileViTImageProcessor, MobileViTForImageClassification + # Set seeds for reproducibility torch.manual_seed(RANDOM_SEED) @@ -129,7 +147,11 @@ def __init__(self, X, y, epochs=10, batch_size=4, verbose=False): # Move to device self.mobilevit.to(self.device) - self.feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-small") + try: + self.feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-small") + except: + self.feature_extractor = MobileViTImageProcessor.from_pretrained("apple/mobilevit-small") + # Freeze the backbone, only train the classifier for name, param in self.mobilevit.named_parameters(): @@ -413,4 +435,4 @@ def evaluate(model, dataloader, loss_fn, device): total_loss += loss.item() preds.extend(outputs.cpu().numpy()) trues.extend(targets.cpu().numpy()) - return total_loss / len(dataloader), preds, trues \ No newline at end of file + return total_loss / len(dataloader), preds, trues diff --git a/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py b/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py index 8d2ebafc..d44364a6 100644 --- a/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py +++ b/01_dsp/dspml_pipeline/dspml_pipeline/feature_estimation/eval_tools.py @@ -1,3 +1,21 @@ +""" +File: + eval_tools.py + +Description: + ??? + +Authors: + jLab + Eric Vetha + nubby + +Date: + 24 Feb 2026 + +Version: + 1.0.9 +""" from dspml_pipeline.feature_estimation.ridge_regression import RidgeRegression from dspml_pipeline.feature_estimation.random_forest import RandomForest from dspml_pipeline.feature_estimation.xgboost_tree import XGBoostTree @@ -6,7 +24,7 @@ from dspml_pipeline.feature_estimation.mlp import MLPRegression from dspml_pipeline.end_to_end_estimation.cnn import CNNEstimator from dspml_pipeline.end_to_end_estimation.lstm import LSTMEstimator -from dspml_pipeline.end_to_end_estimation.transformer import TransformerEstimator +from dspml_pipeline.end_to_end_estimation.pt_transformer import TransformerEstimator from dspml_pipeline.parameters import num2label from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score @@ -263,4 +281,4 @@ def show_results_summary(feature_type: str, training_dir: str, validation_dir: s print(f"Validation Results for {feature_type}".center(40)) print("="*40) results_df_amp = load_results(validation_dir) - display_feature_results(feature_type, results_df_amp) \ No newline at end of file + display_feature_results(feature_type, results_df_amp) diff --git a/01_dsp/dspml_pipeline/scripts/configs/f1-f2-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/f1-f2-config.yaml new file mode 100644 index 00000000..1afc76ac --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/f1-f2-config.yaml @@ -0,0 +1,93 @@ +# f1-f2-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/field-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/f1_f2-pr-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/field-2-soil-compaction-dataset" + # Target combined validation dataset directory + target_dir: "../data/f1_f2-pr-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/f1-pr-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/f1-pr-config.yaml new file mode 100644 index 00000000..cebb2840 --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/f1-pr-config.yaml @@ -0,0 +1,93 @@ +# f1-pr-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/field-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/f1-pr-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/pie-ranch-dataset" + # Target combined validation dataset directory + target_dir: "../data/f1-pr-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/f2-f1-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/f2-f1-config.yaml new file mode 100644 index 00000000..878299ac --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/f2-f1-config.yaml @@ -0,0 +1,93 @@ +# f1-f2-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/field-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/f1-f2-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/field-2-soil-compaction-dataset" + # Target combined validation dataset directory + target_dir: "../data/f1-f2-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/f2-pr-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/f2-pr-config.yaml new file mode 100644 index 00000000..312c1767 --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/f2-pr-config.yaml @@ -0,0 +1,93 @@ +# f2-pr-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/field-2-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/f2-pr-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/pie-ranch-dataset" + # Target combined validation dataset directory + target_dir: "../data/f2-pr-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/il-f1-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/il-f1-config.yaml new file mode 100644 index 00000000..0ee79789 --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/il-f1-config.yaml @@ -0,0 +1,95 @@ +# il-f1-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/wet-0-soil-compaction-dataset" + - "../data/wet-1-soil-compaction-dataset" + - "../data/wet-2-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/il-f1-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/field-soil-compaction-dataset" + # Target combined validation dataset directory + target_dir: "../data/il-f1-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/il-f2-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/il-f2-config.yaml new file mode 100644 index 00000000..a1ecacfd --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/il-f2-config.yaml @@ -0,0 +1,95 @@ +# il-f2-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/wet-0-soil-compaction-dataset" + - "../data/wet-1-soil-compaction-dataset" + - "../data/wet-2-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/il-f2-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/field-2-soil-compaction-dataset" + # Target combined validation dataset directory + target_dir: "../data/il-f2-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/il-pr-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/il-pr-config.yaml new file mode 100644 index 00000000..3af5e158 --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/il-pr-config.yaml @@ -0,0 +1,95 @@ +# il-pr-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/wet-0-soil-compaction-dataset" + - "../data/wet-1-soil-compaction-dataset" + - "../data/wet-2-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/il-pr-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/pie-ranch-dataset" + # Target combined validation dataset directory + target_dir: "../data/il-pr-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/il_f1_f2-pr-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/il_f1_f2-pr-config.yaml new file mode 100644 index 00000000..c74a66e3 --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/il_f1_f2-pr-config.yaml @@ -0,0 +1,97 @@ +# il_f1_f2-pr-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/wet-0-soil-compaction-dataset" + - "../data/wet-1-soil-compaction-dataset" + - "../data/wet-2-soil-compaction-dataset" + - "../data/field-soil-compaction-dataset" + - "../data/field-2-soil-compaction-dataset" + # Target combined training dataset directory + target_dir: "../data/il_f1_f2-pr-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/pie-ranch-dataset" + # Target combined validation dataset directory + target_dir: "../data/il_f1_f2-pr-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/pr-f1-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/pr-f1-config.yaml new file mode 100644 index 00000000..d7da694a --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/pr-f1-config.yaml @@ -0,0 +1,93 @@ +# pr-f1-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/pie-ranch-dataset" + # Target combined training dataset directory + target_dir: "../data/pr-f1-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/field-soil-compaction-dataset" + # Target combined validation dataset directory + target_dir: "../data/pr-f1-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/configs/pr-f2-config.yaml b/01_dsp/dspml_pipeline/scripts/configs/pr-f2-config.yaml new file mode 100644 index 00000000..ac7553c1 --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/configs/pr-f2-config.yaml @@ -0,0 +1,93 @@ +# pr-f2-config.yaml +# +# Dataset labels: +# il = In-lab, "wet*" datasets +# f1 = "field" dataset +# f2 = "field2" dataset +# pr = Dataset from Pie Ranch +# +# '_' separates included datasets; '-' separates training from validation datasets. +# +# Data configuration. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. +data: + label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv + new_dataset: true # Set to true if this is a new dataset + # Training dataset + training: + # Raw datasets to combine (list of directories) + dataset_dirs: + - "../data/pie-ranch-dataset" + # Target combined training dataset directory + target_dir: "../data/pr-f2-training-dataset" + # Validation dataset + validation: + # Raw validation datasets to combine + dataset_dirs: + - "../data/field-2-soil-compaction-dataset" + # Target combined validation dataset directory + target_dir: "../data/pr-f2-val-dataset" + +# For handcrafted features +handcrafted: + enabled: true # Enable or disable handcrafted features + new_features: true # Set to true if this is a new dataset or if features have not been generated yet + pruning_method: all # Options: corr, mi, lasso, none + top_n: 16 # Only used if pruning_method is not none + +# For learned features +learned: + n_features: 8 # Desired number of features + # PCA-based feature extraction + pca: + enabled: true # Enable or disable PCA features + # Kernel-PCA-based feature extraction + kpca: + enabled: true # Enable or disable kPCA features + # Autoencoder-based feature extraction + autoencoder: + enabled: true + epochs: 1000 + batch_size: 256 + verbose: true + # CNN-based feature extraction + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: true + +# Classical model configuration for feature regression +classical: + enabled: true # Enable or disable the evaluation and validation of classical models (on all features) + tune_model_params: true # Set to true to tune the models, or false to save time + +# Deep learning model configuration for feature regression +deep_learning: + enabled: true + +# End-to-end model configurations for raw data regression +end-to-end: + # LSTM-based end-to-end regression + lstm: + enabled: false + epochs: 50 + batch_size: 32 + verbose: false + # CNN-based end-to-end regression + cnn: + enabled: false + epochs: 20 + batch_size: 32 + verbose: false + # Transformer-based end-to-end regression + transformer: + enabled: true + batch_size: 4 + epochs: 10 + verbose: false + +advanced: + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/main.py b/01_dsp/dspml_pipeline/scripts/main.py index 44a6f3a0..2f848f29 100644 --- a/01_dsp/dspml_pipeline/scripts/main.py +++ b/01_dsp/dspml_pipeline/scripts/main.py @@ -1,8 +1,29 @@ +""" +File: + main.py + +Description: + Launch file for WADAR dspml_pipeline. + +Authors: + jLab + Eric Vetha + nubby + +Date: + 6 Mar 2026 + +Version: + 1.0.11 +""" import logging logger = logging.getLogger(__name__) +import argparse import os +import random import sys +import torch sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) import numpy as np @@ -17,44 +38,186 @@ from dspml_pipeline.feature_extraction.learned.autoencoder import AutoencoderLearnedFeatures from dspml_pipeline.feature_extraction.learned.cnn import CNNLearnedFeatures from dspml_pipeline.end_to_end_estimation.cnn import CNNEstimator -from dspml_pipeline.end_to_end_estimation.transformer import TransformerEstimator +from dspml_pipeline.end_to_end_estimation.pt_transformer import TransformerEstimator from dspml_pipeline.end_to_end_estimation.lstm import LSTMEstimator from scipy import stats import matplotlib.pyplot as plt import yaml -def main(): - if len(sys.argv) < 2: - raise RuntimeError("Usage: python main.py ") - config_file = sys.argv[1] +def load_config(path: str) -> dict: + """ + load_config(path) - # Load configuration - with open(config_file, "r") as f: - params = yaml.safe_load(f) + Load a configuration file into a return dictionary. + + Args: + path (str) + Returns: + params (dict) + """ + with open(path, "r") as f: + params = yaml.safe_load(f) + return params + +def plant_seeds(seed: int = 42): + """ + plant_seeds(seed) + + Configure "consistent randomness" in system settings. + + Args: + seed (int) Random seed. + """ + logging.info(f"Configuring random seed of {seed}...") + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + logging.info("DONE.") + +def split_dataset(ds: np.ndarray, + labels: np.ndarray, + train_split: float = 0.8, + test_split: float = 0.2, + random_seed: int = 42) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """ + split_dataset(ds, train_split, test_split, random_seed) + + Divide a given dataset into a training set and testing set. In the event of an + imperfect split, the number of training data entries will be rounded up, while the + testing entries will be rounded down. + + Args: + ds (tuple) Dataset to split. + labels (tuple) Labels to split. + train_split (float) Percentage of dataset to put into the new training dataset. + test_split (float) Percentage of dataset to put into the new testing dataset. + random_seed (int) Random seed for assigning dataset splits. + + Returns: + training_ds (tuple) New training dataset. + training_labels (tuple) New training labels. + testing_ds (tuple) New testing dataset. + testing_labels (tuple) New testing labels. + """ + full_ds_size = len(ds) + training_ds_size = int(np.ceil(train_split * full_ds_size)) + testing_ds_size = int(np.floor(test_split * full_ds_size)) + + # Verify proper dataset split sizes. + assert (training_ds_size + testing_ds_size == full_ds_size), f"Splits of {training_ds_size} and {test_ds_size} are not of total size {full_ds_size}" + + # Split the dataset and labels into training and testing sets based on indices. + training_indices = np.random.choice(full_ds_size, training_ds_size, replace=False) + testing_indices = [index for index in range(full_ds_size) if index not in training_indices] + """ + training_ds = np.ndarray([ds[index] for index in training_indices]) + training_labels = np.ndarray([labels[index] for index in training_indices]) + testing_ds = np.ndarray([ds[index] for index in testing_indices]) + testing_labels = np.ndarray([labels[index] for index in testing_indices]) + """ + training_ds = ds[training_indices] + training_labels = labels[training_indices] + testing_ds = ds[testing_indices] + testing_labels = labels[testing_indices] + return training_ds, training_labels, testing_ds, testing_labels + +def are_duplicate_examples_present(ds1: tuple, ds2: tuple) -> bool: + """ + are_duplicate_examples_present(ds1, ds2) + + Confirm that there are no duplicated examples both within and between each dataset. + + Args: + ds1 (tuple) First dataset. + ds2 (tuple) Second dataset. + + Returns: + (bool) Are duplicates present? + """ + dups = False + # The dimension of each scan is (512x160), and there are many scans. + for i, line1 in enumerate(ds1): + for j, line2 in enumerate(ds2): + if (len(line1) == len(line2)): + for scan1, scan2 in zip(line1, line2): + if (len(scan1) == len(scan2)): + if tuple(scan1) == tuple(scan2): + print(f"Found duplicate at [{i},{j}]!") + dups = True + return dups + +def main(config_path: str, cross_val: bool = False): + """ + main(config_path, cross_val) + + Run the main training/validation pipeline. + + Args: + config_path (str) Path to selected configuration .yaml file. + cross_val (bool) Perform cross-validation on training dataset specified. + """ + # Load training parameters from config file. + params = load_config(path=config_path) + + # Configure logging. setup_logging(verbose=params['advanced']['verbose']) - # Load data from training and validation datasets - trainingFrameLoader = FrameLoader(dataset_dirs=params['data']['training']['dataset_dirs'], - target_dir=params['data']['training']['target_dir'], - data_log="data-log.csv", - label_name=params['data']['label_name']) - validationFrameLoader = FrameLoader(dataset_dirs=params['data']['validation']['dataset_dirs'], - target_dir=params['data']['validation']['target_dir'], - data_log="data-log.csv", - label_name=params['data']['label_name']) - - # If new dataset, extract data. Otherwise, load from saved file. - if params['data']['new_dataset']: - X_train, y_train = trainingFrameLoader.extract_data() - trainingFrameLoader.save_dataset() - X_val, y_val = validationFrameLoader.extract_data() - validationFrameLoader.save_dataset() + # Configure environment for consistent training/results. + seed = 42 # TODO: Import as config. + plant_seeds(seed=seed) + + # Determine whether to split a single dataset into parts or validate on held-out datasets. + # Load only the training "dataset_dirs" for cross validation testing. + if cross_val: + fullFrameLoader = FrameLoader(dataset_dirs=params['data']['training']['dataset_dirs'], + target_dir=params['data']['training']['target_dir'], + data_log="data-log.csv", + label_name=params['data']['label_name']) + X_full, y_full = fullFrameLoader.load(params['data']['new_dataset']) + + # Divide the full dataset into training/testing splits. + X_train, y_train, X_val, y_val = split_dataset(ds=X_full, labels=y_full, random_seed=seed) + + # NOTE: Currently, these frame loaders can only write/save each split. + trainingFrameLoader = FrameLoader(dataset=X_train, + data_log="data-log.csv", + label_name=params['data']['label_name'], + labels=y_train, + target_dir=params['data']['training']['target_dir']) + validationFrameLoader = FrameLoader(dataset=X_val, + data_log="data-log.csv", + label_name=params['data']['label_name'], + labels=y_val, + target_dir=params['data']['validation']['target_dir']) + # Load all datasets if not doing strict cross-validation. else: - X_train, y_train = load_dataset(dataset_dir=params['data']['training']['target_dir']) - X_val, y_val = load_dataset(dataset_dir=params['data']['validation']['target_dir']) + # Load data from training and validation datasets. + trainingFrameLoader = FrameLoader(dataset_dirs=params['data']['training']['dataset_dirs'], + target_dir=params['data']['training']['target_dir'], + data_log="data-log.csv", + label_name=params['data']['label_name']) + validationFrameLoader = FrameLoader(dataset_dirs=params['data']['validation']['dataset_dirs'], + target_dir=params['data']['validation']['target_dir'], + data_log="data-log.csv", + label_name=params['data']['label_name']) + X_train, y_train = trainingFrameLoader.load(params['data']['new_dataset']) + X_val, y_val = validationFrameLoader.load(params['data']['new_dataset']) + + # Verify that there are no duplicate examples in dataset. + if (are_duplicate_examples_present(X_train, X_val)): + print("Found duplicates! Exiting.") + sys.exit(1) + + # TODO: Only save dataset conditionally. + trainingFrameLoader.save_dataset() + validationFrameLoader.save_dataset() + # ======== Handcrafted Features ======== if params['handcrafted']['enabled']: @@ -523,4 +686,19 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + parser = argparse.ArgumentParser(description="Launch training/evaluation of GOPHERS datasets.") + parser.add_argument( + "--config", + "-c", + required=True, + type=str, + help="Path to desired config path." + ) + parser.add_argument( + "--cross-validation", + "-x", + action="store_true", + help="Run cross-validation on the specified dataset (specified as the 'training' dataset in the config)?" + ) + args = parser.parse_args() + main(config_path=args.config, cross_val=args.cross_validation) diff --git a/01_dsp/dspml_pipeline/scripts/config.yaml b/01_dsp/dspml_pipeline/scripts/template_config.yaml similarity index 72% rename from 01_dsp/dspml_pipeline/scripts/config.yaml rename to 01_dsp/dspml_pipeline/scripts/template_config.yaml index e64657ca..1672bef7 100644 --- a/01_dsp/dspml_pipeline/scripts/config.yaml +++ b/01_dsp/dspml_pipeline/scripts/template_config.yaml @@ -1,28 +1,33 @@ -# Data configuration +# Data configuration - Sensys first submission configs. +# +# This configuration file trains on all controlled data as well as field (1), then +# validates on field-2. Pie Ranch dataset is excluded. data: label_name: "Bulk Density (g/cm^3)" # Name of label used data_log.csv - new_dataset: false # Set to true if this is a new dataset + new_dataset: true # Set to true if this is a new dataset # Training dataset training: # Raw datasets to combine (list of directories) - dataset_dirs: + dataset_dirs: - "../data/wet-0-soil-compaction-dataset" - "../data/wet-1-soil-compaction-dataset" - "../data/wet-2-soil-compaction-dataset" - "../data/field-soil-compaction-dataset" + - "../data/field-2-soil-compaction-dataset" # Target combined training dataset directory - target_dir: "../data/combined-training-dataset" + target_dir: "../data/test-training-dataset" # Validation dataset validation: # Raw validation datasets to combine dataset_dirs: - - "../data/field-pie-ranch-dataset" + #- "../data/pie-ranch-dataset" + - "../data/field-2-soil-compaction-dataset" # Target combined validation dataset directory - target_dir: "../data/pie-ranch-dataset" + target_dir: "../data/test-val-dataset" # For handcrafted features handcrafted: - enabled: false # Enable or disable handcrafted features + enabled: true # Enable or disable handcrafted features new_features: true # Set to true if this is a new dataset or if features have not been generated yet pruning_method: all # Options: corr, mi, lasso, none top_n: 16 # Only used if pruning_method is not none @@ -32,10 +37,10 @@ learned: n_features: 8 # Desired number of features # PCA-based feature extraction pca: - enabled: false # Enable or disable PCA features + enabled: true # Enable or disable PCA features # Kernel-PCA-based feature extraction kpca: - enabled: false # Enable or disable kPCA features + enabled: true # Enable or disable kPCA features # Autoencoder-based feature extraction autoencoder: enabled: true @@ -44,7 +49,7 @@ learned: verbose: true # CNN-based feature extraction cnn: - enabled: true + enabled: false epochs: 20 batch_size: 32 verbose: true @@ -62,13 +67,13 @@ deep_learning: end-to-end: # LSTM-based end-to-end regression lstm: - enabled: true + enabled: false epochs: 50 batch_size: 32 verbose: false # CNN-based end-to-end regression cnn: - enabled: true + enabled: false epochs: 20 batch_size: 32 verbose: false @@ -80,4 +85,4 @@ end-to-end: verbose: false advanced: - verbose: true # Set to false to reduce logging output \ No newline at end of file + verbose: true # Set to false to reduce logging output diff --git a/01_dsp/dspml_pipeline/scripts/view_data.py b/01_dsp/dspml_pipeline/scripts/view_data.py new file mode 100644 index 00000000..5b975d5e --- /dev/null +++ b/01_dsp/dspml_pipeline/scripts/view_data.py @@ -0,0 +1,69 @@ +""" +File: + view_data + +Description: + View the contents of a saved numpy file. + +Author: + jLab + nubby + Perplexity.AI + +Date: + 24 Feb 2026 + +Version: + 1.0.0 +""" +import argparse +import numpy as np +import os + +from typing import Union + + +def _load_npy_file(path: str) -> Union[np.array, None]: + """ + _load_npy_file(path) + + Load the contents of a saved .npy file if proper format; + otherwise return None. + + Args: + path (str) Path to file. + + Returns: + data (np.array, None) + """ + try: + assert(os.path.isfile(path) and path.split(".")[-1] == "npy") + data = np.load(path) + except AssertionError: + data = None + + return data + + +def view_data(path: str): + # Load the file in question. + data = _load_npy_file(path=path) + + try: + print(f"Contents: {data}") + print(f"Shape: {data.shape}") + except AttributeError: + print(f"ERROR: File {path} invalid; check the path!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="View the contents of an input .npy file.") + parser.add_argument( + "--path", + "-p", + required=True, + type=str, + help="Path to the desired file." + ) + args = parser.parse_args() + view_data(path=args.path) diff --git a/01_dsp/dspml_pipeline/tests/test_end_to_end.py b/01_dsp/dspml_pipeline/tests/test_end_to_end.py index a94df172..0a1cb2c0 100644 --- a/01_dsp/dspml_pipeline/tests/test_end_to_end.py +++ b/01_dsp/dspml_pipeline/tests/test_end_to_end.py @@ -13,7 +13,7 @@ from dspml_pipeline.end_to_end_estimation.lstm import LSTMEstimator from dspml_pipeline.results import update_results from dspml_pipeline.end_to_end_estimation.cnn import CNNEstimator -from dspml_pipeline.end_to_end_estimation.transformer import TransformerEstimator +from dspml_pipeline.end_to_end_estimation.pt_transformer import TransformerEstimator from scipy import stats @@ -59,4 +59,4 @@ def display_feature_importance(feature_array, feature_names, labels): X = np.abs(X) trans = TransformerEstimator(X, y, verbose=verbose) model, metrics = trans.full_monty() - # update_results(target_dir, "End-to-end", f"Transformer", metrics) \ No newline at end of file + # update_results(target_dir, "End-to-end", f"Transformer", metrics)