gomezzz · GabrieleMeoni · Sep 21, 2021 · Sep 24, 2021 · Sep 26, 2021 · Sep 26, 2021
diff --git a/notebooks/Evaluation - noise.ipynb b/notebooks/Evaluation - noise.ipynb
diff --git a/notebooks/Evaluation - time.ipynb b/notebooks/Evaluation - time.ipynb
@@ -0,0 +1,150 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "#Basic Imports\n",
+    "import os,sys\n",
+    "os.chdir(\"..\")\n",
+    "\n",
+    "from tqdm import tqdm,trange\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay\n",
+    "import torch\n",
+    "import pandas\n",
+    "\n",
+    "from datasets.ssl_dataset import SSL_Dataset\n",
+    "from datasets.data_utils import get_data_loader\n",
+    "from utils import get_model_checkpoints_and_timing_info\n",
+    "from utils import net_builder\n",
+    "from utils import clean_results_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Path to the runs to load\n",
+    "csv_folder = \"/home/gabrielemeoni/project/SSLRS/\"\n",
+    "dataset=\"eurosat_ms\"\n",
+    "folder = \"/scratch/fixmatch_results/sandy_runs/nr_of_labels/\"\n",
+    "sort_criterion = \"net\" # Accepted net, numlabels\n",
+    "seed_wanted = 2# Seed wanted (the others will be filtered)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialize parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "folder=os.path.join(folder, dataset)\n",
+    "checkpoints, run_args, timing_info = get_model_checkpoints_and_timing_info(folder)\n",
+    "timing_info_dict = dict(zip(checkpoints, timing_info))\n",
+    "if os.name == 'nt':\n",
+    "       [print(_.split(\"\\\\\")[1]) for _ in checkpoints];\n",
+    "else:\n",
+    "       [print(_.split(\"/\")[1]) for _ in checkpoints];"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run all models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results=[]\n",
+    "for checkpoint, args in zip(checkpoints,run_args):\n",
+    "    print(\"------------ RUNNING \", checkpoint, \" -----------------\")\n",
+    "    args[\"batch_size\"] = 256\n",
+    "    args[\"data_dir\"] = \"./data/\"\n",
+    "    args[\"use_train_model\"] = False\n",
+    "    args[\"load_path\"] = checkpoint\n",
+    "    if args[\"seed\"] == seed_wanted:\n",
+    "        results.append({\"params\" : args, \"aumgent_time\" : timing_info_dict[checkpoint][0], \"run_time\" : timing_info_dict[checkpoint][1], \"training_time\" : timing_info_dict[checkpoint][2]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "big_df = pd.DataFrame()\n",
+    "pd.set_option('display.max_columns', None)\n",
+    "for result in results:\n",
+    "    result_dict=result[\"params\"].copy()\n",
+    "    result_dict.update({\"aumgent_time\" : result[\"aumgent_time\"]})\n",
+    "    result_dict.update({\"run_time\" : result[\"run_time\"]} )\n",
+    "    result_dict.update({\"training_time\" : result[\"training_time\"]})\n",
+    "    df = pd.DataFrame(result_dict, index=[0])\n",
+    "    df = df.set_index([\"dataset\"])\n",
+    "    df = df.drop(labels=[\"batch_size\",\"data_dir\", \"opt\", \"use_train_model\", \"seed\", \"confidence\", \"lr\"], axis=1)\n",
+    "    big_df = big_df.append(df)\n",
+    "\n",
+    "big_df=big_df.sort_values(by=[sort_criterion], axis=0)\n",
+    "big_df.to_csv(os.path.join(csv_folder,\"time_results_\"+dataset+\"_seed_\"+str(seed_wanted)+\".csv\"))\n",
+    "print(\"File saved at: \", os.path.join(csv_folder,\"time_results_\"+dataset+\"_seed_\"+str(seed_wanted)+\".csv\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/utils.py b/utils.py
@@ -9,6 +9,8 @@
 import pandas as pd
 import seaborn as sn
 import matplotlib.pyplot as plt
+from datetime import datetime
+
 
 def plot_cmatrix(preds,labels,encoding, figsize=(8, 5),dpi=150, class_names_font_scale=1.2, matrix_font_size=12, save_fig_name=None):
     """Plotting the confusion matrix for one or three dataset seeds. 
@@ -275,6 +277,32 @@ def get_model_checkpoints(folderpath):
     return checkpoints, params
 
 
+def get_model_checkpoints_and_timing_info(folderpath):
+    """Returns all the latest checkpoint files, used parameters, and timing info in the below folders
+
+    Args:
+        folderpath (str): path to search (note only depth 1 below will be searched.)
+
+    Returns:
+        list,list,list: lists of checkpoint names, params, and associated timing info.
+    """
+    # Find present models
+    folderpath = folderpath.replace("\\", "/")
+    model_files = glob.glob(folderpath + "/**/model_best.pth", recursive=True)
+    folders = [model_file.split("model_best.pth")[0] for model_file in model_files]
+
+    checkpoints = []
+    timing_info = []
+    params = []
+    for file, folder in zip(model_files, folders):
+        checkpoints.append(file)
+        params.append(decode_parameters_from_path(folder))
+        timing_info.append(_return_training_time_info(folder))
+
+    return checkpoints, params, timing_info
+
+
+
 def _read_best_iteration_number(folder):
     """Reads from the run log file at which iteration the best result was obtained.
 
@@ -294,6 +322,49 @@ def _read_best_iteration_number(folder):
     return int(iteration_str.split(" iters")[0])
 
 
+def _return_training_time_info(folder):
+    """Return average augmentation and runtime for iteration [ms] and total training time [s] from the run log file.
+
+    Args:
+        folder (str): results folder
+
+    Returns:
+        float: average augmentation time per iteration [ms].
+        float: average runtime (forward + backward) per iteration [ms].
+        float: total training time [s].
+    """
+    with open(folder + "log.txt", "r") as file:
+        lines = file.read().splitlines()[5:-2]
+
+    date_i=[]
+    total_time_i_ms=[]
+    run_time_i_ms=[]
+    prefetch_time_i_ms=[]
+    t=0
+    for l in lines:
+        if "model saved" in l:
+            continue
+        elif "data loader keys" in l:
+            continue
+
+        date_i.append(l.split(' ')[0])
+        total_time_i_ms.append(l.split(' ')[1])
+        prefetch_time_i_ms.append(float(l.split(' ')[22].replace(",","")))
+        run_time_i_ms.append(float(l.split(' ')[24].replace(",","")))
+        t+=1
+
+    start_date=date_i[0].split('-')
+    last_date=date_i[-1].split('-')
+
+    start_time=total_time_i_ms[0].split(':')
+    last_time=total_time_i_ms[-1].split(':')
+    start_date_tot=datetime(int(start_date[0][1:]), int(start_date[1]), int(start_date[2]), int(start_time[0]), int(start_time[1]), int(start_time[2][:2]))
+    last_date_tot=datetime(int(last_date[0][1:]), int(last_date[1]), int(last_date[2]), int(last_time[0]), int(last_time[1]), int(last_time[2][:2]))
+    training_time=last_date_tot-start_date_tot
+    training_time_s = int(training_time.total_seconds())
+    return np.mean(np.array(prefetch_time_i_ms)), np.mean(np.array(run_time_i_ms)), training_time_s
+
+
 def decode_parameters_from_path(filepath):
     """Decodes the parameters encoded in the filepath to a checkpoint
 
@@ -333,15 +404,16 @@ def decode_parameters_from_path(filepath):
 
 
 def clean_results_df(
-    original_df, data_folder_name, sort_criterion="net", keep_per_class=False
+    original_df, data_folder_name, sort_criterion="net", keep_per_class=False, swap_accuracy_position=True
 ):
     """Removing unnecessary columns to save into the csv file, sorting rows according to the sort_criterion, sorting colums according to the csv file format.
 
     Args:
         original_df ([df]): original dataframe to clean.
         data_folder_name ([str]): string containing experiment results
         sort_criterion (str, optional): Default criterion for rows sorting. Defaults to "net".
-        keep_per_class (bool, optional): If True will not discard class-wise accuracy
+        keep_per_class (bool, optional): If True will not discard class-wise accuracy. Defaults to False.
+        swap_accuracy_position (bool, optional): If True, accuracy position is swapped. Defaults to True.
 
     Returns:
         [cleaned outputdata]: [df]
@@ -422,9 +494,10 @@ def clean_results_df(
             )    
 
     # Swap accuracy positions to sort it as in the final results file
-    keys = new_df.columns.tolist()
-    keys = keys[1:-1] + [keys[0]] + [keys[-1]]
-    new_df = new_df.reindex(columns=keys)
+    if swap_accuracy_position:
+        keys = new_df.columns.tolist()
+        keys = keys[1:-1] + [keys[0]] + [keys[-1]]
+        new_df = new_df.reindex(columns=keys)
 
     net = new_df["net"]
     if "pretrained" in new_df: