diff --git a/.gitignore b/.gitignore
index 62952767..3405711d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,16 +1,26 @@
-**__pycache__**
-**build**
-**egg-info**
-**dist**
-data/
-*.pyc
-venv/
-*.idea/
+data
+*.log
+
+.DS_Store
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
 *.so
-*.yaml
-*.sh 
-*.pth
-*.pkl
-*.zip
-*.bin
-.vscode/
+
+# Virtual Environments
+venv/
+env/
+.env
+
+# Distribution / packaging
+build/
+dist/
+work_dirs/
+
+*.egg-info/
+*.log.json
+nohup.out
diff --git a/Changes.md b/Changes.md
new file mode 100644
index 00000000..78bdf3db
--- /dev/null
+++ b/Changes.md
@@ -0,0 +1,34 @@
+# Changes: Knowledge Distillation (KD)
+
+Knowledge distillation was added on top of the existing CenterPoint training stack. Two variants exist; only one is used per config:
+
+- **Response-based KD** (`kd.type = "heatmap_mse"`): MSE between student and teacher **heatmap** outputs, weighted by `lambda_kd`.
+- **Feature-based KD** (`kd.type = "feature_mse"`): MSE on **`CenterHead.shared_conv`** features, weighted by `lambda_feat`.
+
+The original detection loss (focal heatmap + L1 regression from `centernet_loss.py`) is unchanged. KD terms are added in `CenterHead.loss`. At test time only the student checkpoint is used.
+
+---
+
+## Code changes
+
+| File | Change |
+|------|--------|
+| `det3d/torchie/apis/train.py` | If `cfg.kd.enabled`, build teacher from `teacher_config`, load `teacher_checkpoint`, freeze it, pass `teacher_model` and `kd_cfg` to the trainer. |
+| `det3d/torchie/trainer/trainer.py` | Each training step: teacher forward under `no_grad` (`return_preds` and/or `return_feats`), then student forward with teacher outputs and `kd_cfg`. |
+| `det3d/models/detectors/point_pillars.py` | Return `head_shared` from the head; support `return_preds` / `return_feats` for the teacher; pass KD arguments into `bbox_head.loss`. |
+| `det3d/models/detectors/voxelnet.py` | Same KD-related forward/loss wiring as PointPillars. |
+| `det3d/models/bbox_heads/center_head.py` | In `loss()`: compute `hm_kd_loss` (heatmap MSE) or `feat_kd_loss` (shared-feature MSE) and add to the per-task loss; log both metrics. |
+| `det3d/torchie/trainer/hooks/logger/text.py` | Log `hm_kd_loss` and `feat_kd_loss` with 6 decimal places. |
+| `det3d/torchie/apis/env.py` | Device selection limited to CUDA or CPU (MPS removed). |
+
+**Unchanged:** `det3d/models/losses/centernet_loss.py` (baseline `FastFocalLoss`, `RegLoss`).
+
+---
+
+## New configs (`configs/nusc/pp/`)
+
+Each file sets a slimmer student (reduced reader/neck/head channels) and a `kd` block (`enabled`, `type`, `lambda_kd` or `lambda_feat`, `teacher_config`, `teacher_checkpoint`).
+
+**Response-based:** `response_based_kd.py`, `response_based_kd_05.py`, `response_based_kd_08.py`, `response_based_kd_smoke.py`, `response_based_kd_resnet.py`, `response_based_kd_resnet_smoke.py`
+
+**Feature-based:** `feature_based_kd.py`, `feature_based_kd_05.py`, `feature_based_kd_08.py`, `feature_based_kd_smoke.py`
diff --git a/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale.py b/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale.py
index c58a6046..4ab92871 100644
--- a/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale.py
+++ b/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale.py
@@ -1,3 +1,4 @@
+import os
 import itertools
 import logging
 
@@ -87,12 +88,12 @@
 # dataset settings
 dataset_type = "NuScenesDataset"
 nsweeps = 10
-data_root = "data/nuScenes"
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
 
 db_sampler = dict(
     type="GT-AUG",
     enable=False,
-    db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl",
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo.pkl",
     sample_groups=[
         dict(car=2),
         dict(truck=3),
@@ -165,8 +166,8 @@
     dict(type="Reformat"),
 ]
 
-train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
-val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
+train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
 test_anno = None
 
 data = dict(
diff --git a/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale_virtual.py b/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale_virtual.py
index 2071f2ac..3790bc97 100644
--- a/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale_virtual.py
+++ b/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale_virtual.py
@@ -1,3 +1,4 @@
+import os
 import itertools
 import logging
 
@@ -88,12 +89,12 @@
 # dataset settings
 dataset_type = "NuScenesDataset"
 nsweeps = 10
-data_root = "data/nuScenes"
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
 
 db_sampler = dict(
     type="GT-AUG",
     enable=False,
-    db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo_virtual.pkl",
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo_virtual.pkl",
     sample_groups=[
         dict(car=2),
         dict(truck=3),
@@ -166,8 +167,8 @@
     dict(type="Reformat"),
 ]
 
-train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
-val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
+train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
 test_anno = None
 
 data = dict(
diff --git a/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale.py b/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale.py
index 40c3425c..9e90ce5a 100644
--- a/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale.py
+++ b/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale.py
@@ -1,3 +1,4 @@
+import os
 import itertools
 import logging
 
@@ -87,12 +88,12 @@
 # dataset settings
 dataset_type = "NuScenesDataset"
 nsweeps = 10
-data_root = "data/nuScenes"
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
 
 db_sampler = dict(
     type="GT-AUG",
     enable=False,
-    db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl",
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo.pkl",
     sample_groups=[
         dict(car=2),
         dict(truck=3),
@@ -156,8 +157,8 @@
     dict(type="Reformat"),
 ]
 
-train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
-val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
+train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
 test_anno = None
 
 data = dict(
diff --git a/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_debug.py b/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_debug.py
index dd7e0fc0..5a4be6ec 100644
--- a/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_debug.py
+++ b/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_debug.py
@@ -1,3 +1,4 @@
+import os
 import itertools
 import logging
 
@@ -87,12 +88,12 @@
 # dataset settings
 dataset_type = "NuScenesDataset"
 nsweeps = 10
-data_root = "data/nuScenes"
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
 
 db_sampler = dict(
     type="GT-AUG",
     enable=False,
-    db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl",
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo.pkl",
     sample_groups=[
         dict(car=2),
         dict(truck=3),
@@ -156,8 +157,8 @@
     dict(type="Reformat"),
 ]
 
-train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
-val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
+train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
 test_anno = None
 
 data = dict(
diff --git a/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_virtual.py b/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_virtual.py
index 39b8e4cf..9ada7854 100644
--- a/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_virtual.py
+++ b/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_virtual.py
@@ -1,3 +1,4 @@
+import os
 import itertools
 import logging
 
@@ -88,12 +89,12 @@
 # dataset settings
 dataset_type = "NuScenesDataset"
 nsweeps = 10
-data_root = "data/nuScenes"
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
 
 db_sampler = dict(
     type="GT-AUG",
     enable=False,
-    db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo_virtual.pkl",
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo_virtual.pkl",
     sample_groups=[
         dict(car=2),
         dict(truck=3),
@@ -157,8 +158,8 @@
     dict(type="Reformat"),
 ]
 
-train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
-val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
+train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
 test_anno = None
 
 data = dict(
diff --git a/configs/mvp/nusc_two_stage_base_with_virtual.py b/configs/mvp/nusc_two_stage_base_with_virtual.py
index 6344e21c..a8987a1b 100644
--- a/configs/mvp/nusc_two_stage_base_with_virtual.py
+++ b/configs/mvp/nusc_two_stage_base_with_virtual.py
@@ -1,3 +1,4 @@
+import os
 import itertools
 import logging
 
@@ -138,12 +139,12 @@
 # dataset settings
 dataset_type = "NuScenesDataset"
 nsweeps = 10
-data_root = "data/nuScenes"
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
 
 db_sampler = dict(
     type="GT-AUG",
     enable=False,
-    db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo_virtual.pkl",
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo_virtual.pkl",
     sample_groups=[
         dict(car=2),
         dict(truck=3),
@@ -213,8 +214,8 @@
     dict(type="Reformat"),
 ]
 
-train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_painted_True.pkl"
-val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_painted_True.pkl"
+train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_painted_True.pkl"
+val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_painted_True.pkl"
 test_anno = None
 
 data = dict(
diff --git a/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep_demo.py b/configs/nusc/pp/.ipynb_checkpoints/demo-checkpoint.py
similarity index 100%
rename from configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep_demo.py
rename to configs/nusc/pp/.ipynb_checkpoints/demo-checkpoint.py
diff --git a/configs/nusc/pp/baseline.py b/configs/nusc/pp/baseline.py
new file mode 100644
index 00000000..22e87b46
--- /dev/null
+++ b/configs/nusc/pp/baseline.py
@@ -0,0 +1,248 @@
+import os
+import itertools
+import logging
+from det3d.utils.config_tool import get_downsample_factor
+import os
+from dotenv import load_dotenv
+
+print(f"Found ENV: {load_dotenv()}")
+samples_per_gpu = int(os.getenv("GPU_SAMPLES", 2))
+workers_per_gpu = int(os.getenv("GPU_WORKERS", 4))
+gpu_ids = os.getenv("SELECTED_GPUS", "0")
+use_subset = os.getenv("USE_SUBSET", "True").lower() in ("true", "1", "yes")
+
+print(f"Samples per gpu: {samples_per_gpu}")
+print(f"Workers per gpu: {workers_per_gpu}")
+print(f"Using GPUs: {gpu_ids}")
+print(f"Using {'subset' if use_subset else 'full'}")
+
+tasks = [
+    dict(num_class=1, class_names=["car"]),
+    dict(num_class=2, class_names=["truck", "construction_vehicle"]),
+    dict(num_class=2, class_names=["bus", "trailer"]),
+    dict(num_class=1, class_names=["barrier"]),
+    dict(num_class=2, class_names=["motorcycle", "bicycle"]),
+    dict(num_class=2, class_names=["pedestrian", "traffic_cone"]),
+]
+
+class_names = list(itertools.chain(*[t["class_names"] for t in tasks]))
+
+# training and testing settings
+target_assigner = dict(
+    tasks=tasks,
+)
+
+
+# model settings
+model = dict(
+    type="PointPillars",
+    pretrained=None,
+    reader=dict(
+        type="PillarFeatureNet",
+        num_filters=[64, 64],
+        num_input_features=5,
+        with_distance=False,
+        voxel_size=(0.2, 0.2, 8),
+        pc_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0),
+    ),
+    backbone=dict(type="PointPillarsScatter", ds_factor=1),
+    neck=dict(
+        type="RPN",
+        layer_nums=[3, 5, 5],
+        ds_layer_strides=[2, 2, 2],
+        ds_num_filters=[64, 128, 256],
+        us_layer_strides=[0.5, 1, 2],
+        us_num_filters=[128, 128, 128],
+        num_input_features=64,
+        logger=logging.getLogger("RPN"),
+    ),
+    bbox_head=dict(
+        # type='RPNHead',
+        type="CenterHead",
+        in_channels=sum([128, 128, 128]),
+        tasks=tasks,
+        dataset='nuscenes',
+        weight=0.25,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0],
+        common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, # (output_channel, num_conv)
+    ),
+)
+
+assigner = dict(
+    target_assigner=target_assigner,
+    out_size_factor=get_downsample_factor(model),
+    gaussian_overlap=0.1,
+    max_objs=500,
+    min_radius=2,
+)
+
+
+train_cfg = dict(assigner=assigner)
+
+test_cfg = dict(
+    post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+    max_per_img=500,
+    nms=dict(
+        nms_pre_max_size=1000,
+        nms_post_max_size=83,
+        nms_iou_threshold=0.2,
+    ),
+    score_threshold=0.1,
+    pc_range=[-51.2, -51.2],
+    out_size_factor=get_downsample_factor(model),
+    voxel_size=[0.2, 0.2]
+)
+
+# dataset settings
+dataset_type = "NuScenesDataset"
+nsweeps = 10
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
+
+db_sampler = dict(
+    type="GT-AUG",
+    enable=False,
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo.pkl",
+    sample_groups=[
+        dict(car=2),
+        dict(truck=3),
+        dict(construction_vehicle=7),
+        dict(bus=4),
+        dict(trailer=6),
+        dict(barrier=2),
+        dict(motorcycle=6),
+        dict(bicycle=6),
+        dict(pedestrian=2),
+        dict(traffic_cone=2),
+    ],
+    db_prep_steps=[
+        dict(
+            filter_by_min_num_points=dict(
+                car=5,
+                truck=5,
+                bus=5,
+                trailer=5,
+                construction_vehicle=5,
+                traffic_cone=5,
+                barrier=5,
+                motorcycle=5,
+                bicycle=5,
+                pedestrian=5,
+            )
+        ),
+        dict(filter_by_difficulty=[-1],),
+    ],
+    global_random_rotation_range_per_object=[0, 0],
+    rate=1.0,
+)
+train_preprocessor = dict(
+    mode="train",
+    shuffle_points=True,
+    global_rot_noise=[-0.3925, 0.3925],
+    global_scale_noise=[0.95, 1.05],
+    db_sampler=None,
+    class_names=class_names,
+)
+
+val_preprocessor = dict(
+    mode="val",
+    shuffle_points=False,
+)
+
+voxel_generator = dict(
+    range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
+    voxel_size=[0.2, 0.2, 8],
+    max_points_in_voxel=20,
+    max_voxel_num=[30000, 60000],
+)
+
+train_pipeline = [
+    dict(type="LoadPointCloudFromFile", dataset=dataset_type),
+    dict(type="LoadPointCloudAnnotations", with_bbox=True),
+    dict(type="Preprocess", cfg=train_preprocessor),
+    dict(type="Voxelization", cfg=voxel_generator),
+    dict(type="AssignLabel", cfg=train_cfg["assigner"]),
+    dict(type="Reformat"),
+]
+test_pipeline = [
+    dict(type="LoadPointCloudFromFile", dataset=dataset_type),
+    dict(type="LoadPointCloudAnnotations", with_bbox=True),
+    dict(type="Preprocess", cfg=val_preprocessor),
+    dict(type="Voxelization", cfg=voxel_generator),
+    dict(type="AssignLabel", cfg=train_cfg["assigner"]),
+    dict(type="Reformat"),
+]
+
+if use_subset:
+    train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True_subset_10.pkl"
+else:
+    train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+
+if use_subset:
+    val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
+else:
+    val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True_subset_10.pkl"
+
+test_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
+
+data = dict(
+    samples_per_gpu=samples_per_gpu,
+    workers_per_gpu=workers_per_gpu,
+    train=dict(
+        type=dataset_type,
+        root_path=data_root,
+        info_path=train_anno,
+        ann_file=train_anno,
+        nsweeps=nsweeps,
+        class_names=class_names,
+        pipeline=train_pipeline,
+    ),
+    val=dict(
+        type=dataset_type,
+        root_path=data_root,
+        info_path=val_anno,
+        test_mode=True,
+        ann_file=val_anno,
+        nsweeps=nsweeps,
+        class_names=class_names,
+        pipeline=test_pipeline,
+    ),
+    test=dict(
+        type=dataset_type,
+        root_path=data_root,
+        info_path=test_anno,
+        ann_file=test_anno,
+        nsweeps=nsweeps,
+        class_names=class_names,
+        pipeline=test_pipeline,
+    ),
+)
+
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# optimizer
+optimizer = dict(
+    type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False,
+)
+lr_config = dict(
+    type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4,
+)
+
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type="TextLoggerHook"),
+        # dict(type='TensorboardLoggerHook')
+    ],
+)
+# yapf:enable
+# runtime settings
+total_epochs = 20
+device_ids = [int(x) for x in gpu_ids.split(",") if x.strip() in ["0", "1"]]
+dist_params = dict(backend="nccl", init_method="env://")
+log_level = "INFO"
+work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3])
+load_from = None
+resume_from = None 
+workflow = [('train', 1)]
diff --git a/configs/nusc/pp/baseline_smoke.py b/configs/nusc/pp/baseline_smoke.py
new file mode 100644
index 00000000..a5e1c125
--- /dev/null
+++ b/configs/nusc/pp/baseline_smoke.py
@@ -0,0 +1,21 @@
+from pathlib import Path
+
+# Keep smoke runs aligned with the real training setup by loading
+# the main PointPillars config first, then overriding only runtime knobs.
+_base_cfg = Path(__file__).with_name("baseline.py")
+exec(_base_cfg.read_text(), globals(), globals())
+
+# Run a tiny local loop quickly while preserving model/data pipeline structure.
+total_epochs = 1
+
+data["samples_per_gpu"] = 1
+data["workers_per_gpu"] = 0
+
+# Subsample infos to reduce the number of train/val samples for smoke tests.
+for split in ("train", "val", "test"):
+    if split in data and isinstance(data[split], dict):
+        data[split]["load_interval"] = 20
+
+log_config["interval"] = 1
+checkpoint_config["interval"] = 1
+work_dir = "./work_dirs/baseline_smoke/"
diff --git a/configs/nusc/pp/feature_based_kd.py b/configs/nusc/pp/feature_based_kd.py
new file mode 100644
index 00000000..ef6957cc
--- /dev/null
+++ b/configs/nusc/pp/feature_based_kd.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+# Uses the base config and overrides for student architecture and feature-based KD
+_base_cfg = Path(__file__).with_name("baseline.py")
+exec(_base_cfg.read_text(), globals(), globals())
+
+# Smaller student architecture
+model["reader"]["num_filters"] = [32, 32]
+model["backbone"]["num_input_features"] = 32
+model["neck"]["num_input_features"] = 32
+model["neck"]["ds_num_filters"] = [32, 64, 128]
+model["neck"]["us_num_filters"] = [64, 64, 64]
+model["bbox_head"]["in_channels"] = sum([64, 64, 64])
+
+# Feature KD (CenterHead shared_conv output MSE vs teacher)
+kd = dict(
+    enabled=True,
+    type="feature_mse",
+    lambda_feat=0.2,
+    teacher_config="./configs/nusc/pp/baseline.py",
+    # teacher_checkpoint="./work_dirs/baseline_smoke/latest.pth",
+    teacher_checkpoint="../Computer-Vision/work_dirs/nusc_centerpoint_pp_02voxel_two_pfn_10sweep/latest.pth",
+)
+
+work_dir = "./work_dirs/feature_based_kd/"
diff --git a/configs/nusc/pp/feature_based_kd_05.py b/configs/nusc/pp/feature_based_kd_05.py
new file mode 100644
index 00000000..875eee06
--- /dev/null
+++ b/configs/nusc/pp/feature_based_kd_05.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+# Uses the base config and overrides for student architecture and feature-based KD
+_base_cfg = Path(__file__).with_name("baseline.py")
+exec(_base_cfg.read_text(), globals(), globals())
+
+# Smaller student architecture
+model["reader"]["num_filters"] = [32, 32]
+model["backbone"]["num_input_features"] = 32
+model["neck"]["num_input_features"] = 32
+model["neck"]["ds_num_filters"] = [32, 64, 128]
+model["neck"]["us_num_filters"] = [64, 64, 64]
+model["bbox_head"]["in_channels"] = sum([64, 64, 64])
+
+# Feature KD (CenterHead shared_conv output MSE vs teacher)
+kd = dict(
+    enabled=True,
+    type="feature_mse",
+    lambda_feat=0.5,
+    teacher_config="./configs/nusc/pp/baseline.py",
+    # teacher_checkpoint="./work_dirs/baseline_smoke/latest.pth",
+    teacher_checkpoint="../Computer-Vision/work_dirs/nusc_centerpoint_pp_02voxel_two_pfn_10sweep/latest.pth",
+)
+
+work_dir = "./work_dirs/feature_based_kd/"
diff --git a/configs/nusc/pp/feature_based_kd_08.py b/configs/nusc/pp/feature_based_kd_08.py
new file mode 100644
index 00000000..4a07591b
--- /dev/null
+++ b/configs/nusc/pp/feature_based_kd_08.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+# Uses the base config and overrides for student architecture and feature-based KD
+_base_cfg = Path(__file__).with_name("baseline.py")
+exec(_base_cfg.read_text(), globals(), globals())
+
+# Smaller student architecture
+model["reader"]["num_filters"] = [32, 32]
+model["backbone"]["num_input_features"] = 32
+model["neck"]["num_input_features"] = 32
+model["neck"]["ds_num_filters"] = [32, 64, 128]
+model["neck"]["us_num_filters"] = [64, 64, 64]
+model["bbox_head"]["in_channels"] = sum([64, 64, 64])
+
+# Feature KD (CenterHead shared_conv output MSE vs teacher)
+kd = dict(
+    enabled=True,
+    type="feature_mse",
+    lambda_feat=0.8,
+    teacher_config="./configs/nusc/pp/baseline.py",
+    # teacher_checkpoint="./work_dirs/baseline_smoke/latest.pth",
+    teacher_checkpoint="../Computer-Vision/work_dirs/nusc_centerpoint_pp_02voxel_two_pfn_10sweep/latest.pth",
+)
+
+work_dir = "./work_dirs/feature_based_kd/"
diff --git a/configs/nusc/pp/feature_based_kd_smoke.py b/configs/nusc/pp/feature_based_kd_smoke.py
new file mode 100644
index 00000000..b8264980
--- /dev/null
+++ b/configs/nusc/pp/feature_based_kd_smoke.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+
+_base_cfg = Path(__file__).with_name("feature_based_kd.py")
+exec(_base_cfg.read_text(), globals(), globals())
+
+total_epochs = 1
+
+data["samples_per_gpu"] = 1
+data["workers_per_gpu"] = 0
+
+for split in ("train", "val", "test"):
+    if split in data and isinstance(data[split], dict):
+        data[split]["load_interval"] = 20
+
+log_config["interval"] = 1
+checkpoint_config["interval"] = 1
+work_dir = "./work_dirs/feature_based_kd_smoke/"
diff --git a/configs/nusc/pp/resnet.py b/configs/nusc/pp/resnet.py
new file mode 100644
index 00000000..10af1194
--- /dev/null
+++ b/configs/nusc/pp/resnet.py
@@ -0,0 +1,248 @@
+import os
+import itertools
+import logging
+from det3d.utils.config_tool import get_downsample_factor
+import os
+from dotenv import load_dotenv
+
+print(f"Found ENV: {load_dotenv()}")
+samples_per_gpu = int(os.getenv("GPU_SAMPLES", 2))
+workers_per_gpu = int(os.getenv("GPU_WORKERS", 4))
+gpu_ids = os.getenv("SELECTED_GPUS", "0")
+use_subset = os.getenv("USE_SUBSET", "True").lower() in ("true", "1", "yes")
+
+print(f"Samples per gpu: {samples_per_gpu}")
+print(f"Workers per gpu: {workers_per_gpu}")
+print(f"Using GPUs: {gpu_ids}")
+print(f"Using {'subset' if use_subset else 'full'}")
+
+tasks = [
+    dict(num_class=1, class_names=["car"]),
+    dict(num_class=2, class_names=["truck", "construction_vehicle"]),
+    dict(num_class=2, class_names=["bus", "trailer"]),
+    dict(num_class=1, class_names=["barrier"]),
+    dict(num_class=2, class_names=["motorcycle", "bicycle"]),
+    dict(num_class=2, class_names=["pedestrian", "traffic_cone"]),
+]
+
+class_names = list(itertools.chain(*[t["class_names"] for t in tasks]))
+
+# training and testing settings
+target_assigner = dict(
+    tasks=tasks,
+)
+
+
+# model settings
+model = dict(
+    type="PointPillars",
+    pretrained=None,
+    reader=dict(
+        type="PillarFeatureNet",
+        num_filters=[64, 64],
+        num_input_features=5,
+        with_distance=False,
+        voxel_size=(0.2, 0.2, 8),
+        pc_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0),
+    ),
+    backbone=dict(type="PointPillarsScatter", ds_factor=1),
+    neck=dict(
+        type="ResNetNeck",
+        layer_nums=[2, 2, 2],
+        ds_layer_strides=[2, 2, 2],
+        ds_num_filters=[64, 128, 256],
+        us_layer_strides=[0.5, 1, 2],
+        us_num_filters=[128, 128, 128],
+        num_input_features=64,
+        logger=logging.getLogger("ResNetNeck"),
+    ),
+    bbox_head=dict(
+        type="CenterHead",
+        in_channels=sum([128, 128, 128]),
+        tasks=tasks,
+        dataset='nuscenes',
+        weight=0.25,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0],
+        common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, # (output_channel, num_conv)
+    ),
+)
+
+assigner = dict(
+    target_assigner=target_assigner,
+    out_size_factor=get_downsample_factor(model),
+    gaussian_overlap=0.1,
+    max_objs=500,
+    min_radius=2,
+)
+
+
+train_cfg = dict(assigner=assigner)
+
+test_cfg = dict(
+    post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+    max_per_img=500,
+    nms=dict(
+        nms_pre_max_size=1000,
+        nms_post_max_size=83,
+        nms_iou_threshold=0.2,
+    ),
+    score_threshold=0.1,
+    pc_range=[-51.2, -51.2],
+    out_size_factor=get_downsample_factor(model),
+    voxel_size=[0.2, 0.2]
+)
+
+# dataset settings
+dataset_type = "NuScenesDataset"
+nsweeps = 10
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
+
+db_sampler = dict(
+    type="GT-AUG",
+    enable=False,
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo.pkl",
+    sample_groups=[
+        dict(car=2),
+        dict(truck=3),
+        dict(construction_vehicle=7),
+        dict(bus=4),
+        dict(trailer=6),
+        dict(barrier=2),
+        dict(motorcycle=6),
+        dict(bicycle=6),
+        dict(pedestrian=2),
+        dict(traffic_cone=2),
+    ],
+    db_prep_steps=[
+        dict(
+            filter_by_min_num_points=dict(
+                car=5,
+                truck=5,
+                bus=5,
+                trailer=5,
+                construction_vehicle=5,
+                traffic_cone=5,
+                barrier=5,
+                motorcycle=5,
+                bicycle=5,
+                pedestrian=5,
+            )
+        ),
+        dict(filter_by_difficulty=[-1],),
+    ],
+    global_random_rotation_range_per_object=[0, 0],
+    rate=1.0,
+)
+train_preprocessor = dict(
+    mode="train",
+    shuffle_points=True,
+    global_rot_noise=[-0.3925, 0.3925],
+    global_scale_noise=[0.95, 1.05],
+    db_sampler=None,
+    class_names=class_names,
+)
+
+val_preprocessor = dict(
+    mode="val",
+    shuffle_points=False,
+)
+
+voxel_generator = dict(
+    range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
+    voxel_size=[0.2, 0.2, 8],
+    max_points_in_voxel=20,
+    max_voxel_num=[30000, 60000],
+)
+
+train_pipeline = [
+    dict(type="LoadPointCloudFromFile", dataset=dataset_type),
+    dict(type="LoadPointCloudAnnotations", with_bbox=True),
+    dict(type="Preprocess", cfg=train_preprocessor),
+    dict(type="Voxelization", cfg=voxel_generator),
+    dict(type="AssignLabel", cfg=train_cfg["assigner"]),
+    dict(type="Reformat"),
+]
+test_pipeline = [
+    dict(type="LoadPointCloudFromFile", dataset=dataset_type),
+    dict(type="LoadPointCloudAnnotations", with_bbox=True),
+    dict(type="Preprocess", cfg=val_preprocessor),
+    dict(type="Voxelization", cfg=voxel_generator),
+    dict(type="AssignLabel", cfg=train_cfg["assigner"]),
+    dict(type="Reformat"),
+]
+
+if use_subset:
+    train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True_subset_10.pkl"
+else:
+    train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+
+if use_subset:
+    val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
+else:
+    val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True_subset_10.pkl"
+
+test_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
+
+
+data = dict(
+    samples_per_gpu=samples_per_gpu,
+    workers_per_gpu=workers_per_gpu,
+    train=dict(
+        type=dataset_type,
+        root_path=data_root,
+        info_path=train_anno,
+        ann_file=train_anno,
+        nsweeps=nsweeps,
+        class_names=class_names,
+        pipeline=train_pipeline,
+    ),
+    val=dict(
+        type=dataset_type,
+        root_path=data_root,
+        info_path=val_anno,
+        test_mode=True,
+        ann_file=val_anno,
+        nsweeps=nsweeps,
+        class_names=class_names,
+        pipeline=test_pipeline,
+    ),
+    test=dict(
+        type=dataset_type,
+        root_path=data_root,
+        info_path=test_anno,
+        ann_file=test_anno,
+        nsweeps=nsweeps,
+        class_names=class_names,
+        pipeline=test_pipeline,
+    ),
+)
+
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# optimizer
+optimizer = dict(
+    type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False,
+)
+lr_config = dict(
+    type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4,
+)
+
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type="TextLoggerHook"),
+        # dict(type='TensorboardLoggerHook')
+    ],
+)
+# yapf:enable
+# runtime settings
+total_epochs = 20
+device_ids = [int(x) for x in gpu_ids.split(",") if x.strip() in ["0", "1"]]
+dist_params = dict(backend="nccl", init_method="env://")
+log_level = "INFO"
+work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3])
+load_from = None
+resume_from = None 
+workflow = [('train', 1)]
diff --git a/configs/nusc/pp/resnet_fastpillars.py b/configs/nusc/pp/resnet_fastpillars.py
new file mode 100644
index 00000000..f6dfb506
--- /dev/null
+++ b/configs/nusc/pp/resnet_fastpillars.py
@@ -0,0 +1,248 @@
+import os
+import itertools
+import logging
+from det3d.utils.config_tool import get_downsample_factor
+import os
+from dotenv import load_dotenv
+
+print(f"Found ENV: {load_dotenv()}")
+samples_per_gpu = int(os.getenv("GPU_SAMPLES", 2))
+workers_per_gpu = int(os.getenv("GPU_WORKERS", 4))
+gpu_ids = os.getenv("SELECTED_GPUS", "0")
+use_subset = os.getenv("USE_SUBSET", "True").lower() in ("true", "1", "yes")
+
+print(f"Samples per gpu: {samples_per_gpu}")
+print(f"Workers per gpu: {workers_per_gpu}")
+print(f"Using GPUs: {gpu_ids}")
+print(f"Using {'subset' if use_subset else 'full'}")
+
+tasks = [
+    dict(num_class=1, class_names=["car"]),
+    dict(num_class=2, class_names=["truck", "construction_vehicle"]),
+    dict(num_class=2, class_names=["bus", "trailer"]),
+    dict(num_class=1, class_names=["barrier"]),
+    dict(num_class=2, class_names=["motorcycle", "bicycle"]),
+    dict(num_class=2, class_names=["pedestrian", "traffic_cone"]),
+]
+
+class_names = list(itertools.chain(*[t["class_names"] for t in tasks]))
+
+# training and testing settings
+target_assigner = dict(
+    tasks=tasks,
+)
+
+
+# model settings
+model = dict(
+    type="PointPillars",
+    pretrained=None,
+    reader=dict(
+        type="FastPillarFeatureNet",
+        num_filters=[64],
+        num_input_features=5,
+        with_distance=False,
+        voxel_size=(0.2, 0.2, 8),
+        pc_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0),
+    ),
+    backbone=dict(type="PointPillarsScatter", ds_factor=1),
+    neck=dict(
+        type="ResNetNeck",
+        layer_nums=[2, 2, 2],
+        ds_layer_strides=[2, 2, 2],
+        ds_num_filters=[64, 128, 256],
+        us_layer_strides=[0.5, 1, 2],
+        us_num_filters=[128, 128, 128],
+        num_input_features=64,
+        logger=logging.getLogger("ResNetNeck"),
+    ),
+    bbox_head=dict(
+        type="CenterHead",
+        in_channels=sum([128, 128, 128]),
+        tasks=tasks,
+        dataset='nuscenes',
+        weight=0.25,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0],
+        common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, # (output_channel, num_conv)
+    ),
+)
+
+assigner = dict(
+    target_assigner=target_assigner,
+    out_size_factor=get_downsample_factor(model),
+    gaussian_overlap=0.1,
+    max_objs=500,
+    min_radius=2,
+)
+
+
+train_cfg = dict(assigner=assigner)
+
+test_cfg = dict(
+    post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+    max_per_img=500,
+    nms=dict(
+        nms_pre_max_size=1000,
+        nms_post_max_size=83,
+        nms_iou_threshold=0.2,
+    ),
+    score_threshold=0.1,
+    pc_range=[-51.2, -51.2],
+    out_size_factor=get_downsample_factor(model),
+    voxel_size=[0.2, 0.2]
+)
+
+# dataset settings
+dataset_type = "NuScenesDataset"
+nsweeps = 10
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
+
+db_sampler = dict(
+    type="GT-AUG",
+    enable=False,
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo.pkl",
+    sample_groups=[
+        dict(car=2),
+        dict(truck=3),
+        dict(construction_vehicle=7),
+        dict(bus=4),
+        dict(trailer=6),
+        dict(barrier=2),
+        dict(motorcycle=6),
+        dict(bicycle=6),
+        dict(pedestrian=2),
+        dict(traffic_cone=2),
+    ],
+    db_prep_steps=[
+        dict(
+            filter_by_min_num_points=dict(
+                car=5,
+                truck=5,
+                bus=5,
+                trailer=5,
+                construction_vehicle=5,
+                traffic_cone=5,
+                barrier=5,
+                motorcycle=5,
+                bicycle=5,
+                pedestrian=5,
+            )
+        ),
+        dict(filter_by_difficulty=[-1],),
+    ],
+    global_random_rotation_range_per_object=[0, 0],
+    rate=1.0,
+)
+train_preprocessor = dict(
+    mode="train",
+    shuffle_points=True,
+    global_rot_noise=[-0.3925, 0.3925],
+    global_scale_noise=[0.95, 1.05],
+    db_sampler=None,
+    class_names=class_names,
+)
+
+val_preprocessor = dict(
+    mode="val",
+    shuffle_points=False,
+)
+
+voxel_generator = dict(
+    range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
+    voxel_size=[0.2, 0.2, 8],
+    max_points_in_voxel=20,
+    max_voxel_num=[30000, 60000],
+)
+
+train_pipeline = [
+    dict(type="LoadPointCloudFromFile", dataset=dataset_type),
+    dict(type="LoadPointCloudAnnotations", with_bbox=True),
+    dict(type="Preprocess", cfg=train_preprocessor),
+    dict(type="Voxelization", cfg=voxel_generator),
+    dict(type="AssignLabel", cfg=train_cfg["assigner"]),
+    dict(type="Reformat"),
+]
+test_pipeline = [
+    dict(type="LoadPointCloudFromFile", dataset=dataset_type),
+    dict(type="LoadPointCloudAnnotations", with_bbox=True),
+    dict(type="Preprocess", cfg=val_preprocessor),
+    dict(type="Voxelization", cfg=voxel_generator),
+    dict(type="AssignLabel", cfg=train_cfg["assigner"]),
+    dict(type="Reformat"),
+]
+
+if use_subset:
+    train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True_subset_10.pkl"
+else:
+    train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+
+if use_subset:
+    val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
+else:
+    val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True_subset_10.pkl"
+
+test_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
+
+
+data = dict(
+    samples_per_gpu=samples_per_gpu,
+    workers_per_gpu=workers_per_gpu,
+    train=dict(
+        type=dataset_type,
+        root_path=data_root,
+        info_path=train_anno,
+        ann_file=train_anno,
+        nsweeps=nsweeps,
+        class_names=class_names,
+        pipeline=train_pipeline,
+    ),
+    val=dict(
+        type=dataset_type,
+        root_path=data_root,
+        info_path=val_anno,
+        test_mode=True,
+        ann_file=val_anno,
+        nsweeps=nsweeps,
+        class_names=class_names,
+        pipeline=test_pipeline,
+    ),
+    test=dict(
+        type=dataset_type,
+        root_path=data_root,
+        info_path=test_anno,
+        ann_file=test_anno,
+        nsweeps=nsweeps,
+        class_names=class_names,
+        pipeline=test_pipeline,
+    ),
+)
+
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# optimizer
+optimizer = dict(
+    type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False,
+)
+lr_config = dict(
+    type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4,
+)
+
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=5,
+    hooks=[
+        dict(type="TextLoggerHook"),
+        # dict(type='TensorboardLoggerHook')
+    ],
+)
+# yapf:enable
+# runtime settings
+total_epochs = 20
+device_ids = [int(x) for x in gpu_ids.split(",") if x.strip() in ["0", "1"]]
+dist_params = dict(backend="nccl", init_method="env://")
+log_level = "INFO"
+work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3])
+load_from = None
+resume_from = None 
+workflow = [('train', 1)]
diff --git a/configs/nusc/pp/response_based_kd.py b/configs/nusc/pp/response_based_kd.py
new file mode 100644
index 00000000..e1af4dfb
--- /dev/null
+++ b/configs/nusc/pp/response_based_kd.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+# Uses the base config and overrides for student architecture and response-based KD parent
+_base_cfg = Path(__file__).with_name("baseline.py")
+exec(_base_cfg.read_text(), globals(), globals())
+
+# Smaller student architecture
+model["reader"]["num_filters"] = [32, 32]
+model["backbone"]["num_input_features"] = 32
+model["neck"]["num_input_features"] = 32
+model["neck"]["ds_num_filters"] = [32, 64, 128]
+model["neck"]["us_num_filters"] = [64, 64, 64]
+model["bbox_head"]["in_channels"] = sum([64, 64, 64])
+
+# Heatmap KD parent settings
+kd = dict(
+    enabled=True,
+    type="heatmap_mse",
+    lambda_kd=0.2,
+    teacher_config="./configs/nusc/pp/baseline.py",
+    # teacher_checkpoint="./work_dirs/baseline_smoke/latest.pth",
+    teacher_checkpoint="../Computer-Vision/work_dirs/nusc_centerpoint_pp_02voxel_two_pfn_10sweep/latest.pth",
+)
+
+work_dir = "./work_dirs/response_based_kd/"
diff --git a/configs/nusc/pp/response_based_kd_05.py b/configs/nusc/pp/response_based_kd_05.py
new file mode 100644
index 00000000..5adafb3c
--- /dev/null
+++ b/configs/nusc/pp/response_based_kd_05.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+# Uses the base config and overrides for student architecture and response-based KD parent
+_base_cfg = Path(__file__).with_name("baseline.py")
+exec(_base_cfg.read_text(), globals(), globals())
+
+# Smaller student architecture
+model["reader"]["num_filters"] = [32, 32]
+model["backbone"]["num_input_features"] = 32
+model["neck"]["num_input_features"] = 32
+model["neck"]["ds_num_filters"] = [32, 64, 128]
+model["neck"]["us_num_filters"] = [64, 64, 64]
+model["bbox_head"]["in_channels"] = sum([64, 64, 64])
+
+# Heatmap KD parent settings
+kd = dict(
+    enabled=True,
+    type="heatmap_mse",
+    lambda_kd=0.5,
+    teacher_config="./configs/nusc/pp/baseline.py",
+    # teacher_checkpoint="./work_dirs/baseline_smoke/latest.pth",
+    teacher_checkpoint="../Computer-Vision/work_dirs/nusc_centerpoint_pp_02voxel_two_pfn_10sweep/latest.pth",
+)
+
+work_dir = "./work_dirs/response_based_kd/"
diff --git a/configs/nusc/pp/response_based_kd_08.py b/configs/nusc/pp/response_based_kd_08.py
new file mode 100644
index 00000000..467efd2f
--- /dev/null
+++ b/configs/nusc/pp/response_based_kd_08.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+# Uses the base config and overrides for student architecture and response-based KD parent
+_base_cfg = Path(__file__).with_name("baseline.py")
+exec(_base_cfg.read_text(), globals(), globals())
+
+# Smaller student architecture
+model["reader"]["num_filters"] = [32, 32]
+model["backbone"]["num_input_features"] = 32
+model["neck"]["num_input_features"] = 32
+model["neck"]["ds_num_filters"] = [32, 64, 128]
+model["neck"]["us_num_filters"] = [64, 64, 64]
+model["bbox_head"]["in_channels"] = sum([64, 64, 64])
+
+# Heatmap KD parent settings
+kd = dict(
+    enabled=True,
+    type="heatmap_mse",
+    lambda_kd=0.8,
+    teacher_config="./configs/nusc/pp/baseline.py",
+    # teacher_checkpoint="./work_dirs/baseline_smoke/latest.pth",
+    teacher_checkpoint="../Computer-Vision/work_dirs/nusc_centerpoint_pp_02voxel_two_pfn_10sweep/latest.pth",
+)
+
+work_dir = "./work_dirs/response_based_kd/"
diff --git a/configs/nusc/pp/response_based_kd_resnet.py b/configs/nusc/pp/response_based_kd_resnet.py
new file mode 100644
index 00000000..a88f06c1
--- /dev/null
+++ b/configs/nusc/pp/response_based_kd_resnet.py
@@ -0,0 +1,23 @@
+from pathlib import Path
+
+# Response-based KD: teacher = PointPillars (PFN) + ResNetNeck (matches cluster
+# nusc_centerpoint_pp_02voxel_two_pfn_10sweep_resnet). Student = narrower PFN + ResNetNeck.
+_base_cfg = Path(__file__).with_name("resnet.py")
+exec(_base_cfg.read_text(), globals(), globals())
+
+model["reader"]["num_filters"] = [32, 32]
+model["backbone"]["num_input_features"] = 32
+model["neck"]["num_input_features"] = 32
+model["neck"]["ds_num_filters"] = [32, 64, 128]
+model["neck"]["us_num_filters"] = [64, 64, 64]
+model["bbox_head"]["in_channels"] = sum([64, 64, 64])
+
+kd = dict(
+    enabled=True,
+    type="heatmap_mse",
+    lambda_kd=0.2,
+    teacher_config="./configs/nusc/pp/resnet.py",
+    teacher_checkpoint="../Computer-Vision/work_dirs/nusc_centerpoint_pp_02voxel_two_pfn_10sweep_resnet/latest.pth",
+)
+
+work_dir = "./work_dirs/response_based_kd_resnet/"
diff --git a/configs/nusc/pp/response_based_kd_resnet_smoke.py b/configs/nusc/pp/response_based_kd_resnet_smoke.py
new file mode 100644
index 00000000..b0b4cbe6
--- /dev/null
+++ b/configs/nusc/pp/response_based_kd_resnet_smoke.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+
+_base_cfg = Path(__file__).with_name("response_based_kd_resnet.py")
+exec(_base_cfg.read_text(), globals(), globals())
+
+total_epochs = 1
+
+data["samples_per_gpu"] = 1
+data["workers_per_gpu"] = 0
+
+for split in ("train", "val", "test"):
+    if split in data and isinstance(data[split], dict):
+        data[split]["load_interval"] = 20
+
+log_config["interval"] = 1
+checkpoint_config["interval"] = 1
+work_dir = "./work_dirs/response_based_kd_resnet_smoke/"
diff --git a/configs/nusc/pp/response_based_kd_smoke.py b/configs/nusc/pp/response_based_kd_smoke.py
new file mode 100644
index 00000000..4e83aeb1
--- /dev/null
+++ b/configs/nusc/pp/response_based_kd_smoke.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+
+_base_cfg = Path(__file__).with_name("response_based_kd.py")
+exec(_base_cfg.read_text(), globals(), globals())
+
+total_epochs = 1
+
+data["samples_per_gpu"] = 1
+data["workers_per_gpu"] = 0
+
+for split in ("train", "val", "test"):
+    if split in data and isinstance(data[split], dict):
+        data[split]["load_interval"] = 20
+
+log_config["interval"] = 1
+checkpoint_config["interval"] = 1
+work_dir = "./work_dirs/response_based_kd_smoke/"
diff --git a/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep_circular_nms.py b/configs/nusc/pp/unused/circular_nms.py
similarity index 94%
rename from configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep_circular_nms.py
rename to configs/nusc/pp/unused/circular_nms.py
index 71c22461..02042783 100644
--- a/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep_circular_nms.py
+++ b/configs/nusc/pp/unused/circular_nms.py
@@ -1,3 +1,4 @@
+import os
 import itertools
 import logging
 
@@ -85,12 +86,12 @@
 # dataset settings
 dataset_type = "NuScenesDataset"
 nsweeps = 10
-data_root = "data/nuScenes"
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
 
 db_sampler = dict(
     type="GT-AUG",
     enable=False,
-    db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl",
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo.pkl",
     sample_groups=[
         dict(car=2),
         dict(truck=3),
@@ -161,13 +162,13 @@
     dict(type="Reformat"),
 ]
 
-train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
-val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
+train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
 test_anno = None
 
 data = dict(
-    samples_per_gpu=4,
-    workers_per_gpu=8,
+    samples_per_gpu=1,
+    workers_per_gpu=2,
     train=dict(
         type=dataset_type,
         root_path=data_root,
diff --git a/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep.py b/configs/nusc/pp/unused/demo.py
similarity index 95%
rename from configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep.py
rename to configs/nusc/pp/unused/demo.py
index 1a89adf3..6ffefdd3 100644
--- a/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep.py
+++ b/configs/nusc/pp/unused/demo.py
@@ -1,3 +1,4 @@
+import os
 import itertools
 import logging
 
@@ -83,12 +84,12 @@
 # dataset settings
 dataset_type = "NuScenesDataset"
 nsweeps = 10
-data_root = "data/nuScenes"
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
 
 db_sampler = dict(
     type="GT-AUG",
     enable=False,
-    db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl",
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo.pkl",
     sample_groups=[
         dict(car=2),
         dict(truck=3),
@@ -159,8 +160,8 @@
     dict(type="Reformat"),
 ]
 
-train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
-val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
+train_anno = "demo/nuScenes/demo_infos.pkl"
+val_anno = "demo/nuScenes/demo_infos.pkl"
 test_anno = None
 
 data = dict(
diff --git a/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn.py b/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn.py
index e51f4c28..efb537f2 100644
--- a/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn.py
+++ b/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn.py
@@ -1,3 +1,4 @@
+import os
 import itertools
 import logging
 
@@ -85,12 +86,12 @@
 # dataset settings
 dataset_type = "NuScenesDataset"
 nsweeps = 10
-data_root = "data/nuScenes"
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
 
 db_sampler = dict(
     type="GT-AUG",
     enable=False,
-    db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl",
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo.pkl",
     sample_groups=[
         dict(car=2),
         dict(truck=3),
@@ -162,8 +163,8 @@
     dict(type="Reformat"),
 ]
 
-train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
-val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
+train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
 test_anno = None
 
 data = dict(
diff --git a/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn_flip.py b/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn_flip.py
index b5e729e5..80c82f3f 100644
--- a/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn_flip.py
+++ b/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn_flip.py
@@ -1,3 +1,4 @@
+import os
 import itertools
 import logging
 
@@ -87,12 +88,12 @@
 # dataset settings
 dataset_type = "NuScenesDataset"
 nsweeps = 10
-data_root = "data/nuScenes"
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
 
 db_sampler = dict(
     type="GT-AUG",
     enable=False,
-    db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl",
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo.pkl",
     sample_groups=[
         dict(car=2),
         dict(truck=3),
@@ -166,9 +167,9 @@
     dict(type="Reformat", double_flip=DOUBLE_FLIP),
 ]
 
-train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
-val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
-test_anno = "data/nuScenes/infos_test_10sweeps_withvelo_filter_True.pkl"
+train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
+test_anno = f"{data_root}/infos_test_10sweeps_withvelo_filter_True.pkl"
 
 data = dict(
     samples_per_gpu=4,
diff --git a/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z.py b/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z.py
index a4b8db5a..0affb717 100644
--- a/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z.py
+++ b/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z.py
@@ -1,3 +1,4 @@
+import os
 import itertools
 import logging
 
@@ -85,12 +86,12 @@
 # dataset settings
 dataset_type = "NuScenesDataset"
 nsweeps = 10
-data_root = "data/nuScenes"
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
 
 db_sampler = dict(
     type="GT-AUG",
     enable=False,
-    db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl",
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo.pkl",
     sample_groups=[
         dict(car=2),
         dict(truck=3),
@@ -163,8 +164,8 @@
     dict(type="Reformat"),
 ]
 
-train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
-val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
+train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
 test_anno = None
 
 data = dict(
diff --git a/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_flip.py b/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_flip.py
index 97858927..bde36417 100644
--- a/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_flip.py
+++ b/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_flip.py
@@ -1,3 +1,4 @@
+import os
 import itertools
 import logging
 
@@ -87,12 +88,12 @@
 # dataset settings
 dataset_type = "NuScenesDataset"
 nsweeps = 10
-data_root = "data/nuScenes"
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
 
 db_sampler = dict(
     type="GT-AUG",
     enable=False,
-    db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl",
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo.pkl",
     sample_groups=[
         dict(car=2),
         dict(truck=3),
@@ -167,9 +168,9 @@
     dict(type="Reformat", double_flip=DOUBLE_FLIP),
 ]
 
-train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
-val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
-test_anno = "data/nuScenes/infos_test_10sweeps_withvelo_filter_True.pkl"
+train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
+test_anno = f"{data_root}/infos_test_10sweeps_withvelo_filter_True.pkl"
 
 data = dict(
     samples_per_gpu=4,
diff --git a/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_01voxel.py b/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_01voxel.py
index c3c499cd..0eb35975 100644
--- a/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_01voxel.py
+++ b/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_01voxel.py
@@ -1,3 +1,4 @@
+import os
 import itertools
 import logging
 
@@ -81,12 +82,12 @@
 # dataset settings
 dataset_type = "NuScenesDataset"
 nsweeps = 10
-data_root = "data/nuScenes"
+data_root = os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes")
 
 db_sampler = dict(
     type="GT-AUG",
     enable=False,
-    db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl",
+    db_info_path=f"{data_root}/dbinfos_train_10sweeps_withvelo.pkl",
     sample_groups=[
         dict(car=2),
         dict(truck=3),
@@ -157,8 +158,8 @@
     dict(type="Reformat"),
 ]
 
-train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
-val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
+train_anno = f"{data_root}/infos_train_10sweeps_withvelo_filter_True.pkl"
+val_anno = f"{data_root}/infos_val_10sweeps_withvelo_filter_True.pkl"
 test_anno = None
 
 data = dict(
diff --git a/create_subset.py b/create_subset.py
new file mode 100644
index 00000000..ff52a428
--- /dev/null
+++ b/create_subset.py
@@ -0,0 +1,31 @@
+import pickle
+import os
+
+# Get the absolute path to your current directory
+base_path = os.getcwd()
+
+def create_subset(filename, ratio=10):
+    input_path = os.path.join(base_path, 'data/nuScenes', filename)
+    output_path = input_path.replace('.pkl', '_subset_10.pkl')
+    
+    if not os.path.exists(input_path):
+        print(f"ERROR: Cannot find {input_path}")
+        return
+
+    print(f"Processing {filename}...")
+    with open(input_path, 'rb') as f:
+        data = pickle.load(f)
+    
+    # Slice the data
+    if isinstance(data, dict) and 'infos' in data:
+        data['infos'] = data['infos'][::ratio]
+    else:
+        data = data[::ratio]
+        
+    with open(output_path, 'wb') as f:
+        pickle.dump(data, f)
+    
+    print(f"SUCCESS: Saved to {output_path}")
+
+create_subset('infos_train_10sweeps_withvelo_filter_True.pkl')
+create_subset('infos_val_10sweeps_withvelo_filter_True.pkl')
diff --git a/cv29f26@gpucluster.st.lab.au.dk b/cv29f26@gpucluster.st.lab.au.dk
new file mode 100644
index 00000000..055e52da
--- /dev/null
+++ b/cv29f26@gpucluster.st.lab.au.dk
@@ -0,0 +1,289 @@
+name: centerpoint
+channels:
+  - defaults
+  - nvidia
+  - nvidia/label/cuda-11.3.0
+  - pytorch
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - binutils=2.40=h1680402_0
+  - binutils_impl_linux-64=2.40=h5293946_0
+  - binutils_linux-64=2.40.0=h06a4308_3
+  - blas=1.0=mkl
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2025.12.2=h06a4308_0
+  - certifi=2022.12.7=py37h06a4308_0
+  - cuda-cccl=12.4.127=0
+  - cuda-command-line-tools=12.4.1=0
+  - cuda-cudart=12.4.127=0
+  - cuda-cudart-dev=12.4.127=0
+  - cuda-cudart-static=12.4.127=0
+  - cuda-cuobjdump=12.4.127=0
+  - cuda-cupti=12.4.127=0
+  - cuda-cupti-static=12.4.127=0
+  - cuda-cuxxfilt=12.4.127=0
+  - cuda-documentation=12.4.127=0
+  - cuda-driver-dev=12.4.127=0
+  - cuda-gdb=12.4.127=0
+  - cuda-libraries=12.4.1=h06a4308_1
+  - cuda-libraries-dev=12.4.1=h06a4308_1
+  - cuda-libraries-static=12.4.1=0
+  - cuda-nsight=12.4.127=0
+  - cuda-nvcc=11.3.58=h2467b9f_0
+  - cuda-nvdisasm=12.4.127=0
+  - cuda-nvml-dev=12.4.127=0
+  - cuda-nvprof=12.4.127=0
+  - cuda-nvprune=12.4.127=0
+  - cuda-nvrtc=12.4.127=0
+  - cuda-nvrtc-dev=12.4.127=0
+  - cuda-nvrtc-static=12.4.127=0
+  - cuda-nvtx=12.4.127=0
+  - cuda-nvvp=12.4.127=0
+  - cuda-opencl=12.4.127=0
+  - cuda-opencl-dev=12.4.127=0
+  - cuda-profiler-api=12.4.127=0
+  - cuda-sanitizer-api=12.4.127=0
+  - cuda-tools=12.4.1=0
+  - cuda-visual-tools=12.4.1=h06a4308_1
+  - cudatoolkit=11.3.1=ha36c431_9
+  - ffmpeg=4.2.2=h20bf706_0
+  - flit-core=3.6.0=pyhd3eb1b0_0
+  - freetype=2.14.1=hf5b9546_0
+  - gcc_impl_linux-64=11.2.0=h1234567_1
+  - gcc_linux-64=11.2.0=h931ca3c_3
+  - gds-tools=1.9.1.3=0
+  - giflib=5.2.2=h5eee18b_0
+  - gmp=6.3.0=h6a678d5_0
+  - gnutls=3.6.15=he1e5248_0
+  - gxx_impl_linux-64=11.2.0=h1234567_1
+  - gxx_linux-64=11.2.0=h06a4308_3
+  - intel-openmp=2021.4.0=h06a4308_3561
+  - jpeg=9f=h5ce9db8_0
+  - kernel-headers_linux-64=4.18.0=h3108a97_1
+  - lame=3.100=hbd0596d_1
+  - lcms2=2.16=hb9589c4_0
+  - ld_impl_linux-64=2.40=h12ee557_0
+  - lerc=4.0.0=h6a678d5_0
+  - libcublas=12.4.5.8=0
+  - libcublas-dev=12.4.5.8=0
+  - libcublas-static=12.4.5.8=0
+  - libcufft=11.2.1.3=0
+  - libcufft-dev=11.2.1.3=0
+  - libcufft-static=11.2.1.3=0
+  - libcufile=1.9.1.3=0
+  - libcufile-dev=1.9.1.3=0
+  - libcufile-static=1.9.1.3=0
+  - libcurand=10.3.5.147=0
+  - libcurand-dev=10.3.5.147=0
+  - libcurand-static=10.3.5.147=0
+  - libcusolver=11.6.1.9=0
+  - libcusolver-dev=11.6.1.9=0
+  - libcusolver-static=11.6.1.9=0
+  - libcusparse=12.3.1.170=0
+  - libcusparse-dev=12.3.1.170=0
+  - libcusparse-static=12.3.1.170=0
+  - libdeflate=1.22=h5eee18b_0
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc=15.2.0=h69a1729_7
+  - libgcc-devel_linux-64=11.2.0=h1234567_1
+  - libgcc-ng=15.2.0=h166f726_7
+  - libgomp=15.2.0=h4751f2c_7
+  - libidn2=2.3.4=h5eee18b_0
+  - libnpp=12.2.5.30=0
+  - libnpp-dev=12.2.5.30=0
+  - libnpp-static=12.2.5.30=0
+  - libnvfatbin=12.4.127=0
+  - libnvfatbin-dev=12.4.127=0
+  - libnvjitlink=12.4.127=0
+  - libnvjitlink-dev=12.4.127=0
+  - libnvjpeg=12.3.1.117=0
+  - libnvjpeg-dev=12.3.1.117=0
+  - libnvjpeg-static=12.3.1.117=0
+  - libopus=1.6.1=h9f10d21_0
+  - libpng=1.6.55=h22898a0_0
+  - libstdcxx=15.2.0=h39759b7_7
+  - libstdcxx-devel_linux-64=11.2.0=h1234567_1
+  - libstdcxx-ng=15.2.0=hc03a8fd_7
+  - libtasn1=4.21.0=h27ab2c4_0
+  - libtiff=4.5.1=hffd6297_1
+  - libunistring=0.9.10=h27cfd23_0
+  - libuv=1.52.0=heb5a705_0
+  - libvpx=1.7.0=h439df22_0
+  - libwebp=1.2.4=h11a3e52_1
+  - libwebp-base=1.2.4=h5eee18b_1
+  - libxcb=1.17.0=h9b100fa_0
+  - libzlib=1.3.1=hb25bd0a_0
+  - lz4-c=1.9.4=h6a678d5_1
+  - mkl=2021.4.0=h06a4308_640
+  - mkl-service=2.4.0=py37h7f8727e_0
+  - mkl_fft=1.3.1=py37hd3c417c_0
+  - mkl_random=1.2.2=py37h51133e4_0
+  - ncurses=6.5=h7934f7d_0
+  - nettle=3.7.3=hbbd107a_1
+  - nsight-compute=2024.1.1.4=0
+  - numpy=1.21.5=py37h6c91a56_3
+  - numpy-base=1.21.5=py37ha15fc14_3
+  - openh264=2.1.1=h4ff587b_0
+  - openssl=1.1.1w=h7f8727e_0
+  - pillow=9.4.0=py37h6a678d5_0
+  - pip=22.3.1=py37h06a4308_0
+  - pthread-stubs=0.3=h0ce48e5_1
+  - python=3.7.16=h7a1cb2a_0
+  - pytorch=1.10.1=py3.7_cuda11.3_cudnn8.2.0_0
+  - pytorch-mutex=1.0=cuda
+  - readline=8.3=hc2a1206_0
+  - setuptools=65.6.3=py37h06a4308_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - sqlite=3.51.2=h3e8d24a_0
+  - sysroot_linux-64=2.28=h3108a97_1
+  - tk=8.6.15=h54e0aa7_0
+  - torchvision=0.11.2=py37_cu113
+  - tzdata=2026a=he532380_0
+  - wheel=0.38.4=py37h06a4308_0
+  - x264=1!157.20191217=h7b6447c_0
+  - xorg-libx11=1.8.12=h9b100fa_1
+  - xorg-libxau=1.0.12=h9b100fa_0
+  - xorg-libxdmcp=1.1.5=h9b100fa_0
+  - xorg-xorgproto=2024.1=h5eee18b_1
+  - xz=5.8.2=h448239c_0
+  - zlib=1.3.1=hb25bd0a_0
+  - zstd=1.5.7=h11fc155_0
+  - pip:
+      - addict==2.4.0
+      - aiofiles==22.1.0
+      - aiosqlite==0.19.0
+      - anyio==3.7.1
+      - argon2-cffi==23.1.0
+      - argon2-cffi-bindings==21.2.0
+      - arrow==1.2.3
+      - attrs==24.2.0
+      - babel==2.14.0
+      - backcall==0.2.0
+      - beautifulsoup4==4.14.3
+      - bleach==6.0.0
+      - cached-property==1.5.2
+      - cachetools==5.5.2
+      - ccimport==0.4.4
+      - cffi==1.15.1
+      - charset-normalizer==3.4.6
+      - comm==0.1.4
+      - cumm-cu113==0.4.11
+      - cumm-cu114==0.4.11
+      - cycler==0.11.0
+      - debugpy==1.7.0
+      - decorator==5.1.1
+      - defusedxml==0.7.1
+      - descartes==1.1.0
+      - dictor==0.1.12
+      - entrypoints==0.4
+      - exceptiongroup==1.3.1
+      - fastjsonschema==2.21.2
+      - fire==0.7.1
+      - fonttools==4.38.0
+      - fqdn==1.5.1
+      - idna==3.10
+      - importlib-metadata==6.7.0
+      - importlib-resources==5.12.0
+      - ipykernel==6.16.2
+      - ipython==7.34.0
+      - ipython-genutils==0.2.0
+      - ipywidgets==8.1.8
+      - isoduration==20.11.0
+      - jedi==0.19.2
+      - jinja2==3.1.6
+      - joblib==1.3.2
+      - json5==0.9.16
+      - jsonpointer==3.0.0
+      - jsonschema==4.17.3
+      - jupyter==1.1.1
+      - jupyter-client==7.4.9
+      - jupyter-console==6.6.3
+      - jupyter-core==4.12.0
+      - jupyter-events==0.6.3
+      - jupyter-server==1.24.0
+      - jupyter-server-fileid==0.9.3
+      - jupyter-server-ydoc==0.8.0
+      - jupyter-ydoc==0.2.5
+      - jupyterlab==3.6.8
+      - jupyterlab-pygments==0.2.2
+      - jupyterlab-server==2.24.0
+      - jupyterlab-widgets==3.0.16
+      - kiwisolver==1.4.5
+      - lark==1.1.9
+      - llvmlite==0.39.1
+      - markupsafe==2.1.5
+      - matplotlib==3.5.2
+      - matplotlib-inline==0.1.6
+      - mistune==3.0.2
+      - moves==0.1
+      - nbclassic==1.2.0
+      - nbclient==0.7.4
+      - nbconvert==7.6.0
+      - nbformat==5.8.0
+      - nest-asyncio==1.6.0
+      - ninja==1.11.1.4
+      - notebook==6.5.7
+      - notebook-shim==0.2.4
+      - numba==0.56.4
+      - nuscenes-devkit==1.1.10
+      - opencv-python-headless==4.3.0.36
+      - packaging==24.0
+      - pandocfilters==1.5.1
+      - parso==0.8.6
+      - pccm==0.4.16
+      - pexpect==4.9.0
+      - pickleshare==0.7.5
+      - pkgutil-resolve-name==1.3.10
+      - portalocker==2.7.0
+      - prometheus-client==0.17.1
+      - prompt-toolkit==3.0.48
+      - protobuf==4.24.4
+      - psutil==7.2.2
+      - ptyprocess==0.7.0
+      - pybind11==2.13.6
+      - pycocotools==2.0.7
+      - pycparser==2.21
+      - pygments==2.17.2
+      - pyparsing==3.1.4
+      - pyquaternion==0.9.9
+      - pyrsistent==0.19.3
+      - python-dateutil==2.9.0.post0
+      - python-json-logger==3.0.1
+      - pytz==2026.1.post1
+      - pyyaml==6.0.1
+      - pyzmq==26.2.1
+      - requests==2.31.0
+      - rfc3339-validator==0.1.4
+      - rfc3986-validator==0.1.1
+      - scikit-learn==1.0.2
+      - scipy==1.7.3
+      - send2trash==1.8.3
+      - shapely==1.8.5
+      - sharedarray==3.2.4
+      - sniffio==1.3.1
+      - soupsieve==2.4.1
+      - spconv-cu113==2.3.6
+      - spconv-cu114==2.3.6
+      - tensorboardx==2.6.2.2
+      - termcolor==2.3.0
+      - terminado==0.17.1
+      - terminaltables==3.1.10
+      - threadpoolctl==3.1.0
+      - tinycss2==1.2.1
+      - tomli==2.0.1
+      - tornado==6.2
+      - tqdm==4.67.3
+      - traitlets==5.9.0
+      - typing-extensions==4.7.1
+      - uri-template==1.3.0
+      - urllib3==2.0.7
+      - wcwidth==0.2.14
+      - webcolors==1.13
+      - webencodings==0.5.1
+      - websocket-client==1.6.1
+      - widgetsnbextension==4.0.15
+      - y-py==0.6.2
+      - ypy-websocket==0.8.4
+      - zipp==3.15.0
+prefix: /home/acer/miniconda3/envs/centerpoint
diff --git a/det3d/models/bbox_heads/center_head.py b/det3d/models/bbox_heads/center_head.py
index 87e85416..691498f0 100644
--- a/det3d/models/bbox_heads/center_head.py
+++ b/det3d/models/bbox_heads/center_head.py
@@ -9,6 +9,7 @@
 from collections import defaultdict
 from det3d.core import box_torch_ops
 import torch
+import torch.nn.functional as F
 from det3d.torchie.cnn import kaiming_init
 from torch import double, nn
 from det3d.models.losses.centernet_loss import FastFocalLoss, RegLoss
@@ -248,6 +249,47 @@ def _sigmoid(self, x):
         return y
 
     def loss(self, example, preds_dicts, test_cfg, **kwargs):
+        teacher_preds_dicts = kwargs.get("teacher_preds_dicts")
+        kd_cfg = kwargs.get("kd_cfg")
+        kd_enabled = bool(kd_cfg and kd_cfg.get("enabled", False))
+        kd_type = (kd_cfg.get("type", "heatmap_mse") if kd_cfg else "heatmap_mse")
+        kd_weight = float(kd_cfg.get("lambda_kd", 0.0)) if kd_enabled else 0.0
+        lambda_feat = float(kd_cfg.get("lambda_feat", 0.0)) if kd_enabled else 0.0
+        student_feats = kwargs.get("student_feats") or {}
+        teacher_feats = kwargs.get("teacher_feats") or {}
+
+        if kd_enabled and kd_type == "heatmap_mse":
+            if teacher_preds_dicts is None:
+                raise ValueError("KD is enabled but teacher_preds_dicts is None.")
+            if len(teacher_preds_dicts) != len(preds_dicts):
+                raise ValueError(
+                    "Teacher/student task count mismatch: "
+                    f"{len(teacher_preds_dicts)} vs {len(preds_dicts)}"
+                )
+
+        feat_kd_loss = None
+        if kd_enabled and kd_type == "feature_mse" and lambda_feat > 0.0:
+            if "head_shared" not in student_feats or "head_shared" not in teacher_feats:
+                raise ValueError(
+                    "feature_mse KD requires student_feats['head_shared'] and "
+                    "teacher_feats['head_shared']."
+                )
+            s_feat = student_feats["head_shared"]
+            t_feat = teacher_feats["head_shared"].detach()
+            if s_feat.shape[2:] != t_feat.shape[2:]:
+                s_feat = F.interpolate(
+                    s_feat,
+                    size=t_feat.shape[2:],
+                    mode="bilinear",
+                    align_corners=False,
+                )
+            if s_feat.shape != t_feat.shape:
+                raise ValueError(
+                    "Teacher/student head_shared shape mismatch after align: "
+                    f"{s_feat.shape} vs {t_feat.shape}"
+                )
+            feat_kd_loss = F.mse_loss(s_feat, t_feat)
+
         rets = []
         for task_id, preds_dict in enumerate(preds_dicts):
             # heatmap focal loss
@@ -276,8 +318,36 @@ def loss(self, example, preds_dicts, test_cfg, **kwargs):
             loc_loss = (box_loss*box_loss.new_tensor(self.code_weights)).sum()
 
             loss = hm_loss + self.weight*loc_loss
+            hm_kd_loss = loss.new_tensor(0.0)
+            if kd_enabled and kd_type == "heatmap_mse" and kd_weight > 0.0:
+                teacher_hm = teacher_preds_dicts[task_id]['hm']
+                if teacher_hm.shape != preds_dict['hm'].shape:
+                    raise ValueError(
+                        "Teacher/student hm shape mismatch at task "
+                        f"{task_id}: {teacher_hm.shape} vs {preds_dict['hm'].shape}"
+                    )
+                teacher_hm_prob = torch.clamp(
+                    torch.sigmoid(teacher_hm.detach()), min=1e-4, max=1 - 1e-4
+                )
+                hm_kd_loss = F.mse_loss(preds_dict['hm'], teacher_hm_prob)
+                loss = loss + kd_weight * hm_kd_loss
+
+            if (
+                task_id == 0
+                and kd_enabled
+                and kd_type == "feature_mse"
+                and lambda_feat > 0.0
+                and feat_kd_loss is not None
+            ):
+                loss = loss + lambda_feat * feat_kd_loss
+
+            feat_kd_loss_log = (
+                feat_kd_loss.detach().cpu()
+                if feat_kd_loss is not None
+                else torch.tensor(0.0)
+            )
 
-            ret.update({'loss': loss, 'hm_loss': hm_loss.detach().cpu(), 'loc_loss':loc_loss, 'loc_loss_elem': box_loss.detach().cpu(), 'num_positive': example['mask'][task_id].float().sum()})
+            ret.update({'loss': loss, 'hm_loss': hm_loss.detach().cpu(), 'loc_loss':loc_loss, 'loc_loss_elem': box_loss.detach().cpu(), 'hm_kd_loss': hm_kd_loss.detach().cpu(), 'feat_kd_loss': feat_kd_loss_log, 'num_positive': example['mask'][task_id].float().sum()})
 
             rets.append(ret)
         
diff --git a/det3d/models/detectors/point_pillars.py b/det3d/models/detectors/point_pillars.py
index 00045ab0..0e41c6bd 100644
--- a/det3d/models/detectors/point_pillars.py
+++ b/det3d/models/detectors/point_pillars.py
@@ -46,10 +46,25 @@ def forward(self, example, return_loss=True, **kwargs):
         )
 
         x = self.extract_feat(data)
-        preds, _ = self.bbox_head(x)
+        preds, head_shared = self.bbox_head(x)
 
         if return_loss:
-            return self.bbox_head.loss(example, preds, self.test_cfg)
+            if kwargs.get("return_preds", False):
+                if kwargs.get("return_feats", False):
+                    return {
+                        "preds": preds,
+                        "feats": {"head_shared": head_shared},
+                    }
+                return preds
+            return self.bbox_head.loss(
+                example,
+                preds,
+                self.test_cfg,
+                teacher_preds_dicts=kwargs.get("teacher_preds_dicts"),
+                kd_cfg=kwargs.get("kd_cfg"),
+                student_feats={"head_shared": head_shared},
+                teacher_feats=kwargs.get("teacher_feats"),
+            )
         else:
             return self.bbox_head.predict(example, preds, self.test_cfg)
 
@@ -71,7 +86,7 @@ def forward_two_stage(self, example, return_loss=True, **kwargs):
 
         x = self.extract_feat(data)
         bev_feature = x 
-        preds, _ = self.bbox_head(x)
+        preds, _head_shared = self.bbox_head(x)
 
         # manual deepcopy ...
         new_preds = []
diff --git a/det3d/models/detectors/voxelnet.py b/det3d/models/detectors/voxelnet.py
index 5a8218fb..24e5bd24 100644
--- a/det3d/models/detectors/voxelnet.py
+++ b/det3d/models/detectors/voxelnet.py
@@ -54,10 +54,25 @@ def extract_feat(self, data):
 
     def forward(self, example, return_loss=True, **kwargs):
         x, _ = self.extract_feat(example)
-        preds, _ = self.bbox_head(x)
+        preds, head_shared = self.bbox_head(x)
 
         if return_loss:
-            return self.bbox_head.loss(example, preds, self.test_cfg)
+            if kwargs.get("return_preds", False):
+                if kwargs.get("return_feats", False):
+                    return {
+                        "preds": preds,
+                        "feats": {"head_shared": head_shared},
+                    }
+                return preds
+            return self.bbox_head.loss(
+                example,
+                preds,
+                self.test_cfg,
+                teacher_preds_dicts=kwargs.get("teacher_preds_dicts"),
+                kd_cfg=kwargs.get("kd_cfg"),
+                student_feats={"head_shared": head_shared},
+                teacher_feats=kwargs.get("teacher_feats"),
+            )
         else:
             return self.bbox_head.predict(example, preds, self.test_cfg)
 
diff --git a/det3d/models/necks/__init__.py b/det3d/models/necks/__init__.py
index 1a1db7e4..7e94b63d 100644
--- a/det3d/models/necks/__init__.py
+++ b/det3d/models/necks/__init__.py
@@ -1,3 +1,4 @@
 from .rpn import RPN
+from .res_net import ResNetNeck
 
-__all__ = ["RPN"]
+__all__ = ["RPN", "ResNetNeck"]
diff --git a/det3d/models/necks/res_net.py b/det3d/models/necks/res_net.py
new file mode 100644
index 00000000..ca43b1a8
--- /dev/null
+++ b/det3d/models/necks/res_net.py
@@ -0,0 +1,115 @@
+import numpy as np
+import torch
+from torch import nn
+
+from det3d.models.utils import Sequential
+from ..registry import NECKS
+from ..utils import build_norm_layer
+
+class BasicBlock(nn.Module):
+    """Standard ResNet Basic Block adapted for det3d."""
+    def __init__(
+        self, 
+        inplanes, 
+        planes, 
+        stride=1, 
+        norm_cfg=None):
+        super(BasicBlock, self).__init__()
+        if norm_cfg is None:
+            norm_cfg = dict(type="BN", eps=1e-3, momentum=0.01)
+            
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = build_norm_layer(norm_cfg, planes)[1]
+        self.relu = nn.ReLU(inplace=True)
+        
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = build_norm_layer(norm_cfg, planes)[1]
+        
+        self.downsample = None
+        if stride != 1 or inplanes != planes:
+            self.downsample = Sequential(
+                nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False),
+                build_norm_layer(norm_cfg, planes)[1],
+            )
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+        return out
+
+@NECKS.register_module
+class ResNetNeck(nn.Module):
+    def __init__(self, layer_nums, ds_layer_strides, ds_num_filters, us_layer_strides, us_num_filters, num_input_features, norm_cfg=None, logger=None, **kwargs):
+        super(ResNetNeck, self).__init__()
+        self._layer_strides = ds_layer_strides
+        self._num_filters = ds_num_filters
+        self._layer_nums = layer_nums
+        self._upsample_strides = us_layer_strides
+        self._num_upsample_filters = us_num_filters
+        self._num_input_features = num_input_features
+
+        if norm_cfg is None:
+            norm_cfg = dict(type="BN", eps=1e-3, momentum=0.01)
+        self._norm_cfg = norm_cfg
+
+        self._upsample_start_idx = len(self._layer_nums) - len(self._upsample_strides)
+
+        in_filters = [self._num_input_features, *self._num_filters[:-1]]
+        blocks = []
+        deblocks = []
+
+        for i, layer_num in enumerate(self._layer_nums):
+            # Build ResNet stage
+            stage_blocks = []
+            stage_blocks.append(BasicBlock(in_filters[i], self._num_filters[i], stride=self._layer_strides[i], norm_cfg=self._norm_cfg))
+            for _ in range(1, layer_num):
+                stage_blocks.append(BasicBlock(self._num_filters[i], self._num_filters[i], stride=1, norm_cfg=self._norm_cfg))
+            
+            blocks.append(Sequential(*stage_blocks))
+            num_out_filters = self._num_filters[i]
+
+            # Build Upsampling (FPN) stage - identical to your RPN
+            if i - self._upsample_start_idx >= 0:
+                stride = (self._upsample_strides[i - self._upsample_start_idx])
+                if stride > 1:
+                    deblock = Sequential(
+                        nn.ConvTranspose2d(num_out_filters, self._num_upsample_filters[i - self._upsample_start_idx], stride, stride=stride, bias=False),
+                        build_norm_layer(self._norm_cfg, self._num_upsample_filters[i - self._upsample_start_idx])[1],
+                        nn.ReLU(),
+                    )
+                else:
+                    stride = np.round(1 / stride).astype(np.int64)
+                    deblock = Sequential(
+                        nn.Conv2d(num_out_filters, self._num_upsample_filters[i - self._upsample_start_idx], stride, stride=stride, bias=False),
+                        build_norm_layer(self._norm_cfg, self._num_upsample_filters[i - self._upsample_start_idx])[1],
+                        nn.ReLU(),
+                    )
+                deblocks.append(deblock)
+                
+        self.blocks = nn.ModuleList(blocks)
+        self.deblocks = nn.ModuleList(deblocks)
+        
+        if logger:
+            logger.info("Finish ResNetNeck Initialization")
+
+    def forward(self, x):
+        ups = []
+        for i in range(len(self.blocks)):
+            x = self.blocks[i](x)
+            if i - self._upsample_start_idx >= 0:
+                ups.append(self.deblocks[i - self._upsample_start_idx](x))
+        if len(ups) > 0:
+            x = torch.cat(ups, dim=1)
+        return x
\ No newline at end of file
diff --git a/det3d/models/readers/__init__.py b/det3d/models/readers/__init__.py
index 94ed32c7..cf8e51b7 100644
--- a/det3d/models/readers/__init__.py
+++ b/det3d/models/readers/__init__.py
@@ -1,4 +1,4 @@
-from .pillar_encoder import PillarFeatureNet, PointPillarsScatter
+from .pillar_encoder import PillarFeatureNet, PointPillarsScatter, FastPillarFeatureNet
 from .voxel_encoder import VoxelFeatureExtractorV3
 from .dynamic_voxel_encoder import DynamicVoxelEncoder
 
diff --git a/det3d/models/readers/pillar_encoder.py b/det3d/models/readers/pillar_encoder.py
index 1a2553a3..b11d32c3 100644
--- a/det3d/models/readers/pillar_encoder.py
+++ b/det3d/models/readers/pillar_encoder.py
@@ -11,6 +11,56 @@
 from ..registry import BACKBONES, READERS
 from ..utils import build_norm_layer
 
+class MAPELayer(nn.Module):
+    """
+    Max-and-Attention Pillar Encoding (MAPE)
+    Based on the FastPillars (2023) architecture.
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        # Standard Point-wise Feature Extraction
+        self.linear = nn.Linear(in_channels, out_channels, bias=False)
+        self.norm = nn.BatchNorm1d(out_channels)
+        
+        # Attention Generation Branch
+        self.attention_fc = nn.Linear(out_channels, out_channels, bias=False)
+        self.attention_norm = nn.BatchNorm1d(out_channels)
+
+    def forward(self, inputs):
+        # inputs shape: [M, max_points_per_pillar, in_channels]
+        # M = total number of non-empty pillars
+        
+        # 1. Extract Point Features
+        x = self.linear(inputs)
+        
+        # Reshape for BatchNorm1d
+        M, num_points, C = x.shape
+        x = x.view(M * num_points, C).unsqueeze(-1)
+        x = self.norm(x)
+        x = x.view(M, num_points, C)
+        x = F.relu(x)
+        
+        # 2. Extract Global Geometric Context (Standard Max Pooling)
+        x_max = torch.max(x, dim=1, keepdim=True)[0]  # Shape: [M, 1, C]
+        
+        # 3. Generate Attention Weights
+        # Use the global feature to determine which channels matter most
+        attn_weights = self.attention_fc(x_max)
+        
+        # Reshape for BatchNorm
+        attn_weights = attn_weights.view(M, C).unsqueeze(-1)
+        attn_weights = self.attention_norm(attn_weights)
+        attn_weights = attn_weights.view(M, 1, C)
+        
+        # Sigmoid to scale weights between 0 and 1
+        attn_weights = torch.sigmoid(attn_weights)    # Shape: [M, 1, C]
+        
+        # 4. Attentive Fusion
+        # Multiply the original features by the attention weights, then pool
+        x_attended = x * attn_weights
+        out = torch.max(x_attended, dim=1)[0]         # Shape: [M, C]
+        
+        return out
 
 class PFNLayer(nn.Module):
     def __init__(self, in_channels, out_channels, norm_cfg=None, last_layer=False):
@@ -216,3 +266,101 @@ def forward(self, voxel_features, coords, batch_size, input_shape):
         # Undo the column stacking to final 4-dim tensor
         batch_canvas = batch_canvas.view(batch_size, self.nchannels, self.ny, self.nx)
         return batch_canvas
+
+@READERS.register_module
+class FastPillarFeatureNet(nn.Module):
+    def __init__(
+        self,
+        num_input_features=4,
+        num_filters=(64,),
+        with_distance=False,
+        voxel_size=(0.2, 0.2, 4),
+        pc_range=(0, -40, -3, 70.4, 40, 1),
+        norm_cfg=None,
+        virtual=False
+    ):
+        """
+        FastPillars version of the Pillar Feature Net using MAPE.
+        """
+        super().__init__()
+        self.name = "FastPillarFeatureNet"
+        assert len(num_filters) > 0
+
+        self.num_input = num_input_features
+        num_input_features += 5
+        if with_distance:
+            num_input_features += 1
+        self._with_distance = with_distance
+
+        # Create FastPillars layers
+        num_filters = [num_input_features] + list(num_filters)
+        pfn_layers = []
+        for i in range(len(num_filters) - 1):
+            in_filters = num_filters[i]
+            out_filters = num_filters[i + 1]
+            
+            # Intermediate layers use standard PFN
+            if i < len(num_filters) - 2:
+                pfn_layers.append(
+                    PFNLayer(in_filters, out_filters, norm_cfg=norm_cfg, last_layer=False)
+                )
+            # Final layer uses MAPE Attention
+            else:
+                pfn_layers.append(
+                    MAPELayer(in_filters, out_filters)
+                )
+                
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+
+        self.virtual = virtual 
+
+        # Need pillar (voxel) size and x/y offset in order to calculate pillar offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.x_offset = self.vx / 2 + pc_range[0]
+        self.y_offset = self.vy / 2 + pc_range[1]
+
+    def forward(self, features, num_voxels, coors):
+        device = features.device
+
+        if self.virtual:
+            virtual_point_mask = features[..., -2] == -1
+            virtual_points = features[virtual_point_mask]
+            virtual_points[..., -2] = 1
+            features[..., -2] = 0 
+            features[virtual_point_mask] = virtual_points
+
+        dtype = features.dtype
+        # Find distance of x, y, and z from cluster center
+        points_mean = features[:, :, :3].sum(dim=1, keepdim=True) / num_voxels.type_as(
+            features
+        ).view(-1, 1, 1)
+        f_cluster = features[:, :, :3] - points_mean
+
+        # Find distance of x, y, and z from pillar center
+        f_center = torch.zeros_like(features[:, :, :2])
+        f_center[:, :, 0] = features[:, :, 0] - (
+            coors[:, 3].to(dtype).unsqueeze(1) * self.vx + self.x_offset
+        )
+        f_center[:, :, 1] = features[:, :, 1] - (
+            coors[:, 2].to(dtype).unsqueeze(1) * self.vy + self.y_offset
+        )
+
+        # Combine together feature decorations
+        features_ls = [features, f_cluster, f_center]
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features_ls.append(points_dist)
+        features = torch.cat(features_ls, dim=-1)
+
+        # The feature decorations were calculated without regard to whether pillar was empty
+        voxel_count = features.shape[1]
+        mask = get_paddings_indicator(num_voxels, voxel_count, axis=0)
+        mask = torch.unsqueeze(mask, -1).type_as(features)
+        features *= mask
+
+        # Forward pass through layers
+        for pfn in self.pfn_layers:
+            features = pfn(features)
+
+        return features.squeeze()
\ No newline at end of file
diff --git a/det3d/ops/dcn/build/lib.linux-x86_64-cpython-37/deform_conv_cuda.cpython-37m-x86_64-linux-gnu.so b/det3d/ops/dcn/build/lib.linux-x86_64-cpython-37/deform_conv_cuda.cpython-37m-x86_64-linux-gnu.so
new file mode 100755
index 00000000..f8aa1957
Binary files /dev/null and b/det3d/ops/dcn/build/lib.linux-x86_64-cpython-37/deform_conv_cuda.cpython-37m-x86_64-linux-gnu.so differ
diff --git a/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/.ninja_deps b/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/.ninja_deps
new file mode 100644
index 00000000..0d39d7f0
Binary files /dev/null and b/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/.ninja_deps differ
diff --git a/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/.ninja_log b/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/.ninja_log
new file mode 100644
index 00000000..64a5ab3f
--- /dev/null
+++ b/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/.ninja_log
@@ -0,0 +1,3 @@
+# ninja log v5
+1	9883	1776796852868130717	/home/cv29f26/Computer-Vision/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o	a316ef61b475275
+1	10800	1776796853740129920	/home/cv29f26/Computer-Vision/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda.o	5666843f3bf74a42
diff --git a/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/build.ninja b/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/build.ninja
new file mode 100644
index 00000000..b131b4ed
--- /dev/null
+++ b/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/build.ninja
@@ -0,0 +1,29 @@
+ninja_required_version = 1.3
+cxx = g++-10
+nvcc = /home/cv29f26/.conda/envs/centerpoint/bin/nvcc
+
+cflags = -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -Wstrict-prototypes -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -pipe -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -pipe -DAT_CHECK=TORCH_CHECK -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /home/cv29f26/.conda/envs/centerpoint/include -fPIC -DWITH_CUDA -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include/TH -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include/THC -I/home/cv29f26/.conda/envs/centerpoint/include -I/home/cv29f26/.conda/envs/centerpoint/include/python3.7m -c
+post_cflags = -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=deform_conv_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14
+cuda_cflags = -DWITH_CUDA -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include/TH -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include/THC -I/home/cv29f26/.conda/envs/centerpoint/include -I/home/cv29f26/.conda/envs/centerpoint/include/python3.7m -c
+cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=deform_conv_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 -ccbin gcc-10 -std=c++14
+ldflags = 
+
+rule compile
+  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
+  depfile = $out.d
+  deps = gcc
+
+rule cuda_compile
+  depfile = $out.d
+  deps = gcc
+  command = $nvcc  $cuda_cflags -c $in -o $out $cuda_post_cflags
+
+
+
+build /home/cv29f26/Computer-Vision/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda.o: compile /home/cv29f26/Computer-Vision/det3d/ops/dcn/src/deform_conv_cuda.cpp
+build /home/cv29f26/Computer-Vision/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o: cuda_compile /home/cv29f26/Computer-Vision/det3d/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+
+
+
+
diff --git a/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda.o b/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda.o
new file mode 100644
index 00000000..5343a6bc
Binary files /dev/null and b/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda.o differ
diff --git a/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o b/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o
new file mode 100644
index 00000000..09db43f6
Binary files /dev/null and b/det3d/ops/dcn/build/temp.linux-x86_64-cpython-37/src/deform_conv_cuda_kernel.o differ
diff --git a/det3d/ops/dcn/deform_conv_cuda.cpython-37m-x86_64-linux-gnu.so b/det3d/ops/dcn/deform_conv_cuda.cpython-37m-x86_64-linux-gnu.so
new file mode 100755
index 00000000..f8aa1957
Binary files /dev/null and b/det3d/ops/dcn/deform_conv_cuda.cpython-37m-x86_64-linux-gnu.so differ
diff --git a/det3d/ops/iou3d_nms/__init__.py b/det3d/ops/iou3d_nms/__init__.py
index c267f071..4e9c5a0e 100644
--- a/det3d/ops/iou3d_nms/__init__.py
+++ b/det3d/ops/iou3d_nms/__init__.py
@@ -1 +1,6 @@
-from det3d.ops.iou3d_nms import iou3d_nms_cuda, iou3d_nms_utils
+from . import iou3d_nms_utils
+
+try:
+    from . import iou3d_nms_cuda
+except Exception:
+    iou3d_nms_cuda = None
diff --git a/det3d/ops/iou3d_nms/build/lib.linux-x86_64-cpython-37/iou3d_nms_cuda.cpython-37m-x86_64-linux-gnu.so b/det3d/ops/iou3d_nms/build/lib.linux-x86_64-cpython-37/iou3d_nms_cuda.cpython-37m-x86_64-linux-gnu.so
new file mode 100755
index 00000000..a9d1e5a2
Binary files /dev/null and b/det3d/ops/iou3d_nms/build/lib.linux-x86_64-cpython-37/iou3d_nms_cuda.cpython-37m-x86_64-linux-gnu.so differ
diff --git a/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/.ninja_deps b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/.ninja_deps
new file mode 100644
index 00000000..b73a148d
Binary files /dev/null and b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/.ninja_deps differ
diff --git a/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/.ninja_log b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/.ninja_log
new file mode 100644
index 00000000..dbff32ff
--- /dev/null
+++ b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/.ninja_log
@@ -0,0 +1,5 @@
+# ninja log v5
+3	1666	1776796662828301919	/home/cv29f26/Computer-Vision/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms_kernel.o	6278d9d101f3f9d
+2	8010	1776796669172296297	/home/cv29f26/Computer-Vision/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_cpu.o	a1924083def66e03
+3	8198	1776796669356296133	/home/cv29f26/Computer-Vision/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms.o	f5d5ce7a772ac0a2
+3	13073	1776796674228291810	/home/cv29f26/Computer-Vision/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms_api.o	629ee59c12c50298
diff --git a/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/build.ninja b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/build.ninja
new file mode 100644
index 00000000..942230c1
--- /dev/null
+++ b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/build.ninja
@@ -0,0 +1,31 @@
+ninja_required_version = 1.3
+cxx = g++-10
+nvcc = /home/cv29f26/.conda/envs/centerpoint/bin/nvcc
+
+cflags = -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -Wstrict-prototypes -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -pipe -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -pipe -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /home/cv29f26/.conda/envs/centerpoint/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /home/cv29f26/.conda/envs/centerpoint/include -fPIC -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include/TH -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include/THC -I/home/cv29f26/.conda/envs/centerpoint/include -I/home/cv29f26/.conda/envs/centerpoint/include/python3.7m -c
+post_cflags = -g '-I /usr/local/cuda/include' -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=iou3d_nms_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14
+cuda_cflags = -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include/TH -I/home/cv29f26/.conda/envs/centerpoint/lib/python3.7/site-packages/torch/include/THC -I/home/cv29f26/.conda/envs/centerpoint/include -I/home/cv29f26/.conda/envs/centerpoint/include/python3.7m -c
+cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -O2 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=iou3d_nms_cuda -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 -ccbin gcc-10 -std=c++14
+ldflags = 
+
+rule compile
+  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
+  depfile = $out.d
+  deps = gcc
+
+rule cuda_compile
+  depfile = $out.d
+  deps = gcc
+  command = $nvcc  $cuda_cflags -c $in -o $out $cuda_post_cflags
+
+
+
+build /home/cv29f26/Computer-Vision/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_cpu.o: compile /home/cv29f26/Computer-Vision/det3d/ops/iou3d_nms/src/iou3d_cpu.cpp
+build /home/cv29f26/Computer-Vision/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms.o: compile /home/cv29f26/Computer-Vision/det3d/ops/iou3d_nms/src/iou3d_nms.cpp
+build /home/cv29f26/Computer-Vision/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms_api.o: compile /home/cv29f26/Computer-Vision/det3d/ops/iou3d_nms/src/iou3d_nms_api.cpp
+build /home/cv29f26/Computer-Vision/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms_kernel.o: cuda_compile /home/cv29f26/Computer-Vision/det3d/ops/iou3d_nms/src/iou3d_nms_kernel.cu
+
+
+
+
+
diff --git a/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_cpu.o b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_cpu.o
new file mode 100644
index 00000000..3ae30abb
Binary files /dev/null and b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_cpu.o differ
diff --git a/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms.o b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms.o
new file mode 100644
index 00000000..9d9794ae
Binary files /dev/null and b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms.o differ
diff --git a/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms_api.o b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms_api.o
new file mode 100644
index 00000000..786bc608
Binary files /dev/null and b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms_api.o differ
diff --git a/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms_kernel.o b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms_kernel.o
new file mode 100644
index 00000000..bb5e0c52
Binary files /dev/null and b/det3d/ops/iou3d_nms/build/temp.linux-x86_64-cpython-37/src/iou3d_nms_kernel.o differ
diff --git a/det3d/ops/iou3d_nms/iou3d_nms_cuda.cpython-37m-x86_64-linux-gnu.so b/det3d/ops/iou3d_nms/iou3d_nms_cuda.cpython-37m-x86_64-linux-gnu.so
new file mode 100755
index 00000000..a9d1e5a2
Binary files /dev/null and b/det3d/ops/iou3d_nms/iou3d_nms_cuda.cpython-37m-x86_64-linux-gnu.so differ
diff --git a/det3d/ops/iou3d_nms/iou3d_nms_utils.py b/det3d/ops/iou3d_nms/iou3d_nms_utils.py
index 4d71e33a..fa5742b1 100644
--- a/det3d/ops/iou3d_nms/iou3d_nms_utils.py
+++ b/det3d/ops/iou3d_nms/iou3d_nms_utils.py
@@ -5,7 +5,10 @@
 """
 import torch
 
-from . import iou3d_nms_cuda
+try:
+    from . import iou3d_nms_cuda
+except Exception:
+    iou3d_nms_cuda = None
 import numpy as np 
 
 
diff --git a/det3d/solver/fastai_optim.py b/det3d/solver/fastai_optim.py
index a5434478..7eb72d49 100644
--- a/det3d/solver/fastai_optim.py
+++ b/det3d/solver/fastai_optim.py
@@ -1,4 +1,9 @@
-from collections import Iterable, defaultdict
+from collections import defaultdict
+
+try:
+    from collections.abc import Iterable
+except ImportError:
+    from collections import Iterable
 from copy import deepcopy
 from itertools import chain
 
diff --git a/det3d/solver/optim.py b/det3d/solver/optim.py
index 224ada0b..914c14d4 100644
--- a/det3d/solver/optim.py
+++ b/det3d/solver/optim.py
@@ -1,4 +1,9 @@
-from collections import Iterable, defaultdict
+from collections import defaultdict
+
+try:
+    from collections.abc import Iterable
+except ImportError:
+    from collections import Iterable
 from copy import deepcopy
 from itertools import chain
 
diff --git a/det3d/torchie/apis/__init__.py b/det3d/torchie/apis/__init__.py
index 952d978c..fca90aa1 100644
--- a/det3d/torchie/apis/__init__.py
+++ b/det3d/torchie/apis/__init__.py
@@ -1,4 +1,4 @@
-from .env import get_root_logger, init_dist, set_random_seed
+from .env import get_root_logger, get_train_device, init_dist, set_random_seed
 from .train import batch_processor, batch_processor_ensemble, build_optimizer, train_detector
 
 # from .inference import init_detector, inference_detector, show_result
@@ -6,6 +6,7 @@
 __all__ = [
     "init_dist",
     "get_root_logger",
+    "get_train_device",
     "set_random_seed",
     "train_detector",
     "build_optimizer",
diff --git a/det3d/torchie/apis/env.py b/det3d/torchie/apis/env.py
index 75dc44e5..4d4da533 100644
--- a/det3d/torchie/apis/env.py
+++ b/det3d/torchie/apis/env.py
@@ -10,6 +10,18 @@
 from det3d.torchie.trainer import get_dist_info
 
 
+def get_train_device(local_rank=0):
+    """Pick training device: CUDA (per rank) if available, else CPU."""
+    forced = os.environ.get("CENTERPOINT_DEVICE", "").strip().lower()
+    if forced in ("cpu", "cuda"):
+        if forced == "cuda" and torch.cuda.is_available():
+            return torch.device("cuda", int(local_rank))
+        return torch.device("cpu")
+    if torch.cuda.is_available():
+        return torch.device("cuda", int(local_rank))
+    return torch.device("cpu")
+
+
 def init_dist(launcher, backend="nccl", **kwargs):
     if mp.get_start_method(allow_none=True) is None:
         mp.set_start_method("spawn")
@@ -52,7 +64,8 @@ def set_random_seed(seed):
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
 
 
 def get_root_logger(log_level=logging.INFO):
diff --git a/det3d/torchie/apis/train.py b/det3d/torchie/apis/train.py
index 9c3e5fb7..bc8886d7 100644
--- a/det3d/torchie/apis/train.py
+++ b/det3d/torchie/apis/train.py
@@ -12,17 +12,24 @@
 import numpy as np
 import torch
 from det3d.builder import _create_learning_rate_scheduler
+from det3d.models import build_detector
 
 # from det3d.datasets.kitti.eval_hooks import KittiDistEvalmAPHook, KittiEvalmAPHookV2
 from det3d.core import DistOptimizerHook
 from det3d.datasets import DATASETS, build_dataloader
 from det3d.solver.fastai_optim import OptimWrapper
-from det3d.torchie.trainer import DistSamplerSeedHook, Trainer, obj_from_dict
+from det3d.torchie import Config
+from det3d.torchie.trainer import (
+    DistSamplerSeedHook,
+    Trainer,
+    load_checkpoint,
+    obj_from_dict,
+)
 from det3d.utils.print_utils import metric_to_str
 from torch import nn
 from torch.nn.parallel import DistributedDataParallel
 
-from .env import get_root_logger
+from .env import get_root_logger, get_train_device
 
 
 def example_to_device(example, device=None, non_blocking=False) -> dict:
@@ -91,10 +98,13 @@ def parse_second_losses(losses):
 
 def batch_processor(model, data, train_mode, **kwargs):
 
-    if "local_rank" in kwargs:
-        device = torch.device(kwargs["local_rank"])
-    else:
-        device = None
+    device = kwargs.get("train_device")
+    if device is None:
+        if torch.cuda.is_available():
+            lr = int(kwargs.get("local_rank", 0))
+            device = torch.device("cuda", lr)
+        else:
+            device = torch.device("cpu")
 
     # data = example_convert_to_torch(data, device=device)
     example = example_to_device(data, device, non_blocking=False)
@@ -159,7 +169,7 @@ def build_one_cycle_optimizer(model, optimizer_config):
             torch.optim.Adam, betas=(0.9, 0.99), amsgrad=optimizer_config.amsgrad
         )
     else:
-        optimizer_func = partial(torch.optim.Adam, amsgrad=optimizer_cfg.amsgrad)
+        optimizer_func = partial(torch.optim.Adam, amsgrad=optimizer_config.amsgrad)
 
     optimizer = OptimWrapper.create(
         optimizer_func,
@@ -252,6 +262,9 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, logge
     if logger is None:
         logger = get_root_logger(cfg.log_level)
 
+    train_device = get_train_device(cfg.local_rank if distributed else 0)
+    logger.info("Training device: %s", train_device)
+
     # start training
     # prepare data loaders
     dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
@@ -264,8 +277,11 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, logge
 
     total_steps = cfg.total_epochs * len(data_loaders[0])
     # print(f"total_steps: {total_steps}")
-    if distributed:
-        model = apex.parallel.convert_syncbn_model(model)
+    if distributed and torch.cuda.is_available():
+        try:
+            model = apex.parallel.convert_syncbn_model(model)
+        except Exception:
+            logger.warning("apex convert_syncbn_model skipped (apex unavailable or incompatible).")
     if cfg.lr_config.type == "one_cycle":
         # build trainer
         optimizer = build_one_cycle_optimizer(model, cfg.optimizer)
@@ -279,22 +295,61 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, logge
         # lr_scheduler = None
         cfg.lr_config = None 
 
-    # put model on gpus
+    # put model on device (CUDA / CPU)
     if distributed:
+        if not torch.cuda.is_available():
+            raise RuntimeError(
+                "Distributed training requires CUDA in this codebase. "
+                "Run without torch.distributed (single process) for CPU."
+            )
         model = DistributedDataParallel(
-            model.cuda(cfg.local_rank),
+            model.to(train_device),
             device_ids=[cfg.local_rank],
             output_device=cfg.local_rank,
             # broadcast_buffers=False,
             find_unused_parameters=True,
         )
     else:
-        model = model.cuda()
+        model = model.to(train_device)
 
     logger.info(f"model structure: {model}")
 
+    kd_cfg = cfg.get("kd", None)
+    kd_enabled = bool(kd_cfg and kd_cfg.get("enabled", False))
+    teacher_model = None
+    if kd_enabled:
+        teacher_cfg_path = kd_cfg.get("teacher_config", None)
+        teacher_ckpt_path = kd_cfg.get("teacher_checkpoint", None)
+        if not teacher_ckpt_path:
+            raise ValueError("KD is enabled but kd.teacher_checkpoint is not set.")
+
+        if teacher_cfg_path:
+            teacher_cfg = Config.fromfile(teacher_cfg_path)
+            teacher_model_cfg = teacher_cfg.model
+            teacher_train_cfg = teacher_cfg.train_cfg
+            teacher_test_cfg = teacher_cfg.test_cfg
+        else:
+            teacher_model_cfg = cfg.model
+            teacher_train_cfg = cfg.train_cfg
+            teacher_test_cfg = cfg.test_cfg
+
+        teacher_model = build_detector(
+            teacher_model_cfg, train_cfg=teacher_train_cfg, test_cfg=teacher_test_cfg
+        )
+        teacher_model = teacher_model.to(train_device)
+        load_checkpoint(teacher_model, teacher_ckpt_path, map_location=str(train_device))
+        teacher_model.eval()
+        teacher_model.requires_grad_(False)
+        logger.info("KD enabled with teacher checkpoint: %s", teacher_ckpt_path)
+
     trainer = Trainer(
-        model, batch_processor, optimizer, lr_scheduler, cfg.work_dir, cfg.log_level
+        model,
+        batch_processor,
+        optimizer,
+        lr_scheduler,
+        cfg.work_dir,
+        cfg.log_level,
+        train_device=train_device,
     )
 
     if distributed:
@@ -323,4 +378,12 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, logge
     elif cfg.load_from:
         trainer.load_checkpoint(cfg.load_from)
 
-    trainer.run(data_loaders, cfg.workflow, cfg.total_epochs, local_rank=cfg.local_rank)
+    trainer.run(
+        data_loaders,
+        cfg.workflow,
+        cfg.total_epochs,
+        local_rank=cfg.local_rank,
+        train_device=train_device,
+        teacher_model=teacher_model,
+        kd_cfg=kd_cfg,
+    )
diff --git a/det3d/torchie/trainer/checkpoint.py b/det3d/torchie/trainer/checkpoint.py
index 61c1f670..946543c3 100644
--- a/det3d/torchie/trainer/checkpoint.py
+++ b/det3d/torchie/trainer/checkpoint.py
@@ -43,8 +43,12 @@
 
 try:
     import spconv.pytorch as spconv
-except:
-    import spconv as spconv
+except Exception:
+    try:
+        import spconv as spconv
+    except Exception:
+        spconv = None
+
 
 def find_all_spconv_keys(model: nn.Module, prefix="") -> Set[str]:
     """
@@ -52,6 +56,8 @@ def find_all_spconv_keys(model: nn.Module, prefix="") -> Set[str]:
     from https://github.com/acivgin1/OpenPCDet/blob/8fc1a5d57bcb418d71d5118fb3df4b58d4ea0244/pcdet/utils/spconv_utils.py
     """
     found_keys: Set[str] = set()
+    if spconv is None:
+        return found_keys
     for name, child in model.named_children():
         new_prefix = f"{prefix}.{name}" if prefix != "" else name
 
diff --git a/det3d/torchie/trainer/hooks/logger/text.py b/det3d/torchie/trainer/hooks/logger/text.py
index 32f38570..686bd00c 100644
--- a/det3d/torchie/trainer/hooks/logger/text.py
+++ b/det3d/torchie/trainer/hooks/logger/text.py
@@ -30,11 +30,12 @@ def _get_max_memory(self, trainer):
             dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX)
         return mem_mb.item()
 
-    def _convert_to_precision4(self, val):
+    def _format_metric_value(self, metric_name, val):
+        precision = 6 if metric_name in ("hm_kd_loss", "feat_kd_loss") else 4
         if isinstance(val, float):
-            val = "{:.4f}".format(val)
+            val = f"{val:.{precision}f}"
         elif isinstance(val, list):
-            val = [self._convert_to_precision4(v) for v in val]
+            val = [self._format_metric_value(metric_name, v) for v in val]
 
         return val
 
@@ -60,7 +61,8 @@ def _log_info(self, log_dict, trainer):
                     log_dict["forward_time"] - log_dict["transfer_time"],
                     log_dict["loss_parse_time"] - log_dict["forward_time"],
                 )
-                log_str += "memory: {}, ".format(log_dict["memory"])
+                if "memory" in log_dict:
+                    log_str += "memory: {}, ".format(log_dict["memory"])
         else:
             log_str = "Epoch({}) [{}][{}]\t".format(
                 log_dict["mode"], log_dict["epoch"] - 1, log_dict["iter"]
@@ -94,11 +96,11 @@ def _log_info(self, log_dict, trainer):
                     continue
 
                 if isinstance(val, float):
-                    val = "{:.4f}".format(val)
+                    val = self._format_metric_value(name, val)
 
                 if isinstance(val, list):
                     log_items.append(
-                        "{}: {}".format(name, self._convert_to_precision4(val[idx]))
+                        "{}: {}".format(name, self._format_metric_value(name, val[idx]))
                     )
                 else:
                     log_items.append("{}: {}".format(name, val))
diff --git a/det3d/torchie/trainer/hooks/memory.py b/det3d/torchie/trainer/hooks/memory.py
index 990f8cec..763a8ab3 100644
--- a/det3d/torchie/trainer/hooks/memory.py
+++ b/det3d/torchie/trainer/hooks/memory.py
@@ -10,13 +10,13 @@ def __init__(self, before_epoch=False, after_epoch=True, after_iter=False):
         self._after_iter = after_iter
 
     def after_iter(self, trainer):
-        if self._after_iter:
+        if self._after_iter and torch.cuda.is_available():
             torch.cuda.empty_cache()
 
     def before_epoch(self, trainer):
-        if self._before_epoch:
+        if self._before_epoch and torch.cuda.is_available():
             torch.cuda.empty_cache()
 
     def after_epoch(self, trainer):
-        if self._after_epoch:
+        if self._after_epoch and torch.cuda.is_available():
             torch.cuda.empty_cache()
diff --git a/det3d/torchie/trainer/trainer.py b/det3d/torchie/trainer/trainer.py
index 18bfc967..af56bae8 100644
--- a/det3d/torchie/trainer/trainer.py
+++ b/det3d/torchie/trainer/trainer.py
@@ -147,6 +147,7 @@ def __init__(
         work_dir=None,
         log_level=logging.INFO,
         logger=None,
+        train_device=None,
         **kwargs,
     ):
         assert callable(batch_processor)
@@ -155,6 +156,11 @@ def __init__(
         self.lr_scheduler = lr_scheduler
 
         self.batch_processor = batch_processor
+        self.train_device = (
+            train_device
+            if train_device is not None
+            else (torch.device("cuda", 0) if torch.cuda.is_available() else torch.device("cpu"))
+        )
 
         # Create work_dir
         if torchie.is_str(work_dir):
@@ -351,20 +357,43 @@ def save_checkpoint(
 
     def batch_processor_inline(self, model, data, train_mode, **kwargs):
 
-        if "local_rank" in kwargs:
-            device = torch.device(kwargs["local_rank"])
-        else:
-            device = None
+        device = kwargs.get("train_device", self.train_device)
 
         # data = example_convert_to_torch(data, device=device)
-        example = example_to_device(
-            data, torch.cuda.current_device(), non_blocking=False
-        )
+        example = example_to_device(data, device, non_blocking=False)
 
         self.call_hook("after_data_to_device")
 
         if train_mode:
-            losses = model(example, return_loss=True)
+            teacher_model = kwargs.get("teacher_model")
+            kd_cfg = kwargs.get("kd_cfg")
+            kd_enabled = bool(kd_cfg and kd_cfg.get("enabled", False))
+            teacher_preds_dicts = None
+            teacher_feats = None
+            kd_type = kd_cfg.get("type", "heatmap_mse") if kd_cfg else "heatmap_mse"
+
+            if teacher_model is not None and kd_enabled:
+                with torch.no_grad():
+                    if kd_type == "feature_mse":
+                        teacher_out = teacher_model(
+                            example,
+                            return_loss=True,
+                            return_preds=True,
+                            return_feats=True,
+                        )
+                        teacher_feats = teacher_out["feats"]
+                    else:
+                        teacher_preds_dicts = teacher_model(
+                            example, return_loss=True, return_preds=True
+                        )
+
+            losses = model(
+                example,
+                return_loss=True,
+                teacher_preds_dicts=teacher_preds_dicts,
+                teacher_feats=teacher_feats,
+                kd_cfg=kd_cfg,
+            )
             self.call_hook("after_forward")
             loss, log_vars = parse_second_losses(losses)
             del losses
@@ -480,9 +509,15 @@ def val(self, data_loader, **kwargs):
 
     def resume(self, checkpoint, resume_optimizer=True, map_location="default"):
         if map_location == "default":
-            checkpoint = self.load_checkpoint(
-                checkpoint , map_location='cuda:{}'.format(torch.cuda.current_device()) # TODO: FIX THIS!!
-            )
+            if torch.cuda.is_available():
+                loc = "cuda:{}".format(torch.cuda.current_device())
+            else:
+                loc = (
+                    self.train_device
+                    if isinstance(self.train_device, str)
+                    else str(self.train_device)
+                )
+            checkpoint = self.load_checkpoint(checkpoint, map_location=loc)
         else:
             checkpoint = self.load_checkpoint(checkpoint, map_location=map_location)
 
diff --git a/tools/nms_better.py b/tools/nms_better.py
index bc2b3bc7..2191b9b2 100644
--- a/tools/nms_better.py
+++ b/tools/nms_better.py
@@ -23,7 +23,7 @@ def parse_args():
     parser = argparse.ArgumentParser(description="Ensemble Models")
     parser.add_argument("ensemble_dir", help="path to a dir that contains all prediction file")
     parser.add_argument("--output_path", help="the path to save ensemble output")    
-    parser.add_argument("--data_root", type=str, default="data/nuScenes/v1.0-trainval") 
+    parser.add_argument("--data_root", type=str, default=os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes") + "/v1.0-trainval") 
     
     args = parser.parse_args()
 
diff --git a/tools/nusc_tracking/pub_test.py b/tools/nusc_tracking/pub_test.py
index 4234ae66..57d903d6 100644
--- a/tools/nusc_tracking/pub_test.py
+++ b/tools/nusc_tracking/pub_test.py
@@ -26,7 +26,7 @@ def parse_args():
         "--checkpoint", help="the dir to checkpoint which the model read from"
     )
     parser.add_argument("--hungarian", action='store_true')
-    parser.add_argument("--root", type=str, default="data/nuScenes")
+    parser.add_argument("--root", type=str, default=os.environ.get("NUSCENES_DATA_ROOT", "data/nuScenes"))
     parser.add_argument("--version", type=str, default='v1.0-trainval')
     parser.add_argument("--max_age", type=int, default=3)
 
diff --git a/tools/train.py b/tools/train.py
index a3a4354e..32ca6b05 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -71,6 +71,9 @@ def main():
 
     cfg = Config.fromfile(args.config)
 
+    if not hasattr(cfg, "gpus") or cfg.gpus is None:
+        cfg.gpus = args.gpus
+
     # update configs according to CLI args
     if args.work_dir is not None:
         cfg.work_dir = args.work_dir
@@ -83,8 +86,11 @@ def main():
 
     if distributed:
         if args.launcher == "pytorch":
-            torch.cuda.set_device(args.local_rank)
-            torch.distributed.init_process_group(backend="nccl", init_method="env://")
+            if torch.cuda.is_available():
+                torch.cuda.set_device(args.local_rank)
+                torch.distributed.init_process_group(backend="nccl", init_method="env://")
+            else:
+                torch.distributed.init_process_group(backend="gloo", init_method="env://")
             cfg.local_rank = args.local_rank
         elif args.launcher == "slurm":
             proc_id = int(os.environ["SLURM_PROCID"])