Skip to content

Commit 4f11d26

Browse files
zakiscodingclaude
andcommitted
fix: bypass raw preprocessing in API for pre-engineered data
The /predict endpoint was passing already feature-engineered data through the raw-data preprocessing pipeline, which caused: (1) drop_duplicates removing valid rows, (2) TRAIN_FEATURE_COLUMNS always being None due to import-before-download ordering, and (3) lat/lng missing from schema alignment. Now the endpoint loads the model once at startup, derives expected feature names from the booster directly, and uses reindex to align without any preprocessing. Separately, regenerated feature_engineered_holdout.csv with lat/lng (previously dropped) and city_encoded naming matching the trained model, uploaded to S3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 3ced33d commit 4f11d26

1 file changed

Lines changed: 13 additions & 19 deletions

File tree

src/api/main.py

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@
88
from typing import List, Dict, Any
99
import pandas as pd
1010
import boto3, os
11-
12-
from src.inference_pipeline.inference import predict
11+
from joblib import load
1312

1413
S3_BUCKET = os.getenv("S3_BUCKET", "model-regression-data")
1514
REGION = os.getenv("AWS_REGION", "us-east-2")
@@ -27,13 +26,10 @@ def load_from_s3(key, local_path):
2726

2827

2928
MODEL_PATH = Path(load_from_s3("models/xgb_best_model.pkl", "models/xgb_best_model.pkl"))
30-
TRAIN_FE_PATH = Path(load_from_s3("processed/feature_engineered_train.csv", "data/processed/feature_engineered_train.csv"))
3129

32-
if TRAIN_FE_PATH.exists():
33-
_train_cols = pd.read_csv(TRAIN_FE_PATH, nrows=1)
34-
TRAIN_FEATURE_COLUMNS = [c for c in _train_cols.columns if c != "price"]
35-
else:
36-
TRAIN_FEATURE_COLUMNS = None
30+
# Load model once at startup and derive expected features from booster
31+
_model = load(MODEL_PATH)
32+
FEATURE_NAMES = _model.get_booster().feature_names
3733

3834
app = FastAPI(title="Housing Regression API")
3935

@@ -45,32 +41,30 @@ def root():
4541

4642
@app.get("/health")
4743
def health():
48-
status: Dict[str, Any] = {"model_path": str(MODEL_PATH)}
44+
status: Dict[str, Any] = {"model_path": str(MODEL_PATH), "status": "healthy"}
4945
if not MODEL_PATH.exists():
5046
status["status"] = "unhealthy"
5147
status["error"] = "Model not found"
5248
else:
53-
status["status"] = "healthy"
54-
if TRAIN_FEATURE_COLUMNS:
55-
status["n_features_expected"] = len(TRAIN_FEATURE_COLUMNS)
49+
status["n_features_expected"] = len(FEATURE_NAMES) if FEATURE_NAMES else 0
5650
return status
5751

5852

5953
@app.post("/predict")
6054
def predict_batch(data: List[dict]):
61-
if not MODEL_PATH.exists():
62-
return {"error": f"Model not found at {str(MODEL_PATH)}"}
63-
6455
df = pd.DataFrame(data)
6556
if df.empty:
6657
return {"error": "No data provided"}
6758

68-
preds_df = predict(df, model_path=MODEL_PATH)
59+
y_true = df.pop("price").tolist() if "price" in df.columns else None
6960

70-
resp = {"predictions": preds_df["predicted_price"].astype(float).tolist()}
71-
if "actual_price" in preds_df.columns:
72-
resp["actuals"] = preds_df["actual_price"].astype(float).tolist()
61+
# Align to exact features the model was trained on
62+
df = df.reindex(columns=FEATURE_NAMES, fill_value=0)
7363

64+
preds = _model.predict(df).tolist()
65+
resp = {"predictions": preds}
66+
if y_true is not None:
67+
resp["actuals"] = y_true
7468
return resp
7569

7670

0 commit comments

Comments
 (0)