datascience/Makefile at main · Mo-Khater/datascience · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
POETRY ?= poetry
PYTHON ?= $(POETRY) run python
PYTHONPATH ?= src
BLACK_ARGS ?= --target-version py311 --fast
SRC_DIR := src/house_price_class_prediction
DATA_DIR := data
PROCESSED_DIR := $(DATA_DIR)/processed
CLEANED_DATA ?= $(PROCESSED_DIR)/cleaned_data.csv
REPORTS_DIR ?= reports
OVERWRITE ?=
PIPELINE_OVERWRITE ?= --overwrite
PIPELINE_RUNTIME_ID ?= $(shell date -u +%Y%m%dT%H%M%SZ)
PY_FILES := $(shell find src tests -type f -name "*.py")
export PIPELINE_RUNTIME_ID

.DEFAULT_GOAL := help

.PHONY: help setup check-python acquisition cleaning feature_engineering feature_selection eda validate reports evaluate train check-final-data test coverage test-report format check lint isort clean pipeline all

help:
	@echo "Available commands:"
	@echo "  make setup         - Install project dependencies"
	@echo "  make check-python  - Verify Python version requirements"
	@echo "  make acquisition   - Run raw data acquisition and cleaning"
	@echo "  make cleaning      - Run preprocessing to create train/test splits"
	@echo "  make feature_engineering - Encode and engineer features"
	@echo "  make feature_selection   - Run feature selection and write final model files"
	@echo "  make eda           - Generate exploratory data analysis figures"
	@echo "  make validate      - Validate cleaned dataset and write reports"
	@echo "  make reports       - Alias for validate"
	@echo "  make evaluate      - Evaluate the saved model on the test split"
	@echo "  make train         - Train models with existing processed data"
	@echo "  make test          - Run tests"
	@echo "  make format        - Format source code with Black"
	@echo "  make check         - Check Black formatting"
	@echo "  make lint          - Run flake8 on source and tests"
	@echo "  make isort         - Sort imports with isort"
	@echo "  make clean         - Remove Python cache files"
	@echo "  make all           - Run quality checks, tests, and modeling pipeline"
	@echo "  Run make setup once before make all on a fresh environment"
	@echo "  Set OVERWRITE=--overwrite to replace generated data outputs"

setup:
	poetry lock && $(POETRY) install

check-python:
	$(PYTHON) test_environment.py

acquisition:
	PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.data.data_acquisition

cleaning:
	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.features.preprocessing $(OVERWRITE)

feature_engineering:
	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.features.build_features $(OVERWRITE)

feature_selection:
	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.features.feature_selection $(OVERWRITE)

eda:
	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.visualization.visualize

validate:
	@test -f $(CLEANED_DATA) || { \
		echo "Missing $(CLEANED_DATA). Run make acquisition first or set CLEANED_DATA to an existing cleaned dataset."; \
		exit 1; \
	}
	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.validation.validate_data \
		--input $(CLEANED_DATA) \
		--out-json $(REPORTS_DIR)/cleaned_data_validation_report.json \
		--out-txt $(REPORTS_DIR)/cleaned_data_validation_report.txt

reports: validate

evaluate:
	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.models.predict

train:
	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.models.train

test:
	$(PYTHON) -m pytest tests -v

format:
	@for file in $(PY_FILES); do \
		$(PYTHON) -m black $(BLACK_ARGS) "$$file"; \
	done

check:
	@for file in $(PY_FILES); do \
		$(PYTHON) -m black --check $(BLACK_ARGS) "$$file"; \
	done

lint:
	$(PYTHON) -m flake8 src tests

isort:
	$(PYTHON) -m isort src tests

clean:
	find . -type d -name "__pycache__" -prune -exec rm -rf {} +
	find . -type f -name "*.py[co]" -delete

pipeline:
	@if [ ! -f "$(CLEANED_DATA)" ]; then \
		echo "Missing $(CLEANED_DATA). Running acquisition first."; \
		$(MAKE) acquisition PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"; \
	fi
	$(MAKE) validate PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
	$(MAKE) cleaning OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
	$(MAKE) feature_engineering OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
	$(MAKE) feature_selection OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
	$(MAKE) eda PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
	$(MAKE) train PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
	$(MAKE) evaluate PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"

all: check-python check lint test pipeline