-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
118 lines (96 loc) · 4.47 KB
/
Makefile
File metadata and controls
118 lines (96 loc) · 4.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
POETRY ?= poetry
PYTHON ?= $(POETRY) run python
PYTHONPATH ?= src
BLACK_ARGS ?= --target-version py311 --fast
SRC_DIR := src/house_price_class_prediction
DATA_DIR := data
PROCESSED_DIR := $(DATA_DIR)/processed
CLEANED_DATA ?= $(PROCESSED_DIR)/cleaned_data.csv
REPORTS_DIR ?= reports
OVERWRITE ?=
PIPELINE_OVERWRITE ?= --overwrite
PIPELINE_RUNTIME_ID ?= $(shell date -u +%Y%m%dT%H%M%SZ)
PY_FILES := $(shell find src tests -type f -name "*.py")
export PIPELINE_RUNTIME_ID
.DEFAULT_GOAL := help
.PHONY: help setup check-python acquisition cleaning feature_engineering feature_selection eda validate reports evaluate train check-final-data test coverage test-report format check lint isort clean pipeline all
help:
@echo "Available commands:"
@echo " make setup - Install project dependencies"
@echo " make check-python - Verify Python version requirements"
@echo " make acquisition - Run raw data acquisition and cleaning"
@echo " make cleaning - Run preprocessing to create train/test splits"
@echo " make feature_engineering - Encode and engineer features"
@echo " make feature_selection - Run feature selection and write final model files"
@echo " make eda - Generate exploratory data analysis figures"
@echo " make validate - Validate cleaned dataset and write reports"
@echo " make reports - Alias for validate"
@echo " make evaluate - Evaluate the saved model on the test split"
@echo " make train - Train models with existing processed data"
@echo " make test - Run tests"
@echo " make format - Format source code with Black"
@echo " make check - Check Black formatting"
@echo " make lint - Run flake8 on source and tests"
@echo " make isort - Sort imports with isort"
@echo " make clean - Remove Python cache files"
@echo " make all - Run quality checks, tests, and modeling pipeline"
@echo " Run make setup once before make all on a fresh environment"
@echo " Set OVERWRITE=--overwrite to replace generated data outputs"
setup:
poetry lock && $(POETRY) install
check-python:
$(PYTHON) test_environment.py
acquisition:
PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.data.data_acquisition
cleaning:
PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.features.preprocessing $(OVERWRITE)
feature_engineering:
PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.features.build_features $(OVERWRITE)
feature_selection:
PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.features.feature_selection $(OVERWRITE)
eda:
PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.visualization.visualize
validate:
@test -f $(CLEANED_DATA) || { \
echo "Missing $(CLEANED_DATA). Run make acquisition first or set CLEANED_DATA to an existing cleaned dataset."; \
exit 1; \
}
PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.validation.validate_data \
--input $(CLEANED_DATA) \
--out-json $(REPORTS_DIR)/cleaned_data_validation_report.json \
--out-txt $(REPORTS_DIR)/cleaned_data_validation_report.txt
reports: validate
evaluate:
PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.models.predict
train:
PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.models.train
test:
$(PYTHON) -m pytest tests -v
format:
@for file in $(PY_FILES); do \
$(PYTHON) -m black $(BLACK_ARGS) "$$file"; \
done
check:
@for file in $(PY_FILES); do \
$(PYTHON) -m black --check $(BLACK_ARGS) "$$file"; \
done
lint:
$(PYTHON) -m flake8 src tests
isort:
$(PYTHON) -m isort src tests
clean:
find . -type d -name "__pycache__" -prune -exec rm -rf {} +
find . -type f -name "*.py[co]" -delete
pipeline:
@if [ ! -f "$(CLEANED_DATA)" ]; then \
echo "Missing $(CLEANED_DATA). Running acquisition first."; \
$(MAKE) acquisition PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"; \
fi
$(MAKE) validate PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
$(MAKE) cleaning OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
$(MAKE) feature_engineering OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
$(MAKE) feature_selection OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
$(MAKE) eda PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
$(MAKE) train PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
$(MAKE) evaluate PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
all: check-python check lint test pipeline