-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
363 lines (299 loc) · 14.6 KB
/
Copy pathMakefile
File metadata and controls
363 lines (299 loc) · 14.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# Makefile for Flashchat — Pure C/Metal MoE inference engine
#
# Targets:
# make — build inference binaries
# make run — single expert forward pass
# make verify — verify Metal vs CPU reference
# make bench — benchmark single expert (10 iterations)
# make moe — full MoE forward pass (K experts, single layer)
# make moebench — benchmark MoE (10 iterations)
# make full — full model MoE forward pass (K=4)
# make fullbench — benchmark full model forward (3 iterations)
# make chat — build interactive chat TUI
# make api-smoke — run HTTP API smoke test
# make cli-smoke — run Flashchat CLI smoke test
# make manage-smoke — run model management integration test (Python core via launcher)
# make tool-template-smoke — run native tool template render/parser smoke test
# make quant-helper-smoke — run native checkpoint quantization helper tests
# make tokenizer-export-smoke — run tokenizer export helper tests
# make native-qwen-compile-smoke — run native Qwen BF16 compiler smoke test
# make mtp-config-smoke — run MTP config/profile precedence smoke test
# make py-tests — run modelmgr unit tests
# make registry / registry-check — regenerate/verify assets/model_configs.json
# make test — run all functional smoke tests
# make help — list available targets
# make clean — remove build artifacts
# make archive-debug — archive repo-local debug contents under debug/.archived
# make clean-venv — remove Python setup virtual environment
# make distclean — remove build artifacts, repo-local debug, and setup venv
#
# Note: Metal shaders are compiled from source at runtime via
# MTLDevice newLibraryWithSource:, so no offline metal compiler needed.
SHELL := /bin/bash
BUILD_DIR = metal_infer
ifeq ($(origin CC),default)
CC = clang
endif
OPT ?= aggressive
FRAMEWORKS = -framework Metal -framework Foundation -framework Accelerate
BASE_CFLAGS = -Wall -Wextra -fobjc-arc -DACCELERATE_NEW_LAPACK
BASE_LDFLAGS = -lpthread -lcompression -ldl
VALID_OPTS = aggressive conservative debug
ifeq ($(filter $(OPT),$(VALID_OPTS)),)
$(error Unknown OPT='$(OPT)'. Use one of: $(VALID_OPTS))
endif
cc-option = $(strip $(shell tmp=$$(mktemp /tmp/flashchat-cc-option.XXXXXX); \
printf 'int main(void){return 0;}\n' | $(CC) -x objective-c $(BASE_CFLAGS) $(1) -c -o "$$tmp" - >/dev/null 2>&1 && printf '%s' '$(1)'; \
rm -f "$$tmp"))
link-option = $(strip $(shell tmp=$$(mktemp /tmp/flashchat-link-option.XXXXXX); rm -f "$$tmp"; \
printf 'int main(void){return 0;}\n' | $(CC) -x objective-c $(BASE_CFLAGS) $(BASE_LDFLAGS) $(1) -o "$$tmp" - >/dev/null 2>&1 && printf '%s' '$(1)'; \
rm -f "$$tmp"))
APPLE_CPU_NAME = $(shell sysctl -n machdep.cpu.brand_string 2>/dev/null | tr '[:upper:]' '[:lower:]' | sed -n 's/^apple \([a-z0-9]*\).*/\1/p')
DETECTED_MCPU_FLAG = $(if $(APPLE_CPU_NAME),-mcpu=apple-$(APPLE_CPU_NAME))
CPU_CFLAGS := $(call cc-option,-mcpu=native)
ifeq ($(CPU_CFLAGS),)
CPU_CFLAGS := $(call cc-option,$(DETECTED_MCPU_FLAG))
endif
AGGRESSIVE_LTO = $(call link-option,-flto)
AGGRESSIVE_EXTRA_CFLAGS = \
$(call cc-option,-ffast-math) \
$(call cc-option,-funroll-loops) \
$(call cc-option,-fvectorize) \
$(call cc-option,-fslp-vectorize) \
$(call cc-option,-ftree-vectorize) \
$(call cc-option,-falign-functions=16)
ifeq ($(OPT),aggressive)
OPT_CFLAGS = -O3 $(CPU_CFLAGS) $(AGGRESSIVE_LTO) $(AGGRESSIVE_EXTRA_CFLAGS)
OPT_LDFLAGS = $(AGGRESSIVE_LTO)
endif
ifeq ($(OPT),conservative)
OPT_CFLAGS = -O2 $(CPU_CFLAGS)
OPT_LDFLAGS =
endif
ifeq ($(OPT),debug)
OPT_CFLAGS = -O0 -g $(CPU_CFLAGS)
OPT_LDFLAGS =
endif
CFLAGS = $(BASE_CFLAGS) $(OPT_CFLAGS)
LDFLAGS = $(BASE_LDFLAGS) $(OPT_LDFLAGS)
CHAT_CFLAGS = $(if $(filter debug,$(OPT)),-O0 -g,-O2 $(CPU_CFLAGS)) -Wall -fobjc-arc
TARGET = $(BUILD_DIR)/metal_infer
MAIN_SRC = $(BUILD_DIR)/main.m
# Optional: offline shader compilation (faster startup, but not required)
METALC = xcrun -sdk macosx metal
METALLIB_TOOL = xcrun -sdk macosx metallib
SHADER_SRC = $(BUILD_DIR)/shaders.metal
SHADER_AIR = $(BUILD_DIR)/shaders.air
SHADER_LIB = $(BUILD_DIR)/shaders.metallib
# Inference engine (complete forward pass)
INFER_TARGET = $(BUILD_DIR)/infer
INFER_SRC = $(BUILD_DIR)/infer.m
# Chat TUI (interactive multi-turn)
CHAT_TARGET = $(BUILD_DIR)/chat
CHAT_SRC = $(BUILD_DIR)/chat.m
LINENOISE_SRC = $(BUILD_DIR)/linenoise.c
LINENOISE_HDR = $(BUILD_DIR)/linenoise.h
RAM_PRESSURE_TARGET = tools/ram_pressure
RAM_PRESSURE_SRC = tools/ram_pressure.c
.PHONY: all clean archive-debug clean-venv distclean help print-build-config run verify bench moe moebench full fullbench fast metallib metal_infer infer chat ram-pressure build-infer infer-run chat-run build-chat api-smoke cli-smoke manage-smoke chat-render-smoke tool-template-smoke cache-roundtrip-smoke quant-helper-smoke tokenizer-export-smoke native-qwen-compile-smoke mtp-config-smoke test bench-api bench-report registry registry-check py-tests
define RUN_ENGINE_BENCH
@bash -c 'set -eo pipefail; \
source ./lib/config.sh; \
flashchat_load_config; \
export FLASHCHAT_MODEL="$$(flashchat_get MODEL)"; \
export FLASHCHAT_MODEL_PATH="$$(flashchat_get MODEL_PATH)"; \
export FLASHCHAT_MODEL_CONFIG="$$(flashchat_get MODEL_CONFIG)"; \
if [[ -z "$$FLASHCHAT_MODEL_PATH" || "$$FLASHCHAT_MODEL_PATH" == *"<snapshot>"* || ! -d "$$FLASHCHAT_MODEL_PATH" ]]; then \
echo "ERROR: Model is not downloaded for $$FLASHCHAT_MODEL."; \
echo "Expected model snapshot: $$FLASHCHAT_MODEL_PATH"; \
echo "Run ./flashchat setup first, or select a configured model with downloaded weights."; \
exit 1; \
fi; \
bits="$$(flashchat_model_quant_bits "$$FLASHCHAT_MODEL")"; \
packed_dir="$$FLASHCHAT_MODEL_PATH/flashchat/q$${bits:-4}/packed_experts"; \
if [[ ! -f "$$packed_dir/layer_00.bin" ]]; then \
echo "ERROR: Engine benchmark artifacts are not available for $$FLASHCHAT_MODEL."; \
echo "Expected: $$packed_dir/layer_00.bin"; \
echo "Run ./flashchat setup first, or select a configured model with packed experts."; \
exit 1; \
fi; \
echo "Using Flashchat model $$FLASHCHAT_MODEL"; \
cd $(BUILD_DIR); \
./metal_infer --model-id "$$FLASHCHAT_MODEL" --model "$$FLASHCHAT_MODEL_PATH" $(1)'
endef
all: $(TARGET) $(INFER_TARGET)
help:
@printf "Flashchat make targets\n"
@printf "\n"
@printf "Build:\n"
@printf " make Build main benchmark and inference binaries\n"
@printf " make all Build main benchmark and inference binaries\n"
@printf " make metal_infer Build main benchmark binary\n"
@printf " make infer Build inference server/engine\n"
@printf " make build-infer Alias for infer\n"
@printf " make chat Build interactive chat client\n"
@printf " make build-chat Alias for chat\n"
@printf " make ram-pressure Build RAM pressure utility for memory-constrained testing\n"
@printf " make metallib Precompile Metal shaders\n"
@printf " make print-build-config Show compiler and optimization settings\n"
@printf "\n"
@printf "Build options:\n"
@printf " OPT=aggressive Fastest probed local build (default)\n"
@printf " OPT=conservative Native CPU, fewer risky optimization flags\n"
@printf " OPT=debug Debug symbols, no speed-oriented flags\n"
@printf " CC=clang Override compiler command\n"
@printf "\n"
@printf "Run:\n"
@printf " make infer-run Run a short inference prompt\n"
@printf " make chat-run Launch the chat client\n"
@printf "\n"
@printf "Benchmarks:\n"
@printf " make run Single expert forward pass\n"
@printf " make verify Metal vs CPU reference verification\n"
@printf " make fast Fast path verification\n"
@printf " make bench Single expert benchmark\n"
@printf " make moe MoE forward pass\n"
@printf " make moebench MoE benchmark\n"
@printf " make full Full model forward pass\n"
@printf " make fullbench Full model benchmark\n"
@printf "\n"
@printf "Tests:\n"
@printf " make cli-smoke Run Flashchat CLI smoke test\n"
@printf " make manage-smoke Run model management integration test\n"
@printf " make chat-render-smoke Run chat TUI render smoke test\n"
@printf " make tool-template-smoke Run native tool template render/parser smoke test\n"
@printf " make cache-roundtrip-smoke Run disk-cache save/load roundtrip self-test\n"
@printf " make quant-helper-smoke Run native checkpoint quantization helper tests\n"
@printf " make tokenizer-export-smoke Run tokenizer export helper tests\n"
@printf " make native-qwen-compile-smoke Run native Qwen BF16 compiler smoke test\n"
@printf " make mtp-config-smoke Run MTP config/profile precedence smoke test\n"
@printf " make py-tests Run modelmgr unit tests\n"
@printf " make registry-check Verify assets/model_configs.json matches the manifests\n"
@printf " make api-smoke Run HTTP API smoke test\n"
@printf " make test Run all functional smoke tests\n"
@printf " make bench-api Run API performance regression benchmark (per registry model)\n"
@printf " make bench-report Compare latest benchmark vs prior commits, flag regressions\n"
@printf "\n"
@printf "Maintenance:\n"
@printf " make clean Remove build artifacts and archive repo-local ./debug contents\n"
@printf " make archive-debug Archive repo-local ./debug contents under debug/.archived\n"
@printf " make clean-venv Remove Python setup virtual environment\n"
@printf " make distclean Remove build artifacts, repo-local ./debug, and setup venv\n"
print-build-config:
@printf "Compiler command: %s\n" "$(CC)"
@printf "Compiler path: "
@command -v $(firstword $(CC)) 2>/dev/null || printf "%s\n" "$(firstword $(CC))"
@$(CC) --version | head -1
@printf "Optimization profile: %s\n" "$(OPT)"
@printf "Detected CPU: %s\n" "$$(sysctl -n machdep.cpu.brand_string 2>/dev/null || printf unknown)"
@printf "CPU flags: %s\n" "$(CPU_CFLAGS)"
@printf "CFLAGS: %s\n" "$(CFLAGS)"
@printf "LDFLAGS: %s\n" "$(LDFLAGS)"
@printf "CHAT_CFLAGS: %s\n" "$(CHAT_CFLAGS)"
metal_infer: $(TARGET)
infer: $(INFER_TARGET)
chat: $(CHAT_TARGET)
ram-pressure: $(RAM_PRESSURE_TARGET)
# Build the binary (shaders compiled at runtime from source)
$(TARGET): $(MAIN_SRC) $(SHADER_SRC)
@$(MAKE) --no-print-directory print-build-config
$(CC) $(CFLAGS) $(FRAMEWORKS) $(LDFLAGS) $(MAIN_SRC) -o $(TARGET)
# Optional: pre-compile shaders (not required — runtime compilation is the default)
metallib: $(SHADER_LIB)
$(SHADER_AIR): $(SHADER_SRC)
$(METALC) -c $(SHADER_SRC) -o $(SHADER_AIR)
$(SHADER_LIB): $(SHADER_AIR)
$(METALLIB_TOOL) $(SHADER_AIR) -o $(SHADER_LIB)
# Build the inference engine
$(INFER_TARGET): $(INFER_SRC)
@$(MAKE) --no-print-directory print-build-config
$(CC) $(CFLAGS) $(FRAMEWORKS) $(LDFLAGS) $(INFER_SRC) -o $(INFER_TARGET)
# Build the chat client (thin HTTP/SSE client + linenoise line editor)
$(CHAT_TARGET): $(CHAT_SRC) $(LINENOISE_SRC) $(LINENOISE_HDR)
@$(MAKE) --no-print-directory print-build-config
$(CC) $(CHAT_CFLAGS) -framework Foundation $(CHAT_SRC) $(LINENOISE_SRC) -o $(CHAT_TARGET)
$(RAM_PRESSURE_TARGET): $(RAM_PRESSURE_SRC)
$(CC) -O2 -Wall -Wextra $(RAM_PRESSURE_SRC) -o $(RAM_PRESSURE_TARGET)
clean: archive-debug
rm -f $(TARGET) $(INFER_TARGET) $(CHAT_TARGET) $(SHADER_AIR) $(SHADER_LIB)
archive-debug:
@if [ -d debug ]; then \
entries="$$(find debug -mindepth 1 -maxdepth 1 ! -name .archived -print)"; \
if [ -n "$$entries" ]; then \
dest="debug/.archived/logs-$$(date +%Y%m%d-%H%M%S)"; \
mkdir -p "$$dest"; \
find debug -mindepth 1 -maxdepth 1 ! -name .archived -exec mv {} "$$dest"/ \;; \
echo "Archived debug contents to $$dest"; \
fi; \
fi
clean-venv:
rm -rf $(BUILD_DIR)/.venv
distclean: clean-venv
rm -f $(TARGET) $(INFER_TARGET) $(CHAT_TARGET) $(SHADER_AIR) $(SHADER_LIB)
rm -rf debug
# Run targets
run: $(TARGET)
$(call RUN_ENGINE_BENCH,--layer 0 --expert 0)
verify: $(TARGET)
$(call RUN_ENGINE_BENCH,--layer 0 --expert 0 --verify)
fast: $(TARGET)
$(call RUN_ENGINE_BENCH,--layer 0 --expert 0 --fast --verify)
bench: $(TARGET)
$(call RUN_ENGINE_BENCH,--layer 0 --expert 0 --fast --benchmark)
moe: $(TARGET)
$(call RUN_ENGINE_BENCH,--layer 0 --fast --moe)
moebench: $(TARGET)
$(call RUN_ENGINE_BENCH,--layer 0 --fast --moe --benchmark)
full: $(TARGET)
$(call RUN_ENGINE_BENCH,--fast --full --k 4)
fullbench: $(TARGET)
$(call RUN_ENGINE_BENCH,--fast --full --k 4 --benchmark)
# Inference engine targets
build-infer: $(INFER_TARGET)
infer-run: $(INFER_TARGET)
cd $(BUILD_DIR) && ./infer --prompt "Hello, what is" --tokens 20 --k 4
# Chat TUI targets (use: make chat)
build-chat: $(CHAT_TARGET)
chat-run: $(CHAT_TARGET)
cd $(BUILD_DIR) && ./chat --k 4
api-smoke: $(INFER_TARGET)
bash tests/test_api_smoke.sh
# Performance regression benchmark — iterates the model registry (installed models not
# opted out via "benchmark": false), runs the uniform spec per model in its default config,
# appends prefill/decode metrics to assets/api_perf_log.tsv. Separate from `make test`
# because it starts a real server per model and is minutes-long.
bench-api: $(INFER_TARGET)
bash tests/bench_api.sh
# Compare the latest benchmark rows against prior commits and flag regressions.
bench-report:
python3 tests/bench_report.py
# assets/model_configs.json is GENERATED from assets/models/*.json (the
# per-model manifests). Edit the manifests, then run `make registry`.
registry:
python3 -m modelmgr resolve --all -o assets/model_configs.json
registry-check:
@python3 -m modelmgr resolve --all -q -o /tmp/flashchat_registry_check.json && \
python3 -c "import json,sys; a=json.load(open('assets/model_configs.json')); b=json.load(open('/tmp/flashchat_registry_check.json')); sys.exit(0 if a==b else ('assets/model_configs.json is out of sync with assets/models/*.json -- run: make registry', 1)[1])" && \
echo "registry in sync"
py-tests:
@FLASHCHAT_CONFIG_DIR="$$(mktemp -d /tmp/flashchat-py-tests.XXXXXX)" \
python3 -m unittest discover -s tests/python -t tests/python
cli-smoke:
bash tests/test_flashchat_cli.sh
manage-smoke:
bash tests/test_modelmgr_cli.sh
chat-render-smoke: $(CHAT_TARGET)
bash tests/test_chat_tui_render.sh
tool-template-smoke: $(INFER_TARGET)
bash tests/test_tool_template_render.sh
cache-roundtrip-smoke: $(INFER_TARGET)
bash tests/test_disk_cache_roundtrip.sh
quant-helper-smoke:
python3 tests/test_flashchat_quant.py
tokenizer-export-smoke:
python3 tests/test_tokenizer_export.py
native-qwen-compile-smoke: $(INFER_TARGET)
bash tests/test_native_qwen_compile.sh
mtp-config-smoke:
bash tests/test_mtp_config.sh
test: registry-check py-tests cli-smoke manage-smoke chat-render-smoke tool-template-smoke cache-roundtrip-smoke quant-helper-smoke tokenizer-export-smoke native-qwen-compile-smoke mtp-config-smoke api-smoke