flashchat/Makefile at main · fiveangle/flashchat · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# Makefile for Flashchat — Pure C/Metal MoE inference engine
#
# Targets:
#   make           — build inference binaries
#   make run       — single expert forward pass
#   make verify    — verify Metal vs CPU reference
#   make bench     — benchmark single expert (10 iterations)
#   make moe       — full MoE forward pass (K experts, single layer)
#   make moebench — benchmark MoE (10 iterations)
#   make full      — full model MoE forward pass (K=4)
#   make fullbench — benchmark full model forward (3 iterations)
#   make chat      — build interactive chat TUI
#   make api-smoke — run HTTP API smoke test
#   make cli-smoke — run Flashchat CLI smoke test
#   make manage-smoke — run model management integration test (Python core via launcher)
#   make tool-template-smoke — run native tool template render/parser smoke test
#   make quant-helper-smoke — run native checkpoint quantization helper tests
#   make tokenizer-export-smoke — run tokenizer export helper tests
#   make native-qwen-compile-smoke — run native Qwen BF16 compiler smoke test
#   make mtp-config-smoke — run MTP config/profile precedence smoke test
#   make py-tests — run modelmgr unit tests
#   make registry / registry-check — regenerate/verify assets/model_configs.json
#   make test      — run all functional smoke tests
#   make help      — list available targets
#   make clean     — remove build artifacts
#   make archive-debug — archive repo-local debug contents under debug/.archived
#   make clean-venv — remove Python setup virtual environment
#   make distclean — remove build artifacts, repo-local debug, and setup venv
#
# Note: Metal shaders are compiled from source at runtime via
# MTLDevice newLibraryWithSource:, so no offline metal compiler needed.

SHELL := /bin/bash

BUILD_DIR = metal_infer

ifeq ($(origin CC),default)
CC = clang
endif
OPT ?= aggressive

FRAMEWORKS = -framework Metal -framework Foundation -framework Accelerate
BASE_CFLAGS = -Wall -Wextra -fobjc-arc -DACCELERATE_NEW_LAPACK
BASE_LDFLAGS = -lpthread -lcompression -ldl

VALID_OPTS = aggressive conservative debug
ifeq ($(filter $(OPT),$(VALID_OPTS)),)
$(error Unknown OPT='$(OPT)'. Use one of: $(VALID_OPTS))
endif

cc-option = $(strip $(shell tmp=$$(mktemp /tmp/flashchat-cc-option.XXXXXX); \
	printf 'int main(void){return 0;}\n' | $(CC) -x objective-c $(BASE_CFLAGS) $(1) -c -o "$$tmp" - >/dev/null 2>&1 && printf '%s' '$(1)'; \
	rm -f "$$tmp"))

link-option = $(strip $(shell tmp=$$(mktemp /tmp/flashchat-link-option.XXXXXX); rm -f "$$tmp"; \
	printf 'int main(void){return 0;}\n' | $(CC) -x objective-c $(BASE_CFLAGS) $(BASE_LDFLAGS) $(1) -o "$$tmp" - >/dev/null 2>&1 && printf '%s' '$(1)'; \
	rm -f "$$tmp"))

APPLE_CPU_NAME = $(shell sysctl -n machdep.cpu.brand_string 2>/dev/null | tr '[:upper:]' '[:lower:]' | sed -n 's/^apple \([a-z0-9]*\).*/\1/p')
DETECTED_MCPU_FLAG = $(if $(APPLE_CPU_NAME),-mcpu=apple-$(APPLE_CPU_NAME))
CPU_CFLAGS := $(call cc-option,-mcpu=native)
ifeq ($(CPU_CFLAGS),)
CPU_CFLAGS := $(call cc-option,$(DETECTED_MCPU_FLAG))
endif

AGGRESSIVE_LTO = $(call link-option,-flto)
AGGRESSIVE_EXTRA_CFLAGS = \
	$(call cc-option,-ffast-math) \
	$(call cc-option,-funroll-loops) \
	$(call cc-option,-fvectorize) \
	$(call cc-option,-fslp-vectorize) \
	$(call cc-option,-ftree-vectorize) \
	$(call cc-option,-falign-functions=16)

ifeq ($(OPT),aggressive)
OPT_CFLAGS = -O3 $(CPU_CFLAGS) $(AGGRESSIVE_LTO) $(AGGRESSIVE_EXTRA_CFLAGS)
OPT_LDFLAGS = $(AGGRESSIVE_LTO)
endif
ifeq ($(OPT),conservative)
OPT_CFLAGS = -O2 $(CPU_CFLAGS)
OPT_LDFLAGS =
endif
ifeq ($(OPT),debug)
OPT_CFLAGS = -O0 -g $(CPU_CFLAGS)
OPT_LDFLAGS =
endif

CFLAGS = $(BASE_CFLAGS) $(OPT_CFLAGS)
LDFLAGS = $(BASE_LDFLAGS) $(OPT_LDFLAGS)
CHAT_CFLAGS = $(if $(filter debug,$(OPT)),-O0 -g,-O2 $(CPU_CFLAGS)) -Wall -fobjc-arc

TARGET = $(BUILD_DIR)/metal_infer
MAIN_SRC = $(BUILD_DIR)/main.m

# Optional: offline shader compilation (faster startup, but not required)
METALC = xcrun -sdk macosx metal
METALLIB_TOOL = xcrun -sdk macosx metallib
SHADER_SRC = $(BUILD_DIR)/shaders.metal
SHADER_AIR = $(BUILD_DIR)/shaders.air
SHADER_LIB = $(BUILD_DIR)/shaders.metallib

# Inference engine (complete forward pass)
INFER_TARGET = $(BUILD_DIR)/infer
INFER_SRC = $(BUILD_DIR)/infer.m

# Chat TUI (interactive multi-turn)
CHAT_TARGET = $(BUILD_DIR)/chat
CHAT_SRC = $(BUILD_DIR)/chat.m
LINENOISE_SRC = $(BUILD_DIR)/linenoise.c
LINENOISE_HDR = $(BUILD_DIR)/linenoise.h
RAM_PRESSURE_TARGET = tools/ram_pressure
RAM_PRESSURE_SRC = tools/ram_pressure.c

.PHONY: all clean archive-debug clean-venv distclean help print-build-config run verify bench moe moebench full fullbench fast metallib metal_infer infer chat ram-pressure build-infer infer-run chat-run build-chat api-smoke cli-smoke manage-smoke chat-render-smoke tool-template-smoke cache-roundtrip-smoke quant-helper-smoke tokenizer-export-smoke native-qwen-compile-smoke mtp-config-smoke test bench-api bench-report registry registry-check py-tests

define RUN_ENGINE_BENCH
	@bash -c 'set -eo pipefail; \
	source ./lib/config.sh; \
	flashchat_load_config; \
	export FLASHCHAT_MODEL="$$(flashchat_get MODEL)"; \
	export FLASHCHAT_MODEL_PATH="$$(flashchat_get MODEL_PATH)"; \
	export FLASHCHAT_MODEL_CONFIG="$$(flashchat_get MODEL_CONFIG)"; \
	if [[ -z "$$FLASHCHAT_MODEL_PATH" || "$$FLASHCHAT_MODEL_PATH" == *"<snapshot>"* || ! -d "$$FLASHCHAT_MODEL_PATH" ]]; then \
		echo "ERROR: Model is not downloaded for $$FLASHCHAT_MODEL."; \
		echo "Expected model snapshot: $$FLASHCHAT_MODEL_PATH"; \
		echo "Run ./flashchat setup first, or select a configured model with downloaded weights."; \
		exit 1; \
	fi; \
	bits="$$(flashchat_model_quant_bits "$$FLASHCHAT_MODEL")"; \
	packed_dir="$$FLASHCHAT_MODEL_PATH/flashchat/q$${bits:-4}/packed_experts"; \
	if [[ ! -f "$$packed_dir/layer_00.bin" ]]; then \
		echo "ERROR: Engine benchmark artifacts are not available for $$FLASHCHAT_MODEL."; \
		echo "Expected: $$packed_dir/layer_00.bin"; \
		echo "Run ./flashchat setup first, or select a configured model with packed experts."; \
		exit 1; \
	fi; \
	echo "Using Flashchat model $$FLASHCHAT_MODEL"; \
	cd $(BUILD_DIR); \
	./metal_infer --model-id "$$FLASHCHAT_MODEL" --model "$$FLASHCHAT_MODEL_PATH" $(1)'
endef

all: $(TARGET) $(INFER_TARGET)

help:
	@printf "Flashchat make targets\n"
	@printf "\n"
	@printf "Build:\n"
	@printf "  make               Build main benchmark and inference binaries\n"
	@printf "  make all           Build main benchmark and inference binaries\n"
	@printf "  make metal_infer   Build main benchmark binary\n"
	@printf "  make infer         Build inference server/engine\n"
	@printf "  make build-infer   Alias for infer\n"
	@printf "  make chat          Build interactive chat client\n"
	@printf "  make build-chat    Alias for chat\n"
	@printf "  make ram-pressure  Build RAM pressure utility for memory-constrained testing\n"
	@printf "  make metallib      Precompile Metal shaders\n"
	@printf "  make print-build-config  Show compiler and optimization settings\n"
	@printf "\n"
	@printf "Build options:\n"
	@printf "  OPT=aggressive     Fastest probed local build (default)\n"
	@printf "  OPT=conservative   Native CPU, fewer risky optimization flags\n"
	@printf "  OPT=debug          Debug symbols, no speed-oriented flags\n"
	@printf "  CC=clang           Override compiler command\n"
	@printf "\n"
	@printf "Run:\n"
	@printf "  make infer-run     Run a short inference prompt\n"
	@printf "  make chat-run      Launch the chat client\n"
	@printf "\n"
	@printf "Benchmarks:\n"
	@printf "  make run           Single expert forward pass\n"
	@printf "  make verify        Metal vs CPU reference verification\n"
	@printf "  make fast          Fast path verification\n"
	@printf "  make bench         Single expert benchmark\n"
	@printf "  make moe           MoE forward pass\n"
	@printf "  make moebench      MoE benchmark\n"
	@printf "  make full          Full model forward pass\n"
	@printf "  make fullbench     Full model benchmark\n"
	@printf "\n"
	@printf "Tests:\n"
	@printf "  make cli-smoke     Run Flashchat CLI smoke test\n"
	@printf "  make manage-smoke  Run model management integration test\n"
	@printf "  make chat-render-smoke  Run chat TUI render smoke test\n"
	@printf "  make tool-template-smoke  Run native tool template render/parser smoke test\n"
	@printf "  make cache-roundtrip-smoke  Run disk-cache save/load roundtrip self-test\n"
	@printf "  make quant-helper-smoke  Run native checkpoint quantization helper tests\n"
	@printf "  make tokenizer-export-smoke  Run tokenizer export helper tests\n"
	@printf "  make native-qwen-compile-smoke  Run native Qwen BF16 compiler smoke test\n"
	@printf "  make mtp-config-smoke  Run MTP config/profile precedence smoke test\n"
	@printf "  make py-tests  Run modelmgr unit tests\n"
	@printf "  make registry-check  Verify assets/model_configs.json matches the manifests\n"
	@printf "  make api-smoke     Run HTTP API smoke test\n"
	@printf "  make test          Run all functional smoke tests\n"
	@printf "  make bench-api     Run API performance regression benchmark (per registry model)\n"
	@printf "  make bench-report  Compare latest benchmark vs prior commits, flag regressions\n"
	@printf "\n"
	@printf "Maintenance:\n"
	@printf "  make clean         Remove build artifacts and archive repo-local ./debug contents\n"
	@printf "  make archive-debug Archive repo-local ./debug contents under debug/.archived\n"
	@printf "  make clean-venv    Remove Python setup virtual environment\n"
	@printf "  make distclean     Remove build artifacts, repo-local ./debug, and setup venv\n"

print-build-config:
	@printf "Compiler command: %s\n" "$(CC)"
	@printf "Compiler path: "
	@command -v $(firstword $(CC)) 2>/dev/null || printf "%s\n" "$(firstword $(CC))"
	@$(CC) --version | head -1
	@printf "Optimization profile: %s\n" "$(OPT)"
	@printf "Detected CPU: %s\n" "$$(sysctl -n machdep.cpu.brand_string 2>/dev/null || printf unknown)"
	@printf "CPU flags: %s\n" "$(CPU_CFLAGS)"
	@printf "CFLAGS: %s\n" "$(CFLAGS)"
	@printf "LDFLAGS: %s\n" "$(LDFLAGS)"
	@printf "CHAT_CFLAGS: %s\n" "$(CHAT_CFLAGS)"

metal_infer: $(TARGET)

infer: $(INFER_TARGET)

chat: $(CHAT_TARGET)

ram-pressure: $(RAM_PRESSURE_TARGET)

# Build the binary (shaders compiled at runtime from source)
$(TARGET): $(MAIN_SRC) $(SHADER_SRC)
	@$(MAKE) --no-print-directory print-build-config
	$(CC) $(CFLAGS) $(FRAMEWORKS) $(LDFLAGS) $(MAIN_SRC) -o $(TARGET)

# Optional: pre-compile shaders (not required — runtime compilation is the default)
metallib: $(SHADER_LIB)

$(SHADER_AIR): $(SHADER_SRC)
	$(METALC) -c $(SHADER_SRC) -o $(SHADER_AIR)

$(SHADER_LIB): $(SHADER_AIR)
	$(METALLIB_TOOL) $(SHADER_AIR) -o $(SHADER_LIB)

# Build the inference engine
$(INFER_TARGET): $(INFER_SRC)
	@$(MAKE) --no-print-directory print-build-config
	$(CC) $(CFLAGS) $(FRAMEWORKS) $(LDFLAGS) $(INFER_SRC) -o $(INFER_TARGET)

# Build the chat client (thin HTTP/SSE client + linenoise line editor)
$(CHAT_TARGET): $(CHAT_SRC) $(LINENOISE_SRC) $(LINENOISE_HDR)
	@$(MAKE) --no-print-directory print-build-config
	$(CC) $(CHAT_CFLAGS) -framework Foundation $(CHAT_SRC) $(LINENOISE_SRC) -o $(CHAT_TARGET)

$(RAM_PRESSURE_TARGET): $(RAM_PRESSURE_SRC)
	$(CC) -O2 -Wall -Wextra $(RAM_PRESSURE_SRC) -o $(RAM_PRESSURE_TARGET)

clean: archive-debug
	rm -f $(TARGET) $(INFER_TARGET) $(CHAT_TARGET) $(SHADER_AIR) $(SHADER_LIB)

archive-debug:
	@if [ -d debug ]; then \
		entries="$$(find debug -mindepth 1 -maxdepth 1 ! -name .archived -print)"; \
		if [ -n "$$entries" ]; then \
			dest="debug/.archived/logs-$$(date +%Y%m%d-%H%M%S)"; \
			mkdir -p "$$dest"; \
			find debug -mindepth 1 -maxdepth 1 ! -name .archived -exec mv {} "$$dest"/ \;; \
			echo "Archived debug contents to $$dest"; \
		fi; \
	fi

clean-venv:
	rm -rf $(BUILD_DIR)/.venv

distclean: clean-venv
	rm -f $(TARGET) $(INFER_TARGET) $(CHAT_TARGET) $(SHADER_AIR) $(SHADER_LIB)
	rm -rf debug

# Run targets
run: $(TARGET)
	$(call RUN_ENGINE_BENCH,--layer 0 --expert 0)

verify: $(TARGET)
	$(call RUN_ENGINE_BENCH,--layer 0 --expert 0 --verify)

fast: $(TARGET)
	$(call RUN_ENGINE_BENCH,--layer 0 --expert 0 --fast --verify)

bench: $(TARGET)
	$(call RUN_ENGINE_BENCH,--layer 0 --expert 0 --fast --benchmark)

moe: $(TARGET)
	$(call RUN_ENGINE_BENCH,--layer 0 --fast --moe)

moebench: $(TARGET)
	$(call RUN_ENGINE_BENCH,--layer 0 --fast --moe --benchmark)

full: $(TARGET)
	$(call RUN_ENGINE_BENCH,--fast --full --k 4)

fullbench: $(TARGET)
	$(call RUN_ENGINE_BENCH,--fast --full --k 4 --benchmark)

# Inference engine targets
build-infer: $(INFER_TARGET)

infer-run: $(INFER_TARGET)
	cd $(BUILD_DIR) && ./infer --prompt "Hello, what is" --tokens 20 --k 4

# Chat TUI targets (use: make chat)

build-chat: $(CHAT_TARGET)

chat-run: $(CHAT_TARGET)
	cd $(BUILD_DIR) && ./chat --k 4

api-smoke: $(INFER_TARGET)
	bash tests/test_api_smoke.sh

# Performance regression benchmark — iterates the model registry (installed models not
# opted out via "benchmark": false), runs the uniform spec per model in its default config,
# appends prefill/decode metrics to assets/api_perf_log.tsv. Separate from `make test`
# because it starts a real server per model and is minutes-long.
bench-api: $(INFER_TARGET)
	bash tests/bench_api.sh

# Compare the latest benchmark rows against prior commits and flag regressions.
bench-report:
	python3 tests/bench_report.py

# assets/model_configs.json is GENERATED from assets/models/*.json (the
# per-model manifests). Edit the manifests, then run `make registry`.
registry:
	python3 -m modelmgr resolve --all -o assets/model_configs.json

registry-check:
	@python3 -m modelmgr resolve --all -q -o /tmp/flashchat_registry_check.json && \
	python3 -c "import json,sys; a=json.load(open('assets/model_configs.json')); b=json.load(open('/tmp/flashchat_registry_check.json')); sys.exit(0 if a==b else ('assets/model_configs.json is out of sync with assets/models/*.json -- run: make registry', 1)[1])" && \
	echo "registry in sync"

py-tests:
	@FLASHCHAT_CONFIG_DIR="$$(mktemp -d /tmp/flashchat-py-tests.XXXXXX)" \
		python3 -m unittest discover -s tests/python -t tests/python

cli-smoke:
	bash tests/test_flashchat_cli.sh

manage-smoke:
	bash tests/test_modelmgr_cli.sh

chat-render-smoke: $(CHAT_TARGET)
	bash tests/test_chat_tui_render.sh

tool-template-smoke: $(INFER_TARGET)
	bash tests/test_tool_template_render.sh

cache-roundtrip-smoke: $(INFER_TARGET)
	bash tests/test_disk_cache_roundtrip.sh

quant-helper-smoke:
	python3 tests/test_flashchat_quant.py

tokenizer-export-smoke:
	python3 tests/test_tokenizer_export.py

native-qwen-compile-smoke: $(INFER_TARGET)
	bash tests/test_native_qwen_compile.sh

mtp-config-smoke:
	bash tests/test_mtp_config.sh

test: registry-check py-tests cli-smoke manage-smoke chat-render-smoke tool-template-smoke cache-roundtrip-smoke quant-helper-smoke tokenizer-export-smoke native-qwen-compile-smoke mtp-config-smoke api-smoke