Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions kompile/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def filter_cmd(ctx):
click.echo(f"Filtering {len(unfiltered)} sources with {model}...")

results = []
from kompile.models import Source
from kompile.models import Source, FilterResult
for i, sid in enumerate(sorted(unfiltered), 1):
src_file = raw_dir / f"{sid}.txt"
if not src_file.exists():
Expand All @@ -155,12 +155,17 @@ def filter_cmd(ctx):
metadata=src_data.get("metadata", {}),
)

result = filter_source(source, client, model)
try:
result = filter_source(source, client, model)
except ValueError as exc:
click.echo(f" [{i}/{len(unfiltered)}] {source.title[:60]} → ✗ skip (parse error: {exc})", err=True)
result = FilterResult(source_id=source.id, keep=False, topics=[], summary=f"[filter error: {exc}]")
results.append(result)
state_add_filter_results(state, [result])
save_state(root, state)
status = "✓ keep" if result.keep else "✗ discard"
click.echo(f" [{i}/{len(unfiltered)}] {source.title[:60]} → {status} | {result.summary[:80]}")

state_add_filter_results(state, results)
kept = sum(1 for r in results if r.keep)
click.echo(f"\nFiltered {len(results)} sources: {kept} kept, {len(results)-kept} discarded.")

Expand All @@ -182,7 +187,7 @@ def filter_cmd(ctx):
@click.pass_context
def compile(ctx, incremental):
"""Run full tiered compilation pipeline → wiki/ directory."""
from kompile.models import Source, TieredWiki, SurfaceNote, Concept, Insight, Gap
from kompile.models import Source, TieredWiki, SurfaceNote, Concept, Insight, Gap, FilterResult
from kompile.compiler.filter import filter_source
from kompile.compiler.classify import classify_topics
from kompile.compiler.summarize import summarize_source
Expand Down Expand Up @@ -219,12 +224,16 @@ def compile(ctx, incremental):
url=src_data.get("metadata", {}).get("url"),
metadata=src_data.get("metadata", {}),
)
result = filter_source(source, client, cfg["models"]["filter"])
try:
result = filter_source(source, client, cfg["models"]["filter"])
except ValueError as exc:
click.echo(f" ✗ skip {source.title[:60]} (parse error: {exc})", err=True)
result = FilterResult(source_id=source.id, keep=False, topics=[], summary=f"[filter error: {exc}]")
results.append(result)
state_add_filter_results(state, [result])
save_state(root, state)
status = "✓" if result.keep else "✗"
click.echo(f" {status} {source.title[:60]}")
state_add_filter_results(state, results)
save_state(root, state)

# ------------------------------------------------------------------ #
# Step 2: Classify topics (if not already done or new sources added)
Expand Down
38 changes: 25 additions & 13 deletions kompile/compiler/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,31 +26,43 @@ def filter_source(
date=source.date,
content=content,
)
response = client.messages.create(
model=model,
max_tokens=1024,
system=FILTER_SYSTEM,
messages=[{"role": "user", "content": user_msg}],
)
data = parse_llm_json(response.content[0].text)
return FilterResult(
source_id=source.id,
keep=bool(data.get("keep", False)),
topics=data.get("topics", []),
summary=data.get("summary", ""),
)
last_exc: Exception | None = None
for attempt in range(3):
response = client.messages.create(
model=model,
max_tokens=1024,
system=FILTER_SYSTEM,
messages=[{"role": "user", "content": user_msg}],
)
try:
data = parse_llm_json(response.content[0].text)
if not isinstance(data, dict):
raise ValueError(f"expected dict, got {type(data).__name__}")
except ValueError as exc:
last_exc = exc
continue
return FilterResult(
source_id=source.id,
keep=bool(data.get("keep", False)),
topics=data.get("topics", []),
summary=data.get("summary", ""),
)
raise ValueError(f"filter_source failed after 3 attempts for {source.id!r}: {last_exc}")


def filter_sources(
sources: list[Source],
client: anthropic.Anthropic,
model: str,
progress_cb: Callable[[int, int, FilterResult], None] | None = None,
checkpoint_cb: Callable[[FilterResult], None] | None = None,
) -> list[FilterResult]:
results = []
for i, source in enumerate(sources):
result = filter_source(source, client, model)
results.append(result)
if checkpoint_cb:
checkpoint_cb(result)
if progress_cb:
progress_cb(i + 1, len(sources), result)
return results