Skip to content

Commit 4e081da

Browse files
committed
chore: reset history with clean release snapshot
0 parents  commit 4e081da

22 files changed

Lines changed: 2683 additions & 0 deletions

.github/workflows/validate.yml

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
name: Schema & Quality Validation
2+
3+
on:
4+
pull_request:
5+
paths:
6+
- 'data/latest/*.jsonl'
7+
- 'data/latest/*.jsonl.gz'
8+
- 'schema/bigv.schema.json'
9+
- 'tools/influx-validate'
10+
- '.github/workflows/validate.yml'
11+
push:
12+
branches:
13+
- main
14+
paths:
15+
- 'data/latest/*.jsonl'
16+
- 'data/latest/*.jsonl.gz'
17+
- 'schema/bigv.schema.json'
18+
19+
jobs:
20+
validate:
21+
runs-on: ubuntu-latest
22+
23+
steps:
24+
- name: Checkout code
25+
uses: actions/checkout@v4
26+
27+
- name: Set up Python
28+
uses: actions/setup-python@v5
29+
with:
30+
python-version: '3.11'
31+
cache: 'pip'
32+
33+
- name: Install dependencies
34+
run: |
35+
pip install -r requirements.txt
36+
37+
- name: Validate test fixtures
38+
run: |
39+
echo "=== Validating test fixtures ==="
40+
python3 tools/influx-validate -s schema/bigv.schema.json test/fixtures/valid.jsonl
41+
42+
echo ""
43+
echo "=== Testing invalid fixture (should fail) ==="
44+
if python3 tools/influx-validate -s schema/bigv.schema.json test/fixtures/invalid.jsonl; then
45+
echo "ERROR: invalid.jsonl should have failed validation"
46+
exit 1
47+
else
48+
echo "✓ invalid.jsonl correctly failed validation"
49+
fi
50+
51+
- name: Validate latest data (if exists)
52+
run: |
53+
# Validate plain JSONL
54+
if [ -f data/latest/latest.jsonl ]; then
55+
echo "=== Validating data/latest/latest.jsonl ==="
56+
python3 tools/influx-validate -s schema/bigv.schema.json data/latest/latest.jsonl
57+
else
58+
echo "⚠ data/latest/latest.jsonl not found (OK for initial commits)"
59+
fi
60+
61+
# Validate compressed JSONL
62+
if [ -f data/latest/latest.jsonl.gz ]; then
63+
echo "=== Validating data/latest/latest.jsonl.gz ==="
64+
python3 tools/influx-validate -s schema/bigv.schema.json data/latest/latest.jsonl.gz
65+
66+
if [ -f data/latest/manifest.json ]; then
67+
echo "=== Validating with manifest ==="
68+
python3 tools/influx-validate -s schema/bigv.schema.json -m data/latest/manifest.json data/latest/latest.jsonl.gz
69+
fi
70+
else
71+
echo "⚠ data/latest/latest.jsonl.gz not found (OK for initial commits)"
72+
fi
73+
74+
- name: STRICT Quality Gate Validation (main branch only)
75+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
76+
run: |
77+
echo "🔒 ENFORCING SINGLE-PATH PIPELINE COMPLIANCE"
78+
echo "=========================================="
79+
echo ""
80+
echo "All data merged to main branch MUST pass through influx-harvest pipeline"
81+
echo "and comply with strict quality gates."
82+
echo ""
83+
84+
# Strict validation of plain JSONL (if exists)
85+
if [ -f data/latest/latest.jsonl ]; then
86+
echo "=== STRICT VALIDATION: data/latest/latest.jsonl ==="
87+
python3 tools/influx-validate --strict -s schema/bigv.schema.json data/latest/latest.jsonl
88+
echo "✓ Plain JSONL passed strict quality gate validation"
89+
else
90+
echo "⚠ data/latest/latest.jsonl not found"
91+
fi
92+
93+
# Strict validation of compressed JSONL (if exists)
94+
if [ -f data/latest/latest.jsonl.gz ]; then
95+
echo "=== STRICT VALIDATION: data/latest/latest.jsonl.gz ==="
96+
python3 tools/influx-validate --strict -s schema/bigv.schema.json data/latest/latest.jsonl.gz
97+
98+
if [ -f data/latest/manifest.json ]; then
99+
echo "=== STRICT VALIDATION with manifest ==="
100+
python3 tools/influx-validate --strict -s schema/bigv.schema.json -m data/latest/manifest.json data/latest/latest.jsonl.gz
101+
fi
102+
echo "✓ Compressed JSONL passed strict quality gate validation"
103+
else
104+
echo "⚠ data/latest/latest.jsonl.gz not found"
105+
fi
106+
107+
echo ""
108+
echo "🎉 ALL STRICT VALIDATIONS PASSED"
109+
echo "Data quality and single-path pipeline compliance verified"
110+
111+
- name: Validate schema itself
112+
run: |
113+
echo "=== Checking schema validity ==="
114+
python3 -c "
115+
import json
116+
from jsonschema import Draft7Validator
117+
118+
with open('schema/bigv.schema.json') as f:
119+
schema = json.load(f)
120+
121+
Draft7Validator.check_schema(schema)
122+
print('✓ Schema is valid JSON Schema Draft-07')
123+
"

.gitignore

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Ignore CCCC runtime domain (Ephemeral mode)
2+
/.cccc/**
3+
4+
# Python
5+
.venv/
6+
__pycache__/
7+
*.pyc
8+
*.pyo
9+
*.pyd
10+
.Python
11+
12+
# State and data (only release tracked)
13+
state/
14+
archive/
15+
temp_work/
16+
*.log
17+
data/**
18+
!data/release/
19+
!data/release/**
20+
21+
# Docs / audit / QA (not tracked)
22+
docs/**
23+
AGENTS.md
24+
25+
# Lists and seeds (track only core rules)
26+
lists/**
27+
!lists/rules/
28+
!lists/rules/brand_heuristics.yml
29+
!lists/rules/risk_terms.yml
30+
31+
# Processed/working batches (not tracked)
32+
processed_batches/**
33+
data/batches/**
34+
data/uncompressed/**
35+
data/test/**
36+
data/samples/**
37+
data/latest/**
38+
data/prefetched.sample.jsonl
39+
!data/prefetched.sample.jsonl
40+
41+
# Scripts/tools: track only minimal publish set
42+
scripts/**
43+
!scripts/pipeline_guard.sh
44+
tools/**
45+
!tools/influx-harvest
46+
47+
# Schema extras (not tracked)
48+
schema/state_db.sql
49+
schema/schema.md
50+
51+
# Reports / audit outputs
52+
docs/qa_reports/
53+
docs/audit_trail/
54+
55+
# IDE
56+
.vscode/
57+
.idea/
58+
*.swp
59+
*.swo
60+
*~
61+
62+
# OS
63+
.DS_Store
64+
Thumbs.db

FOREMAN_TASK.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
Title: Foreman Task Brief (Evergreen)
2+
3+
Purpose
4+
- 监工、审核、阻断造假:只认可“严格通过且去重后的唯一作者数”进度;禁止旁路、行数灌水、未校验产出。
5+
- 发布守门:真相源 = data/latest/latest.jsonl,发布 = data/release/*;二者必须一致、可审计。
6+
7+
Core duties (directional, not status-specific)
8+
1) 质量闸执行:所有批次必须跑 `./scripts/pipeline_guard.sh data/latest/latest.jsonl data/latest/manifest.json schema/bigv.schema.json`(去重 handle + 去重 id、占位/非数字 ID 拒绝、mock/test/tmp 前缀拒绝、粉丝数尾数“000”拒绝、sources.evidence+fetched_at 必填、manifest 对齐、strict 校验)。未通过不得写入 latest/release。
9+
2) 证据与真实性:抽查 `sources.evidence` 与外部 lookup(handle→id/粉丝数/状态/活跃度),偏差或缺证据即退回整批;记录违规人。
10+
3) 评分与 manifest:确认 manifest 的 `score_version/score_formula/score_note` 与实际模型一致,count/sha 与文件一致;不符即拒绝发布。
11+
4) 入口与旁路:仅允许“prefetched JSONL + influx-harvest 过滤”流程入库;发现手工编辑/旁路文件或本机直连 MCP 尝试,立即回滚、归档、记录违规。
12+
5) 发布同步:只有在质量闸通过后,才允许同步 data/release;发布内容必须与真相源哈希一致。
13+
6) 审计与留痕:为每批生成 QA 记录(包含输入文件、pipeline_guard 输出、抽检结果、决定),便于追责与回滚。
14+
15+
Working posture
16+
- 保持流程稳定,不写入进度数字;一旦发现规则无法覆盖的新型造假,立刻升级 pipeline_guard 并通知全体。
17+
- 进度只按“合规唯一作者数”计算;任何以行数冲量、填充占位的行为视为造假。
18+
19+
References
20+
- PROJECT.md(流程与原则)
21+
- data/latest/latest.jsonl + manifest.json(真相源)
22+
- data/release/(发布版)

0 commit comments

Comments
 (0)