theapprenticeproject · ravencore06 · May 3, 2026 · May 9, 2026 · May 14, 2026 · May 21, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+__pycache__/
+*.pyc
+vlm_env/
+.ruff_cache/
+.vscode/
+results.json
diff --git a/README.md b/README.md
@@ -1 +1,8 @@
-# C4GT_2026
+# theApprenticeProject (C4GT 2026)
+
+This repository contains the VLM Evaluation Pipeline developed for The Apprentice Project.
+
+## VLM Evaluation Pipeline
+A cost-efficient Vision Language Model (VLM) pipeline designed to evaluate student artifacts (images/videos) against 21st-century skills rubrics.
+- **Key Directory**: `vlm_evaluation/`
+- **Dependencies**: See `vlm_evaluation/requirements.txt`
diff --git a/vlm_evaluation/dataset.py b/vlm_evaluation/dataset.py
@@ -0,0 +1,49 @@
+import json
+import os
+from PIL import Image
+
+class ArtifactDataset:
+    def __init__(self, data_path: str):
+        """
+        Initializes the dataset loader.
+        Assumes data_path points to a JSON file containing evaluation metadata:
+        [
+            {
+                "image_path": "data/images/student1.jpg",
+                "student_id": "123",
+                "artifact_type": "Origami",
+                "rubric": "1: No effort, 5: Perfect folds and presentation",
+                "ground_truth_score": 4
+            }, ...
+        ]
+        """
+        self.data_path = data_path
+        self.data = []
+
+        if os.path.exists(data_path):
+            with open(data_path, 'r') as f:
+                self.data = json.load(f)
+        else:
+            print(f"Warning: Dataset file {data_path} not found. Returning empty dataset.")
+            print("Please create this file or generate a sample dataset.")
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        image_path = item.get("image_path")
+
+        try:
+            # Handle absolute or relative paths gracefully based on the json directory
+            base_dir = os.path.dirname(self.data_path)
+            full_image_path = os.path.join(base_dir, image_path) if not os.path.isabs(image_path) else image_path
+            image = Image.open(full_image_path).convert("RGB")
+        except Exception as e:
+            print(f"Error loading image {image_path}: {e}")
+            image = None
+
+        return {
+            "image": image,
+            "metadata": item
+        }
diff --git a/vlm_evaluation/evaluate.py b/vlm_evaluation/evaluate.py
@@ -0,0 +1,189 @@
+import argparse
+import json
+import os
+import re
+import torch
+from tqdm import tqdm
+from pydantic import BaseModel
+from lmformatenforcer import JsonSchemaParser
+from lmformatenforcer.integrations.transformers import build_transformers_prefix_allowed_tokens_fn
+from transformers import (
+    LlavaForConditionalGeneration,
+    AutoProcessor,
+    BitsAndBytesConfig,
+)
+from dataset import ArtifactDataset
+from prompts import SYSTEM_PROMPT, generate_evaluation_prompt
+
+
+class EvaluationOutput(BaseModel):
+    skill: str
+    dimension: str
+    score: int
+    max: int
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="VLM Evaluation Pipeline")
+    parser.add_argument(
+        "--data_path", type=str, required=True, help="Path to dataset JSON"
+    )
+    parser.add_argument("--model_name", type=str, default="llava-hf/llava-1.5-7b-hf")
+    parser.add_argument("--quantize", action="store_true", default=True)
+    parser.add_argument("--no_quantize", action="store_false", dest="quantize")
+    parser.add_argument("--output_path", type=str, default="results.json")
+    parser.add_argument("--max_new_tokens", type=int, default=256)
+    return parser.parse_args()
+
+
+def load_model(model_name, quantize=True):
+    if quantize:
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+    else:
+        quantization_config = None
+
+    model = LlavaForConditionalGeneration.from_pretrained(
+        model_name,
+        quantization_config=quantization_config,
+        device_map="auto",
+        torch_dtype=torch.float16,
+    )
+    processor = AutoProcessor.from_pretrained(model_name)
+    return model, processor
+
+
+def extract_score(text):
+    try:
+        # Parse the JSON directly instead of using Regex
+        parsed = json.loads(text)
+        score = parsed.get("score")
+        if isinstance(score, int) and 1 <= score <= 5:
+            return score
+    except json.JSONDecodeError:
+        pass
+    return None
+
+
+def compute_metrics(predictions, ground_truths):
+    total = len(ground_truths)
+    if total == 0:
+        return {}
+
+    exact = sum(1 for p, g in zip(predictions, ground_truths) if p == g)
+    within_1 = sum(1 for p, g in zip(predictions, ground_truths) if abs(p - g) <= 1)
+    mae = sum(abs(p - g) for p, g in zip(predictions, ground_truths)) / total
+    parsed = sum(1 for p in predictions if p is not None)
+
+    return {
+        "total_samples": total,
+        "exact_accuracy": round(exact / total * 100, 2),
+        "within_1_accuracy": round(within_1 / total * 100, 2),
+        "mean_absolute_error": round(mae, 4),
+        "parse_rate": round(parsed / total * 100, 2),
+    }
+
+
+def main():
+    args = parse_args()
+
+    if not torch.cuda.is_available():
+        print("Warning: CUDA not available. Inference will be slow on CPU.")
+
+    print(f"Loading dataset from {args.data_path}...")
+    dataset = ArtifactDataset(args.data_path)
+    if len(dataset) == 0:
+        print("Dataset is empty. Exiting.")
+        return
+
+    print(f"Loading model {args.model_name} (quantize={args.quantize})...")
+    model, processor = load_model(args.model_name, quantize=args.quantize)
+
+    results = []
+    preds = []
+    truths = []
+
+    for i in tqdm(range(len(dataset)), desc="Evaluating"):
+        sample = dataset[i]
+        meta = sample["metadata"]
+        image = sample["image"]
+
+        if image is None:
+            continue
+
+        prompt_text = generate_evaluation_prompt(
+            student_id=meta.get("student_id", "unknown"),
+            artifact_type=meta.get("artifact_type", "unknown"),
+            rubric=meta.get("rubric", {}),
+        )
+
+        inputs = processor(text=prompt_text, images=image, return_tensors="pt").to(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+
+        try:
+            schema = EvaluationOutput.model_json_schema()
+        except AttributeError:
+            schema = EvaluationOutput.schema()
+
+        parser = JsonSchemaParser(schema)
+        prefix_function = build_transformers_prefix_allowed_tokens_fn(processor.tokenizer, parser)
+
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                max_new_tokens=args.max_new_tokens,
+                do_sample=False,
+                prefix_allowed_tokens_fn=prefix_function,
+            )
+
+        decoded = processor.decode(output_ids[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
+        response = decoded.strip()
+
+        predicted_score = extract_score(response)
+        ground_truth = meta.get("ground_truth_score")
+
+        results.append(
+            {
+                "student_id": meta.get("student_id", "unknown"),
+                "predicted_score": predicted_score,
+                "ground_truth_score": ground_truth,
+                "raw_response": response,
+                "artifact_type": meta.get("artifact_type", "unknown"),
+            }
+        )
+
+        if predicted_score is not None and ground_truth is not None:
+            preds.append(predicted_score)
+            truths.append(ground_truth)
+
+    metrics = compute_metrics(preds, truths)
+
+    output = {
+        "config": {
+            "model_name": args.model_name,
+            "quantize": args.quantize,
+            "dataset": args.data_path,
+        },
+        "metrics": metrics,
+        "results": results,
+    }
+
+    with open(args.output_path, "w") as f:
+        json.dump(output, f, indent=2)
+
+    print("\n" + "=" * 50)
+    print("EVALUATION METRICS")
+    print("=" * 50)
+    for k, v in metrics.items():
+        print(f"  {k}: {v}")
+    print("=" * 50)
+    print(f"Results saved to {args.output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vlm_evaluation/generate_sample_data.py b/vlm_evaluation/generate_sample_data.py
@@ -0,0 +1,72 @@
+import json
+import os
+from PIL import Image, ImageDraw
+
+SAMPLE_DATA = [
+    {
+        "image_path": "sample_origami.jpg",
+        "student_id": "S001",
+        "artifact_type": "Origami",
+        "rubric": {
+            "skill": "creativity",
+            "dimension": "originality",
+            "max": 5,
+            "criteria": "1: No recognizable shape, 5: Perfect folds with clean edges and symmetry"
+        },
+        "ground_truth_score": 4,
+    },
+    {
+        "image_path": "sample_drawing.jpg",
+        "student_id": "S002",
+        "artifact_type": "Drawing",
+        "rubric": {
+            "skill": "creativity",
+            "dimension": "composition",
+            "max": 5,
+            "criteria": "1: No effort, 5: Detailed and creative composition"
+        },
+        "ground_truth_score": 3,
+    },
+    {
+        "image_path": "sample_model.jpg",
+        "student_id": "S003",
+        "artifact_type": "Clay Model",
+        "rubric": {
+            "skill": "problem_solving",
+            "dimension": "execution",
+            "max": 5,
+            "criteria": "1: Unrecognizable, 5: Realistic and well-finished model"
+        },
+        "ground_truth_score": 5,
+    },
+]
+
+
+def create_dummy_image(path, size=(224, 224), color=(200, 100, 50)):
+    img = Image.new("RGB", size, color)
+    draw = ImageDraw.Draw(img)
+    draw.rectangle([50, 50, 174, 174], outline=(255, 255, 255), width=3)
+    draw.ellipse([80, 80, 144, 144], fill=(100, 200, 100))
+    img.save(path)
+    print(f"Created {path}")
+
+
+def main():
+    output_dir = os.path.dirname(os.path.abspath(__file__))
+    data_dir = os.path.join(output_dir, "sample_data")
+    os.makedirs(data_dir, exist_ok=True)
+
+    for item in SAMPLE_DATA:
+        image_path = os.path.join(data_dir, item["image_path"])
+        create_dummy_image(image_path)
+        item["image_path"] = os.path.join("sample_data", item["image_path"])
+
+    json_path = os.path.join(output_dir, "sample_dataset.json")
+    with open(json_path, "w") as f:
+        json.dump(SAMPLE_DATA, f, indent=2)
+
+    print(f"Sample dataset saved to {json_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vlm_evaluation/prompts.py b/vlm_evaluation/prompts.py
@@ -0,0 +1,17 @@
+import json
+SYSTEM_PROMPT = """You are an expert evaluator assessing student artifacts for The Apprentice Project.
+You must output your evaluation STRICTLY as a valid JSON object. Do not include any other conversational text."""
+
+def generate_evaluation_prompt(student_id: str, artifact_type: str, rubric: str) -> str:
+    return f"""USER: 
+{SYSTEM_PROMPT}
+
+Artifact ID: {student_id}
+Category: {artifact_type}
+Rubric Schema:
+{rubric}
+
+Please evaluate the artifact based on the rubric.
+Output strictly in this JSON format:
+{{"score": <int>, "feedback": "<brief reasoning>"}}
+ASSISTANT:"""
diff --git a/vlm_evaluation/requirements.txt b/vlm_evaluation/requirements.txt
@@ -0,0 +1,9 @@
+transformers>=4.38.2
+torch
+peft
+bitsandbytes
+Pillow
+accelerate
+datasets
+lm-format-enforcer
+pydantic
diff --git a/vlm_evaluation/run_benchmark.ps1 b/vlm_evaluation/run_benchmark.ps1
@@ -0,0 +1,29 @@
+param(
+    [string]$DataPath = "sample_dataset.json",
+    [string]$ModelName = "llava-hf/llava-1.5-7b-hf",
+    [switch]$NoQuantize = $false,
+    [string]$OutputPath = "results.json",
+    [int]$MaxNewTokens = 256
+)
+
+$QuantizeFlag = if ($NoQuantize) { "--no_quantize" } else { "" }
+
+Write-Host "=== VLM Evaluation Benchmark ===" -ForegroundColor Cyan
+Write-Host "Dataset : $DataPath"
+Write-Host "Model   : $ModelName"
+Write-Host "Quantize: $(-not $NoQuantize)"
+Write-Host "Output  : $OutputPath"
+Write-Host ""
+
+python evaluate.py `
+    --data_path $DataPath `
+    --model_name $ModelName `
+    $QuantizeFlag `
+    --output_path $OutputPath `
+    --max_new_tokens $MaxNewTokens
+
+if ($LASTEXITCODE -eq 0) {
+    Write-Host "Benchmark completed successfully." -ForegroundColor Green
+} else {
+    Write-Host "Benchmark failed with exit code $LASTEXITCODE." -ForegroundColor Red
+}
diff --git a/vlm_evaluation/sample_data/sample_drawing.jpg b/vlm_evaluation/sample_data/sample_drawing.jpg
diff --git a/vlm_evaluation/sample_data/sample_model.jpg b/vlm_evaluation/sample_data/sample_model.jpg
diff --git a/vlm_evaluation/sample_data/sample_origami.jpg b/vlm_evaluation/sample_data/sample_origami.jpg