Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
__pycache__/
*.pyc
vlm_env/
.ruff_cache/
.vscode/
results.json
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,8 @@
# C4GT_2026
# theApprenticeProject (C4GT 2026)

This repository contains the VLM Evaluation Pipeline developed for The Apprentice Project.

## VLM Evaluation Pipeline
A cost-efficient Vision Language Model (VLM) pipeline designed to evaluate student artifacts (images/videos) against 21st-century skills rubrics.
- **Key Directory**: `vlm_evaluation/`
- **Dependencies**: See `vlm_evaluation/requirements.txt`
49 changes: 49 additions & 0 deletions vlm_evaluation/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import json
import os
from PIL import Image

class ArtifactDataset:
def __init__(self, data_path: str):
"""
Initializes the dataset loader.
Assumes data_path points to a JSON file containing evaluation metadata:
[
{
"image_path": "data/images/student1.jpg",
"student_id": "123",
"artifact_type": "Origami",
"rubric": "1: No effort, 5: Perfect folds and presentation",
"ground_truth_score": 4
}, ...
]
"""
self.data_path = data_path
self.data = []

if os.path.exists(data_path):
with open(data_path, 'r') as f:
self.data = json.load(f)
else:
print(f"Warning: Dataset file {data_path} not found. Returning empty dataset.")
print("Please create this file or generate a sample dataset.")

def __len__(self):
return len(self.data)

def __getitem__(self, idx):
item = self.data[idx]
image_path = item.get("image_path")

try:
# Handle absolute or relative paths gracefully based on the json directory
base_dir = os.path.dirname(self.data_path)
full_image_path = os.path.join(base_dir, image_path) if not os.path.isabs(image_path) else image_path
image = Image.open(full_image_path).convert("RGB")
except Exception as e:
print(f"Error loading image {image_path}: {e}")
image = None

return {
"image": image,
"metadata": item
}
189 changes: 189 additions & 0 deletions vlm_evaluation/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import argparse
import json
import os
import re
import torch
from tqdm import tqdm
from pydantic import BaseModel
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import build_transformers_prefix_allowed_tokens_fn
from transformers import (
LlavaForConditionalGeneration,
AutoProcessor,
BitsAndBytesConfig,
)
from dataset import ArtifactDataset
from prompts import SYSTEM_PROMPT, generate_evaluation_prompt


class EvaluationOutput(BaseModel):
skill: str
dimension: str
score: int
max: int


def parse_args():
parser = argparse.ArgumentParser(description="VLM Evaluation Pipeline")
parser.add_argument(
"--data_path", type=str, required=True, help="Path to dataset JSON"
)
parser.add_argument("--model_name", type=str, default="llava-hf/llava-1.5-7b-hf")
parser.add_argument("--quantize", action="store_true", default=True)
parser.add_argument("--no_quantize", action="store_false", dest="quantize")
parser.add_argument("--output_path", type=str, default="results.json")
parser.add_argument("--max_new_tokens", type=int, default=256)
return parser.parse_args()


def load_model(model_name, quantize=True):
if quantize:
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
)
else:
quantization_config = None

model = LlavaForConditionalGeneration.from_pretrained(
model_name,
quantization_config=quantization_config,
device_map="auto",
torch_dtype=torch.float16,
)
processor = AutoProcessor.from_pretrained(model_name)
return model, processor


def extract_score(text):
try:
# Parse the JSON directly instead of using Regex
parsed = json.loads(text)
score = parsed.get("score")
if isinstance(score, int) and 1 <= score <= 5:
return score
except json.JSONDecodeError:
pass
return None


def compute_metrics(predictions, ground_truths):
total = len(ground_truths)
if total == 0:
return {}

exact = sum(1 for p, g in zip(predictions, ground_truths) if p == g)
within_1 = sum(1 for p, g in zip(predictions, ground_truths) if abs(p - g) <= 1)
mae = sum(abs(p - g) for p, g in zip(predictions, ground_truths)) / total
parsed = sum(1 for p in predictions if p is not None)

return {
"total_samples": total,
"exact_accuracy": round(exact / total * 100, 2),
"within_1_accuracy": round(within_1 / total * 100, 2),
"mean_absolute_error": round(mae, 4),
"parse_rate": round(parsed / total * 100, 2),
}


def main():
args = parse_args()

if not torch.cuda.is_available():
print("Warning: CUDA not available. Inference will be slow on CPU.")

print(f"Loading dataset from {args.data_path}...")
dataset = ArtifactDataset(args.data_path)
if len(dataset) == 0:
print("Dataset is empty. Exiting.")
return

print(f"Loading model {args.model_name} (quantize={args.quantize})...")
model, processor = load_model(args.model_name, quantize=args.quantize)

results = []
preds = []
truths = []

for i in tqdm(range(len(dataset)), desc="Evaluating"):
sample = dataset[i]
meta = sample["metadata"]
image = sample["image"]

if image is None:
continue

prompt_text = generate_evaluation_prompt(
student_id=meta.get("student_id", "unknown"),
artifact_type=meta.get("artifact_type", "unknown"),
rubric=meta.get("rubric", {}),
)

inputs = processor(text=prompt_text, images=image, return_tensors="pt").to(
"cuda" if torch.cuda.is_available() else "cpu"
)

try:
schema = EvaluationOutput.model_json_schema()
except AttributeError:
schema = EvaluationOutput.schema()

parser = JsonSchemaParser(schema)
prefix_function = build_transformers_prefix_allowed_tokens_fn(processor.tokenizer, parser)

with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=args.max_new_tokens,
do_sample=False,
prefix_allowed_tokens_fn=prefix_function,
)

decoded = processor.decode(output_ids[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
response = decoded.strip()

predicted_score = extract_score(response)
ground_truth = meta.get("ground_truth_score")

results.append(
{
"student_id": meta.get("student_id", "unknown"),
"predicted_score": predicted_score,
"ground_truth_score": ground_truth,
"raw_response": response,
"artifact_type": meta.get("artifact_type", "unknown"),
}
)

if predicted_score is not None and ground_truth is not None:
preds.append(predicted_score)
truths.append(ground_truth)

metrics = compute_metrics(preds, truths)

output = {
"config": {
"model_name": args.model_name,
"quantize": args.quantize,
"dataset": args.data_path,
},
"metrics": metrics,
"results": results,
}

with open(args.output_path, "w") as f:
json.dump(output, f, indent=2)

print("\n" + "=" * 50)
print("EVALUATION METRICS")
print("=" * 50)
for k, v in metrics.items():
print(f" {k}: {v}")
print("=" * 50)
print(f"Results saved to {args.output_path}")


if __name__ == "__main__":
main()
72 changes: 72 additions & 0 deletions vlm_evaluation/generate_sample_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import json
import os
from PIL import Image, ImageDraw

SAMPLE_DATA = [
{
"image_path": "sample_origami.jpg",
"student_id": "S001",
"artifact_type": "Origami",
"rubric": {
"skill": "creativity",
"dimension": "originality",
"max": 5,
"criteria": "1: No recognizable shape, 5: Perfect folds with clean edges and symmetry"
},
"ground_truth_score": 4,
},
{
"image_path": "sample_drawing.jpg",
"student_id": "S002",
"artifact_type": "Drawing",
"rubric": {
"skill": "creativity",
"dimension": "composition",
"max": 5,
"criteria": "1: No effort, 5: Detailed and creative composition"
},
"ground_truth_score": 3,
},
{
"image_path": "sample_model.jpg",
"student_id": "S003",
"artifact_type": "Clay Model",
"rubric": {
"skill": "problem_solving",
"dimension": "execution",
"max": 5,
"criteria": "1: Unrecognizable, 5: Realistic and well-finished model"
},
"ground_truth_score": 5,
},
]


def create_dummy_image(path, size=(224, 224), color=(200, 100, 50)):
img = Image.new("RGB", size, color)
draw = ImageDraw.Draw(img)
draw.rectangle([50, 50, 174, 174], outline=(255, 255, 255), width=3)
draw.ellipse([80, 80, 144, 144], fill=(100, 200, 100))
img.save(path)
print(f"Created {path}")


def main():
output_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(output_dir, "sample_data")
os.makedirs(data_dir, exist_ok=True)

for item in SAMPLE_DATA:
image_path = os.path.join(data_dir, item["image_path"])
create_dummy_image(image_path)
item["image_path"] = os.path.join("sample_data", item["image_path"])

json_path = os.path.join(output_dir, "sample_dataset.json")
with open(json_path, "w") as f:
json.dump(SAMPLE_DATA, f, indent=2)

print(f"Sample dataset saved to {json_path}")


if __name__ == "__main__":
main()
17 changes: 17 additions & 0 deletions vlm_evaluation/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import json
SYSTEM_PROMPT = """You are an expert evaluator assessing student artifacts for The Apprentice Project.
You must output your evaluation STRICTLY as a valid JSON object. Do not include any other conversational text."""

def generate_evaluation_prompt(student_id: str, artifact_type: str, rubric: str) -> str:
return f"""USER:
{SYSTEM_PROMPT}

Artifact ID: {student_id}
Category: {artifact_type}
Rubric Schema:
{rubric}

Please evaluate the artifact based on the rubric.
Output strictly in this JSON format:
{{"score": <int>, "feedback": "<brief reasoning>"}}
ASSISTANT:"""
9 changes: 9 additions & 0 deletions vlm_evaluation/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
transformers>=4.38.2
torch
peft
bitsandbytes
Pillow
accelerate
datasets
lm-format-enforcer
pydantic
29 changes: 29 additions & 0 deletions vlm_evaluation/run_benchmark.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
param(
[string]$DataPath = "sample_dataset.json",
[string]$ModelName = "llava-hf/llava-1.5-7b-hf",
[switch]$NoQuantize = $false,
[string]$OutputPath = "results.json",
[int]$MaxNewTokens = 256
)

$QuantizeFlag = if ($NoQuantize) { "--no_quantize" } else { "" }

Write-Host "=== VLM Evaluation Benchmark ===" -ForegroundColor Cyan
Write-Host "Dataset : $DataPath"
Write-Host "Model : $ModelName"
Write-Host "Quantize: $(-not $NoQuantize)"
Write-Host "Output : $OutputPath"
Write-Host ""

python evaluate.py `
--data_path $DataPath `
--model_name $ModelName `
$QuantizeFlag `
--output_path $OutputPath `
--max_new_tokens $MaxNewTokens

if ($LASTEXITCODE -eq 0) {
Write-Host "Benchmark completed successfully." -ForegroundColor Green
} else {
Write-Host "Benchmark failed with exit code $LASTEXITCODE." -ForegroundColor Red
}
Binary file added vlm_evaluation/sample_data/sample_drawing.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added vlm_evaluation/sample_data/sample_model.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added vlm_evaluation/sample_data/sample_origami.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading