Skip to content
This repository was archived by the owner on Feb 22, 2026. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions cairo-corpus-starter/src/build_jsonl.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import json, pathlib, orjson
import json
import pathlib

import orjson
from jsonschema import validate

from .schema import schema

PROC = pathlib.Path("data/processed")


def main(out_path: str = "data/processed/dataset.jsonl"):
idx = json.loads((PROC / "index.json").read_text())
with open(out_path, "wb") as f:
Expand All @@ -12,18 +17,24 @@ def main(out_path: str = "data/processed/dataset.jsonl"):
code = pathlib.Path(rec["code_path"]).read_text()
# assign type via naive filename heuristics
name = rec["contract_name"].lower()
if "erc20" in name: rec["type"] = "ERC20"
elif "erc721" in name: rec["type"] = "ERC721"
elif "amm" in name or "dex" in name or "pool" in name: rec["type"] = "DeFi"
elif "util" in name: rec["type"] = "Utility"
if "erc20" in name:
rec["type"] = "ERC20"
elif "erc721" in name:
rec["type"] = "ERC721"
elif "amm" in name or "dex" in name or "pool" in name:
rec["type"] = "DeFi"
elif "util" in name:
rec["type"] = "Utility"
rec["code"] = code
# minimal validation
validate(instance=rec, schema=schema)
f.write(orjson.dumps(rec) + b"\n")
print("Wrote", out_path)


if __name__ == "__main__":
import argparse

ap = argparse.ArgumentParser()
ap.add_argument("--out", default="data/processed/dataset.jsonl")
args = ap.parse_args()
Expand Down
79 changes: 48 additions & 31 deletions cairo-corpus-starter/src/clean_standardize.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,43 @@
import json, pathlib, datetime
from typing import Dict, Any, List
from .detect_cairo_version import detect_cairo_version
from .utils.text import strip_trailing_ws, normalize_indentation, is_duplicate
import datetime
import json
import pathlib
from typing import Any, Dict, List

from src.utils.github_api import *

from .detect_cairo_version import detect_cairo_version
from .utils.text import is_duplicate, normalize_indentation, strip_trailing_ws

RAW = pathlib.Path("data/raw")
PROC = pathlib.Path("data/processed")
V1 = PROC / "cairo_v1"
V2 = PROC / "cairo_v2"
for p in [PROC, V1, V2]: p.mkdir(parents=True, exist_ok=True)
for p in [PROC, V1, V2]:
p.mkdir(parents=True, exist_ok=True)


def _iter_raw():
for path in RAW.rglob("*.json"):
print(path)
data = json.loads(path.read_text())
yield path, data


def _write_split(code: str, meta: Dict[str, Any], name_hint: str):
v = detect_cairo_version(code)
sub = V2 if v == "2" else V1
fname = (name_hint.replace("/","__").replace(" ","_"))[:100] + ".cairo"
fname = (name_hint.replace("/", "__").replace(" ", "_"))[:100] + ".cairo"
(sub / fname).write_text(code)
return v, str((sub / fname).as_posix())


def main():
seen: List[str] = []
index: List[Dict[str, Any]] = []

for path, data in _iter_raw():
src = data.get("source") or data.get("meta",{}).get("source")

src = data.get("source") or data.get("meta", {}).get("source")
# github shape
if "files" in data and "meta" in data:
meta = data["meta"]
Expand All @@ -42,41 +50,50 @@ def main():
if any(is_duplicate(code, s) for s in seen):
continue
seen.append(code)
v, saved = _write_split(code, meta, meta["repo"]["full_name"] + "__" + rec["path"])
index.append({
"contract_name": rec["path"].split("/")[-1].replace(".cairo",""),
"source": "github",
"type": "Other",
"cairo_version": v,
"last_updated": meta["repo"].get("last_commit",""),
"quality": {"category": "unknown"},
"repo": meta["repo"],
"code_path": saved
})
v, saved = _write_split(
code, meta, meta["repo"]["full_name"] + "__" + rec["path"]
)
index.append(
{
"contract_name": rec["path"]
.split("/")[-1]
.replace(".cairo", ""),
"source": "github",
"type": "Other",
"cairo_version": v,
"last_updated": meta["repo"].get("last_commit", ""),
"quality": {"category": "unknown"},
"repo": meta["repo"],
"code_path": saved,
}
)

# docs/blog shape
elif "blocks" in data and "source" in data:
for i, code in enumerate(data["blocks"]):
code = normalize_indentation(strip_trailing_ws(code))
if len(code) < 40:
if len(code) < 40:
continue
if any(is_duplicate(code, s) for s in seen):
continue
seen.append(code)
v, saved = _write_split(code, data, f"{data.get('source')}_{i}")
index.append({
"contract_name": f"{data.get('source')}_{i}",
"source": data["source"],
"type": "Other",
"cairo_version": v,
"last_updated": "",
"quality": {"category": "tutorial"},
"repo": {},
"code_path": saved
})
index.append(
{
"contract_name": f"{data.get('source')}_{i}",
"source": data["source"],
"type": "Other",
"cairo_version": v,
"last_updated": "",
"quality": {"category": "tutorial"},
"repo": {},
"code_path": saved,
}
)
(PROC / "index.json").write_text(json.dumps(index, indent=2))


if __name__ == "__main__":
main()
# print(get_repo_tree("kkrt-labs", 'kakarot'))
# print(len(search_repos("language: Cairo starknet", max_repos=2)))
# print(len(search_repos("language: Cairo starknet", max_repos=2)))
3 changes: 2 additions & 1 deletion cairo-corpus-starter/src/detect_cairo_version.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import re
from typing import Literal

def detect_cairo_version(code: str) -> Literal["1","2"]:

def detect_cairo_version(code: str) -> Literal["1", "2"]:
# Heuristics: Cairo 1/2 typically include "use" statements and modern syntax.
# Cairo 0.x will be filtered earlier; this function distinguishes 1 vs 2 if needed.
# (Adjust heuristics as needed as Cairo evolves.)
Expand Down
17 changes: 11 additions & 6 deletions cairo-corpus-starter/src/quality.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Dict, Any
from typing import Any, Dict


def quality_tag(meta: Dict[str, Any]) -> Dict[str, Any]:
"""Heuristic quality category + score.
Expand All @@ -16,11 +17,15 @@ def quality_tag(meta: Dict[str, Any]) -> Dict[str, Any]:

base = 0.2
if src == "github":
base += min(stars/2000, 0.4) + min(forks/500, 0.2)
if has_tests: base += 0.08
if has_ci: base += 0.06
if has_audit: base += 0.1
if archived: base -= 0.1
base += min(stars / 2000, 0.4) + min(forks / 500, 0.2)
if has_tests:
base += 0.08
if has_ci:
base += 0.06
if has_audit:
base += 0.1
if archived:
base -= 0.1
category = "production" if base >= 0.55 else "unknown"
else:
category = "tutorial"
Expand Down
12 changes: 7 additions & 5 deletions cairo-corpus-starter/src/refresh.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
from .scrape_github import main as scrape_github
from .scrape_docs import main as scrape_docs
from .scrape_blogs import main as scrape_blogs
from .clean_standardize import main as clean_std
from .build_jsonl import main as build_jsonl
from .clean_standardize import main as clean_std
from .scrape_blogs import main as scrape_blogs
from .scrape_docs import main as scrape_docs
from .scrape_github import main as scrape_github


def refresh():
# Example orchestrator
scrape_github(query='language:Cairo starknet', max_repos=30)
scrape_github(query="language:Cairo starknet", max_repos=30)
scrape_docs(max_items=30)
scrape_blogs(max_items=30)
clean_std()
build_jsonl(out_path="data/processed/dataset.jsonl")


if __name__ == "__main__":
refresh()
43 changes: 28 additions & 15 deletions cairo-corpus-starter/src/schema.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,39 @@
from typing import TypedDict, Literal, Dict, Any, Optional
from typing import Any, Dict, Literal, Optional, TypedDict

RecordType = Literal["ERC20","ERC721","DeFi","Utility","Other"]
SourceType = Literal["github","docs","blog"]
CairoVersion = Literal["1","2"]
RecordType = Literal["ERC20", "ERC721", "DeFi", "Utility", "Other"]
SourceType = Literal["github", "docs", "blog"]
CairoVersion = Literal["1", "2"]

schema: Dict[str, Any] = {
"type": "object",
"required": ["contract_name","source","type","cairo_version","last_updated","code"],
"required": [
"contract_name",
"source",
"type",
"cairo_version",
"last_updated",
"code",
],
"properties": {
"contract_name": {"type": "string", "minLength": 1},
"source": {"type": "string", "enum": ["github","docs","blog"]},
"type": {"type": "string", "enum": ["ERC20","ERC721","DeFi","Utility","Other"]},
"cairo_version": {"type": "string", "enum": ["1","2"]},
"source": {"type": "string", "enum": ["github", "docs", "blog"]},
"type": {
"type": "string",
"enum": ["ERC20", "ERC721", "DeFi", "Utility", "Other"],
},
"cairo_version": {"type": "string", "enum": ["1", "2"]},
"last_updated": {"type": "string"},
"quality": {
"type": "object",
"properties": {
"category": {"type": "string", "enum": ["production","tutorial","example","unknown"]},
"score": {"type": "number", "minimum": 0, "maximum": 1}
"category": {
"type": "string",
"enum": ["production", "tutorial", "example", "unknown"],
},
"score": {"type": "number", "minimum": 0, "maximum": 1},
},
"required": ["category"],
"additionalProperties": True
"additionalProperties": True,
},
"repo": {
"type": "object",
Expand All @@ -29,11 +42,11 @@
"stars": {"type": "integer", "minimum": 0},
"forks": {"type": "integer", "minimum": 0},
"last_commit": {"type": "string"},
"archived": {"type": "boolean"}
"archived": {"type": "boolean"},
},
"additionalProperties": True
"additionalProperties": True,
},
"code": {"type": "string", "minLength": 1}
"code": {"type": "string", "minLength": 1},
},
"additionalProperties": True
"additionalProperties": True,
}
41 changes: 32 additions & 9 deletions cairo-corpus-starter/src/scrape_blogs.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,64 @@
import json, pathlib, re, time, requests, os
import json
import os
import pathlib
import re
import time
from datetime import datetime
from bs4 import BeautifulSoup

import requests
import tests.test_schema
from bs4 import BeautifulSoup

RAW_DIR = pathlib.Path("data/raw/blogs")
RAW_DIR.mkdir(parents=True, exist_ok=True)


def fetch(url: str) -> str:
r = requests.get(url, timeout=30)
r.raise_for_status()
return r.text


def extract_code(html: str):
soup = BeautifulSoup(html, "html.parser")
out = []
for pre in soup.find_all(["pre","code"]):
for pre in soup.find_all(["pre", "code"]):
txt = pre.get_text("\n")
if "use::core" in txt or "starknet::" in txt or "pub" in txt or "mod" in txt or "fn " in txt:
if (
"use::core" in txt
or "starknet::" in txt
or "pub" in txt
or "mod" in txt
or "fn " in txt
):
out.append(txt)

return out


def main(feeds_file: str = "feeds.txt", max_items: int = 50):
feeds = [ln.strip() for ln in pathlib.Path(feeds_file).read_text().splitlines() if ln.strip() and not ln.startswith("#")]
feeds = [
ln.strip()
for ln in pathlib.Path(feeds_file).read_text().splitlines()
if ln.strip() and not ln.startswith("#")
]
print(feeds)
n = 0
for feed in feeds:
try:
xml = fetch(feed)
code = extract_code(xml)

file_name = os.path.join(RAW_DIR, f'cairo_blog_{datetime.now().strftime("%d-%m-%Y %H-%M-%S")}.txt')
file_name = os.path.join(
RAW_DIR,
f'cairo_blog_{datetime.now().strftime("%d-%m-%Y %H-%M-%S")}.txt',
)
print(file_name)

with open(file_name, 'a') as file:
with open(file_name, "a") as file:
for c in code:
try:
file.write(c + '\n')
file.write(c + "\n")
except Exception as e:
print(e)

Expand Down Expand Up @@ -67,8 +89,9 @@ def main(feeds_file: str = "feeds.txt", max_items: int = 50):

if __name__ == "__main__":
import argparse

ap = argparse.ArgumentParser()
ap.add_argument("--feeds", default="feeds.txt")
ap.add_argument("--max_items", type=int, default=50)
args = ap.parse_args()
main(args.feeds, args.max_items)
main(args.feeds, args.max_items)
Loading
Loading