Shonen-Labs · Akashbuilds · Aug 29, 2025 · Aug 29, 2025 · Aug 29, 2025
@@ -1,9 +1,14 @@
-import json, pathlib, orjson
+import json
+import pathlib
+
+import orjson
 from jsonschema import validate
+
 from .schema import schema
 
 PROC = pathlib.Path("data/processed")
 
+
 def main(out_path: str = "data/processed/dataset.jsonl"):
     idx = json.loads((PROC / "index.json").read_text())
     with open(out_path, "wb") as f:
@@ -12,18 +17,24 @@ def main(out_path: str = "data/processed/dataset.jsonl"):
             code = pathlib.Path(rec["code_path"]).read_text()
             # assign type via naive filename heuristics
             name = rec["contract_name"].lower()
-            if "erc20" in name: rec["type"] = "ERC20"
-            elif "erc721" in name: rec["type"] = "ERC721"
-            elif "amm" in name or "dex" in name or "pool" in name: rec["type"] = "DeFi"
-            elif "util" in name: rec["type"] = "Utility"
+            if "erc20" in name:
+                rec["type"] = "ERC20"
+            elif "erc721" in name:
+                rec["type"] = "ERC721"
+            elif "amm" in name or "dex" in name or "pool" in name:
+                rec["type"] = "DeFi"
+            elif "util" in name:
+                rec["type"] = "Utility"
             rec["code"] = code
             # minimal validation
             validate(instance=rec, schema=schema)
             f.write(orjson.dumps(rec) + b"\n")
     print("Wrote", out_path)
 
+
 if __name__ == "__main__":
     import argparse
+
     ap = argparse.ArgumentParser()
     ap.add_argument("--out", default="data/processed/dataset.jsonl")
     args = ap.parse_args()

@@ -1,35 +1,43 @@
-import json, pathlib, datetime
-from typing import Dict, Any, List
-from .detect_cairo_version import detect_cairo_version
-from .utils.text import strip_trailing_ws, normalize_indentation, is_duplicate
+import datetime
+import json
+import pathlib
+from typing import Any, Dict, List
+
 from src.utils.github_api import *
 
+from .detect_cairo_version import detect_cairo_version
+from .utils.text import is_duplicate, normalize_indentation, strip_trailing_ws
+
 RAW = pathlib.Path("data/raw")
 PROC = pathlib.Path("data/processed")
 V1 = PROC / "cairo_v1"
 V2 = PROC / "cairo_v2"
-for p in [PROC, V1, V2]: p.mkdir(parents=True, exist_ok=True)
+for p in [PROC, V1, V2]:
+    p.mkdir(parents=True, exist_ok=True)
+
 
 def _iter_raw():
     for path in RAW.rglob("*.json"):
         print(path)
         data = json.loads(path.read_text())
         yield path, data
 
+
 def _write_split(code: str, meta: Dict[str, Any], name_hint: str):
     v = detect_cairo_version(code)
     sub = V2 if v == "2" else V1
-    fname = (name_hint.replace("/","__").replace(" ","_"))[:100] + ".cairo"
+    fname = (name_hint.replace("/", "__").replace(" ", "_"))[:100] + ".cairo"
     (sub / fname).write_text(code)
     return v, str((sub / fname).as_posix())
 
+
 def main():
     seen: List[str] = []
     index: List[Dict[str, Any]] = []
 
     for path, data in _iter_raw():
-    
-        src = data.get("source") or data.get("meta",{}).get("source")
+
+        src = data.get("source") or data.get("meta", {}).get("source")
         # github shape
         if "files" in data and "meta" in data:
             meta = data["meta"]
@@ -42,41 +50,50 @@ def main():
                 if any(is_duplicate(code, s) for s in seen):
                     continue
                 seen.append(code)
-                v, saved = _write_split(code, meta, meta["repo"]["full_name"] + "__" + rec["path"])
-                index.append({
-                    "contract_name": rec["path"].split("/")[-1].replace(".cairo",""),
-                    "source": "github",
-                    "type": "Other",
-                    "cairo_version": v,
-                    "last_updated": meta["repo"].get("last_commit",""),
-                    "quality": {"category": "unknown"},
-                    "repo": meta["repo"],
-                    "code_path": saved
-                })
+                v, saved = _write_split(
+                    code, meta, meta["repo"]["full_name"] + "__" + rec["path"]
+                )
+                index.append(
+                    {
+                        "contract_name": rec["path"]
+                        .split("/")[-1]
+                        .replace(".cairo", ""),
+                        "source": "github",
+                        "type": "Other",
+                        "cairo_version": v,
+                        "last_updated": meta["repo"].get("last_commit", ""),
+                        "quality": {"category": "unknown"},
+                        "repo": meta["repo"],
+                        "code_path": saved,
+                    }
+                )
 
         # docs/blog shape
         elif "blocks" in data and "source" in data:
             for i, code in enumerate(data["blocks"]):
                 code = normalize_indentation(strip_trailing_ws(code))
-                if len(code) < 40: 
+                if len(code) < 40:
                     continue
                 if any(is_duplicate(code, s) for s in seen):
                     continue
                 seen.append(code)
                 v, saved = _write_split(code, data, f"{data.get('source')}_{i}")
-                index.append({
-                    "contract_name": f"{data.get('source')}_{i}",
-                    "source": data["source"],
-                    "type": "Other",
-                    "cairo_version": v,
-                    "last_updated": "",
-                    "quality": {"category": "tutorial"},
-                    "repo": {},
-                    "code_path": saved
-                })
+                index.append(
+                    {
+                        "contract_name": f"{data.get('source')}_{i}",
+                        "source": data["source"],
+                        "type": "Other",
+                        "cairo_version": v,
+                        "last_updated": "",
+                        "quality": {"category": "tutorial"},
+                        "repo": {},
+                        "code_path": saved,
+                    }
+                )
     (PROC / "index.json").write_text(json.dumps(index, indent=2))
 
+
 if __name__ == "__main__":
     main()
     # print(get_repo_tree("kkrt-labs", 'kakarot'))
-    # print(len(search_repos("language: Cairo starknet", max_repos=2)))
+    # print(len(search_repos("language: Cairo starknet", max_repos=2)))
@@ -1,7 +1,8 @@
 import re
 from typing import Literal
 
-def detect_cairo_version(code: str) -> Literal["1","2"]:
+
+def detect_cairo_version(code: str) -> Literal["1", "2"]:
     # Heuristics: Cairo 1/2 typically include "use" statements and modern syntax.
     # Cairo 0.x will be filtered earlier; this function distinguishes 1 vs 2 if needed.
     # (Adjust heuristics as needed as Cairo evolves.)

@@ -1,4 +1,5 @@
-from typing import Dict, Any
+from typing import Any, Dict
+
 
 def quality_tag(meta: Dict[str, Any]) -> Dict[str, Any]:
     """Heuristic quality category + score.
@@ -16,11 +17,15 @@ def quality_tag(meta: Dict[str, Any]) -> Dict[str, Any]:
 
     base = 0.2
     if src == "github":
-        base += min(stars/2000, 0.4) + min(forks/500, 0.2)
-        if has_tests: base += 0.08
-        if has_ci: base += 0.06
-        if has_audit: base += 0.1
-        if archived: base -= 0.1
+        base += min(stars / 2000, 0.4) + min(forks / 500, 0.2)
+        if has_tests:
+            base += 0.08
+        if has_ci:
+            base += 0.06
+        if has_audit:
+            base += 0.1
+        if archived:
+            base -= 0.1
         category = "production" if base >= 0.55 else "unknown"
     else:
         category = "tutorial"

@@ -1,16 +1,18 @@
-from .scrape_github import main as scrape_github
-from .scrape_docs import main as scrape_docs
-from .scrape_blogs import main as scrape_blogs
-from .clean_standardize import main as clean_std
 from .build_jsonl import main as build_jsonl
+from .clean_standardize import main as clean_std
+from .scrape_blogs import main as scrape_blogs
+from .scrape_docs import main as scrape_docs
+from .scrape_github import main as scrape_github
+
 
 def refresh():
     # Example orchestrator
-    scrape_github(query='language:Cairo starknet', max_repos=30)
+    scrape_github(query="language:Cairo starknet", max_repos=30)
     scrape_docs(max_items=30)
     scrape_blogs(max_items=30)
     clean_std()
     build_jsonl(out_path="data/processed/dataset.jsonl")
 
+
 if __name__ == "__main__":
     refresh()
@@ -1,26 +1,39 @@
-from typing import TypedDict, Literal, Dict, Any, Optional
+from typing import Any, Dict, Literal, Optional, TypedDict
 
-RecordType = Literal["ERC20","ERC721","DeFi","Utility","Other"]
-SourceType = Literal["github","docs","blog"]
-CairoVersion = Literal["1","2"]
+RecordType = Literal["ERC20", "ERC721", "DeFi", "Utility", "Other"]
+SourceType = Literal["github", "docs", "blog"]
+CairoVersion = Literal["1", "2"]
 
 schema: Dict[str, Any] = {
     "type": "object",
-    "required": ["contract_name","source","type","cairo_version","last_updated","code"],
+    "required": [
+        "contract_name",
+        "source",
+        "type",
+        "cairo_version",
+        "last_updated",
+        "code",
+    ],
     "properties": {
         "contract_name": {"type": "string", "minLength": 1},
-        "source": {"type": "string", "enum": ["github","docs","blog"]},
-        "type": {"type": "string", "enum": ["ERC20","ERC721","DeFi","Utility","Other"]},
-        "cairo_version": {"type": "string", "enum": ["1","2"]},
+        "source": {"type": "string", "enum": ["github", "docs", "blog"]},
+        "type": {
+            "type": "string",
+            "enum": ["ERC20", "ERC721", "DeFi", "Utility", "Other"],
+        },
+        "cairo_version": {"type": "string", "enum": ["1", "2"]},
         "last_updated": {"type": "string"},
         "quality": {
             "type": "object",
             "properties": {
-                "category": {"type": "string", "enum": ["production","tutorial","example","unknown"]},
-                "score": {"type": "number", "minimum": 0, "maximum": 1}
+                "category": {
+                    "type": "string",
+                    "enum": ["production", "tutorial", "example", "unknown"],
+                },
+                "score": {"type": "number", "minimum": 0, "maximum": 1},
             },
             "required": ["category"],
-            "additionalProperties": True
+            "additionalProperties": True,
         },
         "repo": {
             "type": "object",
@@ -29,11 +42,11 @@
                 "stars": {"type": "integer", "minimum": 0},
                 "forks": {"type": "integer", "minimum": 0},
                 "last_commit": {"type": "string"},
-                "archived": {"type": "boolean"}
+                "archived": {"type": "boolean"},
             },
-            "additionalProperties": True
+            "additionalProperties": True,
         },
-        "code": {"type": "string", "minLength": 1}
+        "code": {"type": "string", "minLength": 1},
     },
-    "additionalProperties": True
+    "additionalProperties": True,
 }
@@ -1,42 +1,64 @@
-import json, pathlib, re, time, requests, os
+import json
+import os
+import pathlib
+import re
+import time
 from datetime import datetime
-from bs4 import BeautifulSoup
+
+import requests
 import tests.test_schema
+from bs4 import BeautifulSoup
 
 RAW_DIR = pathlib.Path("data/raw/blogs")
 RAW_DIR.mkdir(parents=True, exist_ok=True)
 
+
 def fetch(url: str) -> str:
     r = requests.get(url, timeout=30)
     r.raise_for_status()
     return r.text
 
+
 def extract_code(html: str):
     soup = BeautifulSoup(html, "html.parser")
     out = []
-    for pre in soup.find_all(["pre","code"]):
+    for pre in soup.find_all(["pre", "code"]):
         txt = pre.get_text("\n")
-        if "use::core" in txt or "starknet::" in txt or "pub" in txt or "mod" in txt or "fn " in txt:
+        if (
+            "use::core" in txt
+            or "starknet::" in txt
+            or "pub" in txt
+            or "mod" in txt
+            or "fn " in txt
+        ):
             out.append(txt)
 
     return out
 
+
 def main(feeds_file: str = "feeds.txt", max_items: int = 50):
-    feeds = [ln.strip() for ln in pathlib.Path(feeds_file).read_text().splitlines() if ln.strip() and not ln.startswith("#")]
+    feeds = [
+        ln.strip()
+        for ln in pathlib.Path(feeds_file).read_text().splitlines()
+        if ln.strip() and not ln.startswith("#")
+    ]
     print(feeds)
     n = 0
     for feed in feeds:
         try:
             xml = fetch(feed)
             code = extract_code(xml)
 
-            file_name = os.path.join(RAW_DIR, f'cairo_blog_{datetime.now().strftime("%d-%m-%Y %H-%M-%S")}.txt')
+            file_name = os.path.join(
+                RAW_DIR,
+                f'cairo_blog_{datetime.now().strftime("%d-%m-%Y %H-%M-%S")}.txt',
+            )
             print(file_name)
 
-            with open(file_name, 'a') as file:
+            with open(file_name, "a") as file:
                 for c in code:
                     try:
-                        file.write(c + '\n')
+                        file.write(c + "\n")
                     except Exception as e:
                         print(e)
 
@@ -67,8 +89,9 @@ def main(feeds_file: str = "feeds.txt", max_items: int = 50):
 
 if __name__ == "__main__":
     import argparse
+
     ap = argparse.ArgumentParser()
     ap.add_argument("--feeds", default="feeds.txt")
     ap.add_argument("--max_items", type=int, default=50)
     args = ap.parse_args()
-    main(args.feeds, args.max_items)
+    main(args.feeds, args.max_items)