From 72042ecbcad384ada7b72ab2d1c78ef342b59596 Mon Sep 17 00:00:00 2001 From: gustavo-grieco Date: Wed, 3 Sep 2025 09:21:19 +0200 Subject: [PATCH] added support to filter by language discarding test related code --- dataset-generator/curate_dataset.py | 37 +++++++++++++++++++++++++---- dataset-generator/models.py | 5 +++- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/dataset-generator/curate_dataset.py b/dataset-generator/curate_dataset.py index 19ef5f0..9b1753a 100755 --- a/dataset-generator/curate_dataset.py +++ b/dataset-generator/curate_dataset.py @@ -168,7 +168,7 @@ def run_cloc_on_repo(repo_url: str) -> Dict[str, Any]: # Run cloc result = subprocess.run( - ["cloc", "--json", "--quiet", str(clone_path)], + ["cloc", "--json", "--quiet", "--exclude-dir=test,tests", str(clone_path)], capture_output=True, text=True, timeout=30 @@ -401,6 +401,12 @@ def main(): default=1, help="Minimum number of high or critical vulnerabilities required (default: 1)" ) + parser.add_argument( + "--language", + type=str, + default=None, + help="Filter projects by a specific language (e.g., Solidity, Rust)" + ) args = parser.parse_args() @@ -497,8 +503,31 @@ def main(): cloc_stats=cloc_stats ) - curated_entries.append(entry) - project_stats_list.append(project_stat) + # Add language to the entry + primary_language = None + # Whitelist of non-programming languages to ignore + language_whitelist = {"json", "html", "markdown", "css", "yaml", "toml", "xml", "shell", "text", "typescript", "javascript", "svg"} + + if "languages" in cloc_stats and cloc_stats["languages"]: + sorted_langs = sorted(cloc_stats["languages"].items(), key=lambda x: x[1]["lines"], reverse=True) + + # Find the first language not in the whitelist + for lang, _ in sorted_langs: + if lang.lower() not in language_whitelist: + primary_language = lang + entry["language"] = primary_language + break + + # Filter by language if specified + if args.language: + if not primary_language or primary_language.lower() != args.language.lower(): + print(f" Language Mismatch: Skipping project with language '{primary_language}'") + else: + curated_entries.append(entry) + project_stats_list.append(project_stat) + else: + curated_entries.append(entry) + project_stats_list.append(project_stat) else: print(f"✗ {reason}") @@ -536,4 +565,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/dataset-generator/models.py b/dataset-generator/models.py index b334f64..e26405a 100644 --- a/dataset-generator/models.py +++ b/dataset-generator/models.py @@ -37,7 +37,8 @@ class Project: vulnerabilities: List[Vulnerability] = field(default_factory=list) contest_date: Optional[datetime] = None report_url: Optional[str] = None - + language: Optional[str] = None + def to_dict(self): result = { "project_id": self.project_id, @@ -48,6 +49,8 @@ def to_dict(self): } if self.report_url: result["report_url"] = self.report_url + if self.language: + result["language"] = self.language return result