Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions contrib/multilingual/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# SkillSpector Contrib Batch Scanner — Environment Configuration
#
# Copy to the repository root as .env:
# cp contrib/multilingual/.env.example .env
#
# The scanner also respects the upstream .env.example keys
# (OPENAI_API_KEY, SKILLSPECTOR_PROVIDER, SKILLSPECTOR_MODEL).

# Provider configuration
SKILLSPECTOR_PROVIDER=openai
SKILLSPECTOR_MODEL=deepseek-v4-flash

# Single-key mode (standard OpenAI-compatible)
OPENAI_API_KEY=sk-or-xxxxxxxxxxxxxxxxxxxxxxxx
OPENAI_BASE_URL=https://api.deepseek.com/v1

# Multi-key pool (recommended for batch scans).
# Pipe-delimited: key|base_url|model. Separate entries with newlines
# or semicolons. Supports up to 10 keys. Leave unset to use
# single-key mode above.
# SKILLSPECTOR_API_KEYS="
# sk-or-xxx1|https://api.deepseek.com/v1|deepseek-v4-flash
# sk-or-xxx2|https://api.deepseek.com/v1|deepseek-v4-flash
# "

# Logging (DEBUG | INFO | WARNING | ERROR)
SKILLSPECTOR_LOG_LEVEL=WARNING
69 changes: 69 additions & 0 deletions contrib/multilingual/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Multilingual batch scan for SkillSpector.

Community-contributed tool for scanning directories of AI agent skills
in non-English languages. Extends SkillSpector's built-in analyzers
with targeted LLM gap-fill for vulnerability categories that static
English-keyword regex rules cannot detect.

Public API
----------
- :func:`~.discovery.discover_skills`
- :func:`~.detection.detect_language`
- :func:`~.detection.detect_skill_language`
- :func:`~.annotation.is_language_compatible`
- :func:`~.annotation.annotate_findings`
- :func:`~.gap_fill.run_gap_fill`
- :func:`~.runner.run_one`
"""

from __future__ import annotations

# -- .env MUST load before any skillspector import. Python imports
# this __init__.py before executing the batch_scan module body;
# without this early load, constants.py resolves the provider
# with stale env vars.
try:
import dotenv as _dotenv
except ImportError:
pass
else:
_dotenv.load_dotenv(_dotenv.find_dotenv(usecwd=True), override=True)

from .annotation import annotate_findings, is_language_compatible
from .api_pool import ApiKey, ApiKeyPool, PooledChatModel, create_api_key_pool_from_env
from .detection import detect_language, detect_skill_language
from .discovery import discover_skills
from .gap_fill import GapFillAnalyzer, GapFillFinding, GapFillResult, run_gap_fill
from .runner import run_one

__all__ = [
"annotate_findings",
"ApiKey",
"ApiKeyPool",
"create_api_key_pool_from_env",
"detect_language",
"detect_skill_language",
"discover_skills",
"GapFillAnalyzer",
"GapFillFinding",
"GapFillResult",
"is_language_compatible",
"PooledChatModel",
"run_gap_fill",
"run_one",
]
100 changes: 100 additions & 0 deletions contrib/multilingual/annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Finding language-compatibility annotation.

Classifies each finding's ``rule_id`` against known buckets so downstream
reports can flag which findings are reliable for non-English skills.
"""

from __future__ import annotations

# ---------------------------------------------------------------------------
# Rule classification
# ---------------------------------------------------------------------------

# Rule IDs from LLM-based semantic analyzers — inherently multilingual.
_SEMANTIC_RULES: frozenset[str] = frozenset(
{
"SSD1", "SSD2", "SSD3", "SSD4",
"SDI1", "SDI2", "SDI3", "SDI4",
"SQP1", "SQP2", "SQP3",
"TP4",
}
)

# Rule IDs from the gap-fill pass (P5 / P6-P8 / MP1-MP3 / RA1-RA2) —
# these are LLM-generated for non-English skills.
_GAP_FILL_RULES: frozenset[str] = frozenset(
{"P5", "P6", "P7", "P8", "MP1", "MP2", "MP3", "RA1", "RA2"}
)

# Rule IDs from code-level analyzers — language-independent by design.
_CODE_RULES: frozenset[str] = frozenset(
{
"AST1", "AST2", "AST3", "AST4", "AST5", "AST6", "AST7", "AST8",
"TT1", "TT2", "TT3", "TT4", "TT5",
"YR1", "YR2", "YR3", "YR4",
"SC1", "SC2", "SC3", "SC4", "SC5", "SC6",
"LP1", "LP2", "LP3", "LP4",
"TP1", "TP2", "TP3",
"TM1", "TM2", "TM3",
}
)

# English-keyword static rules that have semantic-equivalent coverage
# via SSD / SDI / SQP for non-English skills. These are listed for
# documentation; the compatibility check treats them as needing scrutiny
# when the detected language is non-English.
_ENGLISH_KEYWORD_RULES: frozenset[str] = frozenset(
{
"P1", "P2", "P3", "P4",
"E1", "E2", "E3", "E4",
"PE1", "PE2", "PE3",
"EA1", "EA2", "EA3", "EA4",
"OH1", "OH2", "OH3",
"TR1", "TR2", "TR3",
}
)


def is_language_compatible(rule_id: str, detected_language: str) -> bool:
"""Return ``True`` when *rule_id* is reliable for *detected_language*.

Code-level rules are always compatible. Semantic rules are always
compatible. English-keyword rules are only compatible when the skill
is English. Gap-fill rules are compatible (they were generated by
an LLM specifically for this language).
"""
if detected_language == "en":
return True
return rule_id in _SEMANTIC_RULES | _CODE_RULES | _GAP_FILL_RULES


def annotate_findings(
issues: list[dict[str, object]],
detected_language: str,
) -> list[dict[str, object]]:
"""Add a ``language_compatible`` field to each issue dict.

Returns a new list — the input *issues* list is not mutated.
"""
annotated: list[dict[str, object]] = []
for issue in issues:
rule_id = str(issue.get("id", ""))
entry = dict(issue)
entry["language_compatible"] = is_language_compatible(rule_id, detected_language)
annotated.append(entry)
return annotated
Loading