Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE public.repositories DROP COLUMN license;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE public.repositories ADD COLUMN license VARCHAR(255);
13 changes: 10 additions & 3 deletions scripts/services/docker/Dockerfile.git_integration
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,17 @@ RUN apt-get update && apt-get install -y \
ca-certificates \
git \
ripgrep \
ruby \
libgit2-1.1 \
ruby-dev \
build-essential \
libgit2-dev \
cmake \
pkg-config \
--no-install-recommends \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean \
&& apt-get autoremove -y
&& gem install licensee -v '9.15.3' --no-document \
&& apt-get remove --autoremove -y ruby-dev build-essential libgit2-dev cmake pkg-config \
&& rm -rf /var/lib/apt/lists/*

ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
Expand Down
11 changes: 11 additions & 0 deletions services/apps/git_integration/src/crowdgit/database/crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,17 @@ async def update_last_processed_commit(repo_id: str, commit_hash: str, branch: s
return str(result)


async def update_repository_license(repository_id: str, license_spdx: str | None) -> None:
sql_query = """
UPDATE public.repositories
SET license = $1::varchar,
"updatedAt" = NOW()
WHERE id = $2
AND license IS DISTINCT FROM $1::varchar
"""
await execute(sql_query, (license_spdx, repository_id))
Comment thread
cursor[bot] marked this conversation as resolved.


async def mark_repo_as_processed(repo_id: str, repo_state: RepositoryState):
sql_query = """
UPDATE git."repositoryProcessing"
Expand Down
3 changes: 3 additions & 0 deletions services/apps/git_integration/src/crowdgit/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from crowdgit.services import (
CloneService,
CommitService,
LicenseService,
MaintainerService,
QueueService,
SoftwareValueService,
Expand All @@ -28,6 +29,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
software_value_service = SoftwareValueService()
vulnerability_scanner_service = VulnerabilityScannerService()
maintainer_service = MaintainerService()
license_service = LicenseService()

worker_task = None
worker = RepositoryWorker(
Expand All @@ -36,6 +38,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
software_value_service=software_value_service,
vulnerability_scanner_service=vulnerability_scanner_service,
maintainer_service=maintainer_service,
license_service=license_service,
queue_service=queue_service,
)
logger.info("Repo worker initialized")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from crowdgit.services.base.base_service import BaseService
from crowdgit.services.clone.clone_service import CloneService
from crowdgit.services.commit.commit_service import CommitService
from crowdgit.services.license.license_service import LicenseService
from crowdgit.services.maintainer.maintainer_service import MaintainerService
from crowdgit.services.queue.queue_service import QueueService
from crowdgit.services.software_value.software_value_service import SoftwareValueService
Expand All @@ -12,6 +13,7 @@
"BaseService",
"CloneService",
"CommitService",
"LicenseService",
"SoftwareValueService",
"VulnerabilityScannerService",
"MaintainerService",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from crowdgit.services.license.license_service import LicenseService

__all__ = ["LicenseService"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import json

from crowdgit.errors import CommandExecutionError, CommandTimeoutError
from crowdgit.services.base.base_service import BaseService
from crowdgit.services.utils import run_shell_command


class LicenseService(BaseService):
"""Detects SPDX license from a cloned repository using the licensee gem."""

async def detect(self, repo_path: str) -> str | None:
"""Run licensee against repo_path and return the SPDX identifier, or None."""
Comment thread
gaspergrom marked this conversation as resolved.
try:
output = await run_shell_command(
["licensee", "detect", "--json", repo_path], timeout=60
)
Comment on lines +11 to +16
except CommandExecutionError:
self.logger.info(f"licensee found no license in {repo_path}")
Comment thread
gaspergrom marked this conversation as resolved.
return None
Comment thread
gaspergrom marked this conversation as resolved.
Comment on lines +11 to +19
except CommandTimeoutError as e:
self.logger.warning(f"licensee timed out: {repr(e)}")
return None
except FileNotFoundError as e:
self.logger.warning(f"licensee binary not found in PATH: {repr(e)}")
return None
except Exception as e:
self.logger.warning(f"licensee failed: {repr(e)}")
return None
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Transient detection errors silently clear existing license data

Medium Severity

LicenseService.detect() returns None for both "no license exists" and all error conditions (timeout, binary not found, parse failure, etc.). The caller in repository_worker.py unconditionally passes this result to update_repository_license, which will overwrite a previously valid license (e.g. "MIT") with NULL when the tool fails transiently. The IS DISTINCT FROM guard won't help because 'MIT' IS DISTINCT FROM NULL evaluates to TRUE. A persistent issue like a missing licensee binary would gradually erase all stored license data.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit ef58578. Configure here.


try:
data = json.loads(output)
licenses = data.get("licenses") or []
matched_files = data.get("matched_files") or []
spdx_id = licenses[0].get("spdx_id") if licenses else None
confidence = (
(matched_files[0].get("matcher") or {}).get("confidence")
if matched_files
else None
)
Comment thread
gaspergrom marked this conversation as resolved.
if spdx_id:
self.logger.info(
f"License detected: {spdx_id} (confidence={confidence}) in {repo_path}"
)
else:
self.logger.info(f"No SPDX license matched in {repo_path}")
return spdx_id
except Exception as e:
self.logger.warning(f"Failed to parse licensee output: {repr(e)}")
return None
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
mark_repo_as_processed,
release_repo,
update_last_processed_commit,
update_repository_license,
)
from crowdgit.enums import RepositoryState
from crowdgit.errors import (
Expand All @@ -22,6 +23,7 @@
from crowdgit.services import (
CloneService,
CommitService,
LicenseService,
MaintainerService,
QueueService,
SoftwareValueService,
Expand All @@ -46,13 +48,15 @@ def __init__(
software_value_service: SoftwareValueService,
vulnerability_scanner_service: VulnerabilityScannerService,
maintainer_service: MaintainerService,
license_service: LicenseService,
queue_service: QueueService,
Comment thread
gaspergrom marked this conversation as resolved.
):
self.clone_service = clone_service
self.commit_service = commit_service
self.software_value_service = software_value_service
self.vulnerability_scanner_service = vulnerability_scanner_service
self.maintainer_service = maintainer_service
self.license_service = license_service
self.queue_service = queue_service
self._shutdown = False

Expand Down Expand Up @@ -159,6 +163,7 @@ def _bind_repository_context(self, repository: Repository, repo_name: str) -> No
(self.maintainer_service, "maintainer_processing"),
(self.software_value_service, "software_value_processing"),
(self.vulnerability_scanner_service, "vulnerability_scan_processing"),
(self.license_service, "license_detection"),
(self.queue_service, "queue_service"),
]

Expand All @@ -174,6 +179,7 @@ def _reset_all_contexts(self) -> None:
self.maintainer_service,
self.software_value_service,
self.vulnerability_scanner_service,
self.license_service,
self.queue_service,
]

Expand Down Expand Up @@ -236,6 +242,8 @@ async def _process_single_repository(self, repository: Repository):
repository.id, batch_info.repo_path, repository.url
)
await self.maintainer_service.process_maintainers(repository, batch_info)
license_spdx = await self.license_service.detect(batch_info.repo_path)
Comment thread
cursor[bot] marked this conversation as resolved.
await update_repository_license(repository.id, license_spdx)
Comment thread
gaspergrom marked this conversation as resolved.
Comment thread
gaspergrom marked this conversation as resolved.
await self.commit_service.process_single_batch_commits(
repository,
batch_info,
Expand Down
7 changes: 5 additions & 2 deletions services/libs/data-access-layer/src/repositories/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ export interface IRepository {
updatedAt: string
deletedAt: string | null
lastArchivedCheckAt: string | null
license: string | null
Comment thread
cursor[bot] marked this conversation as resolved.
}

export interface ICreateRepository {
Expand Down Expand Up @@ -148,7 +149,8 @@ export async function getRepositoriesBySourceIntegrationId(
"createdAt",
"updatedAt",
"deletedAt",
"lastArchivedCheckAt"
"lastArchivedCheckAt",
license
FROM public.repositories
WHERE "sourceIntegrationId" = $(sourceIntegrationId)
AND "deletedAt" IS NULL
Expand Down Expand Up @@ -190,7 +192,8 @@ export async function getRepositoriesByUrl(
"createdAt",
"updatedAt",
"deletedAt",
"lastArchivedCheckAt"
"lastArchivedCheckAt",
license
FROM public.repositories
WHERE url IN ($(repoUrls:csv))
${deletedFilter}
Expand Down
Loading