Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 140 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# =============================================================================
# Data Sanitizer Environment Configuration
# =============================================================================

# Environment
ENVIRONMENT=development
DEBUG=False

# =============================================================================
# Database Configuration
# =============================================================================

# PostgreSQL
POSTGRES_HOST=localhost
POSTGRES_PORT=5432
POSTGRES_DB=data_sanitizer
POSTGRES_USER=postgres
POSTGRES_PASSWORD=postgres
DB_POOL_SIZE=10
DB_MAX_OVERFLOW=20

# =============================================================================
# Vector Database (Milvus)
# =============================================================================

MILVUS_HOST=localhost
MILVUS_PORT=19530
MILVUS_COLLECTION=lsh_samples

# =============================================================================
# Cache (Redis)
# =============================================================================

REDIS_HOST=localhost
REDIS_PORT=6379
REDIS_DB=0
REDIS_PASSWORD=
REDIS_TTL=3600

# =============================================================================
# Cloud Storage
# =============================================================================

# Storage provider: local, s3, gcs, azure
STORAGE_PROVIDER=local
STORAGE_BUCKET=

# AWS S3
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_REGION=us-east-1

# Google Cloud Storage
GCS_PROJECT_ID=
GCS_CREDENTIALS_PATH=

# =============================================================================
# API Server
# =============================================================================

API_HOST=0.0.0.0
API_PORT=8000
API_WORKERS=4
API_RELOAD=False
CORS_ORIGINS=*
RATE_LIMIT_PER_MIN=100
MAX_UPLOAD_SIZE_MB=1000
AUTH_ENABLED=True

# =============================================================================
# Data Processing
# =============================================================================

# Chunk sizes
DEFAULT_CHUNKSIZE=50000
MAX_CHUNKSIZE=200000

# Sampling
NUMERIC_SAMPLE_SIZE=1000
CATEGORICAL_SAMPLE_SIZE=500
LSH_SAMPLE_SIZE=200

# MinHash/LSH parameters
MINHASH_NUM_HASHES=64
LSH_BANDS=16
LSH_SHINGLE_K=5

# Quality thresholds
DUPLICATE_THRESHOLD=0.85
IMPUTATION_CONFIDENCE_THRESHOLD=0.7

# PII Detection
PII_DETECTION_ENABLED=True
PII_DEFAULT_STRATEGY=hash

# =============================================================================
# Monitoring & Logging
# =============================================================================

# Metrics
METRICS_ENABLED=True
METRICS_PORT=9090

# Logging
LOG_LEVEL=INFO
LOG_FORMAT=%(asctime)s - %(name)s - %(levelname)s - %(message)s

# Tracing
TRACING_ENABLED=False

# Sentry (Error tracking)
SENTRY_DSN=

# =============================================================================
# Security
# =============================================================================

# JWT
JWT_SECRET=change-me-in-production
JWT_EXPIRY_HOURS=24

# Encryption
ENCRYPTION_KEY=

# SSL
SSL_ENABLED=False

# =============================================================================
# LLM Integration (Optional)
# =============================================================================

# LLM Provider: gemini, openai
LLM_PROVIDER=gemini
LLM_API_KEY=

# Gemini
GEMINI_API_KEY=

# OpenAI
OPENAI_API_KEY=
182 changes: 182 additions & 0 deletions .github/workflows/ci-cd.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
name: CI/CD Pipeline

on:
push:
branches: [ main, develop, copilot/** ]
pull_request:
branches: [ main, develop ]

permissions:
contents: read

jobs:
test:
name: Test Python ${{ matrix.python-version }}
runs-on: ubuntu-latest
permissions:
contents: read
strategy:
matrix:
python-version: ['3.11', '3.12']

steps:
- uses: actions/checkout@v3

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest pytest-cov black flake8 isort mypy autoflake

- name: Code Quality - Black
run: |
black --check --line-length 120 *.py

- name: Code Quality - isort
run: |
isort --check --profile black --line-length 120 *.py

- name: Code Quality - Flake8
run: |
flake8 --select=E,W,F --ignore=E501,W503,E203,E402,E226,F541,W291 --max-line-length=120 *.py

- name: Run Tests
run: |
pytest tests.py -v --cov=. --cov-report=xml --cov-report=term

- name: Upload Coverage
uses: codecov/codecov-action@v3
if: matrix.python-version == '3.12'
with:
file: ./coverage.xml
flags: unittests
name: codecov-umbrella

security:
name: Security Scan
runs-on: ubuntu-latest
permissions:
contents: read

steps:
- uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.12'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install bandit safety

- name: Run Bandit (Security Linter)
run: |
bandit -r . -f json -o bandit-report.json || true
bandit -r . -f screen

- name: Check Dependencies for Vulnerabilities
run: |
pip install -r requirements.txt
safety check --json || true

build-docker:
name: Build Docker Images
runs-on: ubuntu-latest
needs: [test, security]
if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/develop')
permissions:
contents: read
packages: write

steps:
- uses: actions/checkout@v3

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Login to Docker Hub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
if: github.ref == 'refs/heads/main'

- name: Build and Push API Image
uses: docker/build-push-action@v4
with:
context: .
file: ./Dockerfile.api
push: ${{ github.ref == 'refs/heads/main' }}
tags: |
${{ secrets.DOCKER_USERNAME }}/data-sanitizer-api:latest
${{ secrets.DOCKER_USERNAME }}/data-sanitizer-api:${{ github.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max

- name: Build and Push Worker Pass1 Image
uses: docker/build-push-action@v4
with:
context: .
file: ./Dockerfile.worker-pass1
push: ${{ github.ref == 'refs/heads/main' }}
tags: |
${{ secrets.DOCKER_USERNAME }}/data-sanitizer-worker-pass1:latest
${{ secrets.DOCKER_USERNAME }}/data-sanitizer-worker-pass1:${{ github.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max

- name: Build and Push Worker Pass2 Image
uses: docker/build-push-action@v4
with:
context: .
file: ./Dockerfile.worker-pass2
push: ${{ github.ref == 'refs/heads/main' }}
tags: |
${{ secrets.DOCKER_USERNAME }}/data-sanitizer-worker-pass2:latest
${{ secrets.DOCKER_USERNAME }}/data-sanitizer-worker-pass2:${{ github.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max

deploy-staging:
name: Deploy to Staging
runs-on: ubuntu-latest
needs: build-docker
if: github.ref == 'refs/heads/develop'
environment: staging
permissions:
contents: read

steps:
- uses: actions/checkout@v3

- name: Deploy to Staging
run: |
echo "Deploying to staging environment"
# Add your deployment commands here
# Example: kubectl apply -k k8s/overlays/staging

deploy-production:
name: Deploy to Production
runs-on: ubuntu-latest
needs: build-docker
if: github.ref == 'refs/heads/main'
environment: production
permissions:
contents: read

steps:
- uses: actions/checkout@v3

- name: Deploy to Production
run: |
echo "Deploying to production environment"
# Add your deployment commands here
# Example: kubectl apply -k k8s/overlays/prod
64 changes: 64 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# Virtual environments
venv/
env/
ENV/
env.bak/
venv.bak/

# Testing
.pytest_cache/
.hypothesis/
.coverage
.coverage.*
htmlcov/
.tox/
.nox/

# IDEs
.vscode/
.idea/
*.swp
*.swo
*~
.DS_Store

# Project specific
pipeline_state.db
*.db
*.db-journal
pipeline_output/
output/
test_data/
*.log

# Docker
docker-compose.override.yml

# Environment variables
.env
.env.local
Loading
Loading