diff --git a/Procfile b/Procfile new file mode 100644 index 0000000..c8de277 --- /dev/null +++ b/Procfile @@ -0,0 +1 @@ +web: gunicorn api:app --worker-class uvicorn.workers.UvicornWorker --bind 0.0.0.0:$PORT --workers ${WEB_CONCURRENCY:-1} --timeout 120 --access-logfile - --error-logfile - diff --git a/RENDER_DEPLOY.md b/RENDER_DEPLOY.md new file mode 100644 index 0000000..9bb0d2f --- /dev/null +++ b/RENDER_DEPLOY.md @@ -0,0 +1,87 @@ +# Deploying the PR Sentiment API to Render + +This is the FastAPI backend (`api.py` + `prediction_service.py`) that serves the +trained LinkedIn PR sentiment classifier. The trained artifacts live in +`output/` and are committed to the repo, so no external storage is needed. + +## What ships + +| File | Purpose | +|------|---------| +| `render.yaml` | Render Blueprint — defines the web service, build/start commands, health check, env vars. | +| `Procfile` | Same start command, for non-Blueprint / generic buildpack deploys. | +| `runtime.txt` | Pins Python 3.12.3. | +| `requirements_api.txt` | Python deps (scikit-learn pinned to **1.6.1** to match the pickled model). | +| `output/*.pkl`, `output/*.npy` | Trained model + scaler + encoders. | + +## One-time setup + +1. Push this branch to GitHub. +2. In Render: **New +** → **Blueprint** → select the repo. Render reads `render.yaml`. +3. Set the **`GEMINI_API_KEY`** secret in the dashboard (it's `sync: false`, so it is + never stored in the repo). Get a key at https://aistudio.google.com/app/apikey. +4. (Recommended) Set **`ALLOWED_ORIGINS`** to your frontend origin(s), + comma-separated, instead of `*`. +5. Deploy. Render runs the health check against `/health`; the service only + reports healthy once the model has loaded. + +## Environment variables + +| Var | Required | Default | Notes | +|-----|----------|---------|-------| +| `GEMINI_API_KEY` | ✅ | — | App refuses to start without it (fail-fast). | +| `MODEL_DIR` | | `output` | Directory holding the `.pkl`/`.npy` artifacts. | +| `ALLOWED_ORIGINS` | | `*` | Comma-separated origins. With `*`, credentials are disabled (CORS spec). | +| `WEB_CONCURRENCY` | | `1` | gunicorn workers. Each worker loads the model — raise only after checking memory. | +| `LOG_LEVEL` | | `INFO` | | +| `PORT` | (Render-injected) | `8000` | Bound automatically by the start command. | + +## Verify after deploy + +```bash +curl https://.onrender.com/health +curl -X POST https://.onrender.com/predict \ + -H 'Content-Type: application/json' \ + -d '{"text":"Excited to announce our new platform! #AI","has_media":1,"media_count":1}' +``` + +Interactive docs: `https://.onrender.com/docs` + +## Run locally + +```bash +python -m venv venv && source venv/bin/activate +pip install -r requirements_api.txt +export GEMINI_API_KEY=your-key +python api.py # dev server on :8000 (set PORT/RELOAD to override) +# or, mirror production: +gunicorn api:app -k uvicorn.workers.UvicornWorker -b 0.0.0.0:8000 +``` + +## Wiring the Next.js frontend + +The frontend never calls the FastAPI service directly. Instead: + +- `app/sentiment-analyzer/page.tsx` POSTs to the same-origin route `/api/predict`. +- `app/api/predict/route.ts` forwards the request to `${ML_API_URL}/predict`. + +So you only set **one** env var on the Next.js host (e.g. Vercel): + +``` +ML_API_URL=https://.onrender.com +``` + +Locally, `ML_API_URL` defaults to `http://localhost:8000`. Run both together +with `npm run dev` (starts `next dev` + `uvicorn api:app` via `concurrently`), +which also needs `GEMINI_API_KEY` exported for the Python side. + +## Model caveat (read before demoing) + +The model currently in `output/` is the **full-embedding (768-dim) classifier** +— the PCA/regularization fixes described in `FIXES_APPLIED.md` were *documented +but never saved* (`pca_reducer.pkl` is absent, and the saved model reports 784 +input features = 768 embeddings + 16 metadata). It therefore still carries the +documented overfitting (~84% train / ~45% test). The serving pipeline is +correct and dimensionally consistent; if you re-run the notebook to actually +apply PCA, save `pca_reducer.pkl` into `output/` and the service will pick it up +automatically (it already branches on the file's presence). diff --git a/api.py b/api.py new file mode 100644 index 0000000..5b1e05a --- /dev/null +++ b/api.py @@ -0,0 +1,297 @@ +""" +FastAPI Server for LinkedIn PR Sentiment Classification + +This API provides endpoints for predicting whether a LinkedIn post will +generate positive or negative PR using machine learning. +""" + +import logging +import os +from contextlib import asynccontextmanager +from datetime import datetime, timezone +from typing import Dict, Optional + +import uvicorn +from fastapi import FastAPI, HTTPException, status +from fastapi.concurrency import run_in_threadpool +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, Field, field_validator + +from prediction_service import PRClassifierService + +logging.basicConfig( + level=os.getenv("LOG_LEVEL", "INFO").upper(), + format="%(asctime)s %(levelname)s %(name)s: %(message)s", +) +logger = logging.getLogger("pr_api") + + +# Global service instance (populated during the lifespan startup) +prediction_service: Optional[PRClassifierService] = None + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Initialize the prediction service once, before the app serves traffic.""" + global prediction_service + + model_dir = os.getenv("MODEL_DIR", "output") + api_key = os.getenv("GEMINI_API_KEY") + + if not api_key: + # Fail fast: without the key the service can never produce a prediction, + # so we don't want the platform to report a "healthy" deploy. + raise RuntimeError( + "GEMINI_API_KEY environment variable not set. " + "Set it in the Render dashboard (or your local environment) " + "before starting the server." + ) + + try: + prediction_service = PRClassifierService(model_dir=model_dir, api_key=api_key) + logger.info("Prediction service initialized with model from: %s", model_dir) + except Exception: + logger.exception("Failed to initialize prediction service") + raise + + yield + + prediction_service = None + + +# Initialize FastAPI app +app = FastAPI( + title="LinkedIn PR Sentiment Classifier API", + description="Predict PR sentiment (positive/negative) for LinkedIn posts using AI", + version="1.0.0", + docs_url="/docs", + redoc_url="/redoc", + lifespan=lifespan, +) + +# CORS: configurable via ALLOWED_ORIGINS (comma-separated). Defaults to "*". +# The CORS spec forbids combining a wildcard origin with credentials, so we only +# enable credentials when explicit origins are listed. +_origins_env = os.getenv("ALLOWED_ORIGINS", "*").strip() +if _origins_env == "*": + _allow_origins = ["*"] + _allow_credentials = False +else: + _allow_origins = [o.strip() for o in _origins_env.split(",") if o.strip()] + _allow_credentials = True + +app.add_middleware( + CORSMiddleware, + allow_origins=_allow_origins, + allow_credentials=_allow_credentials, + allow_methods=["*"], + allow_headers=["*"], +) + + +def _utc_now() -> str: + return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") + + +# Pydantic models for request/response +class PredictionRequest(BaseModel): + """Request model for PR prediction""" + text: str = Field(..., description="LinkedIn post text content", min_length=1, max_length=10000) + + # Optional metadata fields + post_hour: Optional[int] = Field(12, ge=0, le=23, description="Hour of posting (0-23)") + post_day_of_week: Optional[int] = Field(2, ge=0, le=6, description="Day of week (0=Monday, 6=Sunday)") + post_month: Optional[int] = Field(1, ge=1, le=12, description="Month (1-12)") + has_media: Optional[int] = Field(0, ge=0, le=1, description="Has media (0 or 1)") + media_count: Optional[int] = Field(0, ge=0, description="Number of media items") + media_type: Optional[str] = Field("none", description="Media type (none/image/video)") + post_type: Optional[str] = Field("regular", description="Post type (regular/article)") + author_follower_count: Optional[int] = Field(1000, ge=0, description="Author follower count") + avg_sentiment: Optional[float] = Field(0.0, ge=-1.0, le=1.0, description="Average comment sentiment") + median_sentiment: Optional[float] = Field(0.0, ge=-1.0, le=1.0, description="Median comment sentiment") + num_comments_analyzed: Optional[int] = Field(0, ge=0, description="Number of comments analyzed") + + @field_validator("text") + @classmethod + def text_not_empty(cls, v: str) -> str: + if not v or not v.strip(): + raise ValueError("Text cannot be empty or whitespace only") + return v.strip() + + model_config = { + "json_schema_extra": { + "example": { + "text": "Excited to announce our new AI-powered analytics platform! This will transform how businesses understand their customers. #AI #Innovation", + "post_hour": 14, + "post_day_of_week": 2, + "has_media": 1, + "media_count": 1, + } + } + } + + +class PredictionResponse(BaseModel): + """Response model for PR prediction""" + prediction: str = Field(..., description="Predicted sentiment: 'positive' or 'negative'") + confidence: float = Field(..., description="Confidence score (0-1)") + probabilities: Dict[str, float] = Field(..., description="Probability for each class") + features_extracted: Dict = Field(..., description="Extracted features from input") + timestamp: str = Field(..., description="Prediction timestamp") + + model_config = { + "json_schema_extra": { + "example": { + "prediction": "positive", + "confidence": 0.85, + "probabilities": {"negative": 0.15, "positive": 0.85}, + "features_extracted": { + "text_length": 152, + "emoji_count": 0, + "url_count": 0, + "hashtag_count": 2, + "mention_count": 0, + "embedding_dimension": 768, + }, + "timestamp": "2025-12-15T10:30:00Z", + } + } + } + + +class HealthResponse(BaseModel): + """Health check response""" + status: str + message: str + model_loaded: bool + timestamp: str + + +@app.get("/", tags=["General"]) +async def root(): + """Root endpoint with API information""" + return { + "name": "LinkedIn PR Sentiment Classifier API", + "version": "1.0.0", + "description": "Predict PR sentiment for LinkedIn posts", + "endpoints": { + "health": "/health", + "predict": "/predict (POST)", + "docs": "/docs", + }, + } + + +@app.get("/health", response_model=HealthResponse, tags=["General"]) +async def health_check(): + """Health check endpoint (used by Render's health checks)""" + model_loaded = prediction_service is not None + + return HealthResponse( + status="healthy" if model_loaded else "unhealthy", + message="Service is running" if model_loaded else "Model not loaded", + model_loaded=model_loaded, + timestamp=_utc_now(), + ) + + +@app.post("/predict", response_model=PredictionResponse, tags=["Prediction"]) +async def predict_pr_sentiment(request: PredictionRequest): + """ + Predict PR sentiment for a LinkedIn post. + + Analyzes the provided post text and returns a prediction of whether it will + generate positive or negative PR, along with confidence scores and extracted + features. + """ + if prediction_service is None: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Prediction service not initialized", + ) + + try: + # The prediction makes a synchronous network call to the Gemini + # embedding API; run it off the event loop so concurrent requests + # aren't blocked. + result = await run_in_threadpool( + prediction_service.predict, + text=request.text, + post_hour=request.post_hour, + post_day_of_week=request.post_day_of_week, + post_month=request.post_month, + has_media=request.has_media, + media_count=request.media_count, + media_type=request.media_type, + post_type=request.post_type, + author_follower_count=request.author_follower_count, + avg_sentiment=request.avg_sentiment, + median_sentiment=request.median_sentiment, + num_comments_analyzed=request.num_comments_analyzed, + ) + + result["timestamp"] = _utc_now() + return PredictionResponse(**result) + + except ValueError as e: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Invalid input: {str(e)}", + ) + except RuntimeError as e: + logger.error("Prediction failed: %s", e) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Prediction failed: {str(e)}", + ) + except Exception as e: + logger.exception("Unexpected error during prediction") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Unexpected error: {str(e)}", + ) + + +@app.get("/model-info", tags=["General"]) +async def model_info(): + """Get information about the loaded model""" + if prediction_service is None: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Prediction service not initialized", + ) + + embedding_dim = 30 if prediction_service.pca else 768 + return { + "model_directory": prediction_service.model_dir, + "pca_enabled": prediction_service.pca is not None, + "embedding_model": "Gemini models/embedding-001", + "classifier_model": "XGBoost", + "features": { + "embedding_dimension": embedding_dim, + "metadata_features": len(prediction_service.metadata_features), + "total_features": embedding_dim + len(prediction_service.metadata_features), + }, + "encoders": { + "post_types": list(prediction_service.post_type_encoder.classes_), + "media_types": list(prediction_service.media_type_encoder.classes_), + }, + } + + +# Local development entrypoint. +# In production (Render) the app is served by gunicorn/uvicorn via the start +# command, which binds to $PORT — see render.yaml. +if __name__ == "__main__": + if not os.getenv("GEMINI_API_KEY"): + print("❌ Error: GEMINI_API_KEY environment variable not set") + print(" Please set it with: export GEMINI_API_KEY='your-api-key'") + raise SystemExit(1) + + uvicorn.run( + "api:app", + host=os.getenv("HOST", "0.0.0.0"), + port=int(os.getenv("PORT", "8000")), + reload=os.getenv("RELOAD", "false").lower() == "true", + log_level=os.getenv("LOG_LEVEL", "info").lower(), + ) diff --git a/prediction_service.py b/prediction_service.py new file mode 100644 index 0000000..d5bc72b --- /dev/null +++ b/prediction_service.py @@ -0,0 +1,329 @@ +""" +LinkedIn PR Sentiment Prediction Service + +This module provides feature extraction and prediction functionality for +classifying LinkedIn posts as positive or negative PR using Gemini embeddings +and XGBoost classifier. +""" + +import logging +import os +import re +import numpy as np +import pandas as pd +from datetime import datetime +from typing import Dict, Optional, List +import google.generativeai as genai +import joblib + +logger = logging.getLogger("pr_api.prediction_service") + + +class PRClassifierService: + """Service for predicting PR sentiment of LinkedIn posts""" + + def __init__(self, model_dir: str = "output", api_key: Optional[str] = None): + """ + Initialize the PR classifier service + + Args: + model_dir: Directory containing the model files + api_key: Gemini API key (if not provided, uses GEMINI_API_KEY env var) + """ + self.model_dir = model_dir + self.api_key = api_key or os.getenv("GEMINI_API_KEY") + + if not self.api_key: + raise ValueError("GEMINI_API_KEY must be provided or set as environment variable") + + # Configure Gemini API + genai.configure(api_key=self.api_key) + + # Load model and preprocessors + self._load_models() + + # Define metadata features (must match training order) + self.metadata_features = [ + 'text_length', 'emoji_count', 'url_count', 'hashtag_count', 'mention_count', + 'post_hour', 'post_day_of_week', 'post_month', + 'has_media', 'media_count', 'media_type_encoded', 'post_type_encoded', + 'author_follower_count', + 'avg_sentiment', 'median_sentiment', 'num_comments_analyzed' + ] + + def _load_models(self): + """Load trained model and preprocessing objects""" + try: + self.model = joblib.load(os.path.join(self.model_dir, "pr_classifier_model.pkl")) + self.scaler = joblib.load(os.path.join(self.model_dir, "feature_scaler.pkl")) + self.post_type_encoder = joblib.load(os.path.join(self.model_dir, "post_type_encoder.pkl")) + self.media_type_encoder = joblib.load(os.path.join(self.model_dir, "media_type_encoder.pkl")) + + # Try to load PCA if it exists (optional) + pca_path = os.path.join(self.model_dir, "pca_reducer.pkl") + if os.path.exists(pca_path): + self.pca = joblib.load(pca_path) + logger.info("PCA reducer loaded") + else: + self.pca = None + logger.info("No PCA reducer found (using full embeddings)") + + logger.info("Model and preprocessors loaded successfully") + + except Exception as e: + raise RuntimeError(f"Failed to load model files from {self.model_dir}: {e}") + + def get_gemini_embedding(self, text: str, task_type: str = "RETRIEVAL_DOCUMENT") -> Optional[List[float]]: + """ + Generate embedding for text using Gemini API + + Args: + text: Input text to embed + task_type: Type of embedding task + + Returns: + 768-dimensional embedding vector or None if failed + """ + try: + if not text or pd.isna(text): + return None + + result = genai.embed_content( + model="models/embedding-001", + content=str(text), + task_type=task_type + ) + return result['embedding'] + + except Exception as e: + logger.error("Error generating embedding: %s", e) + return None + + def count_emojis(self, text: str) -> int: + """Count emoji characters in text""" + if pd.isna(text) or not text: + return 0 + + emoji_pattern = re.compile("[" + u"\U0001F600-\U0001F64F" # emoticons + u"\U0001F300-\U0001F5FF" # symbols & pictographs + u"\U0001F680-\U0001F6FF" # transport & map symbols + u"\U0001F1E0-\U0001F1FF" # flags (iOS) + u"\U00002702-\U000027B0" + u"\U000024C2-\U0001F251" + "]+", flags=re.UNICODE) + + return len(emoji_pattern.findall(text)) + + def count_urls(self, text: str) -> int: + """Count URLs in text""" + if pd.isna(text) or not text: + return 0 + + url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') + return len(url_pattern.findall(str(text))) + + def count_hashtags(self, text: str) -> int: + """Count hashtags in text""" + if pd.isna(text) or not text: + return 0 + + return len(re.findall(r'#\w+', str(text))) + + def count_mentions(self, text: str) -> int: + """Count @ mentions in text""" + if pd.isna(text) or not text: + return 0 + + return len(re.findall(r'@\w+', str(text))) + + def extract_metadata_features( + self, + text: str, + post_hour: int = 12, + post_day_of_week: int = 2, + post_month: int = 1, + has_media: int = 0, + media_count: int = 0, + media_type: str = "none", + post_type: str = "regular", + author_follower_count: int = 1000, + avg_sentiment: float = 0.0, + median_sentiment: float = 0.0, + num_comments_analyzed: int = 0 + ) -> np.ndarray: + """ + Extract metadata features from post + + Args: + text: Post text content + post_hour: Hour of posting (0-23), default 12 + post_day_of_week: Day of week (0=Mon, 6=Sun), default 2 (Wednesday) + post_month: Month (1-12), default 1 + has_media: Binary flag for media presence, default 0 + media_count: Number of media items, default 0 + media_type: Type of media (none/image/video/etc), default "none" + post_type: Type of post (regular/article/etc), default "regular" + author_follower_count: Follower count, default 1000 + avg_sentiment: Average comment sentiment, default 0 + median_sentiment: Median comment sentiment, default 0 + num_comments_analyzed: Number of comments, default 0 + + Returns: + Array of 16 metadata features + """ + # Text-based features + text_length = len(str(text)) if text else 0 + emoji_count = self.count_emojis(text) + url_count = self.count_urls(text) + hashtag_count = self.count_hashtags(text) + mention_count = self.count_mentions(text) + + # Encode categorical features + try: + # Handle unknown categories gracefully + if post_type not in self.post_type_encoder.classes_: + post_type = "regular" + post_type_encoded = self.post_type_encoder.transform([post_type])[0] + except: + post_type_encoded = 0 + + try: + if media_type not in self.media_type_encoder.classes_: + media_type = "none" + media_type_encoded = self.media_type_encoder.transform([media_type])[0] + except: + media_type_encoded = 0 + + # Combine all features in correct order + features = np.array([ + text_length, + emoji_count, + url_count, + hashtag_count, + mention_count, + post_hour, + post_day_of_week, + post_month, + has_media, + media_count, + media_type_encoded, + post_type_encoded, + author_follower_count, + avg_sentiment, + median_sentiment, + num_comments_analyzed + ], dtype=float) + + return features + + def predict( + self, + text: str, + post_hour: int = 12, + post_day_of_week: int = 2, + post_month: int = 1, + has_media: int = 0, + media_count: int = 0, + media_type: str = "none", + post_type: str = "regular", + author_follower_count: int = 1000, + avg_sentiment: float = 0.0, + median_sentiment: float = 0.0, + num_comments_analyzed: int = 0 + ) -> Dict: + """ + Predict PR sentiment for a LinkedIn post + + Args: + text: Post text content (required) + Other args: Optional metadata features (see extract_metadata_features) + + Returns: + Dictionary containing: + - prediction: "positive" or "negative" + - confidence: Confidence score (0-1) + - probabilities: Dict with positive/negative probabilities + - features_extracted: Dict with extracted feature counts + """ + if not text: + raise ValueError("Text cannot be empty") + + # 1. Generate embedding + embedding = self.get_gemini_embedding(text) + if embedding is None: + raise RuntimeError("Failed to generate embedding from Gemini API") + + embedding_array = np.array(embedding).reshape(1, -1) + + # 2. Apply PCA if available + if self.pca is not None: + embedding_array = self.pca.transform(embedding_array) + + # 3. Extract metadata features + metadata_features = self.extract_metadata_features( + text=text, + post_hour=post_hour, + post_day_of_week=post_day_of_week, + post_month=post_month, + has_media=has_media, + media_count=media_count, + media_type=media_type, + post_type=post_type, + author_follower_count=author_follower_count, + avg_sentiment=avg_sentiment, + median_sentiment=median_sentiment, + num_comments_analyzed=num_comments_analyzed + ).reshape(1, -1) + + # 4. Combine features (embeddings first, then metadata) + combined_features = np.concatenate([embedding_array, metadata_features], axis=1) + + # 5. Scale features + scaled_features = self.scaler.transform(combined_features) + + # 6. Make prediction + prediction = self.model.predict(scaled_features)[0] + probabilities = self.model.predict_proba(scaled_features)[0] + + # 7. Format response + result = { + "prediction": "positive" if prediction == 1 else "negative", + "confidence": float(max(probabilities)), + "probabilities": { + "negative": float(probabilities[0]), + "positive": float(probabilities[1]) + }, + "features_extracted": { + "text_length": int(metadata_features[0, 0]), + "emoji_count": int(metadata_features[0, 1]), + "url_count": int(metadata_features[0, 2]), + "hashtag_count": int(metadata_features[0, 3]), + "mention_count": int(metadata_features[0, 4]), + "embedding_dimension": embedding_array.shape[1] + } + } + + return result + + +# Example usage +if __name__ == "__main__": + # Test the service + service = PRClassifierService() + + test_text = """ + Exciting news! We're launching our new product that will revolutionize + the industry. Join us for the launch event! #Innovation #TechNews + """ + + result = service.predict(test_text) + print("\nPrediction Result:") + print(f" Prediction: {result['prediction']}") + print(f" Confidence: {result['confidence']:.2%}") + print(f" Probabilities: {result['probabilities']}") + print(f" Features: {result['features_extracted']}") + + + + diff --git a/render.yaml b/render.yaml new file mode 100644 index 0000000..76ff9e2 --- /dev/null +++ b/render.yaml @@ -0,0 +1,47 @@ +# Render Blueprint for the LinkedIn PR Sentiment Classifier API +# Docs: https://render.com/docs/blueprint-spec +# +# Deploy: push this repo to GitHub, then in the Render dashboard choose +# "New +" -> "Blueprint" and point it at the repo. Render reads this file. +# +# The trained model artifacts in output/ are committed to the repo, so no +# external object storage is required at deploy time. + +services: + - type: web + name: pr-sentiment-api + runtime: python + region: oregon + plan: starter # 'free' works too, but sleeps after inactivity + branch: main + buildCommand: pip install --no-cache-dir -r requirements_api.txt + # gunicorn manages the process; UvicornWorker runs the ASGI app. + # --timeout 120 leaves headroom for the synchronous Gemini embedding call. + startCommand: >- + gunicorn api:app + --worker-class uvicorn.workers.UvicornWorker + --bind 0.0.0.0:$PORT + --workers ${WEB_CONCURRENCY:-1} + --timeout 120 + --access-logfile - + --error-logfile - + healthCheckPath: /health + autoDeploy: true + envVars: + # Required — set the value in the Render dashboard (sync: false keeps the + # secret out of this file and out of git). + - key: GEMINI_API_KEY + sync: false + - key: MODEL_DIR + value: output + # Lock to a single worker: the model + ML libs are memory-heavy and one + # worker fits comfortably on small instances. Raise once you size memory. + - key: WEB_CONCURRENCY + value: "1" + # CORS: replace "*" with your frontend origin(s), comma-separated, in prod. + - key: ALLOWED_ORIGINS + value: "*" + - key: LOG_LEVEL + value: INFO + - key: PYTHON_VERSION + value: "3.12.3" diff --git a/requirements_api.txt b/requirements_api.txt new file mode 100644 index 0000000..426d294 --- /dev/null +++ b/requirements_api.txt @@ -0,0 +1,26 @@ +# FastAPI PR Sentiment Classifier - Dependencies + +# Web Framework +fastapi==0.109.0 +uvicorn[standard]==0.27.0 +pydantic==2.5.3 + +# Machine Learning +# NOTE: scikit-learn must match the version used to pickle the artifacts in +# output/ (saved with 1.6.1). A mismatch makes sklearn silently risk +# "invalid results" on unpickle, so this pin is load-bearing. +xgboost==2.0.3 +scikit-learn==1.6.1 +numpy==1.26.3 +pandas==2.2.0 + +# Google Gemini API +google-generativeai==0.3.2 + +# Utilities +python-dotenv==1.0.0 +joblib==1.3.2 + +# Production server (Render) +gunicorn==21.2.0 +python-multipart==0.0.6 diff --git a/runtime.txt b/runtime.txt new file mode 100644 index 0000000..4ddc7cd --- /dev/null +++ b/runtime.txt @@ -0,0 +1 @@ +python-3.12.3