Code-for-Sydney · rafaeldolfe · Apr 11, 2026
diff --git a/README.md b/README.md
@@ -1 +1,64 @@
-# astroturf
+# astroturf
+
+Python service for ingesting Reddit thread data and preparing it for astroturfing analysis.
+
+## First slice
+
+The repository now includes a minimal ingestion API that:
+
+- accepts a Reddit post plus comment thread as JSON
+- validates the payload with Pydantic schemas
+- normalizes post, comment, and author data into a canonical format
+- returns summary metrics that later analyzers can build on
+
+## Run locally
+
+```bash
+python -m venv .venv
+.venv\Scripts\activate
+pip install -r requirements.txt
+uvicorn app.main:app --reload
+```
+
+Then open `http://127.0.0.1:8000/docs` for the interactive API docs.
+
+## Endpoints
+
+- `GET /health`
+- `POST /ingest/reddit`
+
+## Test
+
+```bash
+pytest
+```
+
+## Example payload shape
+
+```json
+{
+  "post": {
+    "post_id": "abc123",
+    "subreddit": "technology",
+    "title": "Example title",
+    "body": "Thread body text",
+    "created_utc": "2026-04-11T01:15:00Z"
+  },
+  "comments": [
+    {
+      "comment_id": "c1",
+      "parent_id": "abc123",
+      "body": "Example comment",
+      "created_utc": "2026-04-11T01:20:00Z",
+      "depth": 0,
+      "author": {
+        "username": "user_a"
+      }
+    }
+  ],
+  "context": {
+    "source": "reddit",
+    "collection_method": "manual_export"
+  }
+}
+```
diff --git a/app/__init__.py b/app/__init__.py
@@ -0,0 +1 @@
+"""Astroturf detection application package."""
diff --git a/app/api/__init__.py b/app/api/__init__.py
@@ -0,0 +1 @@
+"""API router package."""
diff --git a/app/api/routes.py b/app/api/routes.py
@@ -0,0 +1,16 @@
+from fastapi import APIRouter
+
+from app.models.reddit import IngestionResponse, RedditThreadPayload
+from app.services.ingestion import normalize_thread_payload
+
+router = APIRouter()
+
+
+@router.get("/health")
+def healthcheck() -> dict[str, str]:
+    return {"status": "ok"}
+
+
+@router.post("/ingest/reddit", response_model=IngestionResponse)
+def ingest_reddit_thread(payload: RedditThreadPayload) -> IngestionResponse:
+    return normalize_thread_payload(payload)
diff --git a/app/main.py b/app/main.py
@@ -0,0 +1,18 @@
+from fastapi import FastAPI
+
+from app.api.routes import router
+
+
+def create_app() -> FastAPI:
+    app = FastAPI(
+        title="Astroturf Detection API",
+        version="0.1.0",
+        description=(
+            "Ingestion and analysis API for detecting astroturfing in Reddit threads."
+        ),
+    )
+    app.include_router(router)
+    return app
+
+
+app = create_app()
diff --git a/app/models/__init__.py b/app/models/__init__.py
@@ -0,0 +1 @@
+"""Pydantic models used by the application."""
diff --git a/app/models/reddit.py b/app/models/reddit.py
@@ -0,0 +1,144 @@
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict, Field, HttpUrl, field_validator
+
+
+class RedditAuthor(BaseModel):
+    model_config = ConfigDict(extra="allow")
+
+    username: str = Field(min_length=1)
+    account_id: str | None = None
+    account_created_utc: datetime | None = None
+    comment_karma: int | None = Field(default=None, ge=0)
+    post_karma: int | None = Field(default=None, ge=0)
+    is_mod: bool = False
+    is_gold: bool = False
+    is_verified: bool = False
+    metadata: dict[str, object] = Field(default_factory=dict)
+
+
+class RedditComment(BaseModel):
+    model_config = ConfigDict(extra="allow")
+
+    comment_id: str = Field(min_length=1)
+    parent_id: str | None = None
+    author: RedditAuthor | None = None
+    body: str = ""
+    created_utc: datetime
+    score: int | None = None
+    depth: int = Field(default=0, ge=0)
+    permalink: HttpUrl | None = None
+    edited: bool = False
+    is_submitter: bool = False
+    metadata: dict[str, object] = Field(default_factory=dict)
+
+
+class RedditPost(BaseModel):
+    model_config = ConfigDict(extra="allow")
+
+    post_id: str = Field(min_length=1)
+    subreddit: str = Field(min_length=1)
+    title: str = Field(min_length=1)
+    body: str = ""
+    author: RedditAuthor | None = None
+    created_utc: datetime
+    score: int | None = None
+    upvote_ratio: float | None = Field(default=None, ge=0.0, le=1.0)
+    num_comments: int | None = Field(default=None, ge=0)
+    permalink: HttpUrl | None = None
+    url: HttpUrl | None = None
+    is_nsfw: bool = False
+    is_spoiler: bool = False
+    flair_text: str | None = None
+    metadata: dict[str, object] = Field(default_factory=dict)
+
+
+class RedditThreadContext(BaseModel):
+    model_config = ConfigDict(extra="allow")
+
+    source: Literal["reddit"] = "reddit"
+    collected_at_utc: datetime | None = None
+    collection_method: str | None = None
+    likes_available: bool = False
+    awards_available: bool = False
+    metadata: dict[str, object] = Field(default_factory=dict)
+
+
+class RedditThreadPayload(BaseModel):
+    model_config = ConfigDict(extra="allow")
+
+    post: RedditPost
+    comments: list[RedditComment] = Field(default_factory=list)
+    context: RedditThreadContext = Field(default_factory=RedditThreadContext)
+
+    @field_validator("comments")
+    @classmethod
+    def ensure_unique_comment_ids(
+        cls, comments: list[RedditComment]
+    ) -> list[RedditComment]:
+        seen: set[str] = set()
+        duplicates: set[str] = set()
+        for comment in comments:
+            if comment.comment_id in seen:
+                duplicates.add(comment.comment_id)
+            seen.add(comment.comment_id)
+        if duplicates:
+            duplicate_text = ", ".join(sorted(duplicates))
+            raise ValueError(f"Duplicate comment_id values found: {duplicate_text}")
+        return comments
+
+
+class NormalizedAuthorSummary(BaseModel):
+    username: str
+    account_age_days: int | None = None
+    comment_karma: int | None = None
+    post_karma: int | None = None
+    is_mod: bool = False
+    is_verified: bool = False
+
+
+class NormalizedComment(BaseModel):
+    comment_id: str
+    parent_id: str | None = None
+    author_username: str | None = None
+    body: str
+    body_length: int
+    created_utc: datetime
+    score: int | None = None
+    depth: int
+    is_submitter: bool = False
+
+
+class NormalizedPost(BaseModel):
+    post_id: str
+    subreddit: str
+    title: str
+    body: str
+    created_utc: datetime
+    score: int | None = None
+    upvote_ratio: float | None = None
+    num_comments_reported: int | None = None
+    title_length: int
+    body_length: int
+    author_username: str | None = None
+
+
+class ThreadMetrics(BaseModel):
+    total_comments: int
+    unique_authors: int
+    deleted_or_missing_authors: int
+    top_level_comments: int
+    reply_comments: int
+    average_comment_length: float
+
+
+class IngestionResponse(BaseModel):
+    source: Literal["reddit"]
+    ingested_at_utc: datetime
+    post: NormalizedPost
+    comments: list[NormalizedComment]
+    authors: list[NormalizedAuthorSummary]
+    metrics: ThreadMetrics
diff --git a/app/services/__init__.py b/app/services/__init__.py
@@ -0,0 +1 @@
+"""Application service layer."""
diff --git a/app/services/ingestion.py b/app/services/ingestion.py
@@ -0,0 +1,116 @@
+from __future__ import annotations
+
+from datetime import UTC, datetime
+
+from app.models.reddit import (
+    IngestionResponse,
+    NormalizedAuthorSummary,
+    NormalizedComment,
+    NormalizedPost,
+    RedditAuthor,
+    RedditThreadPayload,
+    ThreadMetrics,
+)
+
+
+def normalize_thread_payload(payload: RedditThreadPayload) -> IngestionResponse:
+    ingested_at_utc = datetime.now(UTC)
+    authors = _collect_authors(payload, ingested_at_utc)
+    normalized_comments = [_normalize_comment(comment) for comment in payload.comments]
+
+    total_comments = len(normalized_comments)
+    top_level_comments = sum(1 for comment in normalized_comments if comment.depth == 0)
+    reply_comments = total_comments - top_level_comments
+    body_lengths = [comment.body_length for comment in normalized_comments]
+    average_comment_length = (
+        round(sum(body_lengths) / total_comments, 2) if total_comments else 0.0
+    )
+    deleted_or_missing_authors = sum(
+        1 for comment in normalized_comments if comment.author_username is None
+    )
+
+    metrics = ThreadMetrics(
+        total_comments=total_comments,
+        unique_authors=len(authors),
+        deleted_or_missing_authors=deleted_or_missing_authors,
+        top_level_comments=top_level_comments,
+        reply_comments=reply_comments,
+        average_comment_length=average_comment_length,
+    )
+
+    post = NormalizedPost(
+        post_id=payload.post.post_id,
+        subreddit=payload.post.subreddit,
+        title=payload.post.title.strip(),
+        body=payload.post.body.strip(),
+        created_utc=payload.post.created_utc,
+        score=payload.post.score,
+        upvote_ratio=payload.post.upvote_ratio,
+        num_comments_reported=payload.post.num_comments,
+        title_length=len(payload.post.title.strip()),
+        body_length=len(payload.post.body.strip()),
+        author_username=payload.post.author.username if payload.post.author else None,
+    )
+
+    return IngestionResponse(
+        source=payload.context.source,
+        ingested_at_utc=ingested_at_utc,
+        post=post,
+        comments=normalized_comments,
+        authors=sorted(authors.values(), key=lambda author: author.username.lower()),
+        metrics=metrics,
+    )
+
+
+def _collect_authors(
+    payload: RedditThreadPayload, ingested_at_utc: datetime
+) -> dict[str, NormalizedAuthorSummary]:
+    author_map: dict[str, NormalizedAuthorSummary] = {}
+
+    if payload.post.author:
+        author_map[payload.post.author.username] = _normalize_author(
+            payload.post.author, ingested_at_utc
+        )
+
+    for comment in payload.comments:
+        if comment.author is None:
+            continue
+        author_map.setdefault(
+            comment.author.username, _normalize_author(comment.author, ingested_at_utc)
+        )
+
+    return author_map
+
+
+def _normalize_author(
+    author: RedditAuthor, ingested_at_utc: datetime
+) -> NormalizedAuthorSummary:
+    account_age_days = None
+    if author.account_created_utc is not None:
+        delta = ingested_at_utc - author.account_created_utc.astimezone(UTC)
+        account_age_days = max(delta.days, 0)
+
+    return NormalizedAuthorSummary(
+        username=author.username,
+        account_age_days=account_age_days,
+        comment_karma=author.comment_karma,
+        post_karma=author.post_karma,
+        is_mod=author.is_mod,
+        is_verified=author.is_verified,
+    )
+
+
+def _normalize_comment(comment) -> NormalizedComment:
+    body = comment.body.strip()
+    author_username = comment.author.username if comment.author else None
+    return NormalizedComment(
+        comment_id=comment.comment_id,
+        parent_id=comment.parent_id,
+        author_username=author_username,
+        body=body,
+        body_length=len(body),
+        created_utc=comment.created_utc,
+        score=comment.score,
+        depth=comment.depth,
+        is_submitter=comment.is_submitter,
+    )
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+fastapi>=0.115,<1.0
+pydantic>=2.9,<3.0
+uvicorn[standard]>=0.32,<1.0
+pytest>=8.3,<9.0
+httpx>=0.28,<1.0