Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 64 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,64 @@
# astroturf
# astroturf

Python service for ingesting Reddit thread data and preparing it for astroturfing analysis.

## First slice

The repository now includes a minimal ingestion API that:

- accepts a Reddit post plus comment thread as JSON
- validates the payload with Pydantic schemas
- normalizes post, comment, and author data into a canonical format
- returns summary metrics that later analyzers can build on

## Run locally

```bash
python -m venv .venv
.venv\Scripts\activate
pip install -r requirements.txt
uvicorn app.main:app --reload
```

Then open `http://127.0.0.1:8000/docs` for the interactive API docs.

## Endpoints

- `GET /health`
- `POST /ingest/reddit`

## Test

```bash
pytest
```

## Example payload shape

```json
{
"post": {
"post_id": "abc123",
"subreddit": "technology",
"title": "Example title",
"body": "Thread body text",
"created_utc": "2026-04-11T01:15:00Z"
},
"comments": [
{
"comment_id": "c1",
"parent_id": "abc123",
"body": "Example comment",
"created_utc": "2026-04-11T01:20:00Z",
"depth": 0,
"author": {
"username": "user_a"
}
}
],
"context": {
"source": "reddit",
"collection_method": "manual_export"
}
}
```
1 change: 1 addition & 0 deletions app/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Astroturf detection application package."""
1 change: 1 addition & 0 deletions app/api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""API router package."""
16 changes: 16 additions & 0 deletions app/api/routes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from fastapi import APIRouter

from app.models.reddit import IngestionResponse, RedditThreadPayload
from app.services.ingestion import normalize_thread_payload

router = APIRouter()


@router.get("/health")
def healthcheck() -> dict[str, str]:
return {"status": "ok"}


@router.post("/ingest/reddit", response_model=IngestionResponse)
def ingest_reddit_thread(payload: RedditThreadPayload) -> IngestionResponse:
return normalize_thread_payload(payload)
18 changes: 18 additions & 0 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from fastapi import FastAPI

from app.api.routes import router


def create_app() -> FastAPI:
app = FastAPI(
title="Astroturf Detection API",
version="0.1.0",
description=(
"Ingestion and analysis API for detecting astroturfing in Reddit threads."
),
)
app.include_router(router)
return app


app = create_app()
1 change: 1 addition & 0 deletions app/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Pydantic models used by the application."""
144 changes: 144 additions & 0 deletions app/models/reddit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
from __future__ import annotations

from datetime import datetime
from typing import Literal

from pydantic import BaseModel, ConfigDict, Field, HttpUrl, field_validator


class RedditAuthor(BaseModel):
model_config = ConfigDict(extra="allow")

username: str = Field(min_length=1)
account_id: str | None = None
account_created_utc: datetime | None = None
comment_karma: int | None = Field(default=None, ge=0)
post_karma: int | None = Field(default=None, ge=0)
is_mod: bool = False
is_gold: bool = False
is_verified: bool = False
metadata: dict[str, object] = Field(default_factory=dict)


class RedditComment(BaseModel):
model_config = ConfigDict(extra="allow")

comment_id: str = Field(min_length=1)
parent_id: str | None = None
author: RedditAuthor | None = None
body: str = ""
created_utc: datetime
score: int | None = None
depth: int = Field(default=0, ge=0)
permalink: HttpUrl | None = None
edited: bool = False
is_submitter: bool = False
metadata: dict[str, object] = Field(default_factory=dict)


class RedditPost(BaseModel):
model_config = ConfigDict(extra="allow")

post_id: str = Field(min_length=1)
subreddit: str = Field(min_length=1)
title: str = Field(min_length=1)
body: str = ""
author: RedditAuthor | None = None
created_utc: datetime
score: int | None = None
upvote_ratio: float | None = Field(default=None, ge=0.0, le=1.0)
num_comments: int | None = Field(default=None, ge=0)
permalink: HttpUrl | None = None
url: HttpUrl | None = None
is_nsfw: bool = False
is_spoiler: bool = False
flair_text: str | None = None
metadata: dict[str, object] = Field(default_factory=dict)


class RedditThreadContext(BaseModel):
model_config = ConfigDict(extra="allow")

source: Literal["reddit"] = "reddit"
collected_at_utc: datetime | None = None
collection_method: str | None = None
likes_available: bool = False
awards_available: bool = False
metadata: dict[str, object] = Field(default_factory=dict)


class RedditThreadPayload(BaseModel):
model_config = ConfigDict(extra="allow")

post: RedditPost
comments: list[RedditComment] = Field(default_factory=list)
context: RedditThreadContext = Field(default_factory=RedditThreadContext)

@field_validator("comments")
@classmethod
def ensure_unique_comment_ids(
cls, comments: list[RedditComment]
) -> list[RedditComment]:
seen: set[str] = set()
duplicates: set[str] = set()
for comment in comments:
if comment.comment_id in seen:
duplicates.add(comment.comment_id)
seen.add(comment.comment_id)
if duplicates:
duplicate_text = ", ".join(sorted(duplicates))
raise ValueError(f"Duplicate comment_id values found: {duplicate_text}")
return comments


class NormalizedAuthorSummary(BaseModel):
username: str
account_age_days: int | None = None
comment_karma: int | None = None
post_karma: int | None = None
is_mod: bool = False
is_verified: bool = False


class NormalizedComment(BaseModel):
comment_id: str
parent_id: str | None = None
author_username: str | None = None
body: str
body_length: int
created_utc: datetime
score: int | None = None
depth: int
is_submitter: bool = False


class NormalizedPost(BaseModel):
post_id: str
subreddit: str
title: str
body: str
created_utc: datetime
score: int | None = None
upvote_ratio: float | None = None
num_comments_reported: int | None = None
title_length: int
body_length: int
author_username: str | None = None


class ThreadMetrics(BaseModel):
total_comments: int
unique_authors: int
deleted_or_missing_authors: int
top_level_comments: int
reply_comments: int
average_comment_length: float


class IngestionResponse(BaseModel):
source: Literal["reddit"]
ingested_at_utc: datetime
post: NormalizedPost
comments: list[NormalizedComment]
authors: list[NormalizedAuthorSummary]
metrics: ThreadMetrics
1 change: 1 addition & 0 deletions app/services/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Application service layer."""
116 changes: 116 additions & 0 deletions app/services/ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from __future__ import annotations

from datetime import UTC, datetime

from app.models.reddit import (
IngestionResponse,
NormalizedAuthorSummary,
NormalizedComment,
NormalizedPost,
RedditAuthor,
RedditThreadPayload,
ThreadMetrics,
)


def normalize_thread_payload(payload: RedditThreadPayload) -> IngestionResponse:
ingested_at_utc = datetime.now(UTC)
authors = _collect_authors(payload, ingested_at_utc)
normalized_comments = [_normalize_comment(comment) for comment in payload.comments]

total_comments = len(normalized_comments)
top_level_comments = sum(1 for comment in normalized_comments if comment.depth == 0)
reply_comments = total_comments - top_level_comments
body_lengths = [comment.body_length for comment in normalized_comments]
average_comment_length = (
round(sum(body_lengths) / total_comments, 2) if total_comments else 0.0
)
deleted_or_missing_authors = sum(
1 for comment in normalized_comments if comment.author_username is None
)

metrics = ThreadMetrics(
total_comments=total_comments,
unique_authors=len(authors),
deleted_or_missing_authors=deleted_or_missing_authors,
top_level_comments=top_level_comments,
reply_comments=reply_comments,
average_comment_length=average_comment_length,
)

post = NormalizedPost(
post_id=payload.post.post_id,
subreddit=payload.post.subreddit,
title=payload.post.title.strip(),
body=payload.post.body.strip(),
created_utc=payload.post.created_utc,
score=payload.post.score,
upvote_ratio=payload.post.upvote_ratio,
num_comments_reported=payload.post.num_comments,
title_length=len(payload.post.title.strip()),
body_length=len(payload.post.body.strip()),
author_username=payload.post.author.username if payload.post.author else None,
)

return IngestionResponse(
source=payload.context.source,
ingested_at_utc=ingested_at_utc,
post=post,
comments=normalized_comments,
authors=sorted(authors.values(), key=lambda author: author.username.lower()),
metrics=metrics,
)


def _collect_authors(
payload: RedditThreadPayload, ingested_at_utc: datetime
) -> dict[str, NormalizedAuthorSummary]:
author_map: dict[str, NormalizedAuthorSummary] = {}

if payload.post.author:
author_map[payload.post.author.username] = _normalize_author(
payload.post.author, ingested_at_utc
)

for comment in payload.comments:
if comment.author is None:
continue
author_map.setdefault(
comment.author.username, _normalize_author(comment.author, ingested_at_utc)
)

return author_map


def _normalize_author(
author: RedditAuthor, ingested_at_utc: datetime
) -> NormalizedAuthorSummary:
account_age_days = None
if author.account_created_utc is not None:
delta = ingested_at_utc - author.account_created_utc.astimezone(UTC)
account_age_days = max(delta.days, 0)

return NormalizedAuthorSummary(
username=author.username,
account_age_days=account_age_days,
comment_karma=author.comment_karma,
post_karma=author.post_karma,
is_mod=author.is_mod,
is_verified=author.is_verified,
)


def _normalize_comment(comment) -> NormalizedComment:
body = comment.body.strip()
author_username = comment.author.username if comment.author else None
return NormalizedComment(
comment_id=comment.comment_id,
parent_id=comment.parent_id,
author_username=author_username,
body=body,
body_length=len(body),
created_utc=comment.created_utc,
score=comment.score,
depth=comment.depth,
is_submitter=comment.is_submitter,
)
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
fastapi>=0.115,<1.0
pydantic>=2.9,<3.0
uvicorn[standard]>=0.32,<1.0
pytest>=8.3,<9.0
httpx>=0.28,<1.0
Loading