Skip to content

Commit 0d8643e

Browse files
committed
fix: resolve linting, type errors, and optional dependencies
1 parent f1a7280 commit 0d8643e

11 files changed

Lines changed: 478 additions & 263 deletions

File tree

stylometry/ai_interface.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
11
import json
2-
import urllib.request
32
import urllib.error
4-
from typing import Dict, List, Optional
3+
import urllib.request
4+
from typing import Dict, List, Optional, Any
5+
56

6-
def analyze_stats_with_ai(
7-
summary: Dict,
8-
api_base: str = "http://localhost:1234/v1",
9-
model: str = "local-model"
10-
) -> str:
7+
def analyze_stats_with_ai(summary: Dict[str, Any], api_base: str = "http://localhost:1234/v1", model: str = "local-model") -> str:
118
"""
129
Sends stylometric summary to a local LLM (like LM Studio) for interpretation.
1310
"""
14-
11+
1512
# Construct the prompt
1613
stats_json = json.dumps(summary, indent=2)
1714
prompt = f"""
@@ -35,19 +32,19 @@ def analyze_stats_with_ai(
3532
"model": model,
3633
"messages": [
3734
{"role": "system", "content": "You are a professional linguistic analyst specializing in stylometry."},
38-
{"role": "user", "content": prompt}
35+
{"role": "user", "content": prompt},
3936
],
40-
"temperature": 0.7
37+
"temperature": 0.7,
4138
}
42-
39+
4340
data = json.dumps(payload).encode("utf-8")
4441
req = urllib.request.Request(f"{api_base.rstrip('/')}/chat/completions", data=data)
4542
req.add_header("Content-Type", "application/json")
46-
43+
4744
try:
4845
with urllib.request.urlopen(req, timeout=120) as response:
4946
res_data = json.loads(response.read().decode("utf-8"))
50-
return res_data["choices"][0]["message"]["content"]
47+
return str(res_data["choices"][0]["message"]["content"])
5148
except urllib.error.URLError as e:
5249
return f"AI Analysis failed: Could not connect to LM Studio at {api_base}. Ensure the server is running. (Error: {e})"
5350
except Exception as e:

stylometry/boilerplate.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,37 @@
11
import collections
22
from typing import List, Set
3+
34
from .models import DocRecord
45

6+
57
def find_boilerplate(docs: List[DocRecord], threshold: float = 0.5, min_len: int = 50) -> Set[int]:
68
"""
79
Identifies paragraphs that appear in more than 'threshold' fraction of documents.
810
Returns a set of hashes of boilerplate paragraphs.
911
"""
1012
if len(docs) < 3:
1113
return set()
12-
13-
counts = collections.Counter()
14+
15+
counts: collections.Counter[str] = collections.Counter()
1416
for d in docs:
1517
paragraphs = {p.strip() for p in d.text.split("\n") if len(p.strip()) >= min_len}
1618
for p in paragraphs:
1719
counts[p] += 1
18-
20+
1921
boilerplate_hashes = set()
2022
n_docs = len(docs)
2123
for p, count in counts.items():
2224
if count / n_docs >= threshold:
2325
boilerplate_hashes.add(hash(p))
24-
26+
2527
return boilerplate_hashes
2628

29+
2730
def strip_boilerplate(docs: List[DocRecord], boilerplate_hashes: Set[int]):
2831
"""Removes identified boilerplate paragraphs from documents in-place."""
2932
if not boilerplate_hashes:
3033
return
31-
34+
3235
for d in docs:
3336
lines = d.text.split("\n")
3437
new_lines = []
@@ -39,7 +42,7 @@ def strip_boilerplate(docs: List[DocRecord], boilerplate_hashes: Set[int]):
3942
if hash(line.strip()) in boilerplate_hashes:
4043
continue
4144
new_lines.append(line)
42-
45+
4346
# Re-join and re-tokenize if changed
4447
new_text = "\n".join(new_lines).strip()
4548
if new_text != d.text.strip():

0 commit comments

Comments
 (0)