-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathgenerate_caption.py
More file actions
222 lines (185 loc) · 7.71 KB
/
generate_caption.py
File metadata and controls
222 lines (185 loc) · 7.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#!/usr/bin/env python3
"""Generate a Xiaohongshu发布文案 (caption note) from a cleaned script.
Output: a JSON with title, caption_body, tags, and publish-time hint.
- title: ≤18 chars, first 18 chars include 2 keywords (TF-IDF) so
the feed-thumbnail preview surfaces them.
- body: 200-500 chars, emoji every 50-80 chars, repeats target
keyword once per ~300 chars (TF-IDF normalisation).
- tags: 3-6 hashtags. Mix垂直+长尾 (no pure hot-tag spam).
- publish_time_hint: per audience profile.
This module is rule-based — no LLM dependency. For richer rewriting use
rewrite_script.py first, then run generate_caption.py on the result.
"""
from __future__ import annotations
import argparse
import json
import os
import re
import sys
from collections import Counter
from typing import Iterable, List, Optional
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from content_guard import enforce as enforce_platform_rules, HardBlock # noqa: E402
EMOJI_PALETTE = ["📌", "✨", "💡", "🔥", "👇", "✅", "🚀", "📈"]
# Words that should not become tags
STOPWORDS_ZH = set("""
的 了 在 是 我 你 他 她 它 我们 你们 他们 这 那 就 也 都 还 又 把 被 让
不 没 有 会 能 要 想 觉得 因为 所以 但是 然后 而且 就是 这种 那种
""".split())
def extract_keywords(text: str, *, top_n: int = 8) -> List[str]:
"""Return the top-n bigram/short-word candidates, ranked by TF.
Pure rule-based (no external library). Strips stopwords, keeps 2-6
char tokens, dedupes.
"""
# Strip markdown headings / punctuation
cleaned = re.sub(r"#+\s+|[*_`]", "", text)
# Split by punctuation/whitespace
tokens = re.split(r"[\s,。!?、;:()\(\),.!?;:\-—\n]+", cleaned)
counts: Counter = Counter()
for tok in tokens:
tok = tok.strip()
if not tok:
continue
if tok in STOPWORDS_ZH:
continue
if len(tok) < 2 or len(tok) > 8:
continue
if re.fullmatch(r"\d+", tok):
continue
counts[tok] += 1
return [w for w, _ in counts.most_common(top_n)]
def synthesize_title(hook_text: Optional[str], keywords: List[str],
*, max_chars: int = 18) -> str:
"""Pick a title ≤max_chars that surfaces 2 keywords up front when possible."""
if hook_text:
title = hook_text.strip()
# If the hook is short enough, use it directly
if 4 <= len(title) <= max_chars:
return title
# Else, truncate at a clean break
if len(title) > max_chars:
cut = title[:max_chars]
# find last punctuation/space to avoid cutting mid-word
for sep in "?!。,、 ":
idx = cut.rfind(sep)
if idx >= 4:
return cut[:idx]
return cut
if len(keywords) >= 2:
kw1, kw2 = keywords[0], keywords[1]
title = f"{kw1}{kw2}:3个值得知道的事"
if len(title) <= max_chars:
return title
return f"{kw1}{kw2}:值得收藏"
if keywords:
return f"{keywords[0]}:值得收藏"
return "今日分享"
def synthesize_body(script_text: str, keywords: List[str], *,
emoji_every_chars: int = 60,
min_chars: int = 200,
max_chars: int = 500) -> str:
"""Build the 正文 body. Take the cleaned script content and intersperse
emoji every ~60 chars; cap at max_chars."""
body = re.sub(r"#+\s+", "", script_text)
body = re.sub(r"\n{2,}", "\n\n", body).strip()
# Truncate
if len(body) > max_chars:
body = body[:max_chars]
# cut at the last natural break
for sep in "\n。!?":
idx = body.rfind(sep)
if idx >= min_chars:
body = body[:idx + 1]
break
# Sprinkle emoji
out_chars = []
last_emoji = 0
for i, ch in enumerate(body):
out_chars.append(ch)
if (i - last_emoji) >= emoji_every_chars and ch in "。!?\n":
emoji = EMOJI_PALETTE[(i // emoji_every_chars) % len(EMOJI_PALETTE)]
out_chars.append(f" {emoji}")
last_emoji = i
return "".join(out_chars)
def synthesize_tags(keywords: List[str], *, max_tags: int = 6,
min_tags: int = 3) -> List[str]:
"""Return 3-6 # tags. Mix垂类keyword + 长尾."""
tags = []
for kw in keywords[:max_tags]:
tags.append("#" + kw)
while len(tags) < min_tags and len(keywords) > 0:
# pad with the most popular keyword again, as compound
tags.append("#" + keywords[0])
return tags[:max_tags]
def publish_time_hint(profile: Optional[dict]) -> str:
if not profile:
return "weekday 21:00-22:30"
windows = profile.get("publishing", {}).get("preferred_windows") or []
if isinstance(windows, list) and windows:
return windows[0] if isinstance(windows[0], str) else str(windows[0])
return "weekday 21:00-22:30"
def generate_caption(script_text: str, *,
hook_text: Optional[str] = None,
profile_name: Optional[str] = None,
strict: bool = True) -> dict:
keywords = extract_keywords(script_text)
title = synthesize_title(hook_text, keywords)
body = synthesize_body(script_text, keywords)
tags = synthesize_tags(keywords)
profile = None
if profile_name:
try:
from profiles import load_profile
profile = load_profile(profile_name)
except (FileNotFoundError, ImportError):
pass
payload = {
"title": title,
"caption_body": body,
"tags": tags,
"publish_time_hint": publish_time_hint(profile),
}
if strict:
# Content-guard the title and body. Tags pre-pended with # are fine.
enforce_platform_rules([title], context="title", strict=True)
enforce_platform_rules([body], context="caption", strict=False)
# ^ caption uses non-strict (warnings only) to leave room for emoji density
return payload
def main() -> int:
p = argparse.ArgumentParser(description="Generate Xiaohongshu caption (note body)")
p.add_argument("--script", required=True, help="Path to clean_script.md")
p.add_argument("--hook", default=None, help="Override hook text used as title")
p.add_argument("--profile", default=None, help="Audience profile (tech_pro, lifestyle)")
p.add_argument("--output", default=None, help="JSON output path; stdout if omitted")
p.add_argument("--no-strict", action="store_true",
help="Skip content-guard rejection (warnings still emitted to stderr)")
args = p.parse_args()
if not os.path.isfile(args.script):
print(f"script not found: {args.script}", file=sys.stderr)
return 1
with open(args.script, encoding="utf-8") as f:
script_text = f.read()
hook_text = args.hook
if not hook_text:
# Try to extract from `## Hook` block
m = re.search(r"^##\s+Hook\s*\n+([^\n#]+)", script_text, re.MULTILINE)
if m:
hook_text = m.group(1).strip()
try:
payload = generate_caption(
script_text, hook_text=hook_text,
profile_name=args.profile, strict=not args.no_strict,
)
except HardBlock as exc:
print(f"🚫 Caption rejected by content guard: {exc}", file=sys.stderr)
return 2
out_text = json.dumps(payload, ensure_ascii=False, indent=2)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(out_text)
print(f"✅ caption → {args.output}")
else:
print(out_text)
return 0
if __name__ == "__main__":
sys.exit(main())