Skip to content

Commit 8b63e6b

Browse files
Add optimized hint regeneration scripts (50% API cost savings)
Analysis showed forward and reverse hints are semantically identical (70.5% similarity), so we can derive reverse hints from forward hints without calling the API. New scripts: - regenerate_hints_optimized.py: Generate forward via API, derive reverse automatically - test_hints_optimized.py: Test on 10 samples before full run - analyze_hint_similarity.py: Analyze forward vs reverse hint similarity Savings: 2,246 API calls (50% reduction from 4,492 to 2,246) Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 7f40ab4 commit 8b63e6b

3 files changed

Lines changed: 565 additions & 0 deletions

File tree

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Analyze similarity between forward and reverse hints to determine if we can
4+
optimize future regenerations by deriving one from the other.
5+
"""
6+
7+
import json
8+
from pathlib import Path
9+
from difflib import SequenceMatcher
10+
11+
def load_hints():
12+
"""Load both forward and reverse hints"""
13+
forward_path = Path("public/data/collocation_hints.json")
14+
reverse_path = Path("public/data/reverse_hints.json")
15+
16+
with open(forward_path, 'r', encoding='utf-8') as f:
17+
forward_data = json.load(f)
18+
19+
with open(reverse_path, 'r', encoding='utf-8') as f:
20+
reverse_data = json.load(f)
21+
22+
return forward_data['hints'], reverse_data['hints']
23+
24+
def calculate_similarity(str1, str2):
25+
"""Calculate similarity ratio between two strings"""
26+
return SequenceMatcher(None, str1.lower(), str2.lower()).ratio()
27+
28+
def analyze_hints():
29+
"""Compare forward and reverse hints for the same word pairs"""
30+
forward_hints, reverse_hints = load_hints()
31+
32+
comparisons = []
33+
total_pairs = 0
34+
35+
# Compare matching pairs
36+
for verb_japanese, noun_hints in forward_hints.items():
37+
for noun_japanese, forward_hint in noun_hints.items():
38+
# Check if reverse hint exists for this pair
39+
if noun_japanese in reverse_hints:
40+
reverse_noun_hints = reverse_hints[noun_japanese]
41+
if verb_japanese in reverse_noun_hints:
42+
reverse_hint = reverse_noun_hints[verb_japanese]
43+
44+
similarity = calculate_similarity(forward_hint, reverse_hint)
45+
46+
comparisons.append({
47+
'verb': verb_japanese,
48+
'noun': noun_japanese,
49+
'forward_hint': forward_hint,
50+
'reverse_hint': reverse_hint,
51+
'similarity': similarity
52+
})
53+
total_pairs += 1
54+
55+
# Calculate statistics
56+
similarities = [c['similarity'] for c in comparisons]
57+
avg_similarity = sum(similarities) / len(similarities) if similarities else 0
58+
59+
# Find examples of different similarity levels
60+
very_similar = [c for c in comparisons if c['similarity'] >= 0.8]
61+
somewhat_similar = [c for c in comparisons if 0.5 <= c['similarity'] < 0.8]
62+
different = [c for c in comparisons if c['similarity'] < 0.5]
63+
64+
# Print analysis
65+
print("="*80)
66+
print("HINT SIMILARITY ANALYSIS")
67+
print("="*80)
68+
print(f"\nTotal matching pairs analyzed: {total_pairs}")
69+
print(f"Average similarity: {avg_similarity:.2%}")
70+
print(f"\nSimilarity distribution:")
71+
print(f" Very similar (>=80%): {len(very_similar)} ({len(very_similar)/total_pairs*100:.1f}%)")
72+
print(f" Somewhat similar (50-80%): {len(somewhat_similar)} ({len(somewhat_similar)/total_pairs*100:.1f}%)")
73+
print(f" Different (<50%): {len(different)} ({len(different)/total_pairs*100:.1f}%)")
74+
75+
# Show examples (English only to avoid Unicode errors)
76+
print(f"\n{'='*80}")
77+
print("EXAMPLES OF VERY SIMILAR PAIRS (>=80% similarity)")
78+
print("="*80)
79+
for i, example in enumerate(very_similar[:5], 1):
80+
print(f"\n{i}. Similarity: {example['similarity']:.1%}")
81+
print(f" Forward: {example['forward_hint']}")
82+
print(f" Reverse: {example['reverse_hint']}")
83+
84+
print(f"\n{'='*80}")
85+
print("EXAMPLES OF SOMEWHAT SIMILAR PAIRS (50-80%)")
86+
print("="*80)
87+
for i, example in enumerate(somewhat_similar[:5], 1):
88+
print(f"\n{i}. Similarity: {example['similarity']:.1%}")
89+
print(f" Forward: {example['forward_hint']}")
90+
print(f" Reverse: {example['reverse_hint']}")
91+
92+
print(f"\n{'='*80}")
93+
print("EXAMPLES OF DIFFERENT PAIRS (<50%)")
94+
print("="*80)
95+
for i, example in enumerate(different[:5], 1):
96+
print(f"\n{i}. Similarity: {example['similarity']:.1%}")
97+
print(f" Forward: {example['forward_hint']}")
98+
print(f" Reverse: {example['reverse_hint']}")
99+
100+
# Recommendation
101+
print(f"\n{'='*80}")
102+
print("RECOMMENDATION")
103+
print("="*80)
104+
105+
if avg_similarity >= 0.8:
106+
print(f"\nHINTS ARE VERY SIMILAR ({avg_similarity:.1%} average)")
107+
print("Recommendation: Future regenerations can derive reverse hints from forward hints")
108+
print(" with simple transformations, saving ~50% of API calls.")
109+
elif avg_similarity >= 0.6:
110+
print(f"\nHINTS ARE SOMEWHAT SIMILAR ({avg_similarity:.1%} average)")
111+
print("Recommendation: Some optimization possible, but may require careful review.")
112+
print(" Consider hybrid approach: derive simple cases, regenerate complex ones.")
113+
else:
114+
print(f"\nHINTS ARE QUITE DIFFERENT ({avg_similarity:.1%} average)")
115+
print("Recommendation: Continue regenerating both forward and reverse hints separately")
116+
print(" to maintain quality and clarity.")
117+
118+
# Save detailed analysis
119+
output_path = Path("data-preparation/hint_similarity_analysis.json")
120+
with open(output_path, 'w', encoding='utf-8') as f:
121+
json.dump({
122+
'total_pairs': total_pairs,
123+
'average_similarity': avg_similarity,
124+
'very_similar_count': len(very_similar),
125+
'somewhat_similar_count': len(somewhat_similar),
126+
'different_count': len(different),
127+
'very_similar_examples': very_similar[:10],
128+
'somewhat_similar_examples': somewhat_similar[:10],
129+
'different_examples': different[:10]
130+
}, f, ensure_ascii=False, indent=2)
131+
132+
print(f"\nDetailed analysis saved to: {output_path}")
133+
134+
if __name__ == "__main__":
135+
analyze_hints()

0 commit comments

Comments
 (0)