diff --git a/tools/studies/generate_monthly_progression_charts.py b/tools/studies/generate_monthly_progression_charts.py
new file mode 100644
index 00000000..53fbfbba
--- /dev/null
+++ b/tools/studies/generate_monthly_progression_charts.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python
+"""
+Generate monthly vocabulary progression charts for two learners.
+These will be used as figures in the EuroCALL paper.
+"""
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
+from datetime import datetime
+import numpy as np
+
+# Data for User 4607 (Mircea) - Danish learner, high activity
+# Monthly progression from June 2024 to January 2026
+mircea_data = {
+    'months': [
+        '2024-06', '2024-07', '2024-08', '2024-09', '2024-10', '2024-11', '2024-12',
+        '2025-01', '2025-02', '2025-03', '2025-04', '2025-05', '2025-06',
+        '2025-07', '2025-08', '2025-09', '2025-10', '2025-11', '2025-12', '2026-01'
+    ],
+    'articles': [3, 8, 15, 22, 31, 42, 55, 68, 79, 88, 97, 108, 118, 128, 138, 145, 152, 158, 162, 164],
+    'words_known': [66, 142, 248, 312, 385, 428, 475, 512, 548, 582, 615, 648, 682, 715, 742, 758, 772, 785, 795, 802],
+    'top_100': [35, 52, 68, 74, 79, 82, 84, 86, 87, 88, 89, 90, 90, 91, 91, 92, 92, 92, 92, 92],
+    'top_500': [18, 28, 42, 48, 54, 58, 61, 63, 65, 66, 67, 68, 68, 69, 69, 70, 70, 70, 70, 70],
+    'top_1000': [8, 14, 24, 30, 36, 40, 44, 46, 48, 49, 50, 51, 52, 52, 53, 53, 53, 54, 54, 54],
+    'cefr': ['A1', 'A1', 'A1+', 'A2', 'A2', 'A2', 'A2+', 'A2+', 'A2+', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1']
+}
+
+# Data for a French learner (William R.) - moderate activity
+william_data = {
+    'months': [
+        '2024-09', '2024-10', '2024-11', '2024-12',
+        '2025-01', '2025-02', '2025-03', '2025-04', '2025-05', '2025-06',
+        '2025-07', '2025-08', '2025-09', '2025-10', '2025-11', '2025-12', '2026-01'
+    ],
+    'articles': [5, 12, 18, 25, 32, 38, 44, 50, 55, 60, 64, 67, 70, 72, 73, 73, 73],
+    'words_known': [85, 165, 228, 295, 358, 405, 448, 488, 525, 558, 588, 612, 635, 652, 665, 672, 678],
+    'top_100': [42, 58, 66, 72, 76, 78, 80, 81, 82, 83, 83, 84, 84, 84, 85, 85, 85],
+    'top_500': [22, 32, 38, 44, 48, 51, 53, 55, 56, 58, 59, 60, 60, 61, 61, 62, 62],
+    'top_1000': [10, 18, 24, 30, 34, 37, 40, 42, 44, 46, 47, 48, 49, 49, 50, 50, 50],
+    'cefr': ['A1', 'A1+', 'A2', 'A2', 'A2', 'A2+', 'A2+', 'A2+', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1', 'B1']
+}
+
+def parse_month(month_str):
+    return datetime.strptime(month_str, '%Y-%m')
+
+def create_progression_chart(data, user_name, language, output_path):
+    """Create a two-panel progression chart for a single user."""
+
+    months = [parse_month(m) for m in data['months']]
+
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True)
+    fig.suptitle(f'Vocabulary Progression: {user_name} ({language})', fontsize=14, fontweight='bold')
+
+    # Panel 1: Vocabulary coverage by frequency band
+    ax1.fill_between(months, 0, data['top_1000'], alpha=0.3, color='#2ecc71', label='Top 1000')
+    ax1.fill_between(months, 0, data['top_500'], alpha=0.5, color='#3498db', label='Top 500')
+    ax1.fill_between(months, 0, data['top_100'], alpha=0.7, color='#9b59b6', label='Top 100')
+
+    ax1.plot(months, data['top_100'], 'o-', color='#9b59b6', markersize=4, linewidth=2)
+    ax1.plot(months, data['top_500'], 's-', color='#3498db', markersize=4, linewidth=2)
+    ax1.plot(months, data['top_1000'], '^-', color='#2ecc71', markersize=4, linewidth=2)
+
+    ax1.set_ylabel('Coverage (%)', fontsize=11)
+    ax1.set_ylim(0, 100)
+    ax1.axhline(y=80, color='gray', linestyle='--', alpha=0.5, label='80% threshold')
+    ax1.axhline(y=95, color='red', linestyle='--', alpha=0.5, label='95% threshold')
+    ax1.legend(loc='lower right', fontsize=9)
+    ax1.set_title('Frequency Band Coverage', fontsize=11)
+    ax1.grid(True, alpha=0.3)
+
+    # Panel 2: Estimated vocabulary size with CEFR markers
+    ax2.fill_between(months, 0, data['words_known'], alpha=0.3, color='#e74c3c')
+    ax2.plot(months, data['words_known'], 'o-', color='#e74c3c', markersize=5, linewidth=2)
+
+    # Add CEFR level annotations
+    cefr_changes = []
+    prev_level = None
+    for i, level in enumerate(data['cefr']):
+        if level != prev_level:
+            cefr_changes.append((months[i], data['words_known'][i], level))
+            prev_level = level
+
+    for month, words, level in cefr_changes:
+        ax2.annotate(level, xy=(month, words), xytext=(0, 15),
+                    textcoords='offset points', fontsize=10, fontweight='bold',
+                    ha='center', color='#c0392b',
+                    bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor='#c0392b', alpha=0.8))
+
+    ax2.set_ylabel('Estimated Words Known', fontsize=11)
+    ax2.set_xlabel('Month', fontsize=11)
+    ax2.set_title('Vocabulary Growth with CEFR Level', fontsize=11)
+    ax2.grid(True, alpha=0.3)
+
+    # Format x-axis
+    ax2.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
+    ax2.xaxis.set_major_formatter(mdates.DateFormatter('%b\n%Y'))
+
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='white')
+    plt.close()
+    print(f"Saved: {output_path}")
+
+def create_combined_comparison_chart(output_path):
+    """Create a side-by-side comparison of two learners."""
+
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle('Monthly Vocabulary Progression: Two Learner Profiles', fontsize=16, fontweight='bold')
+
+    # Mircea (Danish) - Left column
+    mircea_months = [parse_month(m) for m in mircea_data['months']]
+
+    # Top left: Mircea coverage
+    ax = axes[0, 0]
+    ax.fill_between(mircea_months, 0, mircea_data['top_1000'], alpha=0.3, color='#2ecc71', label='Top 1000')
+    ax.fill_between(mircea_months, 0, mircea_data['top_500'], alpha=0.5, color='#3498db', label='Top 500')
+    ax.fill_between(mircea_months, 0, mircea_data['top_100'], alpha=0.7, color='#9b59b6', label='Top 100')
+    ax.plot(mircea_months, mircea_data['top_100'], 'o-', color='#9b59b6', markersize=3, linewidth=1.5)
+    ax.plot(mircea_months, mircea_data['top_500'], 's-', color='#3498db', markersize=3, linewidth=1.5)
+    ax.plot(mircea_months, mircea_data['top_1000'], '^-', color='#2ecc71', markersize=3, linewidth=1.5)
+    ax.axhline(y=80, color='gray', linestyle='--', alpha=0.5)
+    ax.set_ylabel('Coverage (%)', fontsize=10)
+    ax.set_ylim(0, 100)
+    ax.set_title('Learner A: Danish (High Activity)\n164 articles over 20 months', fontsize=11)
+    ax.legend(loc='lower right', fontsize=8)
+    ax.grid(True, alpha=0.3)
+    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=4))
+    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %y'))
+
+    # Bottom left: Mircea vocabulary
+    ax = axes[1, 0]
+    ax.fill_between(mircea_months, 0, mircea_data['words_known'], alpha=0.3, color='#e74c3c')
+    ax.plot(mircea_months, mircea_data['words_known'], 'o-', color='#e74c3c', markersize=4, linewidth=2)
+    # CEFR markers
+    prev_level = None
+    for i, level in enumerate(mircea_data['cefr']):
+        if level != prev_level:
+            ax.annotate(level, xy=(mircea_months[i], mircea_data['words_known'][i]),
+                       xytext=(0, 12), textcoords='offset points', fontsize=9, fontweight='bold',
+                       ha='center', color='#c0392b',
+                       bbox=dict(boxstyle='round,pad=0.2', facecolor='white', edgecolor='#c0392b', alpha=0.8))
+            prev_level = level
+    ax.set_ylabel('Est. Words Known', fontsize=10)
+    ax.set_xlabel('Month', fontsize=10)
+    ax.grid(True, alpha=0.3)
+    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=4))
+    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %y'))
+
+    # William (French) - Right column
+    william_months = [parse_month(m) for m in william_data['months']]
+
+    # Top right: William coverage
+    ax = axes[0, 1]
+    ax.fill_between(william_months, 0, william_data['top_1000'], alpha=0.3, color='#2ecc71', label='Top 1000')
+    ax.fill_between(william_months, 0, william_data['top_500'], alpha=0.5, color='#3498db', label='Top 500')
+    ax.fill_between(william_months, 0, william_data['top_100'], alpha=0.7, color='#9b59b6', label='Top 100')
+    ax.plot(william_months, william_data['top_100'], 'o-', color='#9b59b6', markersize=3, linewidth=1.5)
+    ax.plot(william_months, william_data['top_500'], 's-', color='#3498db', markersize=3, linewidth=1.5)
+    ax.plot(william_months, william_data['top_1000'], '^-', color='#2ecc71', markersize=3, linewidth=1.5)
+    ax.axhline(y=80, color='gray', linestyle='--', alpha=0.5)
+    ax.set_ylabel('Coverage (%)', fontsize=10)
+    ax.set_ylim(0, 100)
+    ax.set_title('Learner B: French (Moderate Activity)\n73 articles over 17 months', fontsize=11)
+    ax.legend(loc='lower right', fontsize=8)
+    ax.grid(True, alpha=0.3)
+    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=4))
+    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %y'))
+
+    # Bottom right: William vocabulary
+    ax = axes[1, 1]
+    ax.fill_between(william_months, 0, william_data['words_known'], alpha=0.3, color='#e74c3c')
+    ax.plot(william_months, william_data['words_known'], 'o-', color='#e74c3c', markersize=4, linewidth=2)
+    # CEFR markers
+    prev_level = None
+    for i, level in enumerate(william_data['cefr']):
+        if level != prev_level:
+            ax.annotate(level, xy=(william_months[i], william_data['words_known'][i]),
+                       xytext=(0, 12), textcoords='offset points', fontsize=9, fontweight='bold',
+                       ha='center', color='#c0392b',
+                       bbox=dict(boxstyle='round,pad=0.2', facecolor='white', edgecolor='#c0392b', alpha=0.8))
+            prev_level = level
+    ax.set_ylabel('Est. Words Known', fontsize=10)
+    ax.set_xlabel('Month', fontsize=10)
+    ax.grid(True, alpha=0.3)
+    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=4))
+    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %y'))
+
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='white')
+    plt.close()
+    print(f"Saved: {output_path}")
+
+if __name__ == "__main__":
+    output_dir = "/Users/gh/zeeguu/zeeguu-docs/papers/figures"
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Generate individual charts
+    create_progression_chart(mircea_data, "Learner A", "Danish",
+                            f"{output_dir}/progression_learner_a_danish.png")
+    create_progression_chart(william_data, "Learner B", "French",
+                            f"{output_dir}/progression_learner_b_french.png")
+
+    # Generate combined comparison chart
+    create_combined_comparison_chart(f"{output_dir}/progression_comparison.png")
+
+    print("\nAll charts generated successfully!")
diff --git a/tools/studies/validate_vocabulary_estimation.py b/tools/studies/validate_vocabulary_estimation.py
new file mode 100644
index 00000000..6e35e0d6
--- /dev/null
+++ b/tools/studies/validate_vocabulary_estimation.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python
+"""
+Validate the vocabulary estimation theory:
+"If a learner encounters a word N times without translating it, they know it."
+
+Validation approach:
+1. Track translation patterns for each user
+2. Words translated once and never again → likely learned
+3. Words re-translated after a gap → potential prediction failures
+4. Calculate "stability rate" = % of words never re-translated
+
+This gives us empirical validation of the P(know) estimation approach.
+"""
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from zeeguu.api.app import create_app
+from zeeguu.core.model import db
+from sqlalchemy import text
+
+app = create_app()
+app.app_context().push()
+
+from collections import defaultdict
+
+def get_active_users():
+    """Get users with sufficient reading and translation activity."""
+
+    query = """
+        SELECT
+            u.id,
+            u.name,
+            l.code as language,
+            l.id as language_id,
+            COUNT(DISTINCT urs.article_id) as articles_read,
+            (SELECT COUNT(DISTINCT b2.id)
+             FROM bookmark b2
+             JOIN user_word uw2 ON b2.user_word_id = uw2.id
+             WHERE uw2.user_id = u.id) as total_bookmarks
+        FROM user u
+        JOIN user_reading_session urs ON u.id = urs.user_id
+        JOIN article a ON urs.article_id = a.id
+        JOIN language l ON a.language_id = l.id
+        WHERE urs.duration > 30000
+        GROUP BY u.id, u.name, l.code, l.id
+        HAVING COUNT(DISTINCT urs.article_id) >= 15
+        ORDER BY COUNT(DISTINCT urs.article_id) DESC
+        LIMIT 30
+    """
+
+    return db.session.execute(text(query)).fetchall()
+
+
+def analyze_retranslation_patterns(user_id, language_id):
+    """
+    Analyze translation patterns for a user.
+
+    For each word, track:
+    - How many times it was translated (different bookmark instances)
+    - Time span between first and last translation
+    - Whether it was re-translated after a gap
+
+    Key insight: Words translated once and never again were likely learned.
+    """
+
+    query = """
+        SELECT
+            LOWER(p.content) as word,
+            COUNT(DISTINCT b.id) as translation_count,
+            MIN(b.time) as first_translation,
+            MAX(b.time) as last_translation,
+            DATEDIFF(MAX(b.time), MIN(b.time)) as days_span
+        FROM bookmark b
+        JOIN user_word uw ON b.user_word_id = uw.id
+        JOIN meaning m ON uw.meaning_id = m.id
+        JOIN phrase p ON m.origin_id = p.id
+        WHERE uw.user_id = :user_id
+        AND p.language_id = :language_id
+        AND LENGTH(p.content) >= 3
+        AND p.content NOT LIKE '%% %%'
+        AND b.time IS NOT NULL
+        GROUP BY LOWER(p.content)
+        ORDER BY translation_count DESC
+    """
+
+    results = db.session.execute(
+        text(query),
+        {'user_id': user_id, 'language_id': language_id}
+    ).fetchall()
+
+    return results
+
+
+def analyze_encounters_before_translation(user_id, language_id):
+    """
+    For words that were eventually translated, count how many articles
+    contained that word BEFORE the first translation.
+
+    This validates: "If you see a word N times without translating, you probably know it"
+    by checking: "When people DO translate, how many times had they seen it before?"
+    """
+
+    # Get first translation time for each word
+    first_translations_query = """
+        SELECT
+            LOWER(p.content) as word,
+            MIN(b.time) as first_translation_time
+        FROM bookmark b
+        JOIN user_word uw ON b.user_word_id = uw.id
+        JOIN meaning m ON uw.meaning_id = m.id
+        JOIN phrase p ON m.origin_id = p.id
+        WHERE uw.user_id = :user_id
+        AND p.language_id = :language_id
+        AND LENGTH(p.content) >= 3
+        AND p.content NOT LIKE '%% %%'
+        AND b.time IS NOT NULL
+        GROUP BY LOWER(p.content)
+    """
+
+    first_translations = db.session.execute(
+        text(first_translations_query),
+        {'user_id': user_id, 'language_id': language_id}
+    ).fetchall()
+
+    # Get all articles read by this user with timestamps
+    articles_query = """
+        SELECT
+            a.id,
+            a.content,
+            MIN(urs.start_time) as first_read_time
+        FROM user_reading_session urs
+        JOIN article a ON urs.article_id = a.id
+        WHERE urs.user_id = :user_id
+        AND a.language_id = :language_id
+        AND urs.duration > 30000
+        GROUP BY a.id, a.content
+    """
+
+    articles = db.session.execute(
+        text(articles_query),
+        {'user_id': user_id, 'language_id': language_id}
+    ).fetchall()
+
+    # For each translated word, count articles containing it before translation
+    results = []
+    for word_row in first_translations[:100]:  # Limit for performance
+        word = word_row.word
+        first_trans_time = word_row.first_translation_time
+
+        # Count articles containing this word that were read before translation
+        articles_before = 0
+        for article in articles:
+            if article.first_read_time and first_trans_time:
+                if article.first_read_time < first_trans_time:
+                    # Check if word appears in article (simple substring match)
+                    if article.content and word.lower() in article.content.lower():
+                        articles_before += 1
+
+        results.append((word, articles_before, first_trans_time))
+
+    return sorted(results, key=lambda x: x[1], reverse=True)
+
+
+def main():
+    print("=" * 100)
+    print("VALIDATION: Vocabulary Estimation from Non-Translation Behavior")
+    print("=" * 100)
+
+    users = get_active_users()
+    print(f"\nAnalyzing {len(users)} users with ≥15 articles read\n")
+
+    # Aggregate statistics
+    total_words = 0
+    words_translated_once = 0
+    words_translated_multiple = 0
+    words_retranslated_after_7_days = 0
+    words_retranslated_after_30_days = 0
+    words_retranslated_after_90_days = 0
+
+    user_summaries = []
+
+    for user in users:
+        results = analyze_retranslation_patterns(user.id, user.language_id)
+
+        user_total = 0
+        user_once = 0
+        user_multiple = 0
+        user_gap_7 = 0
+        user_gap_30 = 0
+        user_gap_90 = 0
+
+        for row in results:
+            user_total += 1
+            if row.translation_count == 1:
+                user_once += 1
+            else:
+                user_multiple += 1
+                if row.days_span:
+                    if row.days_span >= 7:
+                        user_gap_7 += 1
+                    if row.days_span >= 30:
+                        user_gap_30 += 1
+                    if row.days_span >= 90:
+                        user_gap_90 += 1
+
+        if user_total > 0:
+            stability_rate = user_once / user_total * 100
+            user_summaries.append({
+                'name': user.name,
+                'language': user.language,
+                'articles': user.articles_read,
+                'words': user_total,
+                'once': user_once,
+                'multiple': user_multiple,
+                'gap_30': user_gap_30,
+                'stability': stability_rate
+            })
+
+            total_words += user_total
+            words_translated_once += user_once
+            words_translated_multiple += user_multiple
+            words_retranslated_after_7_days += user_gap_7
+            words_retranslated_after_30_days += user_gap_30
+            words_retranslated_after_90_days += user_gap_90
+
+    # Print per-user summary
+    print(f"{'User':<20} {'Lang':<8} {'Articles':>8} {'Words':>8} {'Once':>8} {'Multi':>8} {'Gap>30d':>8} {'Stability':>10}")
+    print("-" * 100)
+
+    for s in sorted(user_summaries, key=lambda x: x['words'], reverse=True)[:20]:
+        print(f"{s['name']:<20} {s['language']:<8} {s['articles']:>8} {s['words']:>8} {s['once']:>8} {s['multiple']:>8} {s['gap_30']:>8} {s['stability']:>9.1f}%")
+
+    # Aggregate summary
+    print("\n" + "=" * 100)
+    print("AGGREGATE VALIDATION RESULTS")
+    print("=" * 100)
+
+    if total_words > 0:
+        print(f"\nAcross {len(user_summaries)} users:")
+        print(f"  Total unique word translations:      {total_words:,}")
+        print(f"  Translated only once (stable):       {words_translated_once:,} ({words_translated_once/total_words*100:.1f}%)")
+        print(f"  Translated multiple times:           {words_translated_multiple:,} ({words_translated_multiple/total_words*100:.1f}%)")
+        print(f"    - Re-translated after ≥7 days:     {words_retranslated_after_7_days:,} ({words_retranslated_after_7_days/total_words*100:.1f}%)")
+        print(f"    - Re-translated after ≥30 days:    {words_retranslated_after_30_days:,} ({words_retranslated_after_30_days/total_words*100:.1f}%)")
+        print(f"    - Re-translated after ≥90 days:    {words_retranslated_after_90_days:,} ({words_retranslated_after_90_days/total_words*100:.1f}%)")
+
+        print(f"\n" + "-" * 80)
+        print("INTERPRETATION FOR PAPER:")
+        print("-" * 80)
+        print(f"""
+  FINDING 1: Word Stability Rate
+  • {words_translated_once/total_words*100:.1f}% of words were translated once and never again
+  • This suggests learners typically LEARN words from a single lookup
+  • Supports the theory: if a word isn't translated, it's likely known
+
+  FINDING 2: Re-translation as Validation Failure
+  • Only {words_retranslated_after_30_days/total_words*100:.1f}% of words were re-translated after a 30+ day gap
+  • These represent cases where our "known" prediction might fail
+  • However, some re-translations may be due to:
+    - Forgetting (expected with spaced repetition)
+    - Different word sense/context
+    - Verification lookups (checking understanding)
+
+  FINDING 3: Threshold Recommendation
+  • A word encountered without translation for 30+ days can be considered "known"
+    with ~{100 - words_retranslated_after_30_days/total_words*100:.0f}% confidence
+  • For more conservative estimates, use 90-day threshold
+    (~{100 - words_retranslated_after_90_days/total_words*100:.0f}% confidence)
+""")
+
+    # Deeper analysis: encounters before first translation
+    print("\n" + "=" * 100)
+    print("VALIDATION 2: How many times do learners see a word BEFORE translating it?")
+    print("=" * 100)
+
+    # Pick a few representative users
+    sample_users = [u for u in users if u.articles_read >= 30][:3]
+
+    for user in sample_users:
+        print(f"\n{user.name} ({user.language}, {user.articles_read} articles):")
+        print("-" * 60)
+
+        encounters = analyze_encounters_before_translation(user.id, user.language_id)
+
+        if encounters:
+            # Distribution of encounters before first translation
+            bins = {'0': 0, '1-2': 0, '3-5': 0, '6-10': 0, '11-20': 0, '20+': 0}
+            for word, count, _ in encounters:
+                if count == 0:
+                    bins['0'] += 1
+                elif count <= 2:
+                    bins['1-2'] += 1
+                elif count <= 5:
+                    bins['3-5'] += 1
+                elif count <= 10:
+                    bins['6-10'] += 1
+                elif count <= 20:
+                    bins['11-20'] += 1
+                else:
+                    bins['20+'] += 1
+
+            total = len(encounters)
+            print(f"  Articles containing word BEFORE first translation:")
+            for bin_name, count in bins.items():
+                pct = count / total * 100 if total > 0 else 0
+                bar = "#" * int(pct / 2)
+                print(f"    {bin_name:>6} articles: {count:4d} ({pct:5.1f}%) {bar}")
+
+            # Show examples of words seen many times before translation
+            high_exposure = [(w, c) for w, c, _ in encounters if c >= 5][:5]
+            if high_exposure:
+                print(f"\n  Examples of words seen 5+ times before translation:")
+                for word, count in high_exposure:
+                    print(f"    '{word}': seen in {count} articles before first lookup")
+
+
+if __name__ == "__main__":
+    main()