diff --git a/docs/en/part12/ch42_speech_audio_interaction_data_engineering.md b/docs/en/part12/ch42_speech_audio_interaction_data_engineering.md
index 91ca6e37..9d1c0d6c 100644
--- a/docs/en/part12/ch42_speech_audio_interaction_data_engineering.md
+++ b/docs/en/part12/ch42_speech_audio_interaction_data_engineering.md
@@ -67,7 +67,7 @@ VoiceStyleControl should therefore not be understood simply as a TTS dataset. Th
 
 ### VoiceStyleControl.3: Sample Schema: Separate Modeling of the Semantic Channel and Style Channel
 
-![Figure 42-1: Dual-channel schema for semantic response and style control](../../images/part12/ch42_fig02_dual_channel_schema.svg)
+![Figure 42-1: Dual-channel schema for semantic response and style control](../../images/part12/ch42_fig02_dual_channel_schema_en.svg)
 
 *Figure 42-1: Dual-channel schema for semantic response and style control. The semantic channel answers "what to say," the style channel answers "with which voice and emotion to say it," and the acoustic supervision channel binds both to audio files, speech tokens, and sampling configuration.*
 
@@ -268,7 +268,7 @@ Once training samples enter the dataloader, they are projected from the standard
 
 ### VoiceStyleControl.4: Construction Pipeline: From Text Conversation to Controllable Voice Records
 
-![Figure 42-2: VoiceStyleControl data construction pipeline](../../images/part12/ch42_fig01_data_pipeline.svg)
+![Figure 42-2: VoiceStyleControl data construction pipeline](../../images/part12/ch42_fig01_data_pipeline_en.svg)
 
 *Figure 42-2: VoiceStyleControl data construction pipeline. Text conversation or style content is first assigned speaker and emotion conditions, then audio is generated or collected through the authorized reference voice pool, and finally the samples are tokenized, quality-checked, balanced, and packaged.*
 
@@ -362,7 +362,7 @@ The packaging artifacts include not only JSONL, Parquet, or Hugging Face Dataset
 
 ### VoiceStyleControl.5: Quality Assessment and Closed-Loop Remediation
 
-![Figure 42-3: Quality assessment and data flywheel closed loop](../../images/part12/ch42_fig03_quality_loop.svg)
+![Figure 42-3: Quality assessment and data flywheel closed loop](../../images/part12/ch42_fig03_quality_loop_en.svg)
 
 *Figure 42-3: Quality assessment and data flywheel closed loop. Automated validation, reverse ASR, style assessment, and manual sampling together form a defective-sample queue that feeds back into re-synthesis, re-annotation, downweighting, or removal.*
 
diff --git a/docs/images/part12/ch42_fig01_data_pipeline_en.svg b/docs/images/part12/ch42_fig01_data_pipeline_en.svg
new file mode 100644
index 00000000..006cbd9e
--- /dev/null
+++ b/docs/images/part12/ch42_fig01_data_pipeline_en.svg
@@ -0,0 +1,69 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1320" height="380" viewBox="0 0 1320 380">
+<defs>
+  <marker id="arrow" markerWidth="12" markerHeight="12" refX="10" refY="6" orient="auto"><path d="M2,2 L10,6 L2,10 Z" fill="#475569"/></marker>
+  <style>
+  .title{font-family:-apple-system,BlinkMacSystemFont,'Noto Sans CJK SC','Microsoft YaHei',sans-serif;font-size:17px;font-weight:700;fill:#0f172a}
+  .sub{font-family:-apple-system,BlinkMacSystemFont,'Noto Sans CJK SC','Microsoft YaHei',sans-serif;font-size:13px;fill:#475569}
+  .note{font-family:-apple-system,BlinkMacSystemFont,'Noto Sans CJK SC','Microsoft YaHei',sans-serif;font-size:14px;fill:#334155;font-weight:600}
+  </style>
+</defs>
+<rect x="1" y="1" width="1318" height="378" rx="18" fill="white" stroke="#cbd5e1"/>
+<g>
+  <rect x="35" y="155" width="180" height="78" rx="12" ry="12" fill="#eff6ff" stroke="#334155" stroke-width="1.4"/>
+  <text x="125" y="187" text-anchor="middle" class="title">Text / style content</text>
+  <text x="125" y="211" text-anchor="middle" class="sub">Qwen3-8B or human</text>
+</g>
+<g>
+  <rect x="245" y="155" width="180" height="78" rx="12" ry="12" fill="#f0fdf4" stroke="#334155" stroke-width="1.4"/>
+  <text x="335" y="187" text-anchor="middle" class="title">Style attributes</text>
+  <text x="335" y="211" text-anchor="middle" class="sub">gender / mood / language</text>
+</g>
+<g>
+  <rect x="475" y="35" width="210" height="78" rx="12" ry="12" fill="#fff7ed" stroke="#334155" stroke-width="1.4"/>
+  <text x="580" y="67" text-anchor="middle" class="title">Authorized voice pool</text>
+  <text x="580" y="91" text-anchor="middle" class="sub">multi-speaker x emotion</text>
+</g>
+<g>
+  <rect x="475" y="155" width="210" height="78" rx="12" ry="12" fill="#fff7ed" stroke="#334155" stroke-width="1.4"/>
+  <text x="580" y="187" text-anchor="middle" class="title">Speech synth / collect</text>
+  <text x="580" y="211" text-anchor="middle" class="sub">CosyVoice2 zero-shot</text>
+</g>
+<g>
+  <rect x="730" y="155" width="170" height="78" rx="12" ry="12" fill="#fefce8" stroke="#334155" stroke-width="1.4"/>
+  <text x="815" y="187" text-anchor="middle" class="title">Speech tokens</text>
+  <text x="815" y="211" text-anchor="middle" class="sub">S3Tokenizer 25 Hz</text>
+</g>
+<g>
+  <rect x="935" y="155" width="170" height="78" rx="12" ry="12" fill="#fdf2f8" stroke="#334155" stroke-width="1.4"/>
+  <text x="1020" y="187" text-anchor="middle" class="title">QC and balance</text>
+  <text x="1020" y="211" text-anchor="middle" class="sub">ASR / voice / emotion</text>
+</g>
+<g>
+  <rect x="1130" y="155" width="160" height="78" rx="12" ry="12" fill="#f8fafc" stroke="#334155" stroke-width="1.4"/>
+  <text x="1210" y="187" text-anchor="middle" class="title">Package / release</text>
+  <text x="1210" y="211" text-anchor="middle" class="sub">JSONL / audio / token</text>
+</g>
+<g>
+  <line x1="215" y1="194" x2="245" y2="194" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/>
+</g>
+<g>
+  <line x1="425" y1="194" x2="475" y2="194" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/>
+  <text x="450" y="165" text-anchor="middle" class="sub">target</text>
+  <text x="450" y="179" text-anchor="middle" class="sub">style</text>
+</g>
+<g>
+  <line x1="580" y1="113" x2="580" y2="155" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/>
+  <text x="648" y="139" text-anchor="middle" class="sub">reference choice</text>
+</g>
+<g>
+  <line x1="685" y1="194" x2="730" y2="194" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/>
+</g>
+<g>
+  <line x1="900" y1="194" x2="935" y2="194" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/>
+</g>
+<g>
+  <line x1="1105" y1="194" x2="1130" y2="194" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/>
+</g>
+<text x="660" y="314" text-anchor="middle" class="note">S2SEmoControl: synthesize query and answer separately</text>
+<text x="660" y="337" text-anchor="middle" class="note">TTSSpeakerControl: synthesize style description and answer</text>
+</svg>
diff --git a/docs/images/part12/ch42_fig02_dual_channel_schema_en.svg b/docs/images/part12/ch42_fig02_dual_channel_schema_en.svg
new file mode 100644
index 00000000..8e6d9c74
--- /dev/null
+++ b/docs/images/part12/ch42_fig02_dual_channel_schema_en.svg
@@ -0,0 +1,16 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1280" height="560" viewBox="0 0 1280 560">
+<defs>
+  <marker id="arrow" markerWidth="12" markerHeight="12" refX="10" refY="6" orient="auto"><path d="M2,2 L10,6 L2,10 Z" fill="#475569"/></marker>
+  <style>.title{font-family:-apple-system,BlinkMacSystemFont,'Noto Sans CJK SC','Microsoft YaHei',sans-serif;font-size:20px;font-weight:700;fill:#0f172a}.item{font-family:-apple-system,BlinkMacSystemFont,'Noto Sans CJK SC','Microsoft YaHei',sans-serif;font-size:16px;fill:#334155}.caption{font-family:-apple-system,BlinkMacSystemFont,'Noto Sans CJK SC','Microsoft YaHei',sans-serif;font-size:18px;font-weight:700;fill:#0f172a}</style>
+</defs>
+<rect x="1" y="1" width="1278" height="558" rx="18" fill="white" stroke="#cbd5e1"/>
+<text x="640" y="38" text-anchor="middle" class="caption">Dual-channel schema for semantic response and style control</text>
+<g><rect x="60" y="90" width="330" height="170" rx="14" fill="#eff6ff" stroke="#334155" stroke-width="1.3"/><text x="78" y="120" class="title">Semantic Channel</text><text x="82" y="148" class="item">• query / spoken user query</text><text x="82" y="174" class="item">• answer / assistant response</text><text x="82" y="200" class="item">• task: S2S or TTS</text><text x="82" y="226" class="item">• language: zh</text></g>
+<g><rect x="60" y="310" width="330" height="170" rx="14" fill="#f0fdf4" stroke="#334155" stroke-width="1.3"/><text x="78" y="340" class="title">Style Channel</text><text x="82" y="368" class="item">• query_gender / answer_gender</text><text x="82" y="394" class="item">• query_mood / answer_mood</text><text x="82" y="420" class="item">• gender / mood</text><text x="82" y="446" class="item">• query_id / answer_id</text></g>
+<g><rect x="480" y="120" width="320" height="230" rx="14" fill="#fff7ed" stroke="#334155" stroke-width="1.3"/><text x="498" y="150" class="title">Acoustic Target</text><text x="502" y="178" class="item">• query_audio_path / answer_audio_path</text><text x="502" y="204" class="item">• query_token_25hz</text><text x="502" y="230" class="item">• answer_token_25hz</text><text x="502" y="256" class="item">• speech_token_25hz</text><text x="502" y="282" class="item">• sample_rate: 16000</text></g>
+<g><rect x="900" y="165" width="310" height="200" rx="14" fill="#fdf2f8" stroke="#334155" stroke-width="1.3"/><text x="918" y="195" class="title">Training Record</text><text x="922" y="223" class="item">• Text loss: semantic match</text><text x="922" y="249" class="item">• Speech-token loss: pronounceable</text><text x="922" y="275" class="item">• Speaker constraint: identity</text><text x="922" y="301" class="item">• Emotion constraint: style</text></g>
+<line x1="390" y1="175" x2="480" y2="200" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/><text x="435.0" y="166" text-anchor="middle" class="item">text</text><text x="435.0" y="184" text-anchor="middle" class="item">alignment</text>
+<line x1="390" y1="395" x2="480" y2="280" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/><text x="435.0" y="318" text-anchor="middle" class="item">control</text><text x="435.0" y="336" text-anchor="middle" class="item">conditions</text>
+<line x1="800" y1="235" x2="900" y2="255" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/><text x="850.0" y="226" text-anchor="middle" class="item">unified</text><text x="850.0" y="244" text-anchor="middle" class="item">package</text>
+<text x="640" y="515" text-anchor="middle" class="item">Core principle: model semantic correctness and vocal expression separately, then merge them into one trainable record.</text>
+</svg>
diff --git a/docs/images/part12/ch42_fig03_quality_loop_en.svg b/docs/images/part12/ch42_fig03_quality_loop_en.svg
new file mode 100644
index 00000000..8c2189bb
--- /dev/null
+++ b/docs/images/part12/ch42_fig03_quality_loop_en.svg
@@ -0,0 +1,12 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1120" height="430" viewBox="0 0 1120 430">
+<defs><marker id="arrow" markerWidth="12" markerHeight="12" refX="10" refY="6" orient="auto"><path d="M2,2 L10,6 L2,10 Z" fill="#475569"/></marker><style>.title{font-family:-apple-system,BlinkMacSystemFont,'Noto Sans CJK SC','Microsoft YaHei',sans-serif;font-size:18px;font-weight:700;fill:#0f172a}.item{font-family:-apple-system,BlinkMacSystemFont,'Noto Sans CJK SC','Microsoft YaHei',sans-serif;font-size:14px;fill:#334155}.caption{font-family:-apple-system,BlinkMacSystemFont,'Noto Sans CJK SC','Microsoft YaHei',sans-serif;font-size:18px;font-weight:700;fill:#0f172a}</style></defs>
+<rect x="1" y="1" width="1118" height="428" rx="18" fill="white" stroke="#cbd5e1"/>
+<text x="560" y="38" text-anchor="middle" class="caption">Quality evaluation and data-flywheel loop</text>
+<g><rect x="60" y="110" width="190" height="82" rx="13" fill="#eff6ff" stroke="#334155" stroke-width="1.3"/><text x="155.0" y="140" text-anchor="middle" class="title">Automatic checks</text><text x="155.0" y="168" text-anchor="middle" class="item">schema / duration / rate</text></g>
+<g><rect x="300" y="110" width="190" height="82" rx="13" fill="#f0fdf4" stroke="#334155" stroke-width="1.3"/><text x="395.0" y="140" text-anchor="middle" class="title">Reverse ASR</text><text x="395.0" y="168" text-anchor="middle" class="item">text consistency / CER</text></g>
+<g><rect x="540" y="110" width="190" height="82" rx="13" fill="#fff7ed" stroke="#334155" stroke-width="1.3"/><text x="635.0" y="140" text-anchor="middle" class="title">Style evaluation</text><text x="635.0" y="168" text-anchor="middle" class="item">speaker / emotion</text></g>
+<g><rect x="780" y="110" width="190" height="82" rx="13" fill="#fdf2f8" stroke="#334155" stroke-width="1.3"/><text x="875.0" y="140" text-anchor="middle" class="title">Human sampling</text><text x="875.0" y="168" text-anchor="middle" class="item">naturalness / misuse</text></g>
+<g><rect x="415" y="270" width="290" height="82" rx="13" fill="#f8fafc" stroke="#334155" stroke-width="1.3"/><text x="560.0" y="300" text-anchor="middle" class="title">Repair and versioning</text><text x="560.0" y="328" text-anchor="middle" class="item">resynth / relabel / downweight / remove</text></g>
+<line x1="250" y1="151" x2="300" y2="151" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/><line x1="490" y1="151" x2="540" y2="151" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/><line x1="730" y1="151" x2="780" y2="151" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/><line x1="875" y1="192" x2="705" y2="270" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/><text x="790.0" y="223.0" text-anchor="middle" class="item">issue samples</text><line x1="415" y1="311" x2="250" y2="160" stroke="#475569" stroke-width="1.8" marker-end="url(#arrow)"/><text x="332.5" y="227.5" text-anchor="middle" class="item">rule updates</text>
+<text x="560" y="395" text-anchor="middle" class="item">Online feedback enters review; only samples passing semantic, style, audio, and safety gates enter the next training set.</text>
+</svg>