From b48cf5270f3a0ff1e27873c850ae55ff746a9fd0 Mon Sep 17 00:00:00 2001 From: kero Date: Mon, 22 Jun 2026 12:59:29 +0800 Subject: [PATCH] update ref --- ..._a_tools_and_frameworks_quick_reference.md | 12 +- ...ndix_b_compliance_and_release_checklist.md | 6 +- ..._cost_estimation_and_resource_templates.md | 10 +- ...ppendix_d_paper_to_implementation_guide.md | 14 +- .../appendix_e_common_bug_debugging_manual.md | 6 +- ...terminology_and_chinese_english_mapping.md | 12 +- docs/en/front_matter_guide.md | 2 +- docs/en/part1/ch01_data_change.md | 16 +- docs/en/part1/ch02_quality_framework.md | 14 +- docs/en/part1/ch03_data_stack.md | 4 +- docs/en/part10/ch31_agent_architecture.md | 18 +- .../ch32_auto_collection_parsing_cleaning.md | 14 +- .../ch33_labeling_synthesis_evaluation.md | 24 +- docs/en/part10/ch34_dataops_agent.md | 18 +- .../ch35_security_permission_collaboration.md | 8 +- ...h36_compliance_framework_and_governance.md | 14 +- ...ing_and_privacy_preserving_technologies.md | 18 +- .../ch38_text_corpora_transparent_ledger.md | 33 +- ...ge_text_candidate_pool_data_engineering.md | 17 +- ..._visual_document_table_data_engineering.md | 14 +- ..._visual_reasoning_tool_data_engineering.md | 14 +- ...eech_audio_interaction_data_engineering.md | 2 +- ...ning_trace_compression_data_engineering.md | 6 +- docs/en/part13/ch44_pretrain_recipes.md | 8 +- docs/en/part13/ch45_posttrain_recipes.md | 28 +- docs/en/part13/ch46_rl_reasoning_data.md | 34 +- docs/en/part13/ch47_vlm_data_recipes.md | 10 +- docs/en/part13/ch48_t2i_t2v.md | 10 +- docs/en/part14/p03_llava_instruct.md | 8 +- docs/en/part14/p05_mm_rag.md | 8 +- docs/en/part14/p06_prm.md | 6 +- docs/en/part14/p07_agent_tooluse.md | 2 +- docs/en/part14/p09_privacy_pipeline.md | 10 +- docs/en/part14/p11_mini_deepseek.md | 6 +- docs/en/part14/p12_r1_reasoning_flywheel.md | 4 +- .../p13_multimodal_instruction_factory.md | 8 +- docs/en/part14/p14_video_generation.md | 4 +- .../p15_dataagent_semantic_nl2sql_agent.md | 2 +- docs/en/part2/ch04_data_sources.md | 2 +- docs/en/part2/ch05_cleaning_dedup.md | 6 +- docs/en/part2/ch06_tokenization_loading.md | 4 +- docs/en/part2/ch07_data_operations.md | 6 +- docs/en/part3/ch08_multimodal_image.md | 14 +- docs/en/part3/ch09_recaptioning_ocr.md | 14 +- docs/en/part3/ch10_video_audio.md | 2 +- docs/en/part3/ch11_cross_modal_alignment.md | 4 +- docs/en/part4/ch12_sft.md | 8 +- docs/en/part4/ch13_preference.md | 26 +- docs/en/part4/ch14_qa.md | 2 +- docs/en/part5/ch15_data_synthesis.md | 24 +- docs/en/part5/ch16_distillation.md | 24 +- docs/en/part5/ch17_quality.md | 18 +- docs/en/part6/ch19_tool.md | 16 +- docs/en/part6/ch20_agent.md | 16 +- docs/en/part7/ch21_rag_pipeline.md | 26 +- .../ch22_multimodal_rag_visual_retrieval.md | 18 +- .../ch23_online_feedback_knowledge_update.md | 20 +- docs/en/part8/ch24_dataops_flywheel_team.md | 10 +- ...h25_data_versioning_experiment_tracking.md | 18 +- .../part8/ch26_data_platform_observability.md | 12 +- ...27_data_catalog_and_metadata_governance.md | 16 +- ..._data_productization_and_data_contracts.md | 16 +- .../en/part9/ch29_data_valuation_and_reuse.md | 14 +- ...rnal_data_market_and_sharing_governance.md | 16 +- ..._a_tools_and_frameworks_quick_reference.md | 4 +- ...ndix_b_compliance_and_release_checklist.md | 2 +- ..._cost_estimation_and_resource_templates.md | 4 +- ...ppendix_d_paper_to_implementation_guide.md | 12 +- .../appendix_e_common_bug_debugging_manual.md | 6 +- ...terminology_and_chinese_english_mapping.md | 12 +- docs/zh/part1/ch02_quality_framework.md | 12 +- docs/zh/part1/ch03_data_stack.md | 4 +- docs/zh/part10/ch31_agent_architecture.md | 14 +- .../ch32_auto_collection_parsing_cleaning.md | 10 +- .../ch33_labeling_synthesis_evaluation.md | 24 +- docs/zh/part10/ch34_dataops_agent.md | 16 +- .../ch35_security_permission_collaboration.md | 8 +- ...h36_compliance_framework_and_governance.md | 12 +- ...ing_and_privacy_preserving_technologies.md | 16 +- .../ch38_text_corpora_transparent_ledger.md | 33 +- ...ge_text_candidate_pool_data_engineering.md | 17 +- ..._visual_document_table_data_engineering.md | 10 +- ..._visual_reasoning_tool_data_engineering.md | 12 +- ...ning_trace_compression_data_engineering.md | 6 +- docs/zh/part13/ch44_pretrain_recipes.md | 4 +- docs/zh/part13/ch45_posttrain_recipes.md | 6 +- docs/zh/part13/ch46_rl_reasoning_data.md | 6 +- docs/zh/part13/ch47_vlm_data_recipes.md | 10 +- docs/zh/part13/ch48_t2i_t2v.md | 10 +- docs/zh/part14/p03_llava_instruct.md | 8 +- docs/zh/part14/p05_mm_rag.md | 8 +- docs/zh/part14/p06_prm.md | 6 +- docs/zh/part14/p09_privacy_pipeline.md | 2 +- docs/zh/part14/p11_mini_deepseek.md | 4 +- docs/zh/part14/p12_r1_reasoning_flywheel.md | 2 +- .../p13_multimodal_instruction_factory.md | 8 +- .../p15_dataagent_semantic_nl2sql_agent.md | 2 +- docs/zh/part2/ch05_cleaning_dedup.md | 6 +- docs/zh/part2/ch06_tokenization_loading.md | 2 +- docs/zh/part2/ch07_data_operations.md | 6 +- docs/zh/part3/ch08_multimodal_image.md | 8 +- docs/zh/part3/ch09_recaptioning_ocr.md | 10 +- docs/zh/part3/ch10_video_audio.md | 2 +- docs/zh/part3/ch11_cross_modal_alignment.md | 4 +- docs/zh/part4/ch12_sft.md | 8 +- docs/zh/part4/ch13_preference.md | 26 +- docs/zh/part4/ch14_qa.md | 2 +- docs/zh/part6/ch19_tool.md | 16 +- docs/zh/part6/ch20_agent.md | 14 +- docs/zh/part7/ch21_rag_pipeline.md | 26 +- .../ch22_multimodal_rag_visual_retrieval.md | 18 +- .../ch23_online_feedback_knowledge_update.md | 20 +- ...h25_data_versioning_experiment_tracking.md | 18 +- .../part8/ch26_data_platform_observability.md | 12 +- ...27_data_catalog_and_metadata_governance.md | 16 +- ..._data_productization_and_data_contracts.md | 2 +- .../zh/part9/ch29_data_valuation_and_reuse.md | 4 +- ...rnal_data_market_and_sharing_governance.md | 12 +- .../springer_alt_text_inventory.csv | 72 +- .../springer_alt_text_inventory.json | 194 +- .../springer_alt_text_inventory.xlsx | Bin 77017 -> 74909 bytes publishing/final_review/README.md | 2 +- .../final_review/figure_rights_report.md | 12 +- .../final_review/final_publication_audit.json | 129 +- .../final_review/reference_audit_report.md | 2 +- .../reference_integrity_audit.json | 4166 ++++++++--------- .../final_review/reference_integrity_audit.md | 196 +- publishing/final_review/style_report.md | 28 +- scripts/reference_integrity_audit.py | 8 +- 129 files changed, 2971 insertions(+), 3176 deletions(-) diff --git a/docs/en/appendix_a_tools_and_frameworks_quick_reference.md b/docs/en/appendix_a_tools_and_frameworks_quick_reference.md index c960321a..8eae7633 100644 --- a/docs/en/appendix_a_tools_and_frameworks_quick_reference.md +++ b/docs/en/appendix_a_tools_and_frameworks_quick_reference.md @@ -66,7 +66,7 @@ Object storage stores files but does not automatically provide version semantics | lakeFS | Branches and commits over object storage | Lakehouse-style data governance and collaboration | | Delta Lake / Apache Iceberg | Large tabular data governance | Large-scale structured samples and metadata | -For cross-institution dataset construction, public evaluation, and teaching reproduction, a minimal combination is often enough: **Git for scripts and specifications, DVC or an equivalent for data versions, object storage for large files, and release pages for external documentation**. +For cross-institution dataset construction, public evaluation, and teaching reproduction, a minimal combination is often enough: **Git for scripts and specifications, DVC or an equivalent for data versions, object storage for large files, and release pages for external documentation**. This combination is easy to hand off, easy to reproduce in courses, and consistent with the governance language used in Part VIII and Part XII. Concrete data-versioning commands, remote configuration, and pipeline syntax should follow the official DVC documentation (DVC Contributors 2026). ## A.4 Cleaning, Validation, and Training Preparation Tools @@ -124,7 +124,7 @@ If a project will become an open benchmark or course experiment, preserve annota ### A.5.2 Experiment Tracking Must Bind Data Versions -Tools such as `MLflow` and `Weights & Biases` are often misused by recording only model parameters and metrics while omitting data versions, slice results, and evaluation-script versions. Logs then look rich but cannot explain where improvement came from. +Tools such as `MLflow` and `Weights & Biases` are often misused by recording only model parameters and metrics while omitting data versions, slice results, and evaluation-script versions. Logs then look rich but cannot explain where improvement came from. If MLflow is used as the experiment-tracking entry point, run records, artifact management, and model registry details should follow the official MLflow documentation (MLflow Authors 2026). Track at least: @@ -194,7 +194,7 @@ Without these capabilities, a team may get good final accuracy but still be unab This is suitable for cross-institution specialized datasets, course reproduction, and medium-scale research projects. It is lightweight and relatively easy to hand off. -If a dataset is organized and distributed through the Hugging Face Datasets ecosystem, the loading script, dataset card, and split configuration should follow the Hugging Face Datasets Documentation. +If a dataset is organized and distributed through the Hugging Face Datasets ecosystem, the loading script, dataset card, and split configuration should follow the official Hugging Face Datasets documentation (Hugging Face 2026). ### A.7.2 Enterprise Data Platform Combination @@ -286,11 +286,11 @@ Third, for university collaboration, open benchmarks, and teaching reproduction, ## References -Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12): 86-92. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12): 86-92. https://doi.org/10.1145/3458723. -Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. +Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596. -Pushkarna M, Zaldivar A, Kjartansson O, Cicconi P, Chen V, Efrat A, Zou Y, Mueller J, Taly A, Ehyaei A, Karkkainen K, Marathe A, Han X, Mittal A, Schuster T, Yarmand M, Sohn H, Dwarakanath N C, McCann B (2022) Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 1776-1826. +Pushkarna M, Zaldivar A, Kjartansson O, Cicconi P, Chen V, Efrat A, Zou Y, Mueller J, Taly A, Ehyaei A, Karkkainen K, Marathe A, Han X, Mittal A, Schuster T, Yarmand M, Sohn H, Dwarakanath N C, McCann B (2022) Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 1776-1826. https://doi.org/10.1145/3531146.3533231. DVC Contributors (2026) Data Version Control Documentation. Available at: https://dvc.org/doc. diff --git a/docs/en/appendix_b_compliance_and_release_checklist.md b/docs/en/appendix_b_compliance_and_release_checklist.md index 7e45f24a..ee8d875e 100644 --- a/docs/en/appendix_b_compliance_and_release_checklist.md +++ b/docs/en/appendix_b_compliance_and_release_checklist.md @@ -8,13 +8,13 @@ In large-model data engineering, the most dangerous situation is often not that This appendix therefore does not provide legal advice, medical advice, financial or investment advice, nor does it constitute regulatory approval, ethics review, or release permission. It is a checklist framework better suited to engineering-team execution and traceability. Its goal is to let technical leads, project managers, course owners, and compliance contacts use the same vocabulary and reduce cross-role communication cost. -In scenarios involving law, medicine, finance, minors, cross-border data, sensitive personal information, or industry regulation, readers should rely on their institution's formal policies, the current laws of the relevant jurisdiction, data-provider contracts, ethics-review requirements, and professional compliance opinions. In the mainland China context, cybersecurity, data security, and personal-information protection should be understood in relation to the Cybersecurity Law of the People's Republic of China, the Data Security Law of the People's Republic of China, and the Personal Information Protection Law of the People's Republic of China. The checklists in this appendix can only help teams identify issues that need escalated review in advance; they cannot replace the professional judgment of lawyers, physicians, financial compliance personnel, security leads, or ethics committees. +In scenarios involving law, medicine, finance, minors, cross-border data, sensitive personal information, or industry regulation, readers should rely on their institution's formal policies, the current laws of the relevant jurisdiction, data-provider contracts, ethics-review requirements, and professional compliance opinions. In the mainland China context, cybersecurity, data security, and personal-information protection should be understood in relation to the Cybersecurity Law of the People's Republic of China, the Data Security Law of the People's Republic of China, and the Personal Information Protection Law of the People's Republic of China (National People's Congress of the People's Republic of China 2016, 2021a, 2021b). The checklists in this appendix can only help teams identify issues that need escalated review in advance; they cannot replace the professional judgment of lawyers, physicians, financial compliance personnel, security leads, or ethics committees. ## B.2 Why Compliance Checks Must Shift Left If compliance is checked only before release, teams usually encounter three expensive forms of rework. First, **source rework**: the data has already been collected and cleaned before the team discovers that the original authorization does not allow model training or redistribution. Second, **annotation rework**: annotation is complete before the team realizes that sensitive fields were not properly anonymized. Third, **release rework**: a benchmark is ready to publish before the team discovers unstable train/test boundaries or conflicts between external licenses and leaderboard rules. -A more stable approach is to split compliance into four gates: +A more stable approach is to split compliance into four gates. This split can also align with risk-management frameworks: the NIST AI RMF emphasizes organizing AI risk through governance, mapping, measurement, and management, while the EU Artificial Intelligence Act further reflects a regulatory approach that assigns obligations and boundaries by risk level (National Institute of Standards and Technology 2023; European Parliament and Council of the European Union 2024). 1. Source and authorization checks before data ingestion. 2. Sensitivity and delegation-boundary checks before annotation and processing. @@ -321,4 +321,4 @@ National Institute of Standards and Technology (2023) AI Risk Management Framewo European Parliament and Council of the European Union (2024) Regulation (EU) 2024/1689 laying down harmonised rules on artificial intelligence (Artificial Intelligence Act). Available at: https://eur-lex.europa.eu/eli/reg/2024/1689/oj. -Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. +Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596. diff --git a/docs/en/appendix_c_cost_estimation_and_resource_templates.md b/docs/en/appendix_c_cost_estimation_and_resource_templates.md index 9b6952d3..1ef93d8c 100644 --- a/docs/en/appendix_c_cost_estimation_and_resource_templates.md +++ b/docs/en/appendix_c_cost_estimation_and_resource_templates.md @@ -114,7 +114,7 @@ The safest budget is not "the API bill for generating 100,000 samples." It is th ### C.6.1 Training Estimates Should Not Look Only at GPU Count -Training budgets are often simplified to "how many cards for how many days." The real cost also depends on effective throughput, probability of failed reruns, and number of tuning rounds. +Training budgets are often simplified to "how many cards for how many days." The real cost also depends on effective throughput, probability of failed reruns, and number of tuning rounds. Large-scale training systems such as Megatron-LM show that model parallelism, data parallelism, and pipeline parallelism can significantly affect throughput, memory footprint, and failure-recovery cost (Narayanan et al. 2021). | Item | Unit | Quantity | Unit Cost / Hours | Subtotal | Notes | | :-- | :-- | :-- | :-- | :-- | :-- | @@ -128,7 +128,7 @@ If a team does not reserve resources for failed reruns, the budget usually becom ### C.6.2 Split Inference Cost by Scenario -Inference cost should be split into at least three scenarios. +Inference cost should be split into at least three scenarios. For long-context and high-concurrency serving, memory-management mechanisms such as PagedAttention have become important references for serving-cost estimation, and vLLM's engineering documentation provides a practical entry point for deployment and tuning (Kwon et al. 2023; vLLM Project 2026). | Scenario | Characteristics | Estimation Focus | | :-- | :-- | :-- | @@ -186,7 +186,7 @@ Text projects can sometimes survive rough disk estimates. Document, image, audio | Release images | External release and course reproduction versions | | Archive layer | Cold storage and long-term preservation | -Without this layering, teams often discover late that training was not the expensive part; permanently retaining every intermediate artifact was. +Without this layering, teams often discover late that training was not the expensive part; permanently retaining every intermediate artifact was. If Kubernetes is used to host training, evaluation, or teaching environments, resource quotas, storage volumes, namespaces, and job lifecycle should also be included in the budget sheet, with resource objects and scheduling semantics following the official Kubernetes documentation (Kubernetes Authors 2026). ### C.8.2 Archival Strategy Determines Maintainability Over the Next Three Years @@ -315,9 +315,9 @@ Third, mature cost management is not only about saving money. It makes the relat Patterson D, Gonzalez J, Le Q, Liang C, Munguia L, Rothchild D, So D, Texier M, Dean J (2021) Carbon Emissions and Large Neural Network Training. arXiv preprint arXiv:2104.10350. -Narayanan D, Shoeybi M, Casper J, LeGresley P, Patwary M, Catanzaro B (2021) Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. +Narayanan D, Shoeybi M, Casper J, LeGresley P, Patwary M, Catanzaro B (2021) Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. arXiv:2104.04473. -Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu C H, Gonzalez J E, Zhang H, Stoica I (2023) Efficient Memory Management for Large Language Model Serving with PagedAttention. In: Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles, pp 611-626. +Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu C H, Gonzalez J E, Zhang H, Stoica I (2023) Efficient Memory Management for Large Language Model Serving with PagedAttention. In: Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles, pp 611-626. https://doi.org/10.1145/3600006.3613165. Kubernetes Authors (2026) Kubernetes Documentation. Available at: https://kubernetes.io/docs/. diff --git a/docs/en/appendix_d_paper_to_implementation_guide.md b/docs/en/appendix_d_paper_to_implementation_guide.md index 1c7ca634..b5ee42c1 100644 --- a/docs/en/appendix_d_paper_to_implementation_guide.md +++ b/docs/en/appendix_d_paper_to_implementation_guide.md @@ -393,12 +393,20 @@ These materials let later readers know not only what was built, but why this des ## References -Kapoor S, Narayanan A (2023) Leakage and the reproducibility crisis in machine-learning-based science. Patterns 4(9):100804. +Kapoor S, Narayanan A (2023) Leakage and the reproducibility crisis in machine-learning-based science. Patterns 4(9):100804. https://doi.org/10.1016/j.patter.2023.100804. -Kreuzberger D, Kuhl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723. + +Kreuzberger D, Kuhl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. arXiv:2205.02302. Longpre S, Mahari R, Lee A, et al. (2023) The Data Provenance Initiative: A Large Scale Audit of Dataset Licensing and Attribution in AI. arXiv preprint arXiv:2310.16787. -Mazumder M, Banbury C, Yao X, et al. (2023) DataPerf: Benchmarks for Data-Centric AI Development. In: Advances in Neural Information Processing Systems 36, Datasets and Benchmarks Track. +Mazumder M, Banbury C, Yao X, et al. (2023) DataPerf: Benchmarks for Data-Centric AI Development. In: Advances in Neural Information Processing Systems 36, Datasets and Benchmarks Track. https://doi.org/10.52202/075280-0235. + +Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596. + +Pushkarna M, Zaldivar A, Kjartansson O (2022) Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 1776-1826. https://doi.org/10.1145/3531146.3533231. + +Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28. Zha D, Bhat Z P, Lai K-H, Yang F, Jiang Z, Zhong S, Hu X (2023) Data-centric Artificial Intelligence: A Survey. arXiv preprint arXiv:2303.10158. diff --git a/docs/en/appendix_e_common_bug_debugging_manual.md b/docs/en/appendix_e_common_bug_debugging_manual.md index c76917b0..e2ba7379 100644 --- a/docs/en/appendix_e_common_bug_debugging_manual.md +++ b/docs/en/appendix_e_common_bug_debugging_manual.md @@ -446,10 +446,12 @@ If these three actions continue, this manual becomes part of daily engineering r Blecher L, Cucurull G, Scialom T, Stojnic R (2023) Nougat: Neural Optical Understanding for Academic Documents. arXiv preprint arXiv:2308.13418. +Breck E, Cai S, Nielsen E, Salib M, Sculley D (2017) The ML Test Score: A Rubric for ML Production Readiness and Technical Debt Reduction. In: IEEE International Conference on Big Data, pp 1123-1132. + Pfitzmann B, Auer C, Dolfi M, Nassar A S, Staar P (2022) DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 3743-3751. -Chen D, Huang Y, Ma Z, Chen H, Pan X, Ge C, Gao D, Xie Y, Liu Z, Gao J, Li Y, Ding B, Zhou J (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Companion of the 2024 International Conference on Management of Data, pp 120-134. +Chen D, Huang Y, Ma Z, Chen H, Pan X, Ge C, Gao D, Xie Y, Liu Z, Gao J, Li Y, Ding B, Zhou J (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Companion of the 2024 International Conference on Management of Data, pp 120-134. https://doi.org/10.1145/3626246.3653385. Chen Y, Shetty M, Somashekar G, Ma M, Simmhan Y, Mace J, Bansal C, Wang R, Rajmohan S (2025) AIOpsLab: A Holistic Framework to Evaluate AI Agents for Enabling Autonomous Clouds. arXiv preprint arXiv:2501.06706. -Kapoor S, Narayanan A (2023) Leakage and the reproducibility crisis in machine-learning-based science. Patterns 4(9):100804. +Kapoor S, Narayanan A (2023) Leakage and the reproducibility crisis in machine-learning-based science. Patterns 4(9):100804. https://doi.org/10.1016/j.patter.2023.100804. diff --git a/docs/en/appendix_f_terminology_and_chinese_english_mapping.md b/docs/en/appendix_f_terminology_and_chinese_english_mapping.md index 68204b15..b861943f 100644 --- a/docs/en/appendix_f_terminology_and_chinese_english_mapping.md +++ b/docs/en/appendix_f_terminology_and_chinese_english_mapping.md @@ -404,8 +404,14 @@ This is especially risky around privacy, compliance, and release boundaries. Ter Bommasani R, Klyman K, Zhang D, Liang P (2023) The Foundation Model Transparency Index. arXiv preprint arXiv:2310.12941. -Liang P, Bommasani R, Lee T, et al. (2023) Holistic Evaluation of Language Models. Transactions on Machine Learning Research. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723. -Wang B, Chen W, Pei H, et al. (2023) DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models. In: Advances in Neural Information Processing Systems 36. +Liang P, Bommasani R, Lee T, et al. (2023) Holistic Evaluation of Language Models. Transactions on Machine Learning Research. arXiv:2211.09110. -Weidinger L, Uesato J, Rauh M, Griffin C, Huang P-S, Mellor J, Glaese A, Cheng M, Balle B, Kasirzadeh A, Kenton Z, Brown S, Hawkins W, Stepleton T, Birhane A, Haas J, Rimell L, Hendricks L A, Isaac W, Legassick S, Irving G, Gabriel I (2022) Taxonomy of Risks posed by Language Models. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 214-229. +Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596. + +Pushkarna M, Zaldivar A, Kjartansson O (2022) Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 1776-1826. https://doi.org/10.1145/3531146.3533231. + +Wang B, Chen W, Pei H, et al. (2023) DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models. In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/075280-1361. + +Weidinger L, Uesato J, Rauh M, Griffin C, Huang P-S, Mellor J, Glaese A, Cheng M, Balle B, Kasirzadeh A, Kenton Z, Brown S, Hawkins W, Stepleton T, Birhane A, Haas J, Rimell L, Hendricks L A, Isaac W, Legassick S, Irving G, Gabriel I (2022) Taxonomy of Risks posed by Language Models. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 214-229. https://doi.org/10.1145/3531146.3533088. diff --git a/docs/en/front_matter_guide.md b/docs/en/front_matter_guide.md index 80a96e4e..ebeb08ae 100644 --- a/docs/en/front_matter_guide.md +++ b/docs/en/front_matter_guide.md @@ -6,7 +6,7 @@ This book addresses data engineering problems in the era of large models. It dis ![Book architecture of Data Engineering for Large Foundation Models, showing the data lifecycle, foundation layer, data collection and processing layer, cross-cutting capabilities, model alignment and capability enhancement layer, application governance layer, security and specialized practice layer, project case layer, DataOps flywheel, and platform support capabilities](../images/book_structure_en.png) -*Book architecture. Source: original illustration by the authors. The figure organizes the fourteen parts, forty-eight chapters, fifteen projects, and eight appendices around the data lifecycle, with layers for foundations, collection and processing, cross-cutting capabilities, model alignment, application governance, security and compliance, specialized practice, and project delivery; Alt text: book architecture diagram showing the data lifecycle, layered manuscript structure, DataOps flywheel, platform support capabilities, and engineering principles that run through the book.* +*Book architecture. Source: original illustration by the authors. The figure organizes the fourteen parts, forty-eight chapters, fifteen projects, and eight appendices around the data lifecycle, with layers for foundations, collection and processing, cross-cutting capabilities, model alignment, application governance, security and compliance, specialized practice, and project delivery.* The book's core contributions appear in four areas. diff --git a/docs/en/part1/ch01_data_change.md b/docs/en/part1/ch01_data_change.md index 122bcac1..7a7abd17 100644 --- a/docs/en/part1/ch01_data_change.md +++ b/docs/en/part1/ch01_data_change.md @@ -308,7 +308,7 @@ Hoffmann J, Borgeaud S, Mensch A, Buchatskaya E, Cai T, Rutherford E, de Las Cas Rae J W, Borgeaud S, Cai T, Millican K, Hoffmann J, Song F, Aslanides J, Henderson S, Ring R, Young S, Rutherford E, Hennigan T, Menick J, Cassirer A, Powell R, van den Driessche G, Hendricks L A, Rauh M, Huang P S, Glaese A, Welbl J, Dathathri S, Huang S, Uesato J, Mellor J, Higgins I, Creswell A, McAleese N, Wu A, Elsen E, Jayakumar S, Buchatskaya E, Budden D, Sutherland E, Simonyan K, Paganini M, Sifre L, Martens L, Li X L, Kuncoro A, Nematzadeh A, Gribovskaya E, Donato D, Lazaridou A, Mensch A, Lespiau J B, Tsimpoukelli M, Grigorev N, Fritz D, Sottiaux T, Pajarskas M, Pohlen T, Gong Z, Toyama D, de Masson d'Autume C, Li Y, Terzi T, Mikulik V, Babuschkin I, Clark A, de Las Casas D, Guy A, Jones C, Bradbury J, Johnson M, Hechtman B, Weidinger L, Gabriel I, Isaac W, Lockhart W, Osindero S, Rimell L, Dyer C, Vinyals O, Ayoub K, Stanway J, Bennett L, Hassabis D, Kavukcuoglu K, Irving G (2021) Scaling Language Models: Methods, Analysis & Insights from Training Gopher. arXiv preprint arXiv:2112.11446. -Brown T B, Mann B, Ryder N, Subbiah M, Kaplan J D, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language Models are Few-Shot Learners. Advances in Neural Information Processing Systems 33:1877–1901. +Brown T B, Mann B, Ryder N, Subbiah M, Kaplan J D, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language Models are Few-Shot Learners. Advances in Neural Information Processing Systems 33:1877–1901. arXiv:2005.14165. Dubey A, Jauhri A, Pandey A, Khandelwal A, Al-Dahle A, Letman A, Mathur A, Schelten A, Yang A, Fan A, others (2024) The Llama 3 Herd of Models. arXiv preprint arXiv:2407.21783. @@ -328,23 +328,23 @@ Li J, Zhang Y, Yu H, Ma X, Chen Y, Jiang H, Dang K, Goyal T, Keh S, Sherborn M, Heafield K (2011) KenLM: Faster and Smaller Language Model Queries. In: Proceedings of the Sixth Workshop on Statistical Machine Translation, pp 187–197. -Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21–29. +Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21–29. https://doi.org/10.1109/sequen.1997.666900. -Chen J, Yan X, Lin D, Qu X, Wang Y, Huang X, Zhao Z, Yu T, Zhang Z, Li H, Zheng Y, Xu R, Zhu J, Qiu X (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Proceedings of the ACM SIGMOD International Conference on Management of Data, pp 4436–4449. +Chen J, Yan X, Lin D, Qu X, Wang Y, Huang X, Zhao Z, Yu T, Zhang Z, Li H, Zheng Y, Xu R, Zhu J, Qiu X (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Proceedings of the ACM SIGMOD International Conference on Management of Data, pp 4436–4449. https://doi.org/10.1145/3626246.3653385. Penedo G, Kydlíček H, Anthony L, Hajos M, Sutawika L, Fourmague H, Nguyen H, de Werra L, Wolf T (2024) datatrove: large scale data processing. Hugging Face Open Source Library. https://github.com/huggingface/datatrove. -Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P F, Leike J, Lowe R (2022) Training Language Models to Follow Instructions with Human Feedback. Advances in Neural Information Processing Systems 35:27730–27744. +Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P F, Leike J, Lowe R (2022) Training Language Models to Follow Instructions with Human Feedback. Advances in Neural Information Processing Systems 35:27730–27744. arXiv:2203.02155. -Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Advances in Neural Information Processing Systems 36:53728–53741. +Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Advances in Neural Information Processing Systems 36:53728–53741. arXiv:2305.18290. -Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W T, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. Advances in Neural Information Processing Systems 33:9459–9474. +Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W T, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. Advances in Neural Information Processing Systems 33:9459–9474. arXiv:2005.11401. Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez A N, Kaiser L, Polosukhin I (2017) Attention Is All You Need. Advances in Neural Information Processing Systems 30. Krizhevsky A, Sutskever I, Hinton G E (2012) ImageNet Classification with Deep Convolutional Neural Networks. Advances in Neural Information Processing Systems 25:1097–1105. -He K, Zhang X, Ren S, Sun J (2016) Deep Residual Learning for Image Recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 770–778. +He K, Zhang X, Ren S, Sun J (2016) Deep Residual Learning for Image Recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 770–778. https://doi.org/10.1109/cvpr.2016.90. Cobbe K, Kosaraju V, Bavarian M, Chen M, Jun H, Kaiser L, Plappert M, Tworek J, Hilton J, Nakano R, Hesse C, Schulman J (2021) Training Verifiers to Solve Math Word Problems (GSM8K). arXiv preprint arXiv:2110.14168. @@ -352,4 +352,4 @@ Hendrycks D, Burns C, Basart S, Zou A, Mazeika M, Song D, Steinhardt J (2021) Me Chen M, Tworek J, Jun H, Yuan Q, Pinto H P d O, Kaplan J, Edwards H, Burda Y, Joseph N, Brockman G, others (2021) Evaluating Large Language Models Trained on Code (HumanEval). arXiv preprint arXiv:2107.03374. -Bloom B H (1970) Space/time Trade-offs in Hash Coding with Allowable Errors. Communications of the ACM 13(7):422–426. +Bloom B H (1970) Space/time Trade-offs in Hash Coding with Allowable Errors. Communications of the ACM 13(7):422–426. https://doi.org/10.1145/362686.362692. diff --git a/docs/en/part1/ch02_quality_framework.md b/docs/en/part1/ch02_quality_framework.md index ca82289d..e12c2fee 100644 --- a/docs/en/part1/ch02_quality_framework.md +++ b/docs/en/part1/ch02_quality_framework.md @@ -522,17 +522,17 @@ Within the structure of the whole book, this chapter sits at the foundational-fr ## References -Cohen J (1960) A Coefficient of Agreement for Nominal Scales. Educational and Psychological Measurement 20(1):37-46. +Cohen J (1960) A Coefficient of Agreement for Nominal Scales. Educational and Psychological Measurement 20(1):37-46. https://doi.org/10.1177/001316446002000104. -Lees A, Tran V Q, Tay Y, Sorensen J, Gupta J, Metzler D, Vasserman L (2022) A New Generation of Perspective API: Efficient Multilingual Character-level Transformers. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 3197-3207. +Lees A, Tran V Q, Tay Y, Sorensen J, Gupta J, Metzler D, Vasserman L (2022) A New Generation of Perspective API: Efficient Multilingual Character-level Transformers. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 3197-3207. https://doi.org/10.1145/3534678.3539147. -Nadeem M, Bethke A, Reddy S (2021) StereoSet: Measuring Stereotypical Bias in Pretrained Language Models. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 5356-5371. +Nadeem M, Bethke A, Reddy S (2021) StereoSet: Measuring Stereotypical Bias in Pretrained Language Models. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 5356-5371. https://doi.org/10.18653/v1/2021.acl-long.416. Zhao J, Wang T, Yatskar M, Ordonez V, Chang K W (2018) Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods (WinoBias). In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics, pp 15-20. -Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P F, Leike J, Lowe R (2022) Training Language Models to Follow Instructions with Human Feedback. Advances in Neural Information Processing Systems 35:27730-27744. +Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P F, Leike J, Lowe R (2022) Training Language Models to Follow Instructions with Human Feedback. Advances in Neural Information Processing Systems 35:27730-27744. arXiv:2203.02155. -Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Advances in Neural Information Processing Systems 36:53728-53741. +Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Advances in Neural Information Processing Systems 36:53728-53741. arXiv:2305.18290. Bai Y, Jones A, Ndousse K, Askell A, Chen A, DasSarma N, Drain D, Fort S, Ganguli D, Henighan T, Joseph N, Kadavath S, Kernion J, Conerly T, El-Showk S, Elhage N, Hatfield-Dodds Z, Hernandez D, Hume T, Johnston S, Kravec S, Lovitt L, Nanda N, Olsson C, Amodei D, Brown T, Clark J, McCandlish S, Olah C, Mann B, Kaplan J (2022) Constitutional AI: Harmlessness from AI Feedback. arXiv preprint arXiv:2212.08073. @@ -542,7 +542,7 @@ Cobbe K, Kosaraju V, Bavarian M, Chen M, Jun H, Kaiser L, Plappert M, Tworek J, Hendrycks D, Burns C, Basart S, Zou A, Mazeika M, Song D, Steinhardt J (2021) Measuring Massive Multitask Language Understanding (MMLU). In: International Conference on Learning Representations. -Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. +Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. https://doi.org/10.1109/sequen.1997.666900. Heafield K (2011) KenLM: Faster and Smaller Language Model Queries. In: Proceedings of the Sixth Workshop on Statistical Machine Translation, pp 187-197. @@ -554,6 +554,6 @@ Shi W, Ajith A, Xia M, Huang Y, Liu D, Blevins T, Chen D, Zettlemoyer L (2023) D Golchin S, Surdeanu M (2023) Time Travel in LLMs: Tracing Data Contamination in Large Language Models. arXiv preprint arXiv:2308.14802. -Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723. Nait Saada T, Bethune L, Klein M, Grangier D, Cuturi M, Ablin P (2025) The Data-Quality Illusion: Rethinking Classifier-Based Quality Filtering for LLM Pretraining. arXiv preprint arXiv:2510.00866. diff --git a/docs/en/part1/ch03_data_stack.md b/docs/en/part1/ch03_data_stack.md index a12c84ec..975e1c6f 100644 --- a/docs/en/part1/ch03_data_stack.md +++ b/docs/en/part1/ch03_data_stack.md @@ -345,13 +345,13 @@ Zaharia M, Xin R S, Wendell P, Das T, Armbrust M, Dave A, Meng X, Rosen J, Venka Moritz P, Nishihara R, Wang S, Tumanov A, Liaw R, Liang E, Elibol M, Yang Z, Paul W, Jordan M I, Stoica I (2018) Ray: A Distributed Framework for Emerging AI Applications. In: Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation, pp 561-577. -Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. +Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. https://doi.org/10.1109/sequen.1997.666900. Heafield K (2011) KenLM: Faster and Smaller Language Model Queries. In: Proceedings of the Sixth Workshop on Statistical Machine Translation, pp 187-197. Robertson S, Zaragoza H (2009) The Probabilistic Relevance Framework: BM25 and Beyond. Foundations and Trends in Information Retrieval 3(4):333-389. -Malkov Y A, Yashunin D A (2020) Efficient and Robust Approximate Nearest Neighbor Search Using Hierarchical Navigable Small World Graphs (HNSW). IEEE Transactions on Pattern Analysis and Machine Intelligence 42(4):824-836. +Malkov Y A, Yashunin D A (2020) Efficient and Robust Approximate Nearest Neighbor Search Using Hierarchical Navigable Small World Graphs (HNSW). IEEE Transactions on Pattern Analysis and Machine Intelligence 42(4):824-836. https://doi.org/10.1109/tpami.2018.2889473. Apache Software Foundation (2024) Apache Iceberg: Table Specification and Documentation. (accessed 2024-11). diff --git a/docs/en/part10/ch31_agent_architecture.md b/docs/en/part10/ch31_agent_architecture.md index cc2d158d..6df9e32b 100644 --- a/docs/en/part10/ch31_agent_architecture.md +++ b/docs/en/part10/ch31_agent_architecture.md @@ -528,29 +528,29 @@ For boundary design, the chapter proposed four automation levels: recommendation ## References -Besta M, Blach N, Kubicek A, Gerstenberger R, Podstawski M, Gianinazzi L, Gajda J, Lehmann T, Niewiadomski H, Nyczyk P, Hoefler T (2024) Graph of Thoughts: Solving Elaborate Problems with Large Language Models. In: Proceedings of the AAAI Conference on Artificial Intelligence 38(16):17682-17690. +Besta M, Blach N, Kubicek A, Gerstenberger R, Podstawski M, Gianinazzi L, Gajda J, Lehmann T, Niewiadomski H, Nyczyk P, Hoefler T (2024) Graph of Thoughts: Solving Elaborate Problems with Large Language Models. In: Proceedings of the AAAI Conference on Artificial Intelligence 38(16):17682-17690. https://doi.org/10.1609/aaai.v38i16.29720. -Gao L, Madaan A, Zhou S, Alon U, Liu P, Yang Y, Callan J, Neubig G (2023) PAL: Program-aided Language Models. In: Proceedings of the 40th International Conference on Machine Learning, pp 10764-10799. +Gao L, Madaan A, Zhou S, Alon U, Liu P, Yang Y, Callan J, Neubig G (2023) PAL: Program-aided Language Models. In: Proceedings of the 40th International Conference on Machine Learning, pp 10764-10799. arXiv:2211.10435. Karpas E, Abend O, Belinkov Y, Lenz B, Lieber O, Ratner N, Shoham Y, Bata H, Levine Y, Leyton-Brown K, Muhlgay D, Rozen N, Schwartz E, Shashua A, Shuster K, Tenenbaum J, Wolf L, Zettlemoyer L, Riedel S (2022) MRKL Systems: A Modular, Neuro-Symbolic Architecture That Combines Large Language Models, External Knowledge Sources and Discrete Reasoning. arXiv preprint arXiv:2205.00445. -Kreuzberger D, Kuhl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. +Kreuzberger D, Kuhl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. arXiv:2205.02302. -Madaan A, Tandon N, Gupta P, Hallinan S, Gao L, Wiegreffe S, Alon U, Dziri N, Prabhumoye S, Yang Y, Gupta S, Majumder B P, Hermann K, Welleck S, Yazdanbakhsh A, Clark P (2023) Self-Refine: Iterative Refinement with Self-Feedback. In: Advances in Neural Information Processing Systems 36. +Madaan A, Tandon N, Gupta P, Hallinan S, Gao L, Wiegreffe S, Alon U, Dziri N, Prabhumoye S, Yang Y, Gupta S, Majumder B P, Hermann K, Welleck S, Yazdanbakhsh A, Clark P (2023) Self-Refine: Iterative Refinement with Self-Feedback. In: Advances in Neural Information Processing Systems 36. arXiv:2303.17651. Mialon G, Dessi R, Lomeli M, Nalmpantis C, Pasunuru R, Raileanu R, Roziere B, Schick T, Dwivedi-Yu J, Celikyilmaz A, Grave E, LeCun Y, Scialom T (2023) Augmented Language Models: A Survey. Transactions on Machine Learning Research. Nakano R, Hilton J, Balaji S, Wu J, Ouyang L, Kim C, Hesse C, Jain S, Kosaraju V, Saunders W, Jiang X, Cobbe K, Eloundou T, Krueger G, Button K, Knight M, Chess B, Schulman J (2021) WebGPT: Browser-assisted question-answering with human feedback. arXiv preprint arXiv:2112.09332. -Park J S, O'Brien J C, Cai C J, Morris M R, Liang P, Bernstein M S (2023) Generative Agents: Interactive Simulacra of Human Behavior. In: Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology, Article 2. +Park J S, O'Brien J C, Cai C J, Morris M R, Liang P, Bernstein M S (2023) Generative Agents: Interactive Simulacra of Human Behavior. In: Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology, Article 2. https://doi.org/10.1145/3586183.3606763. Patil S G, Zhang T, Wang X, Gonzalez J E (2023) Gorilla: Large Language Model Connected with Massive APIs. arXiv preprint arXiv:2305.15334. Qin Y, Liang S, Ye Y, Zhu K, Yan L, Lu Y, Lin Y, Cong X, Tang X, Qian B, Zhao S, Tian R, Xie R, Zhou J, Gerstein M, Li D, Liu Z, Sun M (2024) ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world APIs. In: International Conference on Learning Representations. -Schick T, Dwivedi-Yu J, Dessi R, Raileanu R, Lomeli M, Hambro E, Zettlemoyer L, Cancedda N, Scialom T (2023) Toolformer: Language Models Can Teach Themselves to Use Tools. In: Advances in Neural Information Processing Systems 36. +Schick T, Dwivedi-Yu J, Dessi R, Raileanu R, Lomeli M, Hambro E, Zettlemoyer L, Cancedda N, Scialom T (2023) Toolformer: Language Models Can Teach Themselves to Use Tools. In: Advances in Neural Information Processing Systems 36. arXiv:2302.04761. -Shinn N, Cassano F, Gopinath A, Narasimhan K, Yao S (2023) Reflexion: Language Agents with Verbal Reinforcement Learning. In: Advances in Neural Information Processing Systems 36. +Shinn N, Cassano F, Gopinath A, Narasimhan K, Yao S (2023) Reflexion: Language Agents with Verbal Reinforcement Learning. In: Advances in Neural Information Processing Systems 36. arXiv:2303.11366. Wang L, Ma C, Feng X, Zhang Z, Yang H, Zhang J, Chen Z, Tang J, Chen X, Lin Y, Zhao W X, Wei Z, Wen J-R (2023) A Survey on Large Language Model based Autonomous Agents. arXiv preprint arXiv:2308.11432. @@ -558,6 +558,6 @@ Wu Q, Bansal G, Zhang J, Wu Y, Li B, Zhu E, Jiang L, Zhang X, Zhang S, Liu J, Aw Xi Z, Chen W, Guo X, He W, Ding Y, Hong B, Zhang M, Wang J, Jin S, Zhou E, Zheng R, Fan X, Wang X, Xiong L, Zhou Y, Wang W, Jiang C, Zou Y, Liu X, Yin Z, Dou S, Weng R, Cheng W, Zhang Q, Qin W, Zheng Y, Qiu X, Huang X, Gui T (2023) The Rise and Potential of Large Language Model Based Agents: A Survey. arXiv preprint arXiv:2309.07864. -Yao S, Zhao J, Yu D, Du N, Shafran I, Narasimhan K, Cao Y (2023) ReAct: Synergizing Reasoning and Acting in Language Models. In: International Conference on Learning Representations. +Yao S, Zhao J, Yu D, Du N, Shafran I, Narasimhan K, Cao Y (2023) ReAct: Synergizing Reasoning and Acting in Language Models. In: International Conference on Learning Representations. arXiv:2210.03629. -Yao S, Yu D, Zhao J, Shafran I, Griffiths T L, Cao Y, Narasimhan K (2023) Tree of Thoughts: Deliberate Problem Solving with Large Language Models. In: Advances in Neural Information Processing Systems 36. +Yao S, Yu D, Zhao J, Shafran I, Griffiths T L, Cao Y, Narasimhan K (2023) Tree of Thoughts: Deliberate Problem Solving with Large Language Models. In: Advances in Neural Information Processing Systems 36. arXiv:2305.10601. diff --git a/docs/en/part10/ch32_auto_collection_parsing_cleaning.md b/docs/en/part10/ch32_auto_collection_parsing_cleaning.md index 5523c256..fe31f71a 100644 --- a/docs/en/part10/ch32_auto_collection_parsing_cleaning.md +++ b/docs/en/part10/ch32_auto_collection_parsing_cleaning.md @@ -363,13 +363,13 @@ The cleaning stage emphasized generating rule candidates from sampled defects, v ## References -Barbaresi A (2021) Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 122-131. +Barbaresi A (2021) Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 122-131. https://doi.org/10.18653/v1/2021.acl-demo.15. Blecher N, Cresci G, Ballas N, Bautista M (2023) Nougat: Neural Optical Understanding for Academic Documents. arXiv preprint arXiv:2308.13418. Carlini N, Tramer F, Wallace E, Jagielski M, Herbert-Voss A, Lee K, Roberts A, Brown T, Song D, Erlingsson U, Oprea A, Raffel C (2021) Extracting Training Data from Large Language Models. In: Proceedings of the 30th USENIX Security Symposium, pp 2633-2650. -Chen J, Yan X, Lin D, Qu X, Wang Y, Huang X, Zhao Z, Yu T, Zhang Z, Li H, Zheng Y, Xu R, Zhu J, Qiu X (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Proceedings of the ACM SIGMOD International Conference on Management of Data, pp 4436-4449. +Chen J, Yan X, Lin D, Qu X, Wang Y, Huang X, Zhao Z, Yu T, Zhang Z, Li H, Zheng Y, Xu R, Zhu J, Qiu X (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Proceedings of the ACM SIGMOD International Conference on Management of Data, pp 4436-4449. https://doi.org/10.1145/3626246.3653385. Chowdhery A, Narang S, Devlin J, Bosma M, Mishra G, Roberts A, Barham P, Chung H W, Sutton C, Gehrmann S, Schuh P, Shi K, Tsvyashchenko S, Maynez J, Rao A, Barnes P, Tay Y, Shazeer N, Prabhakaran V, Reif E, Du N, Hutchinson B, Pope R, Bradbury J, Austin J, Isard M, Gur-Ari G, Yin P, Duke T, Levskaya A, Ghemawat S, Dev S, Michalewski H, Garcia X, Misra V, Robinson K, Fedus L, Zhou D, Ippolito D, Luan D, Lim H, Zoph B, Spiridonov A, Sepassi R, Dohan D, Agrawal S, Omernick M, Dai A M, Pillai T S, Pellat M, Lewkowycz A, Moreira E, Child R, Polozov O, Lee K, Zhou Z, Wang X, Saeta B, Diaz M, Firat O, Catasta M, Wei J, Meier-Hellstern K, Eck D, Dean J, Petrov S, Fiedel N (2022) PaLM: Scaling Language Modeling with Pathways. Journal of Machine Learning Research 24(240):1-113. @@ -377,23 +377,23 @@ Dodge J, Sap M, Marasovic A, Agnew W, Ilharco G, Groeneveld D, Mitchell M, Gardn Gao L, Biderman S, Black S, Golding L, Hoppe T, Foster C, Phang J, He H, Thite A, Nabeshima N, Presser S, Leahy C (2020) The Pile: An 800GB Dataset of Diverse Text for Language Modeling. arXiv preprint arXiv:2101.00027. -Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083-4091. +Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083-4091. https://doi.org/10.1145/3503161.3548112. -Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: European Conference on Computer Vision, pp 498-517. +Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: European Conference on Computer Vision, pp 498-517. https://doi.org/10.1007/978-3-031-19815-1_29. -Laurencon H, Saulnier L, Wang T, Akiki C, del Moral A V, Le Scao T, Von Werra L, Mou C, Gonzalez Ponferrada E, Nguyen H, Frohberg J, Sasko M, Lhoest Q, McMillan-Major A, Dupont G, Biderman S, Rogers A, Allal L B, De Toni F, Pistilli G, Nguyen O, Nikpoor S, Masoud M, Labbe S, Vial T, Reusch A, Yogatama D, Raffel C, Wolf T, BigScience Workshop (2022) The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset. In: Advances in Neural Information Processing Systems 35, Datasets and Benchmarks Track. +Laurencon H, Saulnier L, Wang T, Akiki C, del Moral A V, Le Scao T, Von Werra L, Mou C, Gonzalez Ponferrada E, Nguyen H, Frohberg J, Sasko M, Lhoest Q, McMillan-Major A, Dupont G, Biderman S, Rogers A, Allal L B, De Toni F, Pistilli G, Nguyen O, Nikpoor S, Masoud M, Labbe S, Vial T, Reusch A, Yogatama D, Raffel C, Wolf T, BigScience Workshop (2022) The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset. In: Advances in Neural Information Processing Systems 35, Datasets and Benchmarks Track. https://doi.org/10.52202/068431-2306. Lee K, Ippolito D, Nystrom A, Zhang C, Eck D, Callison-Burch C, Carlini N (2022) Deduplicating Training Data Makes Language Models Better. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics, pp 8424-8445. Longpre S, Mahari R, Lee A, et al. (2023) The Data Provenance Initiative: A Large Scale Audit of Dataset Licensing and Attribution in AI. arXiv preprint arXiv:2310.16787. -Nguyen T, et al. (2024) CulturaX: A Cleaned, Enormous, and Multilingual Dataset for Large Language Models in 167 Languages. In: Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation. +Nguyen T, et al. (2024) CulturaX: A Cleaned, Enormous, and Multilingual Dataset for Large Language Models in 167 Languages. In: Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation. https://doi.org/10.63317/5iz6z5g7eit3. Ortiz Suarez P J, Sagot B, Romary L (2020) A Monolingual Approach to Contextualized Word Embeddings for Mid-Resource Languages. In: Proceedings of the 12th Language Resources and Evaluation Conference, pp 1703-1714. Pfitzmann B, Auer C, Dolfi M, Nassar A S, Staar P (2022) DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 3743-3751. -Penedo G, Kydlicek H, Allal L B, Lozhkov A, Mitchell M, Raffel C, von Werra L, Wolf T (2024) The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. In: Advances in Neural Information Processing Systems 37, Datasets and Benchmarks Track. +Penedo G, Kydlicek H, Allal L B, Lozhkov A, Mitchell M, Raffel C, von Werra L, Wolf T (2024) The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. In: Advances in Neural Information Processing Systems 37, Datasets and Benchmarks Track. arXiv:2406.17557. Penedo G, Malartic Q, Hesslow D, Cojocaru R, Cappelli A, Alobeidli H, Pannier B, Almazrouei E, Launay J (2023) The RefinedWeb Dataset for Falcon LLM: Outperforming Curated Corpora with Web Data Only. In: Advances in Neural Information Processing Systems 36. diff --git a/docs/en/part10/ch33_labeling_synthesis_evaluation.md b/docs/en/part10/ch33_labeling_synthesis_evaluation.md index 89687602..f175b5a0 100644 --- a/docs/en/part10/ch33_labeling_synthesis_evaluation.md +++ b/docs/en/part10/ch33_labeling_synthesis_evaluation.md @@ -386,13 +386,13 @@ Evaluation and red-team agents generate challenge sets, red-team samples, and ev ## References -Alemohammad S, Casco-Rodriguez J, Luzi L, et al. (2024) Self-Consuming Generative Models Go MAD. In: International Conference on Learning Representations. +Alemohammad S, Casco-Rodriguez J, Luzi L, et al. (2024) Self-Consuming Generative Models Go MAD. In: International Conference on Learning Representations. arXiv:2307.01850. Bai Y, Kadavath S, Kundu S, et al. (2022) Constitutional AI: Harmlessness from AI Feedback. arXiv preprint arXiv:2212.08073. Cui G, Yuan L, Ding N, Yao G, Zhu W, Ni Y, Xie G, Liu Z, Sun M (2023) UltraFeedback: Boosting Language Models with Scaled AI Feedback. arXiv preprint arXiv:2310.01377. -Dubois Y, Li X, Taori R, Zhang T, Gulrajani I, Ba J, Guestrin C, Liang P, Hashimoto T B (2023) AlpacaFarm: A Simulation Framework for Methods that Learn from Human Feedback. In: Advances in Neural Information Processing Systems 36. +Dubois Y, Li X, Taori R, Zhang T, Gulrajani I, Ba J, Guestrin C, Liang P, Hashimoto T B (2023) AlpacaFarm: A Simulation Framework for Methods that Learn from Human Feedback. In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/075280-1308. Gerstgrasser M, Schaeffer R, Dey A, et al. (2024) Is Model Collapse Inevitable? Breaking the Curse of Recursion by Accumulating Real and Synthetic Data. arXiv preprint arXiv:2404.01413. @@ -404,26 +404,26 @@ Koh P W, Sagawa S, Marklund H, et al. (2021) WILDS: A Benchmark of in-the-Wild D Lambert N, Pyatkin V, Morrison J, Miranda L, Lin B Y, Chandu K, Dziri N, Kumar S, Zick T, Choi Y, Smith N A, Hajishirzi H (2024) RewardBench: Evaluating Reward Models for Language Modeling. arXiv preprint arXiv:2403.13787. -Liang P, Bommasani R, Lee T, et al. (2023) Holistic Evaluation of Language Models. Transactions on Machine Learning Research. +Liang P, Bommasani R, Lee T, et al. (2023) Holistic Evaluation of Language Models. Transactions on Machine Learning Research. arXiv:2211.09110. -Liu Y, Iter D, Xu Y, et al. (2023) G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp 2511-2522. +Liu Y, Iter D, Xu Y, et al. (2023) G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp 2511-2522. arXiv:2303.16634. Lin B Y, et al. (2024) WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild. arXiv preprint arXiv:2406.04770. -Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P, Leike J, Lowe R (2022) Training language models to follow instructions with human feedback. In: Advances in Neural Information Processing Systems 35, pp 27730-27744. +Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P, Leike J, Lowe R (2022) Training language models to follow instructions with human feedback. In: Advances in Neural Information Processing Systems 35, pp 27730-27744. arXiv:2203.02155. -Perez E, Huang S, Song F, Cai T, Ring R, Aslanides J, Glaese A, McAleese N, Irving G (2022) Red Teaming Language Models with Language Models. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp 3419-3448. +Perez E, Huang S, Song F, Cai T, Ring R, Aslanides J, Glaese A, McAleese N, Irving G (2022) Red Teaming Language Models with Language Models. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp 3419-3448. arXiv:2202.03286. -Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model is Secretly a Reward Model. In: Advances in Neural Information Processing Systems 36. +Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model is Secretly a Reward Model. In: Advances in Neural Information Processing Systems 36. arXiv:2305.18290. -Ribeiro M T, Wu T, Guestrin C, Singh S (2020) Beyond Accuracy: Behavioral Testing of NLP Models with CheckList. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp 4902-4912. +Ribeiro M T, Wu T, Guestrin C, Singh S (2020) Beyond Accuracy: Behavioral Testing of NLP Models with CheckList. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp 4902-4912. https://doi.org/10.18653/v1/2020.acl-main.442. -Shumailov I, Shumaylov Z, Zhao Y, et al. (2024) AI models collapse when trained on recursively generated data. Nature 631:755-759. +Shumailov I, Shumaylov Z, Zhao Y, et al. (2024) AI models collapse when trained on recursively generated data. Nature 631:755-759. https://doi.org/10.1038/s41586-024-07566-y. -Wang Y, Kordi Y, Mishra S, et al. (2023) Self-Instruct: Aligning Language Models with Self-Generated Instructions. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 13484-13508. +Wang Y, Kordi Y, Mishra S, et al. (2023) Self-Instruct: Aligning Language Models with Self-Generated Instructions. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 13484-13508. https://doi.org/10.18653/v1/2023.acl-long.754. -Zheng L, Chiang W-L, Sheng Y, et al. (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In: Advances in Neural Information Processing Systems 36. +Zheng L, Chiang W-L, Sheng Y, et al. (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In: Advances in Neural Information Processing Systems 36. arXiv:2306.05685. Zhu L, Wang X, Wang Y, et al. (2023) JudgeLM: Fine-tuned Large Language Models are Scalable Judges. arXiv preprint arXiv:2310.17631. -Zhou C, Liu P, Xu P, et al. (2023) LIMA: Less Is More for Alignment. In: Advances in Neural Information Processing Systems 36. +Zhou C, Liu P, Xu P, et al. (2023) LIMA: Less Is More for Alignment. In: Advances in Neural Information Processing Systems 36. arXiv:2305.11206. diff --git a/docs/en/part10/ch34_dataops_agent.md b/docs/en/part10/ch34_dataops_agent.md index d677afdf..146ccb0c 100644 --- a/docs/en/part10/ch34_dataops_agent.md +++ b/docs/en/part10/ch34_dataops_agent.md @@ -431,36 +431,36 @@ In the book's structure, this chapter sits in the agent-automation layer. It con ## References -Amershi S, Begel A, Bird C, Devanbu P, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 291-300. +Amershi S, Begel A, Bird C, Devanbu P, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 291-300. https://doi.org/10.1109/icse-seip.2019.00042. Breck E, Polyzotis N, Roy S, Whang S E, Zinkevich M (2019) Data Validation for Machine Learning. In: Proceedings of Machine Learning and Systems 1, pp 334-347. -Dang Y, Lin Q, Huang P (2019) AIOps: Real-World Challenges and Research Innovations. In: Proceedings of the 41st International Conference on Software Engineering: Companion Proceedings, pp 4-5. +Dang Y, Lin Q, Huang P (2019) AIOps: Real-World Challenges and Research Innovations. In: Proceedings of the 41st International Conference on Software Engineering: Companion Proceedings, pp 4-5. https://doi.org/10.1109/icse-companion.2019.00023. He S, He P, Chen Z, Yang T, Su Y, Lyu M R (2021) A Survey on Automated Log Analysis for Reliability Engineering. ACM Computing Surveys 54(6):1-37. Huyen C (2022) Designing Machine Learning Systems: An Iterative Process for Production-Ready Applications. O'Reilly Media. -Kreuzberger D, Kuhl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. +Kreuzberger D, Kuhl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. arXiv:2205.02302. Makinen S, Skogstrom H, Laaksonen E, Mikkonen T (2021) Who Needs MLOps: What Data Scientists Seek to Accomplish and How Can MLOps Help? In: Proceedings of the 2021 IEEE/ACM 1st Workshop on AI Engineering - Software Engineering for AI, pp 109-112. -Lwakatare L E, Raj A, Crnkovic I, Bosch J, Olsson H H (2020) Large-scale Machine Learning Systems in Real-world Industrial Settings: A Review of Challenges and Solutions. Information and Software Technology 127:106368. +Lwakatare L E, Raj A, Crnkovic I, Bosch J, Olsson H H (2020) Large-scale Machine Learning Systems in Real-world Industrial Settings: A Review of Challenges and Solutions. Information and Software Technology 127:106368. https://doi.org/10.1016/j.infsof.2020.106368. NIST (2023) Artificial Intelligence Risk Management Framework (AI RMF 1.0). National Institute of Standards and Technology. NIST (2024) Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile. NIST AI 600-1. -Paleyes A, Urma R-G, Lawrence N D (2022) Challenges in Deploying Machine Learning: A Survey of Case Studies. ACM Computing Surveys 55(6):1-29. +Paleyes A, Urma R-G, Lawrence N D (2022) Challenges in Deploying Machine Learning: A Survey of Case Studies. ACM Computing Surveys 55(6):1-29. https://doi.org/10.1145/3533378. -Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. +Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. https://doi.org/10.1145/3411764.3445518. -Tamburri D A (2020) Sustainable MLOps: Trends and Challenges. In: Proceedings of the 22nd International Symposium on Symbolic and Numeric Algorithms for Scientific Computing, pp 17-23. +Tamburri D A (2020) Sustainable MLOps: Trends and Challenges. In: Proceedings of the 22nd International Symposium on Symbolic and Numeric Algorithms for Scientific Computing, pp 17-23. https://doi.org/10.1109/synasc51798.2020.00015. -Testi M, Ballabio M, Frontoni E, Iannello G, Moccia S, Soda P, Vessio G (2022) MLOps: A Taxonomy and a Methodology. IEEE Access 10:63606-63618. +Testi M, Ballabio M, Frontoni E, Iannello G, Moccia S, Soda P, Vessio G (2022) MLOps: A Taxonomy and a Methodology. IEEE Access 10:63606-63618. https://doi.org/10.1109/access.2022.3181730. Treveil M, Omont N, Stenac C, Lefevre K, Phan D, Zentici J, Lavoillotte A, Miyazaki M, Heidmann L (2020) Introducing MLOps: How to Scale Machine Learning in the Enterprise. O'Reilly Media. -Vela D, Sharp A, Zhang R, Nguyen T, Hoang A, Pianykh O S (2022) Temporal quality degradation in AI models. Scientific Reports 12:11654. +Vela D, Sharp A, Zhang R, Nguyen T, Hoang A, Pianykh O S (2022) Temporal quality degradation in AI models. Scientific Reports 12:11654. https://doi.org/10.1038/s41598-022-15245-z. Zhu J, He S, Liu J, He P, Xie Q, Zheng Z, Lyu M R (2019) Tools and Benchmarks for Automated Log Parsing. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 121-130. diff --git a/docs/en/part10/ch35_security_permission_collaboration.md b/docs/en/part10/ch35_security_permission_collaboration.md index 4b302434..907de5be 100644 --- a/docs/en/part10/ch35_security_permission_collaboration.md +++ b/docs/en/part10/ch35_security_permission_collaboration.md @@ -442,9 +442,9 @@ Debenedetti E, Zhang J, Balunovic M, et al. (2024) AgentDojo: A Dynamic Environm Ganguli D, Lovitt L, Kernion J, Askell A, Bai Y, Kadavath S, Mann B, Perez E, Schiefer N, Ndousse K, Jones A, Bowman S R, Chen A, Conerly T, DasSarma N, Drain D, Elhage N, El-Showk S, Fort S, Hatfield-Dodds Z, Henighan T, Hernandez D, Hume T, Johnston S, Joseph N, Kravec S, Nanda N, Olsson C, Olah C, Amodei D, Brown T, Clark J, Kaplan J, McCandlish S, Olsson C, Olah C, Amodei D (2022) Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned. arXiv preprint arXiv:2209.07858. -Greshake K, Abdelnabi S, Mishra S, et al. (2023) Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. In: Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security, pp 79-90. +Greshake K, Abdelnabi S, Mishra S, et al. (2023) Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. In: Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security, pp 79-90. https://doi.org/10.1145/3605764.3623985. -Hendrycks D, Mazeika M, Zou A, Patel S, Zhu C, Navarro J, Mu J, Song D, Li B, Steinhardt J (2021) The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization. In: Proceedings of the IEEE/CVF International Conference on Computer Vision, pp 8340-8349. +Hendrycks D, Mazeika M, Zou A, Patel S, Zhu C, Navarro J, Mu J, Song D, Li B, Steinhardt J (2021) The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization. In: Proceedings of the IEEE/CVF International Conference on Computer Vision, pp 8340-8349. https://doi.org/10.1109/iccv48922.2021.00823. Huang Y, Gupta S, Xia M, Li K, Chen D (2024) Catastrophic Jailbreak of Open-source LLMs via Exploiting Generation. In: International Conference on Learning Representations. @@ -452,7 +452,7 @@ Lapid R, Langberg R, Sipper M (2023) Open Sesame! Universal Black Box Jailbreaki Liu Y, Deng G, Li Y, et al. (2023) Prompt Injection Attack against LLM-Integrated Applications. arXiv preprint arXiv:2306.05499. -Perez E, Huang S, Song F, Cai T, Ring R, Aslanides J, Glaese A, McAleese N, Irving G (2022) Red Teaming Language Models with Language Models. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp 3419-3448. +Perez E, Huang S, Song F, Cai T, Ring R, Aslanides J, Glaese A, McAleese N, Irving G (2022) Red Teaming Language Models with Language Models. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp 3419-3448. arXiv:2202.03286. Ruan Y, Dong H, Wang A, Pitis S, Zhou Y, Ba J, Dubois Y, Maddison C J, Hashimoto T B (2024) Identifying the Risks of LM Agents with an LM-Emulated Sandbox. In: International Conference on Learning Representations. @@ -466,6 +466,6 @@ Wei A, Haghtalab N, Steinhardt J (2023) Jailbroken: How Does LLM Safety Training Yi J, Xie Y, Zhu B, Hines K, Kiciman E, Sun G, Xie X, Wu F (2023) Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models. arXiv preprint arXiv:2312.14197. -Zhan Q, Liang Z, Ying Z, Kang D (2024) InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents. In: Findings of the Association for Computational Linguistics: ACL 2024, pp 10471-10506. +Zhan Q, Liang Z, Ying Z, Kang D (2024) InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents. In: Findings of the Association for Computational Linguistics: ACL 2024, pp 10471-10506. https://doi.org/10.18653/v1/2024.findings-acl.624. Zou A, Wang Z, Carlini N, Nasr M, Kolter J Z, Fredrikson M (2023) Universal and Transferable Adversarial Attacks on Aligned Language Models. arXiv preprint arXiv:2307.15043. diff --git a/docs/en/part11/ch36_compliance_framework_and_governance.md b/docs/en/part11/ch36_compliance_framework_and_governance.md index 352a8062..3e0d5f46 100644 --- a/docs/en/part11/ch36_compliance_framework_and_governance.md +++ b/docs/en/part11/ch36_compliance_framework_and_governance.md @@ -1090,25 +1090,25 @@ Cavoukian A, others (2009) Privacy by Design: The 7 Foundational Principles. Inf Gurses S F, Troncoso C, Diaz C (2011) Engineering Privacy. Technical report. -Spiekermann S, Cranor L F (2009) Engineering Privacy. IEEE Transactions on Software Engineering, 35(1), 67-82. +Spiekermann S, Cranor L F (2009) Engineering Privacy. IEEE Transactions on Software Engineering, 35(1), 67-82. https://doi.org/10.1109/tse.2008.88. European Union Agency for Cybersecurity (ENISA) (2022) Data Protection Engineering. ENISA Report. Zieni B, Spagnuelo D, Heckel R (2021) Transparency by Default: GDPR Patterns for Agile Development. In Electronic Government and the Information Systems Perspective, Springer International Publishing, pp 89-102. -Kosenkov O, Zabardast E, Fucci D, Mendez D, Unterkalmsteiner M (2026) Privacy by Design: Aligning GDPR and Software Engineering Specifications with a Requirements Engineering Approach. Information and Software Technology, 190, 107946. +Kosenkov O, Zabardast E, Fucci D, Mendez D, Unterkalmsteiner M (2026) Privacy by Design: Aligning GDPR and Software Engineering Specifications with a Requirements Engineering Approach. Information and Software Technology, 190, 107946. https://doi.org/10.1016/j.infsof.2025.107946. -Hoepman J-H (2014) Privacy Design Strategies. In IFIP International Information Security Conference, pp 446-459. +Hoepman J-H (2014) Privacy Design Strategies. In IFIP International Information Security Conference, pp 446-459. https://doi.org/10.1007/978-3-642-55415-5_38. Perera C, Liu C, Ranjan R, Wang L, Zomaya A Y (2016) Privacy-Knowledge Modeling for the Internet of Things: A Look Back. Computer, 49(12), 60-68. -Dwork C (2008) Differential Privacy: A Survey of Results. In Theory and Applications of Models of Computation, Springer Berlin Heidelberg, pp 1-19. +Dwork C (2008) Differential Privacy: A Survey of Results. In Theory and Applications of Models of Computation, Springer Berlin Heidelberg, pp 1-19. https://doi.org/10.1007/978-3-540-79228-4_1. Shokri R, Shmatikov V (2015) Privacy-Preserving Deep Learning. In 2015 53rd Annual Allerton Conference on Communication, Control, and Computing (Allerton), pp 909-910. -Anthonysamy P, Rashid A, Chitchyan R (2017) Privacy Requirements: Present & Future. In 2017 IEEE/ACM 39th International Conference on Software Engineering: Software Engineering in Society Track (ICSE-SEIS), pp 13-22. +Anthonysamy P, Rashid A, Chitchyan R (2017) Privacy Requirements: Present & Future. In 2017 IEEE/ACM 39th International Conference on Software Engineering: Software Engineering in Society Track (ICSE-SEIS), pp 13-22. https://doi.org/10.1109/icse-seis.2017.3. -Oetzel M C, Spiekermann S (2014) A Systematic Methodology for Privacy Impact Assessments: A Design Science Approach. European Journal of Information Systems, 23(2), 126-150. +Oetzel M C, Spiekermann S (2014) A Systematic Methodology for Privacy Impact Assessments: A Design Science Approach. European Journal of Information Systems, 23(2), 126-150. https://doi.org/10.1057/ejis.2013.18. Notario N, Crespo A, Martin Y-S, del Alamo J M, Le Metayer D, Antignac T, Kung A, Kroener I, Wright D (2015) PRIPARE: Integrating Privacy Best Practices into a Privacy Engineering Methodology. In 2015 IEEE Security and Privacy Workshops, pp 151-158. @@ -1118,7 +1118,7 @@ Agarwal V, Butler C, Degenaro L, Kumar A, Sailer A, Steinder G (2022) Compliance Carlini N, Tramer F, Wallace E, Jagielski M, Herbert-Voss A, Lee K, Roberts A, Brown T, Song D, Erlingsson U, Oprea A, Raffel C (2021) Extracting Training Data from Large Language Models. In 30th USENIX Security Symposium (USENIX Security 21), pp 2633-2650. -Lukas N, Salem A, Sim R, Tople S, Wutschitz L, Zanella-Beguelin S (2023) Analyzing Leakage of Personally Identifiable Information in Language Models. In 2023 IEEE Symposium on Security and Privacy (SP), pp 346-363. +Lukas N, Salem A, Sim R, Tople S, Wutschitz L, Zanella-Beguelin S (2023) Analyzing Leakage of Personally Identifiable Information in Language Models. In 2023 IEEE Symposium on Security and Privacy (SP), pp 346-363. https://doi.org/10.1109/sp46215.2023.10179300. Plant R, Giuffrida V, Gkatzia D (2022) You Are What You Write: Preserving Privacy in the Era of Large Language Models. arXiv preprint arXiv:2204.09391. diff --git a/docs/en/part11/ch37_federated_learning_and_privacy_preserving_technologies.md b/docs/en/part11/ch37_federated_learning_and_privacy_preserving_technologies.md index 64309bb0..e684255c 100644 --- a/docs/en/part11/ch37_federated_learning_and_privacy_preserving_technologies.md +++ b/docs/en/part11/ch37_federated_learning_and_privacy_preserving_technologies.md @@ -444,7 +444,7 @@ Combined with P09 and the medical and financial cases, a mature AI privacy gover Shokri R, Stronati M, Song C, Shmatikov V (2017) Membership Inference Attacks against Machine Learning Models. In 2017 IEEE Symposium on Security and Privacy (SP), pp 3-18. -Zhu L, Liu Z, Han S (2019) Deep Leakage from Gradients. Advances in Neural Information Processing Systems, 32. +Zhu L, Liu Z, Han S (2019) Deep Leakage from Gradients. Advances in Neural Information Processing Systems, 32. arXiv:1906.08935. Geiping J, Bauermeister H, Droge H, Moeller M (2020) Inverting Gradients: How Easy Is It to Break Privacy in Federated Learning? Advances in Neural Information Processing Systems, 33, 16937-16947. @@ -452,17 +452,17 @@ Dwork C (2011) Differential Privacy. In Encyclopedia of Cryptography and Securit Abadi M, Chu A, Goodfellow I, McMahan H B, Mironov I, Talwar K, Zhang L (2016) Deep Learning with Differential Privacy. In Proceedings of the 2016 ACM SIGSAC Conference on Computer and Communications Security, pp 308-318. -Erlingsson U, Pihur V, Korolova A (2014) RAPPOR: Randomized Aggregatable Privacy-Preserving Ordinal Response. In Proceedings of the 2014 ACM SIGSAC Conference on Computer and Communications Security, pp 1054-1067. +Erlingsson U, Pihur V, Korolova A (2014) RAPPOR: Randomized Aggregatable Privacy-Preserving Ordinal Response. In Proceedings of the 2014 ACM SIGSAC Conference on Computer and Communications Security, pp 1054-1067. arXiv:1407.6981. -McMahan H B, Ramage D, Talwar K, Zhang L (2018) Learning Differentially Private Recurrent Language Models. International Conference on Learning Representations. +McMahan H B, Ramage D, Talwar K, Zhang L (2018) Learning Differentially Private Recurrent Language Models. International Conference on Learning Representations. arXiv:1710.06963. -Kairouz P, McMahan H B (2021) Advances and Open Problems in Federated Learning. Foundations and Trends in Machine Learning, 14(1-2), 1-210. +Kairouz P, McMahan H B (2021) Advances and Open Problems in Federated Learning. Foundations and Trends in Machine Learning, 14(1-2), 1-210. https://doi.org/10.1561/2200000083. Bagdasaryan E, Veit A, Hua Y, Estrin D, Shmatikov V (2020) How To Backdoor Federated Learning. In Proceedings of the Twenty Third International Conference on Artificial Intelligence and Statistics, pp 2938-2948. Yang Q, Liu Y, Chen T, Tong Y (2019) Federated Machine Learning: Concept and Applications. ACM Transactions on Intelligent Systems and Technology, 10(2), 1-19. -Bonawitz K, Ivanov V, Kreuter B, Marcedone A, McMahan H B, Patel S, Ramage D, Segal A, Seth K (2017) Practical Secure Aggregation for Privacy-Preserving Machine Learning. In Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security, pp 1175-1191. +Bonawitz K, Ivanov V, Kreuter B, Marcedone A, McMahan H B, Patel S, Ramage D, Segal A, Seth K (2017) Practical Secure Aggregation for Privacy-Preserving Machine Learning. In Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security, pp 1175-1191. https://doi.org/10.1145/3133956.3133982. McMahan B, Moore E, Ramage D, Hampson S, y Arcas B A (2017) Communication-Efficient Learning of Deep Networks from Decentralized Data. In Artificial Intelligence and Statistics, pp 1273-1282. @@ -470,16 +470,16 @@ Zhao Y, Li M, Lai L, Suda N, Civin D, Chandra V (2018) Federated Learning with N Li T, Sahu A K, Zaheer M, Sanjabi M, Talwalkar A, Smith V (2020) Federated Optimization in Heterogeneous Networks. Proceedings of Machine Learning and Systems, 2, 429-450. -Mohassel P, Zhang Y (2017) SecureML: A System for Scalable Privacy-Preserving Machine Learning. In 2017 IEEE Symposium on Security and Privacy (SP), pp 19-38. +Mohassel P, Zhang Y (2017) SecureML: A System for Scalable Privacy-Preserving Machine Learning. In 2017 IEEE Symposium on Security and Privacy (SP), pp 19-38. https://doi.org/10.1109/sp.2017.12. Gilad-Bachrach R, Dowlin N, Laine K, Lauter K, Naehrig M, Wernsing J (2016) CryptoNets: Applying Neural Networks to Encrypted Data with High Throughput and Accuracy. In International Conference on Machine Learning, pp 201-210. Tramer F, Boneh D (2019) Slalom: Fast, Verifiable and Private Execution of Neural Networks in Trusted Hardware. International Conference on Learning Representations. -Hu E J, Shen Y, Wallis P, Allen-Zhu Z, Li Y, Wang S, Wang L, Chen W, others (2022) LoRA: Low-Rank Adaptation of Large Language Models. International Conference on Learning Representations. +Hu E J, Shen Y, Wallis P, Allen-Zhu Z, Li Y, Wang S, Wang L, Chen W, others (2022) LoRA: Low-Rank Adaptation of Large Language Models. International Conference on Learning Representations. arXiv:2106.09685. -Kuang W, Qian B, Li Z, Chen D, Gao D, Pan X, Xie Y, Li Y, Ding B, Zhou J (2024) FederatedScope-LLM: A Comprehensive Package for Fine-Tuning Large Language Models in Federated Learning. In Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 5260-5271. +Kuang W, Qian B, Li Z, Chen D, Gao D, Pan X, Xie Y, Li Y, Ding B, Zhou J (2024) FederatedScope-LLM: A Comprehensive Package for Fine-Tuning Large Language Models in Federated Learning. In Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 5260-5271. https://doi.org/10.1145/3637528.3671573. Blanchard P, El Mhamdi E M, Guerraoui R, Stainer J (2017) Machine Learning with Adversaries: Byzantine Tolerant Gradient Descent. Advances in Neural Information Processing Systems, 30. -Sheller M J, Edwards B, Reina G A, others (2020) Federated Learning in Medicine: Facilitating Multi-Institutional Collaborations without Sharing Patient Data. Scientific Reports, 10(1), 12598. +Sheller M J, Edwards B, Reina G A, others (2020) Federated Learning in Medicine: Facilitating Multi-Institutional Collaborations without Sharing Patient Data. Scientific Reports, 10(1), 12598. https://doi.org/10.1038/s41598-020-69250-1. diff --git a/docs/en/part12/ch38_text_corpora_transparent_ledger.md b/docs/en/part12/ch38_text_corpora_transparent_ledger.md index 87ae2533..f2c2f4eb 100644 --- a/docs/en/part12/ch38_text_corpora_transparent_ledger.md +++ b/docs/en/part12/ch38_text_corpora_transparent_ledger.md @@ -594,15 +594,24 @@ FineWeb shows how open Web text is transformed from snapshots into training corp ## References -1. - Penedo, G., Kydlíček, H., Allal, L. B., Lozhkov, A., Mitchell, M., Raffel, C., von Werra, L., & Wolf, T. (2024). The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. NeurIPS 2024 Datasets and Benchmarks Track. https://arxiv.org/abs/2406.17557 -- Hugging Face. (2026). HuggingFaceFW/fineweb Dataset Card. https://huggingface.co/datasets/HuggingFaceFW/fineweb -- Hugging Face. (2026). DataTrove FineWeb Processing Script. https://github.com/huggingface/datatrove/blob/main/examples/fineweb.py -- Penedo, G., Kydlíček, H., Cappelli, A., Sasko, M., & Wolf, T. (2024). DataTrove large scale data processing. https://github.com/huggingface/datatrove -- Luccioni, S., & Viviano, J. (2021). What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus. https://arxiv.org/abs/2105.02732 - -2. - Soldaini, L., Kinney, R., Bhagia, A., Schwenk, D., Atkinson, D., Authur, R., et al. (2024). Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research. ACL 2024. https://arxiv.org/abs/2402.00159 -- Allen Institute for AI. (2023). Ai2 Dolma: 3 trillion token open corpus for language model pretraining. https://allenai.org/blog/dolma-3-trillion-tokens-open-llm-corpus-9a0ff4b8da64 -- AllenAI. (2026). allenai/dolma Dataset Card. https://huggingface.co/datasets/allenai/dolma -- AllenAI. (2026). Dolma Dataset and Toolkit Repository. https://github.com/allenai/dolma -- AllenAI. (2026). Dolma Toolkit Documentation. https://github.com/allenai/dolma/blob/main/docs/README.md -- Groeneveld, D., Beltagy, I., Walsh, P., Bhagia, A., Kinney, R., Tafjord, O., et al. (2024). OLMo: Accelerating the Science of Language Models. https://arxiv.org/abs/2402.00838 +Penedo, G., Kydlíček, H., Allal, L. B., Lozhkov, A., Mitchell, M., Raffel, C., von Werra, L., & Wolf, T. (2024). The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. NeurIPS 2024 Datasets and Benchmarks Track. https://arxiv.org/abs/2406.17557. + +Hugging Face. (2026). HuggingFaceFW/fineweb Dataset Card. https://huggingface.co/datasets/HuggingFaceFW/fineweb. + +Hugging Face. (2026). DataTrove FineWeb Processing Script. https://github.com/huggingface/datatrove/blob/main/examples/fineweb.py. + +Penedo, G., Kydlíček, H., Cappelli, A., Sasko, M., & Wolf, T. (2024). DataTrove large scale data processing. https://github.com/huggingface/datatrove. + +Luccioni, S., & Viviano, J. (2021). What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus. https://arxiv.org/abs/2105.02732. + +Soldaini, L., Kinney, R., Bhagia, A., Schwenk, D., Atkinson, D., Authur, R., et al. (2024). Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research. ACL 2024. https://arxiv.org/abs/2402.00159. + +Allen Institute for AI. (2023). Ai2 Dolma: 3 trillion token open corpus for language model pretraining. https://allenai.org/blog/dolma-3-trillion-tokens-open-llm-corpus-9a0ff4b8da64. + +AllenAI. (2026). allenai/dolma Dataset Card. https://huggingface.co/datasets/allenai/dolma. + +AllenAI. (2026). Dolma Dataset and Toolkit Repository. https://github.com/allenai/dolma. + +AllenAI. (2026). Dolma Toolkit Documentation. https://github.com/allenai/dolma/blob/main/docs/README.md. + +Groeneveld, D., Beltagy, I., Walsh, P., Bhagia, A., Kinney, R., Tafjord, O., et al. (2024). OLMo: Accelerating the Science of Language Models. https://arxiv.org/abs/2402.00838. diff --git a/docs/en/part12/ch39_image_text_candidate_pool_data_engineering.md b/docs/en/part12/ch39_image_text_candidate_pool_data_engineering.md index c655bd80..fb1a2499 100644 --- a/docs/en/part12/ch39_image_text_candidate_pool_data_engineering.md +++ b/docs/en/part12/ch39_image_text_candidate_pool_data_engineering.md @@ -261,9 +261,14 @@ For readers of this book, what is most worth learning from LAION-5B is not downl ## References -- Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., et al. (2022). LAION-5B: An open large-scale dataset for training next generation image-text models. NeurIPS 2022 Datasets and Benchmarks Track. https://arxiv.org/abs/2210.08402 -- LAION. (2022). LAION-5B: A new era of open large-scale multi-modal datasets. https://laion.ai/blog/laion-5b/ -- LAION-AI. (2022). dataset-spec. https://github.com/LAION-AI/dataset-spec -- Gadre, S. Y., Ilharco, G., Fang, A., Hayase, J., Smyrnis, G., Nguyen, T., et al. (2023). DataComp: In search of the next generation of multimodal datasets. NeurIPS 2023 Datasets and Benchmarks Track. https://arxiv.org/abs/2304.14108 -- DataComp Team. (2026). DataComp Benchmark Documentation. https://www.datacomp.ai/dcclip/ -- ML Foundations. (2023). DataComp codebase. https://github.com/mlfoundations/datacomp +Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., et al. (2022). LAION-5B: An open large-scale dataset for training next generation image-text models. NeurIPS 2022 Datasets and Benchmarks Track. https://arxiv.org/abs/2210.08402. + +LAION. (2022). LAION-5B: A new era of open large-scale multi-modal datasets. https://laion.ai/blog/laion-5b/. + +LAION-AI. (2022). dataset-spec. https://github.com/LAION-AI/dataset-spec. + +Gadre, S. Y., Ilharco, G., Fang, A., Hayase, J., Smyrnis, G., Nguyen, T., et al. (2023). DataComp: In search of the next generation of multimodal datasets. NeurIPS 2023 Datasets and Benchmarks Track. https://arxiv.org/abs/2304.14108. + +DataComp Team. (2026). DataComp Benchmark Documentation. https://www.datacomp.ai/dcclip/. + +ML Foundations. (2023). DataComp codebase. https://github.com/mlfoundations/datacomp. diff --git a/docs/en/part12/ch40_visual_document_table_data_engineering.md b/docs/en/part12/ch40_visual_document_table_data_engineering.md index 9f47c716..8c986a3c 100644 --- a/docs/en/part12/ch40_visual_document_table_data_engineering.md +++ b/docs/en/part12/ch40_visual_document_table_data_engineering.md @@ -711,19 +711,19 @@ Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., and C Jaume, G., Ekenel, H.K., and Thiran, J.-P. (2019). FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents. *ICDAR Workshop*. -Kuhn, H.W. (1955). The Hungarian Method for the Assignment Problem. *Naval Research Logistics Quarterly*, 2(1-2), pp. 83-97. +Kuhn, H.W. (1955). The Hungarian Method for the Assignment Problem. *Naval Research Logistics Quarterly*, 2(1-2), pp. 83-97. https://doi.org/10.1002/nav.3800020109. Levenshtein, V.I. (1965). Binary Codes Capable of Correcting Deletions, Insertions and Reversals. *Soviet Physics Doklady*, 10, pp. 707-710. Liu, H., Xue, W., Chen, Y., et al. (2024). A Survey on Hallucination in Large Vision-Language Models. *arXiv preprint arXiv:2402.00253*. -Mathew, M., Karatzas, D., and Jawahar, C.V. (2021). DocVQA: A Dataset for VQA on Document Images. *Proc. WACV*. +Mathew, M., Karatzas, D., and Jawahar, C.V. (2021). DocVQA: A Dataset for VQA on Document Images. *Proc. WACV*. https://doi.org/10.1109/wacv48630.2021.00225. Niu, J., Liu, Z., Gu, Z., et al. (2025). MinerU 2.5: A Decoupled Vision-Language Model for Efficient High-Resolution Document Parsing. *arXiv preprint*. Park, S., Shin, S., Lee, B., et al. (2019). CORD: A Consolidated Receipt Dataset for Post-OCR Parsing. *NeurIPS Workshop on Document Intelligence*. -Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D., and Finn, C. (2024). Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. *Proc. NeurIPS*. +Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D., and Finn, C. (2024). Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. *Proc. NeurIPS*. arXiv:2305.18290. Schulman, J., Wolski, F., Dhariwal, P., Radford, A., and Klimov, O. (2017). Proximal Policy Optimization Algorithms. *arXiv preprint arXiv:1707.06347*. @@ -737,7 +737,7 @@ Xue, W., Yu, B., Wang, W., Tao, D., and Li, Q. (2021). TGRNet: A Table Graph Rec Yang, Z., Long, R., Wang, P., et al. (2023). Modeling Entities as Semantic Points for Visual Information Extraction in the Wild. *Proc. CVPR*. -Zhang, N., Chen, M., Bi, Z., et al. (2022). CBLUE: A Chinese Biomedical Language Understanding Evaluation Benchmark. *Proc. ACL*, pp. 7888-7915. +Zhang, N., Chen, M., Bi, Z., et al. (2022). CBLUE: A Chinese Biomedical Language Understanding Evaluation Benchmark. *Proc. ACL*, pp. 7888-7915. https://doi.org/10.18653/v1/2022.acl-long.544. Zhong, X., ShafieiBavani, E., and Jimeno Yepes, A. (2020). Image-based Table Recognition: Data, Model, and Evaluation. *arXiv preprint arXiv:1911.10683*. @@ -749,7 +749,7 @@ Cui, C., Sun, T., Liang, S., et al. (2025). PaddleOCR-VL: Boosting Multilingual Guo, D., Yang, D., Zhang, H., et al. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. *arXiv preprint arXiv:2501.12948*. -Hunyuan Vision Team, Lyu, P., Wan, X., et al. (2025). HunyuanOCR Technical Report. *arXiv preprint*. +Hunyuan Vision Team (2025). HunyuanOCR Technical Report. *arXiv preprint*. Li, Y., Yang, G., Liu, H., Wang, B., and Zhang, C. (2025a). Dots.OCR: Multilingual Document Layout Parsing in a Single Vision-Language Model. *arXiv preprint*. @@ -761,9 +761,9 @@ Wang, W., Gao, Z., Gu, L., et al. (2025). InternVL3.5: Advancing Open-Source Mul Zhang, J., Liu, Y., Wu, Z., et al. (2025). MonkeyOCR v1.5 Technical Report: Unlocking Robust Document Parsing for Complex Patterns. *arXiv preprint*. -Smock, B., Pesala, R., and Abraham, R. (2022). PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents. *Proc. CVPR*. +Smock, B., Pesala, R., and Abraham, R. (2022). PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents. *Proc. CVPR*. https://doi.org/10.1109/cvpr52688.2022.00459. -Zhu, F., Lei, W., Huang, Y., Wang, C., Zhang, S., Lv, J., Feng, F., and Chua, T.-S. (2021). TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance. *Proc. ACL*. +Zhu, F., Lei, W., Huang, Y., Wang, C., Zhang, S., Lv, J., Feng, F., and Chua, T.-S. (2021). TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance. *Proc. ACL*. https://doi.org/10.18653/v1/2021.acl-long.254. Pandas Development Team. (2026). pandas Documentation. https://pandas.pydata.org/docs/. diff --git a/docs/en/part12/ch41_visual_reasoning_tool_data_engineering.md b/docs/en/part12/ch41_visual_reasoning_tool_data_engineering.md index a5045ded..59a0f4d5 100644 --- a/docs/en/part12/ch41_visual_reasoning_tool_data_engineering.md +++ b/docs/en/part12/ch41_visual_reasoning_tool_data_engineering.md @@ -818,17 +818,17 @@ The main method in this chapter is to decompose visual reasoning tasks into anno ## References -Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. +Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. https://doi.org/10.18653/v1/2022.findings-acl.177. -Methani, N., Ganguly, P., Khapra, M. M., & Kumar, P. (2020). PlotQA: Reasoning over Scientific Plots. WACV 2020. +Methani, N., Ganguly, P., Khapra, M. M., & Kumar, P. (2020). PlotQA: Reasoning over Scientific Plots. WACV 2020. https://doi.org/10.1109/wacv45572.2020.9093523. Kahou, S. E., Michalski, V., Atkinson, A., Kádár, Á., Trischler, A., & Bengio, Y. (2017). FigureQA: An Annotated Figure Dataset for Visual Reasoning. arXiv:1710.07300. -Kafle, K., Price, B., Cohen, S., & Kanan, C. (2018). DVQA: Understanding Data Visualizations via Question Answering. CVPR 2018. +Kafle, K., Price, B., Cohen, S., & Kanan, C. (2018). DVQA: Understanding Data Visualizations via Question Answering. CVPR 2018. https://doi.org/10.1109/cvpr.2018.00592. -Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. +Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. https://doi.org/10.1109/wacv48630.2021.00225. -Masry, A., Islam, M. S., Ahmed, M., Bajaj, A., Kabir, F., Kartha, A., ... & Joty, S. (2025, July). Chartqapro: A more diverse and challenging benchmark for chart question answering. In Findings of the Association for Computational Linguistics: ACL 2025 (pp. 19123-19151). +Masry, A., Islam, M. S., Ahmed, M., Bajaj, A., Kabir, F., Kartha, A., ... & Joty, S. (2025, July). Chartqapro: A more diverse and challenging benchmark for chart question answering. In Findings of the Association for Computational Linguistics: ACL 2025 (pp. 19123-19151). https://doi.org/10.18653/v1/2025.findings-acl.978. Xie, T., Lin, M., Liu, M., Ye, Y., Chen, C., & Liu, S. (2026). Infochartqa: A benchmark for multimodal question answering on infographic charts. Advances in Neural Information Processing Systems, 38. @@ -844,9 +844,9 @@ He, X., Zhang, Y., Mou, L., Xing, E., & Xie, P. (2020). PathVQA: 30000+ Question Liu, B., Zhan, L.-M., Xu, L., Ma, L., Yang, Y., & Wu, X.-M. (2021). SLAKE: A Semantically-Labeled Knowledge-Enhanced Dataset for Medical Visual Question Answering. IEEE 18th International Symposium on Biomedical Imaging. https://doi.org/10.1109/ISBI48211.2021.9434010. -Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. International Conference on Learning Representations. +Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. International Conference on Learning Representations. arXiv:2210.03629. -Schick, T., Dwivedi-Yu, J., Dessi, R., et al. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. Advances in Neural Information Processing Systems, 36. +Schick, T., Dwivedi-Yu, J., Dessi, R., et al. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. Advances in Neural Information Processing Systems, 36. arXiv:2302.04761. Kirillov, A., Mintun, E., Ravi, N., et al. (2023). Segment Anything. Proceedings of the IEEE/CVF International Conference on Computer Vision, 4015-4026. diff --git a/docs/en/part12/ch42_speech_audio_interaction_data_engineering.md b/docs/en/part12/ch42_speech_audio_interaction_data_engineering.md index 5bf720cb..91ca6e37 100644 --- a/docs/en/part12/ch42_speech_audio_interaction_data_engineering.md +++ b/docs/en/part12/ch42_speech_audio_interaction_data_engineering.md @@ -466,7 +466,7 @@ Du Z, Chen Q, Zhang S, Hu K, Lu H, Yang Y, Hu H, Zheng S, Gu Y, Ma Z, Gao Z, Yan Du Z, Wang Y, Chen Q, Shi X, Lv X, Zhao T, Gao Z, Yang Y, Gao C, Wang H, others (2024) CosyVoice 2: Scalable Streaming Speech Synthesis with Large Language Models. arXiv preprint arXiv:2412.10117. -Mittag G, Naderi B, Chehadi A, Möller S (2021) NISQA: A Deep CNN-Self-Attention Model for Multidimensional Speech Quality Prediction with Crowdsourced Datasets. In: Interspeech 2021, pp 2127–2131. +Mittag G, Naderi B, Chehadi A, Möller S (2021) NISQA: A Deep CNN-Self-Attention Model for Multidimensional Speech Quality Prediction with Crowdsourced Datasets. In: Interspeech 2021, pp 2127–2131. https://doi.org/10.21437/interspeech.2021-299. Song X (2026) S3Tokenizer: Reverse Engineering of Supervised Semantic Speech Tokenizer proposed in CosyVoice. GitHub repository. https://github.com/xingchensong/S3Tokenizer. diff --git a/docs/en/part12/ch43_reasoning_trace_compression_data_engineering.md b/docs/en/part12/ch43_reasoning_trace_compression_data_engineering.md index ff8659e7..d26983fd 100644 --- a/docs/en/part12/ch43_reasoning_trace_compression_data_engineering.md +++ b/docs/en/part12/ch43_reasoning_trace_compression_data_engineering.md @@ -453,8 +453,8 @@ Latent-Switch-69K illustrates the core problem of reasoning-trace data engineeri ## References -1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022. +1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022. arXiv:2201.11903. 2. Lightman, H., Kosaraju, V., Burda, Y., Edwards, H., Baker, B., Lee, T., Leike, J., Schulman, J., Sutskever, I., & Cobbe, K. (2023). Let's Verify Step by Step. arXiv:2305.20050. 3. Yao, S., Zhao, J., Yu, D., Du, N., Shafran, I., Narasimhan, K., & Cao, Y. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. arXiv:2210.03629. -4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. -5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021. +4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948. +5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021. arXiv:2103.03874. diff --git a/docs/en/part13/ch44_pretrain_recipes.md b/docs/en/part13/ch44_pretrain_recipes.md index 2ebfb543..124213be 100644 --- a/docs/en/part13/ch44_pretrain_recipes.md +++ b/docs/en/part13/ch44_pretrain_recipes.md @@ -287,11 +287,11 @@ Bavarian M, Jun H, Tezak N, Schulman J, McLeavey C, Tworek J, Chen M (2022) Effi Bengio Y, Louradour J, Collobert R, Weston J (2009) Curriculum Learning. In: Proceedings of the 26th Annual International Conference on Machine Learning, pp 41–48. -Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21–29. +Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21–29. https://doi.org/10.1109/sequen.1997.666900. Grattafiori A, Dubey A, Jauhri A, Pandey A, Kadian A, Al-Dahle A, Letman A, Mathur A, Schelten A, Vaughan A, others (2024) The Llama 3 Herd of Models. arXiv preprint arXiv:2407.21783. -Groeneveld D, Magnusson I, Bhagia A, Schwenk D, Soldaini L, Tafjord O, Sherborne M, Kinney R, Authur C, Atkinson D, others (2024) OLMo: Accelerating the Science of Language Models. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics, pp 15789–15809. +Groeneveld D, Magnusson I, Bhagia A, Schwenk D, Soldaini L, Tafjord O, Sherborne M, Kinney R, Authur C, Atkinson D, others (2024) OLMo: Accelerating the Science of Language Models. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics, pp 15789–15809. Available at: https://arxiv.org/abs/2402.00838. Hoffmann J, Borgeaud S, Mensch A, Buchatskaya E, Cai T, Rutherford E, de Las Casas D, Hendricks L A, Welbl J, Clark A, others (2022) Training Compute-Optimal Large Language Models (Chinchilla). arXiv preprint arXiv:2203.15556. @@ -307,6 +307,6 @@ Sennrich R, Haddow B, Birch A (2016) Neural Machine Translation of Rare Words wi Shao Z, Wang P, Zhu Q, Xu R, Song J, Zhang M, Li Y, Wu Y, Guo D (2024) DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models. arXiv preprint arXiv:2402.03300. -Su J, Lu Y, Pan S, Murtadha A, Wen B, Liu Y (2024) RoFormer: Enhanced Transformer with Rotary Position Embedding (RoPE). Neurocomputing 568:127063. +Su J, Lu Y, Pan S, Murtadha A, Wen B, Liu Y (2024) RoFormer: Enhanced Transformer with Rotary Position Embedding (RoPE). Neurocomputing 568:127063. https://doi.org/10.1016/j.neucom.2023.127063. -Wang X, Wei J, Schuurmans D, Le Q, Chi E, Narang S, Chowdhery A, Zhou D (2023) Self-Consistency Improves Chain of Thought Reasoning in Language Models. In: International Conference on Learning Representations. +Wang X, Wei J, Schuurmans D, Le Q, Chi E, Narang S, Chowdhery A, Zhou D (2023) Self-Consistency Improves Chain of Thought Reasoning in Language Models. In: International Conference on Learning Representations. arXiv:2203.11171. diff --git a/docs/en/part13/ch45_posttrain_recipes.md b/docs/en/part13/ch45_posttrain_recipes.md index eaddc246..74cfff2c 100644 --- a/docs/en/part13/ch45_posttrain_recipes.md +++ b/docs/en/part13/ch45_posttrain_recipes.md @@ -52,6 +52,8 @@ In modern engineering practice, the common scale for SFT data is typically in th If SFT teaches the model "how to answer," then the preference alignment layer teaches the model "which of two qualified responses is better." This layer establishes the model's reward surface. Preference data can serve multiple different training paradigms: training an RM to support RLHF, or directly serving direct preference optimization methods such as DPO, IPO, KTO, GRPO, or RLVR. Its typical scale spans a wide range, from $10^5$ to $10^7$ preference pairs, depending on whether the data construction process includes large-scale automatic generation, multi-round sampling, and online feedback. +Within these methods, preference learning should not be understood only as binary ranking between two answers. Methods such as KTO interpret human feedback more like gain/loss signals under prospect theory, and theoretical work has also attempted to unify RLHF, DPO, and more general learning-from-human-preference paradigms (Ethayarajh et al. 2024; Gheshlaghi Azar et al. 2024). This reminds data engineers not to preserve only the final label, but also the feedback source, review rationale, and candidate-group structure. + **Layer Three: Online Continuous Optimization Data** Model deployment is not the end of post-training, but the beginning. The third layer determines whether the model can self-correct as the real business evolves. After deployment, user upvotes and downvotes, system refusal logs, difficult samples, manually reviewed red-line data, A/B experiment results, and emergent safety incidents all enter the continuous optimization pipeline through streaming or batch processing. Without this layer, the model remains at the static level of a one-time release, ultimately rendered obsolete by the ever-shifting distribution of user inputs. @@ -97,8 +99,8 @@ In Table 45-1, `[D]` denotes information directly disclosed in public materials, * **Tülu-3** is one of the most suitable projects in this chapter to serve as a reproducible baseline. It not only open-sources the model weights, but also discloses the post-training data mixture, training code, and evaluation methodology, enabling teams to translate the recipe from the paper into an inspectable engineering process. * **Llama-3** (Grattafiori et al. 2024) represents a heavy-asset industrial approach. Its report discloses key mechanisms such as multi-round post-training, preference annotation, reward model retraining, and rejection sampling, but many data details are not fully disclosed. It is therefore more suitable as a reference for understanding industrial closed-loop systems than as a direct template for replication. -* **Qwen2.5** provides important reference value for Chinese, multilingual, multi-task, and synthetic data approaches. A distinction must be carefully drawn: the synthetic data approach in the Qwen2.5 report and seed-free synthesis methods such as Magpie (Xu et al. 2024) can be discussed in parallel, but should not be conflated as "officially adopting Magpie" in the absence of an explicit source. -* **Nemotron-4** and HelpSteer2 derive their value from the granularity of preference annotation. HelpSteer2 (Wang et al. 2024b) does not merely record overall preference—it establishes scoring signals along dimensions such as helpfulness, correctness, coherence, complexity, and verbosity, providing a referenceable example for reward model data design. +* **Qwen2.5 / Qwen3** provide important reference value for Chinese, multilingual, multi-task, and synthetic data approaches. A distinction must be carefully drawn: the synthetic data approaches in the Qwen-series reports and seed-free synthesis methods such as Magpie (Xu et al. 2025) can be discussed in parallel; the Qwen3 technical report can also serve as a public reference for later-version recipes (Yang et al. 2025), but none of this should be written as "officially adopting Magpie" in the absence of an explicit source. +* **Nemotron-4** and HelpSteer2 derive their value from the granularity of preference annotation. HelpSteer2 (Wang et al. 2024b) does not merely record overall preference; it establishes scoring signals along dimensions such as helpfulness, correctness, coherence, complexity, and verbosity, providing a referenceable example for reward model data design. Reward-model engineering techniques can also refer to Skywork-Reward's summary of data mixing, training stability, and evaluation protocols (Liu et al. 2024b). --- @@ -146,6 +148,8 @@ These four gates are best implemented as a combination of automated filtering an SFT data also requires stratified mixing rather than simple aggregation. It is recommended to break the data into at least six categories: general Q&A, knowledge explanation, complex instruction-following, code and tool use, mathematics and reasoning, and safety and refusals. Each category should be individually tracked for count, average length, source, filter rate, and manual review pass rate. For reproducing open-source model recipes, the most valuable practice is not to exactly replicate a given mixing ratio, but to maintain a "mixing change log." When the model shows changes in code capability, safety refusals, or linguistic quality, the team can trace back to determine whether a change in a specific data category was responsible, rather than relying on intuition alone. +Large open-source technical reports also show that post-training data mixtures usually co-evolve with pretraining data, synthetic data, reasoning data, and safety data, rather than optimizing a single sample category in isolation. The mixture and post-training descriptions in the DeepSeek-V3 report can serve as an additional reference for understanding this kind of multi-stage recipe (Liu et al. 2024a). + It is also important to note that good SFT data from one stage is not necessarily suitable for all training rounds. The first round of SFT is better suited to data that is structurally clear, response-stable, and broadly covering, helping the model establish basic assistant behavior. Subsequent incremental SFT rounds are better suited to incorporating hard cases, domain-specific tasks, tool invocation, and safety boundary repair data. Loading all data at once makes it easy for high-value boundary samples to be overwhelmed by a large volume of ordinary samples. A better approach is to organize SFT data into curriculum-style versions: `sft_base_mix`, `sft_complex_mix`, `sft_safety_patch`, `sft_domain_patch`, each with its own manifest and evaluation report. --- @@ -214,7 +218,7 @@ Among the many models claiming to be open-source, some projects only release fin ### 45.5.2 SFT-Mix: The Behavioral Template Layer -The SFT stage of Tülu-3 (Lambert et al. 2024) does not pursue unbounded expansion of sample volume. Its SFT-Mix scale is approximately 939K [D]—a figure disclosed in the publicly available training data documentation—and should be used in alignment with the corresponding dataset card or paper table. +The SFT stage of Tülu-3 (Lambert et al. 2025) does not pursue unbounded expansion of sample volume. Its SFT-Mix scale is approximately 939K [D]—a figure disclosed in the publicly available training data documentation—and should be used in alignment with the corresponding dataset card or paper table. **Source structure and composition:** The SFT-Mix reflects a deliberate manual curation strategy. It combines basic dialogue, multi-task instruction-following, multi-turn interaction, API tool use, code generation, mathematical reasoning, and core safety tasks according to stage objectives. **Why not simply pursue higher sample count?** Blindly stacking millions of simple function-completion samples for code tasks would cause "catastrophic forgetting," causing the model to lose the ability to hold a normal natural-language conversation. Tülu-3's experience demonstrates that by downsampling and balancing high-quality sources—such as carefully annotated domain-specific sets—the model's comprehensive behavioral patterns can be rapidly consolidated within a few tens of thousands of fine-tuning steps. @@ -301,6 +305,8 @@ Reward Hacking is one of the most common and most underestimated risks in prefer Ch05 already examined general methods for benchmark contamination detection. Here we focus exclusively on the particularly insidious contamination problems unique to the post-training phase. +In post-training, contamination means not only repeated problem statements, but also cases where the model indirectly learns benchmark answer patterns through instruction synthesis, preference filtering, or judge feedback. Prior work has explicitly warned that LLMs should not become "evaluation benchmark cheaters" during training and feedback loops (Zhou et al. 2023). + 1. **SFT mixing in evaluation set answers:** Developers inadvertently use test set ground truth directly as model input when constructing synthetic instructions. 2. **Preference hardening the evaluation style:** During annotation, annotators unconsciously assign high scores to responses that match the format of specific benchmark multiple-choice templates, hardening the evaluation set style into reward preferences. 3. **Implicit filtering contamination:** During rejection sampling, using the pass rate on external evaluation sets as a filtering signal is equivalent to leaking test set metrics into the model. @@ -323,6 +329,8 @@ Another commonly overlooked issue is "review model contamination." Many teams us To address the problem of a correct final result but an unreliable reasoning process, industry has proposed the Process Reward Model (PRM). This chapter does not elaborate on the full implementation of PRM, but notes its data value: **reasoning models require not only rewards at the final answer dimension, but also fine-grained data management of intermediate reasoning steps, verifier states, and full rejection sampling trajectories**. This naturally leads to the topic of the next chapter: how to construct an intermediate-state data flywheel for reasoning. +Representative work on process supervision shows that step-by-step verification can decompose "whether the answer is correct" into finer-grained training signals, improving error localization and data filtering in mathematical reasoning (Lightman et al. 2024). + The core unit of process reward data is not a complete response, but a step. A usable process reward sample must at minimum preserve the problem, the complete trajectory, step segmentation, a local judgment for each step, the final answer, the final verification result, and the error type. If only the entire chain-of-thought is stored, a subsequent PRM cannot learn at which step the reasoning begins to deviate. For math tasks, annotations can mark "algebraic transformation error," "omitted condition," or "final numerical error"; for code tasks, annotations can mark "correct algorithmic approach but boundary condition error," "insufficient computational complexity," or "unit test not covered." These fine-grained labels will give the Ch46 reasoning data flywheel more stable training signals. Process reward data also requires distinguishing between "correct process but wrong result" and "correct result but wrong process." The former may indicate an error in the final computation step or format extraction; the latter may indicate that the model obtained the answer through guessing or pattern memorization. If the training system only checks the final answer, the second type of sample will be erroneously rewarded, and the model will learn inexplicable and potentially non-reproducible reasoning shortcuts. The combined value of PRM, RLVR, and rejection sampling lies precisely in simultaneously preserving both process signals and result signals, so that the model pursues not only answer correctness but also stable reasoning paths. @@ -336,7 +344,7 @@ Post-training involves not only technical route selection, but also cost, organi * **Limitations of SFT:** If development stops at SFT without any preference alignment, the model typically only learns a superficially professional response format. Its behavior under adversarial stress tests such as jailbreak attacks may still be unstable. * **The quantity trap:** Pursuing sample count alone in instruction tuning not only increases compute costs, but can also amplify the data's tendency toward templatization and distributional skew. * **Blunting of DPO signals:** When constructing DPO data, if the quality difference between chosen and rejected responses is too weak (or both are of poor quality), the contrastive loss signal in DPO becomes blunted and may even damage the model's existing linguistic capabilities. -* **Narrow RM coverage:** If the reward model's training set covers too narrow a range of prompt types, the model's behavior in production will be strongly driven by unknown reward model vulnerabilities, producing absurd outputs when faced with complex user inputs. +* **Narrow RM coverage:** If the reward model's training set covers too narrow a range of prompt types, the model's behavior in production will be strongly driven by unknown reward model vulnerabilities, producing absurd outputs when faced with complex user inputs. Length-related bias deserves a separate audit, because both RLHF data and RM scores may mistake "longer" for "better" (Singhal et al. 2024). * **Prerequisites for using Magpie:** Seed-free self-generation methods such as Magpie should not be used directly. Since they amplify the base model's inherent distribution, they require accompanying pipelines for diversity deduplication, difficulty filtering, factuality verification, and safety cleaning. * **RLVR boundaries:** Rule-based RLVR is suitable for verifiable tasks such as mathematical derivation and code compilation, but is not appropriate for open-ended chat, literary creation, or emotional support tasks that lack standard answers. * **Reset cost of context migration:** When migrating post-training recipes validated on open-source models to domain-specific contexts such as healthcare or finance, the original preference data should not be reused directly. Safety boundary alignment must be redone according to domain requirements, and rigorous evaluation set isolation mechanisms must be established. @@ -374,9 +382,9 @@ This chapter, through a systematic deconstruction of the core data recipes in th Wang Y, Kordi Y, Mishra S, Liu A, Smith N A, Khashabi D, Hajishirzi H (2023) Self-Instruct: Aligning Language Models with Self-Generated Instructions. Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 13484–13508. https://doi.org/10.18653/v1/2023.acl-long.754. -Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P F, Leike J, Lowe R (2022) Training Language Models to Follow Instructions with Human Feedback. Advances in Neural Information Processing Systems, 35, 27730–27744. +Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P F, Leike J, Lowe R (2022) Training Language Models to Follow Instructions with Human Feedback. Advances in Neural Information Processing Systems, 35, 27730–27744. arXiv:2203.02155. -Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Advances in Neural Information Processing Systems, 36, 53728–53741. +Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Advances in Neural Information Processing Systems, 36, 53728–53741. arXiv:2305.18290. Ethayarajh K, Xu W, Muennighoff N, Jurafsky D, Kiela D (2024) Model Alignment as Prospect Theoretic Optimization. Proceedings of the 41st International Conference on Machine Learning, pp 12634–12651. @@ -384,11 +392,11 @@ Gheshlaghi Azar M, Guo Z D, Piot B, Munos R, Rowland M, Valko M, Calandriello D Grattafiori A, Dubey A, Jauhri A, Pandey A, Kadian A, Al-Dahle A, Letman A, Mathur A, Schelten A, Vaughan A, others (2024) The Llama 3 Herd of Models. arXiv preprint arXiv:2407.21783. -Lambert N, Morrison J, Pyatkin V, Huang S, Ivison H, Brahman F, Miranda L J V, Liu A, Dziri N, Lyu X, Gu Y, Malik S, Graf V, Hwang J D, Yang J, Le Bras R, Tafjord O, Wilhelm C, Soldaini L, Smith N A, Wang Y, Dasigi P, Hajishirzi H (2025) Tülu 3: Pushing Frontiers in Open Language Model Post-Training. Second Conference on Language Modeling. +Lambert N, Morrison J, Pyatkin V, Huang S, Ivison H, Brahman F, Miranda L J V, Liu A, Dziri N, Lyu X, Gu Y, Malik S, Graf V, Hwang J D, Yang J, Le Bras R, Tafjord O, Wilhelm C, Soldaini L, Smith N A, Wang Y, Dasigi P, Hajishirzi H (2025) Tülu 3: Pushing Frontiers in Open Language Model Post-Training. Second Conference on Language Modeling. arXiv preprint arXiv:2411.15124. Yang A, Li A, Yang B, Zhang B, Hui B, Zheng B, Yu B, Gao C, Huang C, Lv C, others (2025) Qwen3 Technical Report. arXiv preprint arXiv:2505.09388. -Wang Z, Dong Y, Delalleau O, Zeng J, Shen G, Egert D, Zhang J J, Sreedhar M N, Kuchaiev O (2024) HelpSteer 2: Open-Source Dataset for Training Top-Performing Reward Models. Advances in Neural Information Processing Systems, 37, 1474–1501. +Wang Z, Dong Y, Delalleau O, Zeng J, Shen G, Egert D, Zhang J J, Sreedhar M N, Kuchaiev O (2024) HelpSteer 2: Open-Source Dataset for Training Top-Performing Reward Models. Advances in Neural Information Processing Systems, 37, 1474–1501. https://doi.org/10.52202/079017-0047. Xu C, Sun Q, Zheng K, Geng X, Zhao P, Feng J, Tao C, Lin Q, Jiang D (2024) WizardLM: Empowering Large Pre-Trained Language Models to Follow Complex Instructions. International Conference on Learning Representations. arXiv:2304.12244. @@ -404,6 +412,6 @@ Singhal P, Goyal T, Xu J, Durrett G (2024) A Long Way to Go: Investigating Lengt Zhou K, Zhu Y, Chen Z, Chen W, Zhao W X, Chen X, Lin Y, Wen J-R, Han J (2023) Don't Make Your LLM an Evaluation Benchmark Cheater. arXiv preprint arXiv:2311.01964. -Zheng L, Chiang W-L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E, Zhang H, Gonzalez J, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. Advances in Neural Information Processing Systems, 36, 46595-46623. +Zheng L, Chiang W-L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E, Zhang H, Gonzalez J, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. Advances in Neural Information Processing Systems, 36, 46595-46623. arXiv:2306.05685. -Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations. +Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations. arXiv:2305.20050. diff --git a/docs/en/part13/ch46_rl_reasoning_data.md b/docs/en/part13/ch46_rl_reasoning_data.md index 275730ed..206e3ef9 100644 --- a/docs/en/part13/ch46_rl_reasoning_data.md +++ b/docs/en/part13/ch46_rl_reasoning_data.md @@ -38,6 +38,8 @@ Reasoning models; data recipes; open-source large language models; training data In the early instruction fine-tuning era, teams often understood reasoning capability as "giving the model more step-by-step answers." For example, writing out detailed solutions for math problems, step-by-step analyses for coding problems, and inference chains for logic problems. Such data can indeed teach a model to "respond as if reasoning," but it has a fundamental limitation: the model is merely imitating pre-written trajectories and has not genuinely learned to explore the error space. +Several representative technical lines sit behind this traditional route: STaR emphasizes bootstrapping new supervision signals from the model's own reasoning, Self-Refine emphasizes iterative answer repair through feedback, and LIMA reminds us that a small number of high-quality alignment samples can still cause significant behavioral changes (Zelikman et al. 2022; Madaan et al. 2023; Zhou et al. 2023). Together, these works show that the value of reasoning data comes not only from chain length, but also from whether the process can be filtered and fed back. + This limitation quickly manifests on complex tasks. A model can fluently write "first, second, therefore," but each step may not have been verified; it can also perform well on common problem types in the training set, but when faced with slightly varied mathematical conditions, boundary inputs, or code test cases, the reasoning chain breaks. More importantly, manually writing CoT is expensive and cannot cover problem types at scale. A team can write 10,000 or 100,000 high-quality CoT examples, but it is very difficult to manually produce enough failure trajectories, correction trajectories, and boundary-case trajectories. The insight that R1-Zero experiments brought to data engineering is this: on verifiable tasks, models do not necessarily need to see large quantities of human-written CoT before producing usable reasoning behavior. As long as a task can be programmatically verified, the model can gradually discover more effective reasoning paths through sampling and reward signals. The final answer to a math problem, unit tests for a code problem, schema validation for structured output, and the return status of a tool call can all serve as part of the training signal. @@ -87,6 +89,8 @@ Cold-start data must satisfy four conditions. The scale of cold-start SFT need not be large. For educational or small-team projects, a few thousand to tens of thousands of high-quality Long-CoT examples are often more valuable than hundreds of thousands of low-quality CoT examples. Community projects such as Sky-T1 also demonstrate that small-scale, carefully constructed reasoning data can significantly improve the mathematical and coding performance of 32B-class open-source models [D]. +If cold-start samples come from existing open-source chat or base models, the team should also pay attention to the license, data disclosure level, and alignment-stage differences across model families. Reports on open foundation models such as Llama 2 provide early references for understanding model cards, license boundaries, and fine-tuning data transparency (Touvron et al. 2023). + The most common pitfall in the cold-start phase is writing samples that are too "perfect." Reasoning trajectories produced after real RL training typically include probing, checking, revisiting conditions, and correction, whereas manually written cold-start samples that present only linear derivation teach the model an overly tidy explanation style. While such a style has good readability on simple problems, it may lack self-checking capability on complex ones. Therefore, cold-start data may retain moderate intermediate checks, such as "we need to verify the boundary condition here," "this expression holds only when the denominator is nonzero," or "let us first validate with a small example." These expressions are not intended to create verbosity but to provide the model with extensible reasoning behavior templates for subsequent RL. Cold-start data must also control "answer leakage." In many synthetic datasets, the generator already knows the gold answer and reverse-engineers the reasoning process, leading to a tight coupling between steps and conclusion. Models trained on such samples may guess answers directly by pattern matching without actually reasoning. A more reliable approach is to retain check fields between the problem, the gold answer, and the reasoning process, such as `answer_source`, `verified_by`, `trace_quality`, and `leakage_risk`. If samples come from strong-model distillation, the teacher model, sampling temperature, and filtering rules should also be recorded to prevent data bias from becoming untraceable later. @@ -182,6 +186,8 @@ The core of R1-paradigm data engineering lies in reward signal and verifier desi **Model-based rewards** refer to rewards produced by reward models or LLM-as-Judge. They can cover open-ended Q&A, explanation quality, style, and safety boundaries, offering broader applicability, but they are also more susceptible to bias, length bias, and prompt sensitivity. +The engineering use of LLM-as-Judge needs separate calibration. Experiences from MT-Bench and Chatbot Arena show that model-based judges can increase the throughput of open-ended evaluation, but they also introduce position bias, length bias, and model-family preference; research on reward-model overoptimization also shows that continuously optimizing a single reward can push the policy away from true quality objectives (Zheng et al. 2023; Gao et al. 2023). + These two reward types should not replace each other but should be used in layers. For tasks with verifiable answers, rule-based rewards take priority; for open-ended tasks, model-based rewards can be used but should be paired with human spot-checks and audit sets; for high-risk domains, reliance on model review alone is insufficient. In training systems, rewards should ideally not be stored as a single floating-point number. A single score facilitates algorithmic consumption but complicates engineering debugging. A more practical structure is to simultaneously save `reward_score`, `reward_source`, `pass_flag`, `failure_reason`, and `audit_notes`. For example, a math problem that passes symbolic comparison receives 1.0, a code problem that passes all tests receives 1.0, a format error receives 0.0, and a correct but excessively long answer might receive 0.8. Scores participate in training, but the reason fields determine how data is repaired afterward. @@ -345,6 +351,8 @@ This section dissects three components: OpenThoughts-114K, the rule-based reward OpenThoughts-114K offers a window into the structure of open-source reasoning data. It is no longer a simple `{"instruction": "...", "response": "..."}` format but organizes samples around problems, reasoning, answers, and metadata. +The earlier ThoughtSource project also organized reasoning data into a reusable central hub, demonstrating the need to register different tasks, reasoning formats, and sources under a unified scheme. Such resources are useful as references for reasoning-sample schemas and source governance, rather than as material to be blindly concatenated into a training set (Ott et al. 2023). + A reasoning sample typically needs to answer three questions: * What is the problem; @@ -359,6 +367,8 @@ When ingesting data such as OpenThoughts, a four-step check can be applied. The After ingestion, the data should not all directly enter training. A more reliable approach is to establish a `curated` subset retaining only samples with complete fields, stable language, verifiable answers, and clear task sources. Samples with incomplete fields but valuable problems can enter `needs_repair`; samples with unverifiable answers or potential evaluation-set contamination should enter `excluded`. This stratification adds upfront workload but reduces training anomalies later. +Distilled reasoning data also needs to record the relationship among the teacher, rationale, and final answer. Distilling Step-by-Step shows that intermediate explanations can help smaller models learn, but only when explanation quality, answer correctness, and training objectives remain aligned (Hsieh et al. 2023). + An actionable data catalog is as follows: | Subset | Entry Condition | Use | @@ -413,6 +423,10 @@ The verification pool also needs to define a unified set of failure reason enume Verifier pool quality must also be evaluated. A simple approach is to maintain a golden validation set containing human-confirmed correct and incorrect answers; after every verifier update, a regression run is performed. If the new version causes large numbers of previously correct answers to fail, the parser or normalization has become too strict; if it allows large numbers of previously wrong answers to pass, the rules have become too loose or a loophole has appeared. Training systems may only use verifier versions that pass regression. +Training the verifier itself can also become a data engineering task. Work such as V-STaR demonstrates a route for training verifiers from self-generated trajectories, indicating that verifier pools need not only rule regression tests but also a continuously accumulated set of discriminative positive and negative samples (Hosseini et al. 2024). + +Process-level verification can also borrow from the "Let's Verify Step by Step" approach, decomposing whole-problem correctness into intermediate steps so that the data pipeline can locate where reasoning first deviates instead of assigning only a binary pass/fail label to the final answer (Lightman et al. 2024). + In production environments, verifiers must also consider cost. Symbolic simplification of math expressions can be time-consuming for complex expressions; code sandboxes consume CPU and memory; LLM judges incur additional inference costs. The data pipeline can apply cheap rules in a first-pass filter and then forward a small number of disputed samples to more expensive verifiers. This controls cost while preserving sufficient quality signal. ### Case C: Rejection Sampling in Practice @@ -486,6 +500,8 @@ RL may cause the model to favor generating longer reasoning chains. Length is no **Third, language mixing.** Chinese-English mixing degrades user-facing output experience and affects parsers. The cold-start phase should standardize language requirements, and the rejection sampling phase should filter trajectories with frequent language switching. +Multilingual reasoning data requires separate statistics for language and task type. Prior work on multilingual chain-of-thought suggests that cross-lingual reasoning capability does not always transfer automatically from English CoT, so language mixing is both an output-quality issue and a data-distribution issue (Shi et al. 2022). + **Fourth, reasoning that appears plausible but is unverifiable.** Many Long-CoT samples are written to look like reasoning, but no step can be programmatically checked. Such samples are suitable for SFT but not for RLVR. Before entering the RL stage, the task pool must be stratified by "degree of verifiability." @@ -504,6 +520,8 @@ Reasoning data often comes from public problem banks, community synthetic data, **Ninth, treating teacher model outputs as ground truth.** When using a strong model to generate Long-CoT, it is tempting to assume the outputs are inherently high-quality. In reality, teacher models can also produce incorrect reasoning, overly long explanations, answer leakage, and hallucinated citations. All teacher-generated data should undergo verifier checks or human spot-checks; quality control must not be skipped on the grounds that the teacher is strong. +Closed models or reasoning-model system cards can help teams understand capability boundaries, but they should not be treated as directly reproducible data sources. The o1 system card, for example, is mainly valuable for safety boundaries, evaluation dimensions, and system behavior descriptions, rather than for reusable training trajectories (Jaech et al. 2024). + **Tenth, neglecting the organization of negative samples.** Many teams organize only positive samples, making it impossible to later train a process reward model or analyze model failure boundaries. Negative samples need not enter SFT but should be saved by error type. A structured negative sample library helps the team identify task-type weaknesses, verifier loopholes, and sampling configuration issues. @@ -540,6 +558,8 @@ The focus of this path is to first validate the data production closed loop, the Applicability boundaries also need to be specified clearly. The R1-style flywheel is best suited for math, code, structured output, tool calling, and some long-context tasks; it is not suitable for all open-ended conversational tasks. For safety, medical, financial, and legal tasks, rule-based verification can only cover a portion of facts or format requirements and cannot replace expert review. +Tool-calling tasks can also draw from experience in API-connected models such as Gorilla: when the task objective is to select a tool, fill in arguments, and explain returned results, the data should record not only natural-language answers, but also API documentation versions, call arguments, execution results, and error-recovery paths (Patil et al. 2024). + When estimating cost, do not only estimate training GPU hours. The primary expense in a reasoning data flywheel often occurs before and after training: constructing the task pool, generating candidates, running verifiers, storing trajectories, human audits, and repeated evaluation. Especially in the multi-path sampling stage, token costs grow rapidly with the number of candidates, maximum length, and problem scale. Without budget controls, the team may spend most of its resources in the first sampling round, only to find that verifier quality is insufficient and the data is unusable. Therefore, small teams can adopt a staged budget approach. Stage one covers only 100 to 500 tasks, with the goal of validating whether the data structure and verifiers are functional. Stage two expands to 3,000 to 10,000 tasks, with the goal of producing trainable rejection-sampled data. Stage three then considers larger-scale RL or second-round SFT. Each stage should have a stopping condition—for example, if the format pass rate falls below a threshold, fix the prompt first; if the verifier error rate is too high, fix the verifier first, rather than continuing to scale sampling. @@ -588,15 +608,15 @@ Meurer A, Smith C P, Paprocki M, Certik O, Kirpichev S B, Rocklin M, Kumar A, Iv Guha E, Marten R, Keh S, Raoof N, Smyrnis G, Bansal H, Nezhurina M, Mercat J, Vu T, Sprague Z, others (2025) OpenThoughts: Data Recipes for Reasoning Models. arXiv preprint arXiv:2506.04178. -Zhou C, Liu P, Xu P, Iyer S, Sun J, Mao Y, Ma X, Efrat A, Yu P, Yu L, Zhang S, Ghosh G, Lewis M, Zettlemoyer L, Levy O (2023) LIMA: Less Is More for Alignment. Advances in Neural Information Processing Systems, 36, 55006–55021. +Zhou C, Liu P, Xu P, Iyer S, Sun J, Mao Y, Ma X, Efrat A, Yu P, Yu L, Zhang S, Ghosh G, Lewis M, Zettlemoyer L, Levy O (2023) LIMA: Less Is More for Alignment. Advances in Neural Information Processing Systems, 36, 55006–55021. arXiv:2305.11206. -Zelikman E, Wu Y, Mu J, Goodman N (2022) STaR: Bootstrapping Reasoning with Reasoning. Advances in Neural Information Processing Systems, 35, 15476–15488. +Zelikman E, Wu Y, Mu J, Goodman N (2022) STaR: Bootstrapping Reasoning with Reasoning. Advances in Neural Information Processing Systems, 35, 15476–15488. arXiv:2203.14465. -Madaan A, Tandon N, Gupta P, Hallinan S, Gao L, Wiegreffe S, Alon U, Dziri N, Prabhumoye S, Yang Y, Gupta S, Majumder B P, Hermann K, Welleck S, Yazdanbakhsh A, Clark P (2023) Self-Refine: Iterative Refinement with Self-Feedback. Advances in Neural Information Processing Systems, 36, 46534–46594. +Madaan A, Tandon N, Gupta P, Hallinan S, Gao L, Wiegreffe S, Alon U, Dziri N, Prabhumoye S, Yang Y, Gupta S, Majumder B P, Hermann K, Welleck S, Yazdanbakhsh A, Clark P (2023) Self-Refine: Iterative Refinement with Self-Feedback. Advances in Neural Information Processing Systems, 36, 46534–46594. arXiv:2303.17651. -Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations. +Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations. arXiv:2305.20050. -Zheng L, Chiang W-L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E, Zhang H, Gonzalez J, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. Advances in Neural Information Processing Systems, 36, 46595–46623. +Zheng L, Chiang W-L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E, Zhang H, Gonzalez J, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. Advances in Neural Information Processing Systems, 36, 46595–46623. arXiv:2306.05685. Gao L, Schulman J, Hilton J (2023) Scaling Laws for Reward Model Overoptimization. Proceedings of the 40th International Conference on Machine Learning, pp 10835–10866. @@ -606,8 +626,8 @@ Shi F, Suzgun M, Freitag M, Wang X, Srivats S, Vosoughi S, Chung H W, Tay Y, Rud Jaech A, Kalai A, Lerer A, Richardson A, El-Kishky A, Low A, Helyar A, Madry A, Beutel A, Carney A, others (2024) OpenAI o1 System Card. arXiv preprint arXiv:2412.16720. -Ott S, Hebenstreit K, Liévin V, others (2023) ThoughtSource: A Central Hub for Large Language Model Reasoning Data. Scientific Data, 10(1), 528. +Ott S, Hebenstreit K, Liévin V, others (2023) ThoughtSource: A Central Hub for Large Language Model Reasoning Data. Scientific Data, 10(1), 528. https://doi.org/10.1038/s41597-023-02433-3. Hsieh C-Y, Li C-L, Yeh C-K, Nakhost H, Fujii Y, Ratner A, Krishna R, Lee C-Y, Pfister T (2023) Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes. Findings of the Association for Computational Linguistics: ACL 2023, pp 8003–8017. -Patil S G, Zhang T, Wang X, Gonzalez J E (2024) Gorilla: Large Language Model Connected with Massive APIs. Advances in Neural Information Processing Systems, 38. +Patil S G, Zhang T, Wang X, Gonzalez J E (2024) Gorilla: Large Language Model Connected with Massive APIs. Advances in Neural Information Processing Systems, 38. arXiv:2305.15334. diff --git a/docs/en/part13/ch47_vlm_data_recipes.md b/docs/en/part13/ch47_vlm_data_recipes.md index 3a7473df..beb667e4 100644 --- a/docs/en/part13/ch47_vlm_data_recipes.md +++ b/docs/en/part13/ch47_vlm_data_recipes.md @@ -334,9 +334,9 @@ Chen Z, Wu J, Wang W, Su W, Chen G, Xing S, Zhong M, Liu Q, Lu Y, Li B, others ( Chen Z, Wang W, Tian H, Ye S, Gao Z, Cui E, Tong X, Hu J, Luo J, Ma S, others (2024) InternVL3: Exploring Advanced Training and Test-Time Scaling for Vision-Language Models. arXiv preprint arXiv:2504.10479. -Dao T, Fu D Y, Ermon S, Rudra A, Ré C (2022) FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In: Advances in Neural Information Processing Systems 35:16344-16359. +Dao T, Fu D Y, Ermon S, Rudra A, Ré C (2022) FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In: Advances in Neural Information Processing Systems 35:16344-16359. https://doi.org/10.52202/068431-1189. -Gadre S Y, Ilharco G, Fang A, Hayase J, Ilharco G, Marten T, Wortsman M, Goyal S, Guha E, Jain H, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. In: Advances in Neural Information Processing Systems 36. +Gadre S Y, Ilharco G, Fang A, Hayase J, Ilharco G, Marten T, Wortsman M, Goyal S, Guha E, Jain H, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. In: Advances in Neural Information Processing Systems 36. Available at: https://arxiv.org/abs/2304.14108. Laurençon A, Saulnier L, Tronchon L, Bekman S, Singh A, Lozhkov A, Wang T, Karamcheti S, Rush A M, Kiela D, others (2023) OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. arXiv preprint arXiv:2306.16527. @@ -348,14 +348,14 @@ Liu S, Zeng Z, Ren T, Li F, Zhang H, Yang J, Li C, Yang J, Su H, Zhu J, others ( Lu P, Bansal H, Xia T, Liu J, Li C, Hajishirzi H, Cheng H, Chang K W, Galley M, Gao J (2023) MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts. arXiv preprint arXiv:2310.02255. -Mathew M, Karatzas D, Jawahar C V (2021) DocVQA: A Dataset for VQA on Document Images. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision, pp 2200-2209. +Mathew M, Karatzas D, Jawahar C V (2021) DocVQA: A Dataset for VQA on Document Images. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision, pp 2200-2209. https://doi.org/10.1109/wacv48630.2021.00225. Radford A, Kim J W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, others (2021) Learning Transferable Visual Models from Natural Language Supervision (CLIP). In: Proceedings of the 38th International Conference on Machine Learning, pp 8748-8763. -Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278-25294. +Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278-25294. Available at: https://arxiv.org/abs/2210.08402. Wang P, Bai S, Tan S, Wang S, Fan Z, Bai J, Chen K, Liu X, Wang J, Ge W, others (2024) Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191. -Yue X, Ni Y, Zhang K, Zheng T, Liu R, Zhang S, Stevens J, Jiang C, Zheng N, Sun T, others (2024) MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 9556-9567. +Yue X, Ni Y, Zhang K, Zheng T, Liu R, Zhang S, Stevens J, Jiang C, Zheng N, Sun T, others (2024) MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 9556-9567. https://doi.org/10.1109/cvpr52733.2024.00913. Zhu W, Hessel J, Awadalla A, Gadre S Y, Dodge J, Fang A, Yu Y, Schmidt L, Wang W Y, Choi Y (2023) Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text. arXiv preprint arXiv:2304.06939. diff --git a/docs/en/part13/ch48_t2i_t2v.md b/docs/en/part13/ch48_t2i_t2v.md index 114733b9..dd8d38e4 100644 --- a/docs/en/part13/ch48_t2i_t2v.md +++ b/docs/en/part13/ch48_t2i_t2v.md @@ -404,17 +404,17 @@ This chapter also concludes Part XI. The preceding chapters addressed the collec PySceneDetect Contributors (2026) PySceneDetect Documentation. Available at: https://www.scenedetect.com/docs/latest/. -Gadre S Y, Ilharco G, Fang A, Hayase J, Ilharco G, Marten T, Wortsman M, Goyal S, Guha E, Jain H, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. In: Advances in Neural Information Processing Systems 36. +Gadre S Y, Ilharco G, Fang A, Hayase J, Ilharco G, Marten T, Wortsman M, Goyal S, Guha E, Jain H, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. In: Advances in Neural Information Processing Systems 36. Available at: https://arxiv.org/abs/2304.14108. -Ghosh S, Bhatt U, Bhattacharya R, Parmar P, Patel S, Islam M, Reddy K K, others (2023) GenEval: An Object-Focused Framework for Evaluating Text-to-Image Alignment. In: Advances in Neural Information Processing Systems 36. +Ghosh S, Bhatt U, Bhattacharya R, Parmar P, Patel S, Islam M, Reddy K K, others (2023) GenEval: An Object-Focused Framework for Evaluating Text-to-Image Alignment. In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/075280-2270. -Kirstain Y, Polyak A, Singer U, Matiana S, Penna J, Levy O (2023) Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation (PickScore). In: Advances in Neural Information Processing Systems 36. +Kirstain Y, Polyak A, Singer U, Matiana S, Penna J, Levy O (2023) Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation (PickScore). In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/075280-1594. Open-Sora Team (2024) Open-Sora: Democratizing Efficient Video Production for All. arXiv preprint arXiv:2412.20404. -Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278-25294. +Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278-25294. Available at: https://arxiv.org/abs/2210.08402. -Wang W, Lv Q, Yu W, Hong W, Qi J, Wang Y, Ji J, Yang Z, Zhao L, Song X, others (2023) CogVLM: Visual Expert for Pretrained Language Models. In: Advances in Neural Information Processing Systems 36. +Wang W, Lv Q, Yu W, Hong W, Qi J, Wang Y, Ji J, Yang Z, Zhao L, Song X, others (2023) CogVLM: Visual Expert for Pretrained Language Models. In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/079017-3860. Wu X, Sun K, Zhu F, Zhao R, Li H (2023) Human Preference Score v2: A Solid Benchmark for Evaluating Human Preferences of Text-to-Image Synthesis (HPSv2). arXiv preprint arXiv:2306.09341. diff --git a/docs/en/part14/p03_llava_instruct.md b/docs/en/part14/p03_llava_instruct.md index 2e4b7580..75d4e3c0 100644 --- a/docs/en/part14/p03_llava_instruct.md +++ b/docs/en/part14/p03_llava_instruct.md @@ -1136,8 +1136,8 @@ As part of Part 14, this chapter corresponds to the project-level validation of ## References -1. Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. NeurIPS 2023. -2. Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollár, P., & Zitnick, C. L. (2014). Microsoft COCO: Common Objects in Context. ECCV 2014. +1. Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. NeurIPS 2023. https://doi.org/10.52202/075280-1516. +2. Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollár, P., & Zitnick, C. L. (2014). Microsoft COCO: Common Objects in Context. ECCV 2014. https://doi.org/10.1007/978-3-319-10602-1_48. 3. Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., et al. (2021). Learning Transferable Visual Models From Natural Language Supervision. ICML 2021. -4. Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. -5. Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. +4. Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. https://doi.org/10.1109/wacv48630.2021.00225. +5. Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. https://doi.org/10.18653/v1/2022.findings-acl.177. diff --git a/docs/en/part14/p05_mm_rag.md b/docs/en/part14/p05_mm_rag.md index 1b3568fc..6087f1cc 100644 --- a/docs/en/part14/p05_mm_rag.md +++ b/docs/en/part14/p05_mm_rag.md @@ -1209,8 +1209,8 @@ As part of Part Fourteen, this chapter corresponds to the project-level deployme ## References -1. Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. NeurIPS 2023. -2. Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollár, P., & Zitnick, C. L. (2014). Microsoft COCO: Common Objects in Context. ECCV 2014. +1. Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. NeurIPS 2023. https://doi.org/10.52202/075280-1516. +2. Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollár, P., & Zitnick, C. L. (2014). Microsoft COCO: Common Objects in Context. ECCV 2014. https://doi.org/10.1007/978-3-319-10602-1_48. 3. Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., et al. (2021). Learning Transferable Visual Models From Natural Language Supervision. ICML 2021. -4. Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. -5. Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. +4. Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. https://doi.org/10.1109/wacv48630.2021.00225. +5. Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. https://doi.org/10.18653/v1/2022.findings-acl.177. diff --git a/docs/en/part14/p06_prm.md b/docs/en/part14/p06_prm.md index 999687f6..5692d4ed 100644 --- a/docs/en/part14/p06_prm.md +++ b/docs/en/part14/p06_prm.md @@ -1138,8 +1138,8 @@ As part of Part Fourteen, this chapter corresponds to the project-level validati ## References -1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022. +1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022. arXiv:2201.11903. 2. Lightman, H., Kosaraju, V., Burda, Y., Edwards, H., Baker, B., Lee, T., Leike, J., Schulman, J., Sutskever, I., & Cobbe, K. (2023). Let's Verify Step by Step. arXiv:2305.20050. 3. Yao, S., Zhao, J., Yu, D., Du, N., Shafran, I., Narasimhan, K., & Cao, Y. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. arXiv:2210.03629. -4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. -5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021. +4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948. +5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021. arXiv:2103.03874. diff --git a/docs/en/part14/p07_agent_tooluse.md b/docs/en/part14/p07_agent_tooluse.md index 99d7c07c..f037cb86 100644 --- a/docs/en/part14/p07_agent_tooluse.md +++ b/docs/en/part14/p07_agent_tooluse.md @@ -1166,5 +1166,5 @@ As part of Part Fourteen, this chapter corresponds to the project-level validati 1. Yao, S., Zhao, J., Yu, D., Du, N., Shafran, I., Narasimhan, K., & Cao, Y. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. arXiv:2210.03629. 2. Schick, T., Dwivedi-Yu, J., Dessì, R., Raileanu, R., Lomeli, M., Hambro, E., Zettlemoyer, L., Cancedda, N., & Scialom, T. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. arXiv:2302.04761. 3. NIST. (2023). Artificial Intelligence Risk Management Framework (AI RMF 1.0). National Institute of Standards and Technology. -4. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications. +4. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications. https://genai.owasp.org/llm-top-10/. 5. OpenTelemetry Authors. (2026). OpenTelemetry Documentation. https://opentelemetry.io/docs/. diff --git a/docs/en/part14/p09_privacy_pipeline.md b/docs/en/part14/p09_privacy_pipeline.md index 082326d0..78967458 100644 --- a/docs/en/part14/p09_privacy_pipeline.md +++ b/docs/en/part14/p09_privacy_pipeline.md @@ -1126,8 +1126,8 @@ As part of Part XIV, this chapter corresponds to the project-level validation of ## References -1. European Union. (2016). Regulation (EU) 2016/679: General Data Protection Regulation. -2. NIST. (2020). NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0. -3. Dwork, C., & Roth, A. (2014). The Algorithmic Foundations of Differential Privacy. *Foundations and Trends in Theoretical Computer Science*. -4. Kairouz, P., McMahan, H. B., Avent, B., Bellet, A., Bennis, M., Bhagoji, A. N., et al. (2021). Advances and Open Problems in Federated Learning. *Foundations and Trends in Machine Learning*. -5. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications. +1. European Union. (2016). Regulation (EU) 2016/679: General Data Protection Regulation. https://eur-lex.europa.eu/eli/reg/2016/679/oj. +2. NIST. (2020). NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0. https://doi.org/10.6028/nist.cswp.10. +3. Dwork, C., & Roth, A. (2014). The Algorithmic Foundations of Differential Privacy. *Foundations and Trends in Theoretical Computer Science*. https://doi.org/10.1561/9781601988195. +4. Kairouz, P., McMahan, H. B., Avent, B., Bellet, A., Bennis, M., Bhagoji, A. N., et al. (2021). Advances and Open Problems in Federated Learning. *Foundations and Trends in Machine Learning*. https://doi.org/10.1561/2200000083. +5. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications. https://genai.owasp.org/llm-top-10/. diff --git a/docs/en/part14/p11_mini_deepseek.md b/docs/en/part14/p11_mini_deepseek.md index 433b351d..c2585a5b 100644 --- a/docs/en/part14/p11_mini_deepseek.md +++ b/docs/en/part14/p11_mini_deepseek.md @@ -62,7 +62,7 @@ Reproduction materials should include data source descriptions, minimal samples, ## Background and Objectives -In pre-training data engineering, "Scaling Laws" (Kaplan et al. 2020) apply not only to model parameters but equally to the experimentation and validation of data recipes. In the earlier Project 1 (Mini-C4), we completed an end-to-end cleaning pipeline for a single-source corpus. However, real industrial-scale large models—such as DeepSeek-V3 (Liu et al. 2024)—are never trained on a single corpus; they are trained on a precise mixture of web pages, code, mathematics, academic papers, and other data sources. +In pre-training data engineering, "Scaling Laws" (Kaplan et al. 2020) apply not only to model parameters but equally to the experimentation and validation of data recipes. In the earlier Project 1 (Mini-C4), we completed an end-to-end cleaning pipeline for a single-source corpus. However, real industrial-scale large models—such as DeepSeek-V3 (DeepSeek-AI et al. 2024)—are never trained on a single corpus; they are trained on a precise mixture of web pages, code, mathematics, academic papers, and other data sources. Why do we need a Mini pre-training pipeline? @@ -171,7 +171,7 @@ unique.save_to_disk("./data/mixed_1b_dedup") ### Step 3: Training a 150K Super-Vocabulary Tokenizer -DeepSeek-V3 (Liu et al. 2024) employs a super-vocabulary of approximately 150K entries (a substantial increase over Llama-2's 32K), which makes it highly efficient at processing Chinese text and code. In this step, we train a BPE tokenizer on the mixed and deduplicated data. +DeepSeek-V3 (DeepSeek-AI et al. 2024) employs a super-vocabulary of approximately 150K entries (a substantial increase over Llama-2's 32K), which makes it highly efficient at processing Chinese text and code. In this step, we train a BPE tokenizer on the mixed and deduplicated data. ```python from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers @@ -525,7 +525,7 @@ As part of Part 14, this chapter corresponds to the project-level validation of ## References -Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21–29. +Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21–29. https://doi.org/10.1109/sequen.1997.666900. Kaplan J, McCandlish S, Henighan T, Brown T B, Chess B, Child R, Gray S, Radford A, Wu J, Amodei D (2020) Scaling Laws for Neural Language Models. arXiv preprint arXiv:2001.08361. diff --git a/docs/en/part14/p12_r1_reasoning_flywheel.md b/docs/en/part14/p12_r1_reasoning_flywheel.md index cfa15ceb..e91fdbc5 100644 --- a/docs/en/part14/p12_r1_reasoning_flywheel.md +++ b/docs/en/part14/p12_r1_reasoning_flywheel.md @@ -504,8 +504,8 @@ Guo D, Yang D, Zhang H, Song J, Zhang R, Xu R, Zhu Q, Ma S, Wang P, Bi X, others Guha E, Marten R, Keh S, Raoof N, Smyrnis G, Bansal H, Nezhurina M, Mercat J, Vu T, Sprague Z, others (2025) OpenThoughts: Data Recipes for Reasoning Models. arXiv preprint arXiv:2506.04178. -Hendrycks D, Burns C, Kadavath S, Arora A, Basart S, Tang E, Song D, Steinhardt J (2021) Measuring Mathematical Problem Solving with the MATH Dataset. In: Advances in Neural Information Processing Systems 34:24262-24273. +Hendrycks D, Burns C, Kadavath S, Arora A, Basart S, Tang E, Song D, Steinhardt J (2021) Measuring Mathematical Problem Solving with the MATH Dataset. In: Advances in Neural Information Processing Systems 34:24262-24273. arXiv:2103.03874. Hui B, Yang J, Cui Z, Yang J, Liu D, Zhang L, Liu B, Yu B, Lu K, Chi K, others (2024) Qwen2.5 Technical Report. arXiv preprint arXiv:2412.15115. -Qwen Team (2025) QwQ-32B: Embracing the Power of Reinforcement Learning for Reasoning Models. Qwen Blog. +Qwen Team (2025) QwQ-32B: Embracing the Power of Reinforcement Learning for Reasoning Models. Qwen Blog. https://qwenlm.github.io/blog/qwq-32b/. diff --git a/docs/en/part14/p13_multimodal_instruction_factory.md b/docs/en/part14/p13_multimodal_instruction_factory.md index 58697142..f916864a 100644 --- a/docs/en/part14/p13_multimodal_instruction_factory.md +++ b/docs/en/part14/p13_multimodal_instruction_factory.md @@ -599,10 +599,10 @@ Bai S, Chen K, Liu X, Wang J, Ge W, Song S, Dang K, Wang P, Wang S, Tang J, et a Zhu J, Wang W, Chen Z, Liu Z, Ye S, Gu L, Duan Y, Tian H, Su W, Shao J, et al. (2025) InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models. arXiv preprint arXiv:2504.10479. -Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu C H, Gonzalez J E, Zhang H, Stoica I (2023) Efficient Memory Management for Large Language Model Serving with PagedAttention. In: Proceedings of the 29th ACM Symposium on Operating Systems Principles, pp 611-626. +Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu C H, Gonzalez J E, Zhang H, Stoica I (2023) Efficient Memory Management for Large Language Model Serving with PagedAttention. In: Proceedings of the 29th ACM Symposium on Operating Systems Principles, pp 611-626. https://doi.org/10.1145/3600006.3613165. -Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, et al. (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35, pp 25278-25294. +Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, et al. (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35, pp 25278-25294. Available at: https://arxiv.org/abs/2210.08402. -Wang X, Wei J, Schuurmans D, Le Q, Chi E, Narang S, Chowdhery A, Zhou D (2023) Self-Consistency Improves Chain of Thought Reasoning in Language Models. In: International Conference on Learning Representations. +Wang X, Wei J, Schuurmans D, Le Q, Chi E, Narang S, Chowdhery A, Zhou D (2023) Self-Consistency Improves Chain of Thought Reasoning in Language Models. In: International Conference on Learning Representations. arXiv:2203.11171. -Zheng L, Chiang W L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E P, Zhang H, Gonzalez J E, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In: Advances in Neural Information Processing Systems 36. +Zheng L, Chiang W L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E P, Zhang H, Gonzalez J E, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In: Advances in Neural Information Processing Systems 36. arXiv:2306.05685. diff --git a/docs/en/part14/p14_video_generation.md b/docs/en/part14/p14_video_generation.md index 81c11c77..8c4f504c 100644 --- a/docs/en/part14/p14_video_generation.md +++ b/docs/en/part14/p14_video_generation.md @@ -545,10 +545,10 @@ Bai S, Chen K, Liu X, Wang J, Ge W, Song S, Dang K, Wang P, Wang S, Tang J, othe Zhu J, Wang W, Chen Z, Liu Z, Ye S, Gu L, Duan Y, Tian H, Su W, Shao J, others (2025) InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models. arXiv preprint arXiv:2504.10479. -Farnebäck G (2003) Two-Frame Motion Estimation Based on Polynomial Expansion. In: Proceedings of the 13th Scandinavian Conference on Image Analysis, pp 363–370. +Farnebäck G (2003) Two-Frame Motion Estimation Based on Polynomial Expansion. In: Proceedings of the 13th Scandinavian Conference on Image Analysis, pp 363–370. https://doi.org/10.1007/3-540-45103-x_50. Pexels (2014) Pexels: Free Stock Photos, Royalty Free Images & Videos. Available at: https://www.pexels.com. Radford A, Kim J W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, others (2021) Learning Transferable Visual Models from Natural Language Supervision (CLIP). In: Proceedings of the 38th International Conference on Machine Learning, pp 8748–8763. -Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278–25294. +Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278–25294. Available at: https://arxiv.org/abs/2210.08402. diff --git a/docs/en/part14/p15_dataagent_semantic_nl2sql_agent.md b/docs/en/part14/p15_dataagent_semantic_nl2sql_agent.md index d5d26c20..7a6a0e46 100644 --- a/docs/en/part14/p15_dataagent_semantic_nl2sql_agent.md +++ b/docs/en/part14/p15_dataagent_semantic_nl2sql_agent.md @@ -934,7 +934,7 @@ As part of Part 14, this chapter validates earlier methods at the project level. ## References 1. Yu, T., Zhang, R., Yang, K., Yasunaga, M., Wang, D., Li, Z., et al. (2018). Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task. EMNLP 2018. -2. Wang, B., Shin, R., Liu, X., Polozov, O., & Richardson, M. (2020). RAT-SQL: Relation-Aware Schema Encoding and Linking for Text-to-SQL Parsers. ACL 2020. +2. Wang, B., Shin, R., Liu, X., Polozov, O., & Richardson, M. (2020). RAT-SQL: Relation-Aware Schema Encoding and Linking for Text-to-SQL Parsers. ACL 2020. https://doi.org/10.18653/v1/2020.acl-main.677. 3. Schick, T., Dwivedi-Yu, J., Dessi, R., Raileanu, R., Lomeli, M., Hambro, E., Zettlemoyer, L., Cancedda, N., & Scialom, T. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. arXiv:2302.04761. 4. Yao, S., Zhao, J., Yu, D., Du, N., Shafran, I., Narasimhan, K., & Cao, Y. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. arXiv:2210.03629. 5. dbt Labs. (2026). dbt Documentation. https://docs.getdbt.com/. diff --git a/docs/en/part2/ch04_data_sources.md b/docs/en/part2/ch04_data_sources.md index 25315b47..f4cbc818 100644 --- a/docs/en/part2/ch04_data_sources.md +++ b/docs/en/part2/ch04_data_sources.md @@ -394,7 +394,7 @@ In the next chapter, we will build on the raw data collected in this chapter and ## References -Barbaresi A (2021) Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 122-131. +Barbaresi A (2021) Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 122-131. https://doi.org/10.18653/v1/2021.acl-demo.15. Blecher L, Cucurull G, Scialom T, Stojnic R (2023) Nougat: Neural Optical Understanding for Academic Documents. arXiv preprint arXiv:2308.13418. diff --git a/docs/en/part2/ch05_cleaning_dedup.md b/docs/en/part2/ch05_cleaning_dedup.md index 6272c61b..d2072619 100644 --- a/docs/en/part2/ch05_cleaning_dedup.md +++ b/docs/en/part2/ch05_cleaning_dedup.md @@ -322,7 +322,7 @@ def detect_and_redact_pii(text: str) -> tuple[str, list]: return text, found ``` -**Named Entity Recognition (NER) models** cover PII types that are difficult to enumerate with rules, such as real personal names, addresses, and organization names. It is recommended to use spaCy (Honnibal et al. 2020) with its Chinese model (`zh_core_web_trf`) or open-source Chinese NER models available on HuggingFace to identify named entities such as persons (PER), locations (LOC), and organizations (ORG), then determine based on context whether redaction is necessary. +**Named Entity Recognition (NER) models** cover PII types that are difficult to enumerate with rules, such as real personal names, addresses, and organization names. It is recommended to use spaCy (Honnibal et al. 2023) with its Chinese model (`zh_core_web_trf`) or open-source Chinese NER models available on HuggingFace to identify named entities such as persons (PER), locations (LOC), and organizations (ORG), then determine based on context whether redaction is necessary. --- @@ -519,7 +519,7 @@ After cleaning, deduplication, and decontamination are complete, the raw corpus ## References -Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. +Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. https://doi.org/10.1109/sequen.1997.666900. Heafield K (2011) KenLM: Faster and Smaller Language Model Queries. In: Proceedings of the Sixth Workshop on Statistical Machine Translation, pp 187-197. @@ -529,6 +529,8 @@ Indyk P, Motwani R (1998) Approximate Nearest Neighbors: Towards Removing the Cu Joulin A, Grave E, Bojanowski P, Douze M, Jegou H, Mikolov T (2017) FastText.zip: Compressing Text Classification Models. arXiv preprint arXiv:1612.03651. +Nait Saada T, Bethune L, Klein M, Grangier D, Cuturi M, Ablin P (2025) The Data-Quality Illusion: Rethinking Classifier-Based Quality Filtering for LLM Pretraining. arXiv preprint arXiv:2510.00866. + Penedo G, Kydlíček H, Ben Allal L, Lozhkov A, Mitchell M, Raffel C, von Werra L, Wolf T (2024) The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. arXiv preprint arXiv:2406.17557. Penedo G, Malartic Q, Hesslow D, Cojocaru R, Cappelli A, Alobeidli H, Pannier B, Almazrouei E, Launay J (2023) The RefinedWeb Dataset for Falcon LLM: Outperforming Curated Corpora with Web Data Only. In: Advances in Neural Information Processing Systems 36. diff --git a/docs/en/part2/ch06_tokenization_loading.md b/docs/en/part2/ch06_tokenization_loading.md index e8d08f12..bc10c7da 100644 --- a/docs/en/part2/ch06_tokenization_loading.md +++ b/docs/en/part2/ch06_tokenization_loading.md @@ -250,7 +250,7 @@ When $T = 1$, weights are proportional to data volume and large sources complete Curriculum learning is a strategy that **dynamically adjusts the data recipe** during training (Bengio et al. 2009): in the early stages of model training, "simpler" data (shorter sentences, more fluent language, more general domains) is used, with progressively longer and more complex samples introduced as training proceeds. This mimics the cognitive principle that humans learn "easy before hard." -In engineering implementation, difficulty metrics for curriculum learning can come from multiple dimensions: token sequence length (short → long), perplexity score (low perplexity → high perplexity), and quality tier (High → Medium → Low). The LLaMA-3 (Dubey et al. 2024) technical report explicitly mentions substantially increasing the weight of high-quality curated data (code, mathematical reasoning, books) during the pretraining cooldown phase — this is essentially a **data quality curriculum**: first using massive general data to establish broad world knowledge, then using high-quality curated data in the final phase to strengthen specific capabilities. +In engineering implementation, difficulty metrics for curriculum learning can come from multiple dimensions: token sequence length (short → long), perplexity score (low perplexity → high perplexity), and quality tier (High → Medium → Low). The Llama 3 technical report (Grattafiori et al. 2024) explicitly mentions substantially increasing the weight of high-quality curated data (code, mathematical reasoning, books) during the pretraining cooldown phase — this is essentially a **data quality curriculum**: first using massive general data to establish broad world knowledge, then using high-quality curated data in the final phase to strengthen specific capabilities. --- @@ -462,7 +462,7 @@ This chapter echoes the cost governance perspective of Chapter 3: in pretraining Bengio Y, Louradour J, Collobert R, Weston J (2009) Curriculum Learning. In: Proceedings of the 26th Annual International Conference on Machine Learning, pp 41-48. -Brown T B, Mann B, Ryder N, Subbiah M, Kaplan J, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D M, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language Models are Few-Shot Learners. In: Advances in Neural Information Processing Systems 33, pp 1877-1901. +Brown T B, Mann B, Ryder N, Subbiah M, Kaplan J, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D M, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language Models are Few-Shot Learners. In: Advances in Neural Information Processing Systems 33, pp 1877-1901. arXiv:2005.14165. Grattafiori A, Dubey A, Jauhri A, Pandey A, Kadian A, Al-Dahle A, Letman A, Mathur A, Schelten A, Vaughan A, others (2024) The Llama 3 Herd of Models. arXiv preprint arXiv:2407.21783. diff --git a/docs/en/part2/ch07_data_operations.md b/docs/en/part2/ch07_data_operations.md index 6520ede4..a69c6847 100644 --- a/docs/en/part2/ch07_data_operations.md +++ b/docs/en/part2/ch07_data_operations.md @@ -389,7 +389,7 @@ Chen M, Tworek J, Jun H, Yuan Q, Pinto H P d O, Kaplan J, Edwards H, Burda Y, Jo Cobbe K, Kosaraju V, Bavarian M, Chen M, Jun H, Kaiser L, Plappert M, Tworek J, Hilton J, Nakano R, Hesse C, Schulman J (2021) Training Verifiers to Solve Math Word Problems (GSM8K). arXiv preprint arXiv:2110.14168. -Covington M A, McFall J D (2010) Cutting the Gordian Knot: The Moving-Average Type–Token Ratio (MATTR). Journal of Quantitative Linguistics 17(2):94-100. +Covington M A, McFall J D (2010) Cutting the Gordian Knot: The Moving-Average Type–Token Ratio (MATTR). Journal of Quantitative Linguistics 17(2):94-100. https://doi.org/10.1080/09296171003643098. Heafield K (2011) KenLM: Faster and Smaller Language Model Queries. In: Proceedings of the Sixth Workshop on Statistical Machine Translation, pp 187-197. @@ -399,9 +399,9 @@ Lees A, Tran V Q, Tay Y, Sorensen J, Gupta J, Metzler D, Vasserman L (2022) A Ne DVC Team and Contributors (2024) DVC: Data Version Control - Git for Data & Models. Documentation: . Source repository: . -Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17-28. +Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17-28. https://doi.org/10.1145/3299887.3299891. -Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the ACM CHI Conference on Human Factors in Computing Systems, pp 1-15. +Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the ACM CHI Conference on Human Factors in Computing Systems, pp 1-15. https://doi.org/10.1145/3411764.3445518. Whang S E, Roh Y, Song H, Lee J G (2023) Data Collection and Quality Challenges in Deep Learning: A Data-Centric AI Perspective. The VLDB Journal 32(4):791-813. diff --git a/docs/en/part3/ch08_multimodal_image.md b/docs/en/part3/ch08_multimodal_image.md index c0c534a7..d09be1c1 100644 --- a/docs/en/part3/ch08_multimodal_image.md +++ b/docs/en/part3/ch08_multimodal_image.md @@ -329,7 +329,7 @@ Although interleaved image-text data is an important modern multimodal training ## References -Alayrac J B, Donahue J, Luc P, Miech A, Barr I, Hasson Y, Lenc K, Mensch A, Millican K, Reynolds M, others (2022) Flamingo: A Visual Language Model for Few-Shot Learning. Advances in Neural Information Processing Systems 35:23716-23736. +Alayrac J B, Donahue J, Luc P, Miech A, Barr I, Hasson Y, Lenc K, Mensch A, Millican K, Reynolds M, others (2022) Flamingo: A Visual Language Model for Few-Shot Learning. Advances in Neural Information Processing Systems 35:23716-23736. https://doi.org/10.52202/068431-1723. Bai J, Bai S, Yang S, Wang S, Tan S, Wang P, Lin J, Zhou C, Zhou J (2023) Qwen-VL: A Versatile Vision-Language Model's Understanding, Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966. @@ -337,23 +337,23 @@ Bai S, Chen K, Liu X, Wang J, Ge W, Song S, Dang K, Wang P, Wang S, Tang J, Zhon Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, Uszkoreit J, Houlsby N (2020) An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale (ViT). In: International Conference on Learning Representations 2021. -Gadre S Y, Ilharco G, Fang A, Hayase J, Smyrnis G, Nguyen T, Marten R, Wortsman M, Ghosh S, Zhang G, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. Advances in Neural Information Processing Systems 36. +Gadre S Y, Ilharco G, Fang A, Hayase J, Smyrnis G, Nguyen T, Marten R, Wortsman M, Ghosh S, Zhang G, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. Advances in Neural Information Processing Systems 36. Available at: https://arxiv.org/abs/2304.14108. -Laurençon H, Saulnier L, Tronchon L, Bekman S, Singh A, Lozhkov A, Wang T, Karamcheti S, Rush A M, Kiela D, Cord M, Wolf T (2023) OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. Advances in Neural Information Processing Systems 36. +Laurençon H, Saulnier L, Tronchon L, Bekman S, Singh A, Lozhkov A, Wang T, Karamcheti S, Rush A M, Kiela D, Cord M, Wolf T (2023) OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. Advances in Neural Information Processing Systems 36. arXiv:2306.16527. Liu H, Li C, Wu Q, Lee Y J (2023) Visual Instruction Tuning (LLaVA). Advances in Neural Information Processing Systems 36:34892-34916. -Liu H, Li C, Li Y, Lee Y J (2024) Improved Baselines with Visual Instruction Tuning (LLaVA-1.5). In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 26296-26306. +Liu H, Li C, Li Y, Lee Y J (2024) Improved Baselines with Visual Instruction Tuning (LLaVA-1.5). In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 26296-26306. https://doi.org/10.1109/cvpr52733.2024.02484. NVIDIA (2023) NVIDIA Data Loading Library (DALI). https://github.com/NVIDIA/DALI. Radford A, Kim J W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, others (2021) Learning Transferable Visual Models From Natural Language Supervision (CLIP). In: Proceedings of the 38th International Conference on Machine Learning, pp 8748-8763. -Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. Advances in Neural Information Processing Systems 35:25278-25294. +Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. Advances in Neural Information Processing Systems 35:25278-25294. Available at: https://arxiv.org/abs/2210.08402. -Zhu W, Hessel J, Awadalla A, Gadre S Y, Dodge J, Fang A, Yu Y, Schmidt L, Wang W Y, Choi Y (2023) Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text. Advances in Neural Information Processing Systems 36. +Zhu W, Hessel J, Awadalla A, Gadre S Y, Dodge J, Fang A, Yu Y, Schmidt L, Wang W Y, Choi Y (2023) Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text. Advances in Neural Information Processing Systems 36. arXiv:2304.06939. -Zhai X, Mustafa B, Kolesnikov A, Beyer L (2023) Sigmoid Loss for Language Image Pre-Training (SigLIP). In: Proceedings of the IEEE/CVF International Conference on Computer Vision, pp 11975-11986. +Zhai X, Mustafa B, Kolesnikov A, Beyer L (2023) Sigmoid Loss for Language Image Pre-Training (SigLIP). In: Proceedings of the IEEE/CVF International Conference on Computer Vision, pp 11975-11986. https://doi.org/10.1109/iccv51070.2023.01100. Zhu J, Wang W, Chen Z, Liu Z, Ye S, Gu L, Duan Y, Tian H, Su W, Shao J, Gao Z, Cui E, Cao Y, Liu Y, Xu W, Li H, Wang J, Lv H, Chen D, Li S, He Y, Jiang T, Luo J, Wang Y, He C, Shi B, Zhang X, Shao W, He J, Xiong Y, Qu W, Sun P, Jiao P, Wu L, Zhang K, Deng H, Ge J, Chen K, Wang L, Dou M, Lu L, Zhu X, Lu T, Lin D, Qiao Y, Dai J, Wang W (2025) InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models. arXiv preprint arXiv:2504.10479. diff --git a/docs/en/part3/ch09_recaptioning_ocr.md b/docs/en/part3/ch09_recaptioning_ocr.md index f85bd324..c0e9bbdb 100644 --- a/docs/en/part3/ch09_recaptioning_ocr.md +++ b/docs/en/part3/ch09_recaptioning_ocr.md @@ -275,30 +275,30 @@ Blecher L, Cucurull G, Scialom T, Stojnic R (2023) Nougat: Neural Optical Unders Fu C, Chen P, Shen Y, Qin Y, Zhang M, Lin X, Qiu Z, Lin W, Yang J, Zheng X, Li K, Sun X, Wu E (2023) MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2306.13394. -Dou Z Y, Xu Y, Gan Z, Wang J, Wang S, Wang L, Zhu C, Zhang P, Yuan L, Peng N, Liu Z (2022) Coarse-to-Fine Vision-Language Pre-training with Fusion in the Backbone (FIBER). Advances in Neural Information Processing Systems 35:32942-32956. +Dou Z Y, Xu Y, Gan Z, Wang J, Wang S, Wang L, Zhu C, Zhang P, Yuan L, Peng N, Liu Z (2022) Coarse-to-Fine Vision-Language Pre-training with Fusion in the Backbone (FIBER). Advances in Neural Information Processing Systems 35:32942-32956. https://doi.org/10.52202/068431-2387. -Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083-4091. +Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083-4091. https://doi.org/10.1145/3503161.3548112. -Kim G, Moon S, Xu R, Yim J, Park S, Seo J, Baek J, Yoo M, Park S, Park S (2022) OCR-Free Document Understanding Transformer (Donut). In: European Conference on Computer Vision, pp 498-517. +Kim G, Moon S, Xu R, Yim J, Park S, Seo J, Baek J, Yoo M, Park S, Park S (2022) OCR-Free Document Understanding Transformer (Donut). In: European Conference on Computer Vision, pp 498-517. https://doi.org/10.1007/978-3-031-19815-1_29. Kirillov A, Mintun E, Ravi N, Mao H, Rolland C, Gustafson L, Xiao T, Whitehead S, Berg A C, Lo W Y, others (2023) Segment Anything (SAM). In: Proceedings of the IEEE/CVF International Conference on Computer Vision, pp 4015-4026. Lee J, Jia M, Sangkloy P, Krishnamurthy J, Han S, Chang S F, Hutchinson B (2023) Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding. In: Proceedings of the 40th International Conference on Machine Learning, pp 18893-18912. -Li L H, Zhang P, Zhang H, Yang J, Li C, Zhong Y, Wang L, Yuan L, Zhang L, Hwang J N, Chang K W, Gao J (2022) Grounded Language-Image Pre-training (GLIP). In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 10965-10975. +Li L H, Zhang P, Zhang H, Yang J, Li C, Zhong Y, Wang L, Yuan L, Zhang L, Hwang J N, Chang K W, Gao J (2022) Grounded Language-Image Pre-training (GLIP). In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 10965-10975. https://doi.org/10.1109/cvpr52688.2022.01069. Liu H, Li C, Wu Q, Lee Y J (2023b) MMBench: Is Your Multi-modal Model an All-around Player? arXiv preprint arXiv:2307.06281. Liu S, Zeng Z, Ren T, Li F, Zhang H, Yang J, Li C, Yang J, Su H, Zhu J, Zhang L (2023c) Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection. arXiv preprint arXiv:2303.05499. -Liu H, Li C, Li Y, Lee Y J (2024) Improved Baselines with Visual Instruction Tuning (LLaVA-1.5). In: CVPR 2024, pp 26296-26306. +Liu H, Li C, Li Y, Lee Y J (2024) Improved Baselines with Visual Instruction Tuning (LLaVA-1.5). In: CVPR 2024, pp 26296-26306. https://doi.org/10.1109/cvpr52733.2024.02484. Lu P, Qiu L, Chang K W, Zhu W, Rajpurohit T, Clark P, Kalyan A (2022) Dynamic Prompt Learning via Policy Gradient for Semi-structured Mathematical Reasoning (TabMWP). arXiv preprint arXiv:2209.14610. -Masry A, Long D, Tan J Q, Joty S, Hoque E (2022) ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp 2263-2279. +Masry A, Long D, Tan J Q, Joty S, Hoque E (2022) ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp 2263-2279. https://doi.org/10.18653/v1/2022.findings-acl.177. Radford A, Kim J W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, others (2021) Learning Transferable Visual Models From Natural Language Supervision (CLIP). In: ICML 2021, pp 8748-8763. -Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. Advances in Neural Information Processing Systems 35:25278-25294. +Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. Advances in Neural Information Processing Systems 35:25278-25294. Available at: https://arxiv.org/abs/2210.08402. Zhu J, Wang W, Chen Z, Liu Z, Ye S, Gu L, Duan Y, Tian H, Su W, Shao J, Gao Z, Cui E, Cao Y, Liu Y, Xu W, Li H, Wang J, Lv H, Chen D, Li S, He Y, Jiang T, Luo J, Wang Y, He C, Shi B, Zhang X, Shao W, He J, Xiong Y, Qu W, Sun P, Jiao P, Wu L, Zhang K, Deng H, Ge J, Chen K, Wang L, Dou M, Lu L, Zhu X, Lu T, Lin D, Qiao Y, Dai J, Wang W (2025) InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models. arXiv preprint arXiv:2504.10479. diff --git a/docs/en/part3/ch10_video_audio.md b/docs/en/part3/ch10_video_audio.md index 17c2a03d..55fd0571 100644 --- a/docs/en/part3/ch10_video_audio.md +++ b/docs/en/part3/ch10_video_audio.md @@ -327,7 +327,7 @@ On this basis, the chapter added dense event labeling, audio-video mismatch dete Bain M, Huh J, Han T, Zisserman A (2023) WhisperX: Time-Accurate Speech Transcription of Long-Form Audio. arXiv preprint arXiv:2303.00747. -Bredin H, Yin R, Coria J M, Gelly G, Korshunov P, Lavechin M, Fustes D, Titeux H, Bouaziz W, Gill M P (2020) pyannote.audio: Neural Building Blocks for Speaker Diarization. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp 7124-7128. +Bredin H, Yin R, Coria J M, Gelly G, Korshunov P, Lavechin M, Fustes D, Titeux H, Bouaziz W, Gill M P (2020) pyannote.audio: Neural Building Blocks for Speaker Diarization. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp 7124-7128. https://doi.org/10.1109/icassp40776.2020.9052974. Brooks T, Peebles B, Holmes C, DePue W, Guo Y, Jing L, Schnurr D, Taylor J, Luhman T, Luhman E, others (2024) Video Generation Models as World Simulators (Sora). OpenAI Technical Report. diff --git a/docs/en/part3/ch11_cross_modal_alignment.md b/docs/en/part3/ch11_cross_modal_alignment.md index f23af210..01943942 100644 --- a/docs/en/part3/ch11_cross_modal_alignment.md +++ b/docs/en/part3/ch11_cross_modal_alignment.md @@ -349,10 +349,10 @@ Rombach R, Blattmann A, Lorenz D, Esser P, Ommer B (2022) High-Resolution Image Sakoe H, Chiba S (1978) Dynamic Programming Algorithm Optimization for Spoken Word Recognition (DTW). IEEE Transactions on Acoustics, Speech, and Signal Processing 26(1):43-49. -Salvador S, Chan P (2007) Toward Accurate Dynamic Time Warping in Linear Time and Space (FastDTW). Intelligent Data Analysis 11(5):561-580. +Salvador S, Chan P (2007) Toward Accurate Dynamic Time Warping in Linear Time and Space (FastDTW). Intelligent Data Analysis 11(5):561-580. https://doi.org/10.3233/ida-2007-11508. van den Oord A, Vinyals O, Kavukcuoglu K (2017) Neural Discrete Representation Learning (VQ-VAE). Advances in Neural Information Processing Systems 30. -Wu Y, Chen K, Zhang T, Hui Y, Berg-Kirkpatrick T, Dubnov S (2023) Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation (CLAP). In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp 1-5. +Wu Y, Chen K, Zhang T, Hui Y, Berg-Kirkpatrick T, Dubnov S (2023) Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation (CLAP). In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp 1-5. https://doi.org/10.1109/icassp49357.2023.10095969. Dufumier B, Castillo-Navarro J, Tuia D, Thiran J P (2025) What to Align in Multimodal Contrastive Learning? In: Proceedings of the 13th International Conference on Learning Representations. arXiv preprint arXiv:2409.07402. diff --git a/docs/en/part4/ch12_sft.md b/docs/en/part4/ch12_sft.md index cfe02fc0..4e3ab4c2 100644 --- a/docs/en/part4/ch12_sft.md +++ b/docs/en/part4/ch12_sft.md @@ -703,13 +703,13 @@ Askell, A., Bai, Y., Chen, A., Drain, D., Ganguli, D., Henighan, T., et al. (202 OpenAI. (2024). *Introducing Structured Outputs in the API*. OpenAI Blog, August 6, 2024; OpenAI API Documentation: *Structured Model Outputs*. Accessed May 14, 2026. -Singhal, K., Azizi, S., Tu, T., Mahdavi, S. S., Wei, J., Chung, H. W., et al. (2023). Large Language Models Encode Clinical Knowledge. Nature. +Singhal, K., Azizi, S., Tu, T., Mahdavi, S. S., Wei, J., Chung, H. W., et al. (2023). Large Language Models Encode Clinical Knowledge. Nature. https://doi.org/10.1038/s41586-023-06291-2. -Gebru, T., Morgenstern, J., Vecchione, B., Vaughan, J. W., Wallach, H., Daumé III, H., & Crawford, K. (2021). *Datasheets for Datasets*. Communications of the ACM, 64(12), 86–92. +Gebru, T., Morgenstern, J., Vecchione, B., Vaughan, J. W., Wallach, H., Daumé III, H., & Crawford, K. (2021). *Datasheets for Datasets*. Communications of the ACM, 64(12), 86–92. https://doi.org/10.1145/3458723. -Pushkarna, M., Zaldivar, A., & Kjartansson, O. (2022). *Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI*. Proceedings of the ACM Conference on Fairness, Accountability, and Transparency, 1776–1826. +Pushkarna, M., Zaldivar, A., & Kjartansson, O. (2022). *Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI*. Proceedings of the ACM Conference on Fairness, Accountability, and Transparency, 1776–1826. https://doi.org/10.1145/3531146.3533231. -Mitchell, M., Wu, S., Zaldivar, A., Barnes, P., Vasserman, L., Hutchinson, B., et al. (2019). *Model Cards for Model Reporting*. Proceedings of the Conference on Fairness, Accountability, and Transparency, 220–229. +Mitchell, M., Wu, S., Zaldivar, A., Barnes, P., Vasserman, L., Hutchinson, B., et al. (2019). *Model Cards for Model Reporting*. Proceedings of the Conference on Fairness, Accountability, and Transparency, 220–229. https://doi.org/10.1145/3287560.3287596. Liang, P., Bommasani, R., Lee, T., Tsipras, D., Soylu, D., Yasunaga, M., et al. (2022). *Holistic Evaluation of Language Models*. arXiv:2211.09110. diff --git a/docs/en/part4/ch13_preference.md b/docs/en/part4/ch13_preference.md index 281148af..c6345e73 100644 --- a/docs/en/part4/ch13_preference.md +++ b/docs/en/part4/ch13_preference.md @@ -473,19 +473,19 @@ When preference data is understood and built in this way, it is elevated from be ## References -Christiano, P. F., Leike, J., Brown, T. B., Martic, M., Legg, S., & Amodei, D. (2017). Deep reinforcement learning from human preferences. *Advances in Neural Information Processing Systems*, 30. +Christiano, P. F., Leike, J., Brown, T. B., Martic, M., Legg, S., & Amodei, D. (2017). Deep reinforcement learning from human preferences. *Advances in Neural Information Processing Systems*, 30. arXiv:1706.03741. Ziegler, D. M., Stiennon, N., Wu, J., Brown, T. B., Radford, A., Amodei, D., Christiano, P., & Irving, G. (2019). Fine-tuning language models from human preferences. *arXiv preprint arXiv:1909.08593*. -Stiennon, N., Ouyang, L., Wu, J., Ziegler, D. M., Lowe, R., Voss, C., Radford, A., Amodei, D., & Christiano, P. (2020). Learning to summarize from human feedback. *Advances in Neural Information Processing Systems*, 33, 3008–3021. +Stiennon, N., Ouyang, L., Wu, J., Ziegler, D. M., Lowe, R., Voss, C., Radford, A., Amodei, D., & Christiano, P. (2020). Learning to summarize from human feedback. *Advances in Neural Information Processing Systems*, 33, 3008–3021. arXiv:2009.01325. Askell, A., Bai, Y., Chen, A., et al. (2021). A general language assistant as a laboratory for alignment. *arXiv preprint arXiv:2112.00861*. -Ouyang, L., Wu, J., Jiang, X., et al. (2022). Training language models to follow instructions with human feedback. *Advances in Neural Information Processing Systems*, 35, 27730–27744. +Ouyang, L., Wu, J., Jiang, X., et al. (2022). Training language models to follow instructions with human feedback. *Advances in Neural Information Processing Systems*, 35, 27730–27744. arXiv:2203.02155. Bai, Y., Jones, A., Ndousse, K., et al. (2022a). Training a helpful and harmless assistant with reinforcement learning from human feedback. *arXiv preprint arXiv:2204.05862*. -Rafailov, R., Sharma, A., Mitchell, E., et al. (2023). Direct preference optimization: Your language model is secretly a reward model. *Advances in Neural Information Processing Systems*, 36, 53728–53741. +Rafailov, R., Sharma, A., Mitchell, E., et al. (2023). Direct preference optimization: Your language model is secretly a reward model. *Advances in Neural Information Processing Systems*, 36, 53728–53741. arXiv:2305.18290. Bai, Y., Kadavath, S., Kundu, S., et al. (2022b). Constitutional AI: Harmlessness from AI feedback. *arXiv preprint arXiv:2212.08073*. @@ -495,26 +495,26 @@ Lightman, H., Kosaraju, V., Burda, Y., et al. (2024). Let's verify step by step. Uesato, J., Kushman, N., Kumar, R., et al. (2022). Solving math word problems with process- and outcome-based feedback. *arXiv preprint arXiv:2211.14275*. -Bradley, R. A., & Terry, M. E. (1952). Rank analysis of incomplete block designs: I. The method of paired comparisons. *Biometrika*, 39(3/4), 324–345. +Bradley, R. A., & Terry, M. E. (1952). Rank analysis of incomplete block designs: I. The method of paired comparisons. *Biometrika*, 39(3/4), 324–345. https://doi.org/10.2307/2334029. Roijers, D. M., Vamplew, P., Whiteson, S., et al. (2013). A survey of multi-objective sequential decision-making. *Journal of Artificial Intelligence Research*, 48, 67–113. -Deb, K., Pratap, A., Agarwal, S., et al. (2002). A fast and elitist multiobjective genetic algorithm: NSGA-II. *IEEE Transactions on Evolutionary Computation*, 6(2), 182–197. +Deb, K., Pratap, A., Agarwal, S., et al. (2002). A fast and elitist multiobjective genetic algorithm: NSGA-II. *IEEE Transactions on Evolutionary Computation*, 6(2), 182–197. https://doi.org/10.1109/4235.996017. -Cohen, J. (1960). A coefficient of agreement for nominal scales. *Educational and Psychological Measurement*, 20(1), 37–46. +Cohen, J. (1960). A coefficient of agreement for nominal scales. *Educational and Psychological Measurement*, 20(1), 37–46. https://doi.org/10.1177/001316446002000104. -Dawid, A. P., & Skene, A. M. (1979). Maximum likelihood estimation of observer error-rates using the EM algorithm. *Journal of the Royal Statistical Society: Series C (Applied Statistics)*, 28(1), 20–28. +Dawid, A. P., & Skene, A. M. (1979). Maximum likelihood estimation of observer error-rates using the EM algorithm. *Journal of the Royal Statistical Society: Series C (Applied Statistics)*, 28(1), 20–28. https://doi.org/10.2307/2346806. Snow, R., O'Connor, B., Jurafsky, D., et al. (2008). Cheap and fast—but is it good? Evaluating non-expert annotations for natural language tasks. *Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing*, 254–263. -Aroyo, L., & Welty, C. (2015). Truth is a lie: Crowd truth and the seven myths of human annotation. *AI Magazine*, 36(1), 15–24. +Aroyo, L., & Welty, C. (2015). Truth is a lie: Crowd truth and the seven myths of human annotation. *AI Magazine*, 36(1), 15–24. https://doi.org/10.1609/aimag.v36i1.2564. -Northcutt, C. G., Jiang, L., & Chuang, I. L. (2021). Confident learning: Estimating uncertainty in dataset labels. *Journal of Artificial Intelligence Research*, 70, 1373–1411. +Northcutt, C. G., Jiang, L., & Chuang, I. L. (2021). Confident learning: Estimating uncertainty in dataset labels. *Journal of Artificial Intelligence Research*, 70, 1373–1411. https://doi.org/10.1613/jair.1.12125. -Gebru, T., Morgenstern, J., Vecchione, B., et al. (2021). Datasheets for datasets. *Communications of the ACM*, 64(12), 86–92. +Gebru, T., Morgenstern, J., Vecchione, B., et al. (2021). Datasheets for datasets. *Communications of the ACM*, 64(12), 86–92. https://doi.org/10.1145/3458723. -Bender, E. M., & Friedman, B. (2018). Data statements for natural language processing: Toward mitigating system bias and enabling better science. *Transactions of the Association for Computational Linguistics*, 6, 587–604. +Bender, E. M., & Friedman, B. (2018). Data statements for natural language processing: Toward mitigating system bias and enabling better science. *Transactions of the Association for Computational Linguistics*, 6, 587–604. https://doi.org/10.1162/tacl_a_00041. -Mitchell, M., Wu, S., Zaldivar, A., et al. (2019). Model cards for model reporting. *Proceedings of the Conference on Fairness, Accountability, and Transparency*, 220–229. +Mitchell, M., Wu, S., Zaldivar, A., et al. (2019). Model cards for model reporting. *Proceedings of the Conference on Fairness, Accountability, and Transparency*, 220–229. https://doi.org/10.1145/3287560.3287596. Liang, P., Bommasani, R., Lee, T., et al. (2022). Holistic evaluation of language models. *arXiv preprint arXiv:2211.09110*. diff --git a/docs/en/part4/ch14_qa.md b/docs/en/part4/ch14_qa.md index dde05f5d..0e06871f 100644 --- a/docs/en/part4/ch14_qa.md +++ b/docs/en/part4/ch14_qa.md @@ -645,7 +645,7 @@ Wei, J., Bosma, M., Zhao, V. Y., et al. (2022). *Finetuned Language Models Are Z Ouyang, L., Wu, J., Jiang, X., et al. (2022). *Training Language Models to Follow Instructions with Human Feedback*. Advances in Neural Information Processing Systems, 35, 27730–27744. arXiv:2203.02155. -Christiano, P. F., Leike, J., Brown, T. B., et al. (2017). *Deep Reinforcement Learning from Human Preferences*. Advances in Neural Information Processing Systems, 30. +Christiano, P. F., Leike, J., Brown, T. B., et al. (2017). *Deep Reinforcement Learning from Human Preferences*. Advances in Neural Information Processing Systems, 30. arXiv:1706.03741. Stiennon, N., Ouyang, L., Wu, J., et al. (2020). *Learning to Summarize from Human Feedback*. Advances in Neural Information Processing Systems, 33, 3008–3021. arXiv:2009.01325. diff --git a/docs/en/part5/ch15_data_synthesis.md b/docs/en/part5/ch15_data_synthesis.md index 4aaf42c6..124205bd 100644 --- a/docs/en/part5/ch15_data_synthesis.md +++ b/docs/en/part5/ch15_data_synthesis.md @@ -564,25 +564,25 @@ For teams hoping to upgrade "calling models to generate data" into a factory-sca Honovich, O., Scialom, T., Levy, O., et al. (2023). Unnatural Instructions: Tuning Language Models with (Almost) No Human Labor. *Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics*, 14409–14428. -Wang, Y., Kordi, Y., Mishra, S., et al. (2023). Self-Instruct: Aligning Language Models with Self-Generated Instructions. *Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics*, 13484–13508. +Wang, Y., Kordi, Y., Mishra, S., et al. (2023). Self-Instruct: Aligning Language Models with Self-Generated Instructions. *Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics*, 13484–13508. https://doi.org/10.18653/v1/2023.acl-long.754. Xu, C., Sun, Q., Zheng, K., et al. (2024). WizardLM: Empowering Large Language Models to Follow Complex Instructions. *International Conference on Learning Representations (ICLR 2024)*. arXiv:2304.12244. Alemohammad, S., Casco-Rodriguez, J., Luzi, L., et al. (2024). Self-Consuming Generative Models Go MAD. *International Conference on Learning Representations (ICLR 2024)*. arXiv:2307.01850. -Shumailov, I., Shumaylov, Z., Zhao, Y., et al. (2024). AI Models Collapse When Trained on Recursively Generated Data. *Nature*, 631(8022), 755–759. +Shumailov, I., Shumaylov, Z., Zhao, Y., et al. (2024). AI Models Collapse When Trained on Recursively Generated Data. *Nature*, 631(8022), 755–759. https://doi.org/10.1038/s41586-024-07566-y. Polyzotis, N., Roy, S., Whang, S. E., & Zinkevich, M. (2017). Data Management Challenges in Production Machine Learning. *Proceedings of the 2017 ACM International Conference on Management of Data (SIGMOD '17)*, 1723–1726. Sculley, D., Holt, G., Golovin, D., et al. (2015). Hidden Technical Debt in Machine Learning Systems. *Advances in Neural Information Processing Systems*, 28, 2503–2511. -Gebru, T., Morgenstern, J., Vecchione, B., et al. (2021). Datasheets for Datasets. *Communications of the ACM*, 64(12), 86–92. +Gebru, T., Morgenstern, J., Vecchione, B., et al. (2021). Datasheets for Datasets. *Communications of the ACM*, 64(12), 86–92. https://doi.org/10.1145/3458723. -Pushkarna, M., Zaldivar, A., & Kjartansson, O. (2022). Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. *Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency (FAccT '22)*, 1776–1826. +Pushkarna, M., Zaldivar, A., & Kjartansson, O. (2022). Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. *Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency (FAccT '22)*, 1776–1826. https://doi.org/10.1145/3531146.3533231. -Feng, S. Y., Gangal, V., Wei, J., et al. (2021). A Survey of Data Augmentation Approaches for NLP. *Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021*, 968–988. +Feng, S. Y., Gangal, V., Wei, J., et al. (2021). A Survey of Data Augmentation Approaches for NLP. *Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021*, 968–988. https://doi.org/10.18653/v1/2021.findings-acl.84. -Shorten, C., & Khoshgoftaar, T. M. (2019). A Survey on Image Data Augmentation for Deep Learning. *Journal of Big Data*, 6, Article 60. +Shorten, C., & Khoshgoftaar, T. M. (2019). A Survey on Image Data Augmentation for Deep Learning. *Journal of Big Data*, 6, Article 60. https://doi.org/10.1186/s40537-019-0197-0. Mukherjee, S., Mitra, A., Jawahar, G., et al. (2023). Orca: Progressive Learning from Complex Explanation Traces of GPT-4. arXiv:2306.02707. @@ -590,20 +590,20 @@ Zhou, C., Liu, P., Xu, P., et al. (2023). LIMA: Less Is More for Alignment. *Adv Chen, L., Li, S., Yan, J., et al. (2024). AlpaGasus: Training A Better Alpaca with Fewer Data. *International Conference on Learning Representations (ICLR 2024)*. arXiv:2307.08701. -Madaan, A., Tandon, N., Gupta, P., et al. (2023). Self-Refine: Iterative Refinement with Self-Feedback. *Advances in Neural Information Processing Systems*, 36. +Madaan, A., Tandon, N., Gupta, P., et al. (2023). Self-Refine: Iterative Refinement with Self-Feedback. *Advances in Neural Information Processing Systems*, 36. arXiv:2303.17651. Chen, Z., Deng, Y., Yuan, H., et al. (2024). Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models. *International Conference on Machine Learning (ICML 2024)*. arXiv:2401.01335. Bai, Y., Kadavath, S., Kundu, S., et al. (2022). Constitutional AI: Harmlessness from AI Feedback. arXiv:2212.08073. -Liu, Y., Iter, D., Xu, Y., et al. (2023). G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. *Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing*, 2511–2522. +Liu, Y., Iter, D., Xu, Y., et al. (2023). G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. *Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing*, 2511–2522. arXiv:2303.16634. -Zheng, L., Chiang, W.-L., Sheng, Y., et al. (2023). Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. *Advances in Neural Information Processing Systems*, 36. +Zheng, L., Chiang, W.-L., Sheng, Y., et al. (2023). Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. *Advances in Neural Information Processing Systems*, 36. arXiv:2306.05685. Lee, K., Ippolito, D., Nystrom, A., et al. (2022). Deduplicating Training Data Makes Language Models Better. *Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics*, 8424–8445. -Wang, Y., Mishra, S., Alipoormolabashi, P., et al. (2022). Super-NaturalInstructions: Generalization via Declarative Instructions on 1600+ NLP Tasks. *Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing*, 5085–5109. +Wang, Y., Mishra, S., Alipoormolabashi, P., et al. (2022). Super-NaturalInstructions: Generalization via Declarative Instructions on 1600+ NLP Tasks. *Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing*, 5085–5109. arXiv:2204.07705. -Sambasivan, N., Kapania, S., Highfill, H., et al. (2021). "Everyone Wants to Do the Model Work, Not the Data Work": Data Cascades in High-Stakes AI. *Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems*, Article 39. +Sambasivan, N., Kapania, S., Highfill, H., et al. (2021). "Everyone Wants to Do the Model Work, Not the Data Work": Data Cascades in High-Stakes AI. *Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems*, Article 39. https://doi.org/10.1145/3411764.3445518. -Shinn, N., Cassano, F., Gopinath, A., et al. (2023). Reflexion: Language Agents with Verbal Reinforcement Learning. *Advances in Neural Information Processing Systems*, 36. +Shinn, N., Cassano, F., Gopinath, A., et al. (2023). Reflexion: Language Agents with Verbal Reinforcement Learning. *Advances in Neural Information Processing Systems*, 36. arXiv:2303.11366. diff --git a/docs/en/part5/ch16_distillation.md b/docs/en/part5/ch16_distillation.md index 67d12da5..71f80c66 100644 --- a/docs/en/part5/ch16_distillation.md +++ b/docs/en/part5/ch16_distillation.md @@ -632,7 +632,7 @@ Ultimately, a mature distillation system enables students to stably take over hi ## References -Gou, J., Yu, B., Maybank, S. J., et al. (2021). Knowledge Distillation: A Survey. *International Journal of Computer Vision*, 129(6), 1789–1819. +Gou, J., Yu, B., Maybank, S. J., et al. (2021). Knowledge Distillation: A Survey. *International Journal of Computer Vision*, 129(6), 1789–1819. https://doi.org/10.1007/s11263-021-01453-z. Hinton, G., Vinyals, O., & Dean, J. (2015). Distilling the Knowledge in a Neural Network. *arXiv preprint arXiv:1503.02531*. @@ -640,7 +640,7 @@ Bucila, C., Caruana, R., & Niculescu-Mizil, A. (2006). Model Compression. *Proce Mukherjee, S., Mitra, A., Jawahar, G., et al. (2023). Orca: Progressive Learning from Complex Explanation Traces of GPT-4. *arXiv preprint arXiv:2306.02707*. -Ouyang, L., Wu, J., Jiang, X., et al. (2022). Training Language Models to Follow Instructions with Human Feedback. *Advances in Neural Information Processing Systems*, 35, 27730–27744. +Ouyang, L., Wu, J., Jiang, X., et al. (2022). Training Language Models to Follow Instructions with Human Feedback. *Advances in Neural Information Processing Systems*, 35, 27730–27744. arXiv:2203.02155. Romero, A., Ballas, N., Ebrahimi Kahou, S., et al. (2015). FitNets: Hints for Thin Deep Nets. *International Conference on Learning Representations*. @@ -652,15 +652,15 @@ Sanh, V., Debut, L., Chaumond, J., et al. (2019). DistilBERT, a Distilled Versio Jiao, X., Yin, Y., Shang, L., et al. (2020). TinyBERT: Distilling BERT for Natural Language Understanding. *Findings of the Association for Computational Linguistics: EMNLP 2020*, 4163–4174. -Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. *Advances in Neural Information Processing Systems*, 36. +Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. *Advances in Neural Information Processing Systems*, 36. arXiv:2302.04761. -Wang, Y., Kordi, Y., Mishra, S., et al. (2023). Self-Instruct: Aligning Language Models with Self-Generated Instructions. *Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics*, 13484–13508. +Wang, Y., Kordi, Y., Mishra, S., et al. (2023). Self-Instruct: Aligning Language Models with Self-Generated Instructions. *Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics*, 13484–13508. https://doi.org/10.18653/v1/2023.acl-long.754. -Xu, C., Sun, Q., Zheng, K., et al. (2024). WizardLM: Empowering Large Language Models to Follow Complex Instructions. *International Conference on Learning Representations*. +Xu, C., Sun, Q., Zheng, K., et al. (2024). WizardLM: Empowering Large Language Models to Follow Complex Instructions. *International Conference on Learning Representations*. arXiv:2304.12244. -Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. *International Conference on Learning Representations*. +Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. *International Conference on Learning Representations*. arXiv:2210.03629. -Patil, S. G., Zhang, T., Wang, X., et al. (2024). Gorilla: Large Language Model Connected with Massive APIs. *Advances in Neural Information Processing Systems*, 37. +Patil, S. G., Zhang, T., Wang, X., et al. (2024). Gorilla: Large Language Model Connected with Massive APIs. *Advances in Neural Information Processing Systems*, 37. arXiv:2305.15334. Shazeer, N., Mirhoseini, A., Maziarz, K., et al. (2017). Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. *International Conference on Learning Representations*. @@ -668,16 +668,16 @@ Fedus, W., Zoph, B., & Shazeer, N. (2022). Switch Transformers: Scaling to Trill Du, N., Huang, Y., Dai, A. M., et al. (2022). GLaM: Efficient Scaling of Language Models with Mixture-of-Experts. *Proceedings of the 39th International Conference on Machine Learning*, 5547–5569. -Zheng, L., Chiang, W.-L., Sheng, Y., et al. (2023). Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. *Advances in Neural Information Processing Systems*, 36. +Zheng, L., Chiang, W.-L., Sheng, Y., et al. (2023). Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. *Advances in Neural Information Processing Systems*, 36. arXiv:2306.05685. -Liu, Y., Iter, D., Xu, Y., et al. (2023). G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. *Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing*, 2511–2522. +Liu, Y., Iter, D., Xu, Y., et al. (2023). G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. *Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing*, 2511–2522. arXiv:2303.16634. -Mirzadeh, S. I., Farajtabar, M., Li, A., et al. (2020). Improved Knowledge Distillation via Teacher Assistant. *Proceedings of the AAAI Conference on Artificial Intelligence*, 34(04), 5191–5198. +Mirzadeh, S. I., Farajtabar, M., Li, A., et al. (2020). Improved Knowledge Distillation via Teacher Assistant. *Proceedings of the AAAI Conference on Artificial Intelligence*, 34(04), 5191–5198. https://doi.org/10.1609/aaai.v34i04.5963. -Kim, Y., & Rush, A. M. (2016). Sequence-Level Knowledge Distillation. *Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing*, 1317–1327. +Kim, Y., & Rush, A. M. (2016). Sequence-Level Knowledge Distillation. *Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing*, 1317–1327. https://doi.org/10.18653/v1/d16-1139. Lightman, H., Kosaraju, V., Burda, Y., et al. (2024). Let's Verify Step by Step. *International Conference on Learning Representations (ICLR 2024)*. arXiv:2305.20050. -Wei, J., Wang, X., Schuurmans, D., et al. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. *Advances in Neural Information Processing Systems*, 35, 24824–24837. +Wei, J., Wang, X., Schuurmans, D., et al. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. *Advances in Neural Information Processing Systems*, 35, 24824–24837. arXiv:2201.11903. Bai, Y., Kadavath, S., Kundu, S., et al. (2022). Constitutional AI: Harmlessness from AI Feedback. *arXiv preprint arXiv:2212.08073*. diff --git a/docs/en/part5/ch17_quality.md b/docs/en/part5/ch17_quality.md index d05c19bb..a08e2a96 100644 --- a/docs/en/part5/ch17_quality.md +++ b/docs/en/part5/ch17_quality.md @@ -576,25 +576,25 @@ Case post-mortems demonstrate that synthetic data quality deterioration typicall ## References -Tan, Z., Li, D., Wang, S., et al. (2024). Large Language Models for Data Annotation and Synthesis: A Survey. *Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing*, 930–957. +Tan, Z., Li, D., Wang, S., et al. (2024). Large Language Models for Data Annotation and Synthesis: A Survey. *Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing*, 930–957. https://doi.org/10.18653/v1/2024.emnlp-main.54. Long, L., Wang, R., Xiao, R., et al. (2024). On LLMs-Driven Synthetic Data Generation, Curation, and Evaluation: A Survey. *Findings of the Association for Computational Linguistics: ACL 2024*, 11065–11082. -Shumailov, I., Shumaylov, Z., Zhao, Y., et al. (2024). AI models collapse when trained on recursively generated data. *Nature*, 631, 755–759. +Shumailov, I., Shumaylov, Z., Zhao, Y., et al. (2024). AI models collapse when trained on recursively generated data. *Nature*, 631, 755–759. https://doi.org/10.1038/s41586-024-07566-y. -Alemohammad, S., Casco-Rodriguez, J., Luzi, L., et al. (2024). Self-Consuming Generative Models Go MAD. *International Conference on Learning Representations*. +Alemohammad, S., Casco-Rodriguez, J., Luzi, L., et al. (2024). Self-Consuming Generative Models Go MAD. *International Conference on Learning Representations*. arXiv:2307.01850. Gerstgrasser, M., Schaeffer, R., Dey, A., et al. (2024). Is Model Collapse Inevitable? Breaking the Curse of Recursion by Accumulating Real and Synthetic Data. arXiv:2404.01413. -Wang, Y., Kordi, Y., Mishra, S., et al. (2023). Self-Instruct: Aligning Language Models with Self-Generated Instructions. *Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics*, 13484–13508. +Wang, Y., Kordi, Y., Mishra, S., et al. (2023). Self-Instruct: Aligning Language Models with Self-Generated Instructions. *Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics*, 13484–13508. https://doi.org/10.18653/v1/2023.acl-long.754. Honovich, O., Scialom, T., Levy, O., et al. (2023). Unnatural Instructions: Tuning Language Models with (Almost) No Human Labor. *Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics*, 14409–14428. Xu, C., Sun, Q., Zheng, K., et al. (2024). WizardLM: Empowering Large Language Models to Follow Complex Instructions. *International Conference on Learning Representations (ICLR 2024)*. arXiv:2304.12244. -Zheng, L., Chiang, W.-L., Sheng, Y., et al. (2023). Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. *Advances in Neural Information Processing Systems*, 36. +Zheng, L., Chiang, W.-L., Sheng, Y., et al. (2023). Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. *Advances in Neural Information Processing Systems*, 36. arXiv:2306.05685. -Liu, Y., Iter, D., Xu, Y., et al. (2023). G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. *Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing*, 2511–2522. +Liu, Y., Iter, D., Xu, Y., et al. (2023). G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. *Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing*, 2511–2522. arXiv:2303.16634. Geirhos, R., Jacobsen, J.-H., Michaelis, C., et al. (2020). Shortcut learning in deep neural networks. *Nature Machine Intelligence*, 2, 665–673. @@ -602,14 +602,14 @@ Torralba, A., & Efros, A. A. (2011). Unbiased Look at Dataset Bias. *Proceedings Koh, P. W., Sagawa, S., Marklund, H., et al. (2021). WILDS: A Benchmark of in-the-Wild Distribution Shifts. *Proceedings of the 38th International Conference on Machine Learning*, 5637–5664. -Ribeiro, M. T., Wu, T., Guestrin, C., et al. (2020). Beyond Accuracy: Behavioral Testing of NLP Models with CheckList. *Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics*, 4902–4912. +Ribeiro, M. T., Wu, T., Guestrin, C., et al. (2020). Beyond Accuracy: Behavioral Testing of NLP Models with CheckList. *Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics*, 4902–4912. https://doi.org/10.18653/v1/2020.acl-main.442. Lee, K., Ippolito, D., Nystrom, A., et al. (2022). Deduplicating Training Data Makes Language Models Better. *Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics*, 8424–8445. Carlini, N., Ippolito, D., Jagielski, M., et al. (2023). Quantifying Memorization Across Neural Language Models. *International Conference on Learning Representations*. -Gebru, T., Morgenstern, J., Vecchione, B., et al. (2021). Datasheets for Datasets. *Communications of the ACM*, 64(12), 86–92. +Gebru, T., Morgenstern, J., Vecchione, B., et al. (2021). Datasheets for Datasets. *Communications of the ACM*, 64(12), 86–92. https://doi.org/10.1145/3458723. -Mitchell, M., Wu, S., Zaldivar, A., et al. (2019). Model Cards for Model Reporting. *Proceedings of the Conference on Fairness, Accountability, and Transparency*, 220–229. +Mitchell, M., Wu, S., Zaldivar, A., et al. (2019). Model Cards for Model Reporting. *Proceedings of the Conference on Fairness, Accountability, and Transparency*, 220–229. https://doi.org/10.1145/3287560.3287596. Raji, I. D., Smart, A., White, R. N., et al. (2020). Closing the AI Accountability Gap: Defining an End-to-End Framework for Internal Algorithmic Auditing. *Proceedings of the 2020 Conference on Fairness, Accountability, and Transparency*, 33–44. diff --git a/docs/en/part6/ch19_tool.md b/docs/en/part6/ch19_tool.md index 0c8de575..33097daa 100644 --- a/docs/en/part6/ch19_tool.md +++ b/docs/en/part6/ch19_tool.md @@ -528,17 +528,17 @@ Parisi, A., Zhao, Y., & Fiedel, N. (2022). TALM: Tool Augmented Language Models. Nakano, R., Hilton, J., Balaji, S., et al. (2021). WebGPT: Browser-Assisted Question-Answering with Human Feedback. arXiv:2112.09332. -Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. International Conference on Learning Representations. +Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. International Conference on Learning Representations. arXiv:2210.03629. -Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. Advances in Neural Information Processing Systems, 36. +Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. Advances in Neural Information Processing Systems, 36. arXiv:2302.04761. -Li, M., Zhao, Y., Yu, B., et al. (2023). API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs. Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, 3102–3116. +Li, M., Zhao, Y., Yu, B., et al. (2023). API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs. Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, 3102–3116. https://doi.org/10.18653/v1/2023.emnlp-main.187. Qin, Y., Liang, S., Ye, Y., et al. (2024). ToolLLM: Facilitating Large Language Models to Master 16000+ Real-World APIs. International Conference on Learning Representations. -Patil, S. G., Zhang, T., Wang, X., & Gonzalez, J. E. (2024). Gorilla: Large Language Model Connected with Massive APIs. Advances in Neural Information Processing Systems, 37. +Patil, S. G., Zhang, T., Wang, X., & Gonzalez, J. E. (2024). Gorilla: Large Language Model Connected with Massive APIs. Advances in Neural Information Processing Systems, 37. arXiv:2305.15334. -Zhuang, Y., Yu, Y., Wang, K., et al. (2023). ToolQA: A Dataset for LLM Question Answering with External Tools. Advances in Neural Information Processing Systems, 36. +Zhuang, Y., Yu, Y., Wang, K., et al. (2023). ToolQA: A Dataset for LLM Question Answering with External Tools. Advances in Neural Information Processing Systems, 36. https://doi.org/10.52202/075280-2180. Huang, Y., Shi, J., Li, Y., et al. (2023). MetaTool Benchmark for Large Language Models: Deciding Whether to Use Tools and Which to Use. arXiv:2310.03128. @@ -546,13 +546,13 @@ Patil, S. G., Mao, H., Yan, F., et al. (2025). The Berkeley Function Calling Lea Yao, S., Shinn, N., Razavi, P., & Narasimhan, K. (2025). τ-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains. International Conference on Learning Representations. -Shinn, N., Cassano, F., Gopinath, A., et al. (2023). Reflexion: Language Agents with Verbal Reinforcement Learning. Advances in Neural Information Processing Systems, 36. +Shinn, N., Cassano, F., Gopinath, A., et al. (2023). Reflexion: Language Agents with Verbal Reinforcement Learning. Advances in Neural Information Processing Systems, 36. arXiv:2303.11366. -Yang, J., Jimenez, C. E., Wettig, A., et al. (2024). SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering. Advances in Neural Information Processing Systems, 37. +Yang, J., Jimenez, C. E., Wettig, A., et al. (2024). SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering. Advances in Neural Information Processing Systems, 37. https://doi.org/10.52202/079017-1601. Ruan, Y., Dong, H., Wang, A., et al. (2024). Identifying the Risks of LM Agents with an LM-Emulated Sandbox. International Conference on Learning Representations. -Greshake, K., Abdelnabi, S., Mishra, S., et al. (2023). Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security, 79–90. +Greshake, K., Abdelnabi, S., Mishra, S., et al. (2023). Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security, 79–90. https://doi.org/10.1145/3605764.3623985. Liu, Y., Deng, G., Li, Y., et al. (2023). Prompt Injection Attack against LLM-Integrated Applications. arXiv:2306.05499. diff --git a/docs/en/part6/ch20_agent.md b/docs/en/part6/ch20_agent.md index c71a2f0a..d5876a11 100644 --- a/docs/en/part6/ch20_agent.md +++ b/docs/en/part6/ch20_agent.md @@ -476,29 +476,29 @@ This chapter particularly emphasizes that long-term memory must not be treated a Young, S., Gašić, M., Thomson, B., & Williams, J. D. (2013). *POMDP-Based Statistical Spoken Dialog Systems: A Review*. Proceedings of the IEEE, 101(5), 1160–1179. https://doi.org/10.1109/JPROC.2012.2225812. -Williams, J. D., Raux, A., Ramachandran, D., & Black, A. (2013). *The Dialog State Tracking Challenge*. Proceedings of the SIGDIAL 2013 Conference, 404–413. +Williams, J. D., Raux, A., Ramachandran, D., & Black, A. (2013). *The Dialog State Tracking Challenge*. Proceedings of the SIGDIAL 2013 Conference, 404–413. https://doi.org/10.1109/slt.2014.7078595. -Budzianowski, P., Wen, T.-H., Tseng, B.-H., et al. (2018). *MultiWOZ - A Large-Scale Multi-Domain Wizard-of-Oz Dataset for Task-Oriented Dialogue Modelling*. Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 5016–5026. +Budzianowski, P., Wen, T.-H., Tseng, B.-H., et al. (2018). *MultiWOZ - A Large-Scale Multi-Domain Wizard-of-Oz Dataset for Task-Oriented Dialogue Modelling*. Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 5016–5026. https://doi.org/10.18653/v1/d18-1547. -Yao, S., Zhao, J., Yu, D., et al. (2023). *ReAct: Synergizing Reasoning and Acting in Language Models*. International Conference on Learning Representations. +Yao, S., Zhao, J., Yu, D., et al. (2023). *ReAct: Synergizing Reasoning and Acting in Language Models*. International Conference on Learning Representations. arXiv:2210.03629. -Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). *Toolformer: Language Models Can Teach Themselves to Use Tools*. Advances in Neural Information Processing Systems, 36. +Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). *Toolformer: Language Models Can Teach Themselves to Use Tools*. Advances in Neural Information Processing Systems, 36. arXiv:2302.04761. Liu, N. F., Lin, K., Hewitt, J., et al. (2024a). *Lost in the Middle: How Language Models Use Long Contexts*. Transactions of the Association for Computational Linguistics, 12, 157–173. https://doi.org/10.1162/tacl_a_00638. Packer, C., Wooders, S., Lin, K., et al. (2023). *MemGPT: Towards LLMs as Operating Systems*. arXiv:2310.08560. -Wang, W., Dong, L., Cheng, H., et al. (2023). *Augmenting Language Models with Long-Term Memory*. Advances in Neural Information Processing Systems, 36. +Wang, W., Dong, L., Cheng, H., et al. (2023). *Augmenting Language Models with Long-Term Memory*. Advances in Neural Information Processing Systems, 36. https://doi.org/10.52202/075280-3259. Zhong, W., Guo, L., Gao, Q., et al. (2024). *MemoryBank: Enhancing Large Language Models with Long-Term Memory*. Proceedings of the AAAI Conference on Artificial Intelligence, 38(17), 19724–19731. https://doi.org/10.1609/aaai.v38i17.29946. -Park, J. S., O'Brien, J. C., Cai, C. J., et al. (2023). *Generative Agents: Interactive Simulacra of Human Behavior*. Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology. +Park, J. S., O'Brien, J. C., Cai, C. J., et al. (2023). *Generative Agents: Interactive Simulacra of Human Behavior*. Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology. https://doi.org/10.1145/3586183.3606763. -Lewis, P., Perez, E., Piktus, A., et al. (2020). *Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks*. Advances in Neural Information Processing Systems, 33, 9459–9474. +Lewis, P., Perez, E., Piktus, A., et al. (2020). *Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks*. Advances in Neural Information Processing Systems, 33, 9459–9474. arXiv:2005.11401. Asai, A., Wu, Z., Wang, Y., et al. (2024). *Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection*. International Conference on Learning Representations. -Shinn, N., Cassano, F., Gopinath, A., et al. (2023). *Reflexion: Language Agents with Verbal Reinforcement Learning*. Advances in Neural Information Processing Systems, 36. +Shinn, N., Cassano, F., Gopinath, A., et al. (2023). *Reflexion: Language Agents with Verbal Reinforcement Learning*. Advances in Neural Information Processing Systems, 36. arXiv:2303.11366. Liu, X., Yu, H., Zhang, H., et al. (2024b). *AgentBench: Evaluating LLMs as Agents*. International Conference on Learning Representations. diff --git a/docs/en/part7/ch21_rag_pipeline.md b/docs/en/part7/ch21_rag_pipeline.md index ffd5cb45..59149f40 100644 --- a/docs/en/part7/ch21_rag_pipeline.md +++ b/docs/en/part7/ch21_rag_pipeline.md @@ -933,35 +933,35 @@ On top of evaluation, this chapter further incorporated online failure-sample fe ## References -Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W-t, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459–9474. +Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W-t, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459–9474. arXiv:2005.11401. Gao Y, Xiong Y, Gao X, Jia K, Pan J, Bi Y, Dai Y, Sun J, Wang M, Wang H (2023) Retrieval-Augmented Generation for Large Language Models: A Survey. arXiv preprint arXiv:2312.10997. Guu K, Lee K, Tung Z, Pasupat P, Chang M-W (2020) REALM: Retrieval-Augmented Language Model Pre-Training. In: Proceedings of the 37th International Conference on Machine Learning (ICML), pp 3929–3938. -Izacard G, Grave E (2021) Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering. In: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics (EACL), pp 874–880. +Izacard G, Grave E (2021) Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering. In: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics (EACL), pp 874–880. https://doi.org/10.18653/v1/2021.eacl-main.74. -Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: European Conference on Computer Vision (ECCV). +Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: European Conference on Computer Vision (ECCV). https://doi.org/10.1007/978-3-031-19815-1_29. -Xu Y, Li M, Cui L, Huang S, Wei F, Zhou M (2020) LayoutLM: Pre-training of Text and Layout for Document Image Understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 1192–1200. +Xu Y, Li M, Cui L, Huang S, Wei F, Zhou M (2020) LayoutLM: Pre-training of Text and Layout for Document Image Understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 1192–1200. https://doi.org/10.1145/3394486.3403172. -Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083–4091. +Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083–4091. https://doi.org/10.1145/3503161.3548112. -Appalaraju S, Jasani B, Kota B U, Xie Y, Manmatha R (2021) DocFormer: End-to-End Transformer for Document Understanding. In: Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp 993–1003. +Appalaraju S, Jasani B, Kota B U, Xie Y, Manmatha R (2021) DocFormer: End-to-End Transformer for Document Understanding. In: Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp 993–1003. https://doi.org/10.1109/iccv48922.2021.00103. -Smock B, Pesala R, Abraham R (2022) PubTables-1M: Towards Comprehensive Table Extraction from Unstructured Documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 4634–4642. +Smock B, Pesala R, Abraham R (2022) PubTables-1M: Towards Comprehensive Table Extraction from Unstructured Documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 4634–4642. https://doi.org/10.1109/cvpr52688.2022.00459. -Liu N F, Lin K, Hewitt J, Paranjape A, Bevilacqua M, Petroni F, Liang P (2024) Lost in the Middle: How Language Models Use Long Contexts. Transactions of the Association for Computational Linguistics 12:157–173. +Liu N F, Lin K, Hewitt J, Paranjape A, Bevilacqua M, Petroni F, Liang P (2024) Lost in the Middle: How Language Models Use Long Contexts. Transactions of the Association for Computational Linguistics 12:157–173. https://doi.org/10.1162/tacl_a_00638. -Karpukhin V, Oğuz B, Min S, Lewis P, Wu L, Edunov S, Chen D, Yih W-t (2020) Dense Passage Retrieval for Open-Domain Question Answering. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp 6769–6781. +Karpukhin V, Oğuz B, Min S, Lewis P, Wu L, Edunov S, Chen D, Yih W-t (2020) Dense Passage Retrieval for Open-Domain Question Answering. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp 6769–6781. https://doi.org/10.18653/v1/2020.emnlp-main.550. -Reimers N, Gurevych I (2019) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, pp 3982–3992. +Reimers N, Gurevych I (2019) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, pp 3982–3992. https://doi.org/10.18653/v1/d19-1410. -Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150-158. +Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150-158. https://doi.org/10.18653/v1/2024.eacl-demo.16. -Niu C, Wu Y, Zhu J, Xu S, Shum K, Zhong R, Song J, Zhang T (2024) RAGTruth: A Hallucination Corpus for Developing Trustworthy Retrieval-Augmented Language Models. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (ACL), pp 10862-10878. +Niu C, Wu Y, Zhu J, Xu S, Shum K, Zhong R, Song J, Zhang T (2024) RAGTruth: A Hallucination Corpus for Developing Trustworthy Retrieval-Augmented Language Models. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (ACL), pp 10862-10878. https://doi.org/10.18653/v1/2024.acl-long.585. -Manning C D, Raghavan P, Schütze H (2008) Introduction to Information Retrieval. Cambridge University Press. +Manning C D, Raghavan P, Schütze H (2008) Introduction to Information Retrieval. Cambridge University Press. https://doi.org/10.5860/choice.46-2715. Thakur N, Reimers N, Rücklé A, Srivastava A, Gurevych I (2021) BEIR: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models. In: Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks. diff --git a/docs/en/part7/ch22_multimodal_rag_visual_retrieval.md b/docs/en/part7/ch22_multimodal_rag_visual_retrieval.md index 0edfd72f..011dd763 100644 --- a/docs/en/part7/ch22_multimodal_rag_visual_retrieval.md +++ b/docs/en/part7/ch22_multimodal_rag_visual_retrieval.md @@ -542,17 +542,17 @@ At the evaluation level, the chapter decomposed visual retrieval errors into par ## References -Xu Y, Li M, Cui L, Huang S, Wei F, Zhou M (2020) LayoutLM: Pre-training of Text and Layout for Document Image Understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 1192–1200. +Xu Y, Li M, Cui L, Huang S, Wei F, Zhou M (2020) LayoutLM: Pre-training of Text and Layout for Document Image Understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 1192–1200. https://doi.org/10.1145/3394486.3403172. -Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083–4091. +Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083–4091. https://doi.org/10.1145/3503161.3548112. -Appalaraju S, Jasani B, Kota B U, Xie Y, Manmatha R (2021) DocFormer: End-to-End Transformer for Document Understanding. In: Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp 993–1003. +Appalaraju S, Jasani B, Kota B U, Xie Y, Manmatha R (2021) DocFormer: End-to-End Transformer for Document Understanding. In: Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp 993–1003. https://doi.org/10.1109/iccv48922.2021.00103. -Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 498–517. +Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 498–517. https://doi.org/10.1007/978-3-031-19815-1_29. -Mathew M, Karatzas D, Jawahar C V (2021) DocVQA: A Dataset for VQA on Document Images. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), pp 2200–2209. +Mathew M, Karatzas D, Jawahar C V (2021) DocVQA: A Dataset for VQA on Document Images. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), pp 2200–2209. https://doi.org/10.1109/wacv48630.2021.00225. -Mathew M, Bagal V, Tito R, Karatzas D, Valveny E, Jawahar C V (2022) InfographicVQA. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), pp 1697–1706. +Mathew M, Bagal V, Tito R, Karatzas D, Valveny E, Jawahar C V (2022) InfographicVQA. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), pp 1697–1706. https://doi.org/10.1109/wacv51458.2022.00264. Radford A, Kim J W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, Krueger G, Sutskever I (2021) Learning Transferable Visual Models From Natural Language Supervision. In: Proceedings of the 38th International Conference on Machine Learning (ICML), pp 8748–8763. @@ -562,11 +562,11 @@ Li J, Li D, Savarese S, Hoi S (2023) BLIP-2: Bootstrapping Language-Image Pre-tr Lee K, Joshi M, Turc I, Hu H, Liu F, Eisenschlos J, Khandelwal U, Shaw P, Chang M-W, Toutanova K (2023) Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding. In: Proceedings of the 40th International Conference on Machine Learning (ICML), pp 18893–18912. -Masry A, Long D X, Tan J Q, Joty S, Hoque E (2022) ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp 2263–2279. +Masry A, Long D X, Tan J Q, Joty S, Hoque E (2022) ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp 2263–2279. https://doi.org/10.18653/v1/2022.findings-acl.177. Liu F, Piccinno F, Krichene S, Pang C, Lee K, Joshi M, Altun Y, Collier N, Eisenschlos J M (2023a) MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 12756–12770. -Liu F, Eisenschlos J M, Piccinno F, Krichene S, Pang C, Lee K, Joshi M, Chen W, Collier N, Altun Y (2023b) DePlot: One-shot Visual Language Reasoning by Plot-to-Table Translation. In: Findings of the Association for Computational Linguistics: ACL 2023, pp 10381–10399. +Liu F, Eisenschlos J M, Piccinno F, Krichene S, Pang C, Lee K, Joshi M, Chen W, Collier N, Altun Y (2023b) DePlot: One-shot Visual Language Reasoning by Plot-to-Table Translation. In: Findings of the Association for Computational Linguistics: ACL 2023, pp 10381–10399. https://doi.org/10.18653/v1/2023.findings-acl.660. Ren S, He K, Girshick R, Sun J (2015) Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In: Advances in Neural Information Processing Systems 28, pp 91–99. @@ -576,7 +576,7 @@ Kirillov A, Mintun E, Ravi N, Mao H, Rolland C, Gustafson L, Xiao T, Whitehead S Nogueira R, Cho K (2019) Passage Re-ranking with BERT. arXiv preprint arXiv:1901.04085. -Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150–158. +Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150–158. https://doi.org/10.18653/v1/2024.eacl-demo.16. Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28, pp 2503–2511. diff --git a/docs/en/part7/ch23_online_feedback_knowledge_update.md b/docs/en/part7/ch23_online_feedback_knowledge_update.md index be8ef9b9..bb2f7d47 100644 --- a/docs/en/part7/ch23_online_feedback_knowledge_update.md +++ b/docs/en/part7/ch23_online_feedback_knowledge_update.md @@ -670,35 +670,35 @@ At the operational level, the chapter establishes metrics dashboards and operati ## References -Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 291–300. +Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 291–300. https://doi.org/10.1109/icse-seip.2019.00042. Breck E, Cai S, Nielsen E, Salib M, Sculley D (2017) The ML Test Score: A Rubric for ML Production Readiness and Technical Debt Reduction. In: Proceedings of the IEEE International Conference on Big Data, pp 1123–1132. -Chapelle O, Zhang Y (2009) A Dynamic Bayesian Network Click Model for Web Search Ranking. In: Proceedings of the 18th International Conference on World Wide Web, pp 1–10. +Chapelle O, Zhang Y (2009) A Dynamic Bayesian Network Click Model for Web Search Ranking. In: Proceedings of the 18th International Conference on World Wide Web, pp 1–10. https://doi.org/10.1145/1526709.1526711. -Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150–158. +Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150–158. https://doi.org/10.18653/v1/2024.eacl-demo.16. -Gama J, Žliobaitė I, Bifet A, Pechenizkiy M, Bouchachia A (2014) A Survey on Concept Drift Adaptation. ACM Computing Surveys 46(4):1–37. +Gama J, Žliobaitė I, Bifet A, Pechenizkiy M, Bouchachia A (2014) A Survey on Concept Drift Adaptation. ACM Computing Surveys 46(4):1–37. https://doi.org/10.1145/2523813. Gao Y, Xiong Y, Gao X, Jia K, Pan J, Bi Y, Dai Y, Sun J, Wang M, Wang H (2023) Retrieval-Augmented Generation for Large Language Models: A Survey. arXiv preprint arXiv:2312.10997. -Hu Y, Koren Y, Volinsky C (2008) Collaborative Filtering for Implicit Feedback Datasets. In: Proceedings of the 2008 IEEE International Conference on Data Mining, pp 263–272. +Hu Y, Koren Y, Volinsky C (2008) Collaborative Filtering for Implicit Feedback Datasets. In: Proceedings of the 2008 IEEE International Conference on Data Mining, pp 263–272. https://doi.org/10.1109/icdm.2008.22. Huyen C (2022) Designing Machine Learning Systems: An Iterative Process for Production-Ready Applications. O'Reilly Media. -Joachims T (2002) Optimizing Search Engines Using Clickthrough Data. In: Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp 133–142. +Joachims T (2002) Optimizing Search Engines Using Clickthrough Data. In: Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp 133–142. https://doi.org/10.1145/775066.775067. -Joachims T, Swaminathan A, Schnabel T (2017) Unbiased Learning-to-Rank with Biased Feedback. In: Proceedings of the Tenth ACM International Conference on Web Search and Data Mining, pp 781–789. +Joachims T, Swaminathan A, Schnabel T (2017) Unbiased Learning-to-Rank with Biased Feedback. In: Proceedings of the Tenth ACM International Conference on Web Search and Data Mining, pp 781–789. https://doi.org/10.1145/3018661.3018699. Koh P W, Sagawa S, Marklund H, Xie S M, Zhang M, Balsubramani A, Hu W, Yasunaga M, Phillips R L, Gao I, Lee T, David E, Stavness I, Guo W, Earnshaw B A, Haque I S, Beery S, Leskovec J, Kundaje A, Pierson E, Levine S, Finn C, Liang P (2021) WILDS: A Benchmark of in-the-Wild Distribution Shifts. In: Proceedings of the 38th International Conference on Machine Learning, pp 5637–5664. Kohavi R, Tang D, Xu Y (2020) Trustworthy Online Controlled Experiments: A Practical Guide to A/B Testing. Cambridge University Press. -Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866–31879. +Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866–31879. arXiv:2205.02302. -Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W-t, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459–9474. +Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W-t, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459–9474. arXiv:2005.11401. -Mallen A, Asai A, Zhong V, Das R, Khashabi D, Hajishirzi H (2023) When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 9802–9822. +Mallen A, Asai A, Zhong V, Das R, Khashabi D, Hajishirzi H (2023) When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 9802–9822. https://doi.org/10.18653/v1/2023.acl-long.546. Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28, pp 2503–2511. diff --git a/docs/en/part8/ch24_dataops_flywheel_team.md b/docs/en/part8/ch24_dataops_flywheel_team.md index 072cda56..14db7e29 100644 --- a/docs/en/part8/ch24_dataops_flywheel_team.md +++ b/docs/en/part8/ch24_dataops_flywheel_team.md @@ -819,7 +819,7 @@ Apache Airflow is the most mature data-workflow orchestration tool, suitable for ## References -Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291–300. +Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291–300. https://doi.org/10.1109/icse-seip.2019.00042. Baylor D, Breck E, Cheng H-T, Fiedel N, Foo C Y, Haque Z, Haykal S, Ispir M, Jain V, Koc L, Koo C Y, Lew L, Mewald C, Modi A N, Polyzotis N, Ramesh S, Roy S, Whang S E, Wicke M, Wilkiewicz J, Zhang X, Zinkevich M (2017) TFX: A TensorFlow-Based Production-Scale Machine Learning Platform. In: Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp 1387–1395. @@ -837,7 +837,7 @@ Dehghani Z (2022) Data Mesh: Delivering Data-Driven Value at Scale. O'Reilly Med Forsgren N, Humble J, Kim G (2018) Accelerate: The Science of Lean Software and DevOps. IT Revolution Press. -Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86–92. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86–92. https://doi.org/10.1145/3458723. Humble J, Farley D (2010) Continuous Delivery: Reliable Software Releases through Build, Test, and Deployment Automation. Addison-Wesley. @@ -845,15 +845,15 @@ Humble J, Molesky J, O'Reilly B (2015) Lean Enterprise: How High Performance Org Kim G, Humble J, Debois P, Willis J, Forsgren N (2021) The DevOps Handbook: How to Create World-Class Agility, Reliability, and Security in Technology Organizations, 2nd Edition. IT Revolution Press. -Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866–31879. +Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866–31879. arXiv:2205.02302. -Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220–229. +Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220–229. https://doi.org/10.1145/3287560.3287596. Project Management Institute (2021) A Guide to the Project Management Body of Knowledge (PMBOK Guide), 7th Edition. Project Management Institute. Reis J, Housley M (2022) Fundamentals of Data Engineering. O'Reilly Media. -Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1–15. +Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1–15. https://doi.org/10.1145/3411764.3445518. Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28, pp 2503–2511. diff --git a/docs/en/part8/ch25_data_versioning_experiment_tracking.md b/docs/en/part8/ch25_data_versioning_experiment_tracking.md index 69ec4ee4..18f97cf6 100644 --- a/docs/en/part8/ch25_data_versioning_experiment_tracking.md +++ b/docs/en/part8/ch25_data_versioning_experiment_tracking.md @@ -658,7 +658,7 @@ Zaharia et al.'s "Accelerating the Machine Learning Lifecycle with MLflow" (2018 ## References -Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291-300. +Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291-300. https://doi.org/10.1109/icse-seip.2019.00042. Armbrust M, Ghodsi A, Xin R, Zaharia M (2020) Delta Lake: High-Performance ACID Table Storage over Cloud Object Stores. Proceedings of the VLDB Endowment 13(12):3411-3424. @@ -668,31 +668,31 @@ Breck E, Cai S, Nielsen E, Salib M, Sculley D (2017) The ML Test Score: A Rubric Breck E, Polyzotis N, Roy S, Whang S E, Zinkevich M (2019) Data Validation for Machine Learning. In: Proceedings of Machine Learning and Systems 1, pp 334-347. -Buneman P, Khanna S, Tan W-C (2001) Why and Where: A Characterization of Data Provenance. In: Proceedings of the 8th International Conference on Database Theory (ICDT), pp 316-330. +Buneman P, Khanna S, Tan W-C (2001) Why and Where: A Characterization of Data Provenance. In: Proceedings of the 8th International Conference on Database Theory (ICDT), pp 316-330. https://doi.org/10.1007/3-540-44503-x_20. DAMA International (2017) DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition. Technics Publications. DVC Documentation (2024) Data Version Control Documentation. Available at: https://dvc.org/doc. -Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723. -Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. +Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. arXiv:2205.02302. -Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. +Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596. Moreau L, Missier P (eds.) (2013) PROV-DM: The PROV Data Model. W3C Recommendation. -Peng R D (2011) Reproducible Research in Computational Science. Science 334(6060):1226-1227. +Peng R D (2011) Reproducible Research in Computational Science. Science 334(6060):1226-1227. https://doi.org/10.1126/science.1213847. Polyzotis N, Roy S, Whang S E, Zinkevich M (2017) Data Management Challenges in Production Machine Learning. In: Proceedings of the 2017 ACM International Conference on Management of Data (SIGMOD), pp 1723-1726. -Sandve G K, Nekrutenko A, Taylor J, Hovig E (2013) Ten Simple Rules for Reproducible Computational Research. PLOS Computational Biology 9(10):e1003285. +Sandve G K, Nekrutenko A, Taylor J, Hovig E (2013) Ten Simple Rules for Reproducible Computational Research. PLOS Computational Biology 9(10):e1003285. https://doi.org/10.1371/journal.pcbi.1003285. Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28, pp 2503-2511. -Simmhan Y L, Plale B, Gannon D (2005) A Survey of Data Provenance in e-Science. ACM SIGMOD Record 34(3):31-36. +Simmhan Y L, Plale B, Gannon D (2005) A Survey of Data Provenance in e-Science. ACM SIGMOD Record 34(3):31-36. https://doi.org/10.1145/1084805.1084812. -Stodden V, Leisch F, Peng R D (eds.) (2014) Implementing Reproducible Research. CRC Press. +Stodden V, Leisch F, Peng R D (eds.) (2014) Implementing Reproducible Research. CRC Press. https://doi.org/10.1201/b16868. Vartak M, Subramanyam H, Lee W-E, Viswanathan S, Husnoo S, Madden S, Zaharia M (2016) ModelDB: A System for Machine Learning Model Management. In: Proceedings of the Workshop on Human-In-the-Loop Data Analytics (HILDA), Article 14. diff --git a/docs/en/part8/ch26_data_platform_observability.md b/docs/en/part8/ch26_data_platform_observability.md index e2628fe9..f45c3e7f 100644 --- a/docs/en/part8/ch26_data_platform_observability.md +++ b/docs/en/part8/ch26_data_platform_observability.md @@ -674,7 +674,7 @@ PagerDuty is the most widely used incident response tool in the industry, suppor ## References -Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291-300. +Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291-300. https://doi.org/10.1109/icse-seip.2019.00042. Baylor D, Breck E, Cheng H-T, Fiedel N, Foo C Y, Haque Z, Haykal S, Ispir M, Jain V, Koc L, Koo C Y, Lew L, Mewald C, Modi A N, Polyzotis N, Ramesh S, Roy S, Whang S E, Wicke M, Wilkiewicz J, Zhang X, Zinkevich M (2017) TFX: A TensorFlow-Based Production-Scale Machine Learning Platform. In: Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp 1387-1395. @@ -686,15 +686,15 @@ Breck E, Cai S, Nielsen E, Salib M, Sculley D (2017) The ML Test Score: A Rubric Breck E, Polyzotis N, Roy S, Whang S E, Zinkevich M (2019) Data Validation for Machine Learning. In: Proceedings of Machine Learning and Systems 1, pp 334-347. -Dean J, Barroso L A (2013) The Tail at Scale. Communications of the ACM 56(2):74-80. +Dean J, Barroso L A (2013) The Tail at Scale. Communications of the ACM 56(2):74-80. https://doi.org/10.1145/2408776.2408794. Hellerstein J M, Sreekanti V, Gonzalez J E, Dalton J, Dey A, Nag S, Ramachandran K, Arora S, Bhattacharyya A, Das S, Donsky A, Fierro G, Kumar C, Mazzariol M, Narayanan S, Parameswaran A, Rahman T, Shah R, She C, Storey M, Turman C, Wu E (2017) Ground: A Data Context Service. In: Proceedings of CIDR. Kleppmann M (2017) Designing Data-Intensive Applications. O'Reilly Media. -Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. +Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. arXiv:2205.02302. -National Institute of Standards and Technology (2006) Guide to Computer Security Log Management. NIST Special Publication 800-92. +National Institute of Standards and Technology (2006) Guide to Computer Security Log Management. NIST Special Publication 800-92. https://doi.org/10.6028/nist.sp.800-92. Nygard M T (2018) Release It!: Design and Deploy Production-Ready Software, 2nd Edition. Pragmatic Bookshelf. @@ -704,7 +704,7 @@ OpenTelemetry Authors (2024) OpenTelemetry Specification. Available at: https:// Polyzotis N, Roy S, Whang S E, Zinkevich M (2017) Data Management Challenges in Production Machine Learning. In: Proceedings of the 2017 ACM International Conference on Management of Data (SIGMOD), pp 1723-1726. -Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. +Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. https://doi.org/10.1145/3411764.3445518. Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28, pp 2503-2511. @@ -712,4 +712,4 @@ Sigelman B H, Barroso L A, Burrows M, Stephenson P, Moshchuk A, Osina D, Fikes J Turnbull J (2014) The Art of Monitoring. James Turnbull. -Xu W, Huang L, Fox A, Patterson D, Jordan M I (2009) Detecting Large-Scale System Problems by Mining Console Logs. In: Proceedings of the ACM SIGOPS 22nd Symposium on Operating Systems Principles (SOSP), pp 117-132. \ No newline at end of file +Xu W, Huang L, Fox A, Patterson D, Jordan M I (2009) Detecting Large-Scale System Problems by Mining Console Logs. In: Proceedings of the ACM SIGOPS 22nd Symposium on Operating Systems Principles (SOSP), pp 117-132. https://doi.org/10.1145/1629575.1629587. \ No newline at end of file diff --git a/docs/en/part9/ch27_data_catalog_and_metadata_governance.md b/docs/en/part9/ch27_data_catalog_and_metadata_governance.md index 4abd619c..28e62f80 100644 --- a/docs/en/part9/ch27_data_catalog_and_metadata_governance.md +++ b/docs/en/part9/ch27_data_catalog_and_metadata_governance.md @@ -592,15 +592,15 @@ Abedjan Z, Golab L, Naumann F (2015) Profiling relational data: a survey. The VL Breck E, Polyzotis N, Roy S, Whang S E, Zinkevich M (2019) Data Validation for Machine Learning. In: Proceedings of the 2nd SysML Conference (MLSys). -Buneman P, Khanna S, Tan W-C (2001) Why and Where: A Characterization of Data Provenance. In: Proceedings of the 8th International Conference on Database Theory (ICDT), pp 316–330. +Buneman P, Khanna S, Tan W-C (2001) Why and Where: A Characterization of Data Provenance. In: Proceedings of the 8th International Conference on Database Theory (ICDT), pp 316–330. https://doi.org/10.1007/3-540-44503-x_20. -Cai L, Zhu Y (2015) The challenges of data quality and data quality assessment in the big data era. Data science journal, 2015, 14: 2-2. +Cai L, Zhu Y (2015) The challenges of data quality and data quality assessment in the big data era. Data science journal, 2015, 14: 2-2. https://doi.org/10.5334/dsj-2015-002. DAMA International (2017) DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition. Technics Publications, Basking Ridge. -Fernandez R C, Abedjan Z, Koko F, Yuan G, Madden S, Stonebraker M (2018) Aurum: A Data Discovery System. In: 2018 IEEE 34th International Conference on Data Engineering (ICDE), pp 1001–1012. +Fernandez R C, Abedjan Z, Koko F, Yuan G, Madden S, Stonebraker M (2018) Aurum: A Data Discovery System. In: 2018 IEEE 34th International Conference on Data Engineering (ICDE), pp 1001–1012. https://doi.org/10.1109/icde.2018.00094. -Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86–92. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86–92. https://doi.org/10.1145/3458723. Halevy A, Korn F, Noy N F, Olston C, Polyzotis N, Roy S, Whang S E (2016) Goods: Organizing Google's Datasets. In: Proceedings of the 2016 ACM SIGMOD International Conference on Management of Data, pp 795–806. @@ -608,15 +608,15 @@ Hellerstein J M, Sreekanti V, Gonzalez J E, Dalton J, Dey A, Nag S, Ramachandran Herschel M, Diestelkämper R, Ben Lahmar H (2017) A survey on provenance: What for? What form? What from? The VLDB Journal 26(6):881–906. -Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency (FAT*), pp 220–229. +Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency (FAT*), pp 220–229. https://doi.org/10.1145/3287560.3287596. Noy N F, Musen M A (2000) PROMPT: Algorithm and Tool for Automated Ontology Merging and Alignment. In: Proceedings of the 17th National Conference on Artificial Intelligence (AAAI), pp 450–455. -Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17–28. +Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17–28. https://doi.org/10.1145/3299887.3299891. Rahm E, Bernstein P A (2001) A survey of approaches to automatic schema matching. The VLDB Journal 10(4):334–350. -Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1–15. +Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1–15. https://doi.org/10.1145/3411764.3445518. Sandhu R S, Coyne E J, Feinstein H L, Youman C E (1996) Role-Based Access Control Models. IEEE Computer 29(2):38–47. @@ -626,4 +626,4 @@ Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young Stonebraker M, Bruckner D, Ilyas I F, Beskales G, Cherniack M, Zdonik S, Pagan A, Xu S (2013) Data Curation at Scale: The Data Tamer System. In: 6th Biennial Conference on Innovative Data Systems Research (CIDR). -Wang R Y, Strong D M (1996) Beyond Accuracy: What Data Quality Means to Data Consumers. Journal of Management Information Systems 12(4):5–33. \ No newline at end of file +Wang R Y, Strong D M (1996) Beyond Accuracy: What Data Quality Means to Data Consumers. Journal of Management Information Systems 12(4):5–33. https://doi.org/10.1080/07421222.1996.11518099. \ No newline at end of file diff --git a/docs/en/part9/ch28_data_productization_and_data_contracts.md b/docs/en/part9/ch28_data_productization_and_data_contracts.md index 236ece2b..a01d1ee1 100644 --- a/docs/en/part9/ch28_data_productization_and_data_contracts.md +++ b/docs/en/part9/ch28_data_productization_and_data_contracts.md @@ -302,23 +302,23 @@ Breck E, Polyzotis N, Roy S, Whang S E, Zinkevich M (2019) Data Validation for M Dehghani Z (2022) Data Mesh: Delivering Data-Driven Value at Scale. O'Reilly Media, Sebastopol. -Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723. Kleppmann M (2017) Designing Data-Intensive Applications: The Big Ideas Behind Reliable, Scalable, and Maintainable Systems. O'Reilly Media, Sebastopol. -Lwakatare L E, Raj A, Crnkovic I, Bosch J, Olsson H H (2020) Large-scale machine learning systems in real-world industrial settings: A review of challenges and solutions. Information and Software Technology 127:106368. +Lwakatare L E, Raj A, Crnkovic I, Bosch J, Olsson H H (2020) Large-scale machine learning systems in real-world industrial settings: A review of challenges and solutions. Information and Software Technology 127:106368. https://doi.org/10.1016/j.infsof.2020.106368. -Machado I A, Costa C, Santos M Y (2022) Data Mesh: Concepts and Principles of a Paradigm Shift in Data Architectures. Procedia Computer Science 196:263-271. +Machado I A, Costa C, Santos M Y (2022) Data Mesh: Concepts and Principles of a Paradigm Shift in Data Architectures. Procedia Computer Science 196:263-271. https://doi.org/10.1016/j.procs.2021.12.013. -Paleyes A, Urma R-G, Lawrence N D (2022) Challenges in Deploying Machine Learning: A Survey of Case Studies. ACM Computing Surveys 55(6):1-29. +Paleyes A, Urma R-G, Lawrence N D (2022) Challenges in Deploying Machine Learning: A Survey of Case Studies. ACM Computing Surveys 55(6):1-29. https://doi.org/10.1145/3533378. -Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17-28. +Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17-28. https://doi.org/10.1145/3299887.3299891. Rahm E, Bernstein P A (2001) A survey of approaches to automatic schema matching. The VLDB Journal 10(4):334-350. Redman T C (1998) The Impact of Poor Data Quality on the Typical Enterprise. Communications of the ACM 41(2):79-82. -Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. +Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. https://doi.org/10.1145/3411764.3445518. Schelter S, Lange D, Schmidt P, Celikel M, Biessmann F, Grafberger A (2018) Automating Large-Scale Data Quality Verification. Proceedings of the VLDB Endowment 11(12):1781-1794. @@ -326,6 +326,6 @@ Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young Shankar S, Garcia R, Hellerstein J M, Parameswaran A G (2022) Operationalizing Machine Learning: An Interview Study. arXiv preprint arXiv:2209.09125. -Strong D M, Lee Y W, Wang R Y (1997) Data Quality in Context. Communications of the ACM 40(5):103-110. +Strong D M, Lee Y W, Wang R Y (1997) Data Quality in Context. Communications of the ACM 40(5):103-110. https://doi.org/10.1145/253769.253804. -Wang R Y, Strong D M (1996) Beyond Accuracy: What Data Quality Means to Data Consumers. Journal of Management Information Systems 12(4):5-33. \ No newline at end of file +Wang R Y, Strong D M (1996) Beyond Accuracy: What Data Quality Means to Data Consumers. Journal of Management Information Systems 12(4):5-33. https://doi.org/10.1080/07421222.1996.11518099. \ No newline at end of file diff --git a/docs/en/part9/ch29_data_valuation_and_reuse.md b/docs/en/part9/ch29_data_valuation_and_reuse.md index 1f242f2f..8a37e475 100644 --- a/docs/en/part9/ch29_data_valuation_and_reuse.md +++ b/docs/en/part9/ch29_data_valuation_and_reuse.md @@ -593,15 +593,15 @@ Ultimately, data valuation and reuse aim to help an organization know its own da Brynjolfsson E, Hitt L M, Kim H H (2011) Strength in Numbers: How Does Data-Driven Decisionmaking Affect Firm Performance? Available at SSRN 1819486. -Fleckenstein M, Obaidi A, Tryfona N (2023) A Review of Data Valuation Approaches and Building and Scoring a Data Valuation Model. Harvard Data Science Review 5(1). +Fleckenstein M, Obaidi A, Tryfona N (2023) A Review of Data Valuation Approaches and Building and Scoring a Data Valuation Model. Harvard Data Science Review 5(1). https://doi.org/10.1162/99608f92.c18db966. -Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daume III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daume III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723. Ghorbani A, Zou J (2019) Data Shapley: Equitable Valuation of Data for Machine Learning. In: Proceedings of the 36th International Conference on Machine Learning (ICML), pp 2242-2251. Gunasekar S, Zhang Y, Aneja J, Mendes C C T, Del Giorno A, Gopi S, Javaheripi M, Kauffmann P, de Rosa G, Saarikivi O, Salim A, Shah S, Behl H S, Wang X, Bubeck S, Eldan R, Kalai A T, Lee Y T, Li Y (2023) Textbooks Are All You Need. arXiv preprint arXiv:2306.11644. -Hoffmann J, Borgeaud S, Mensch A, Buchatskaya E, Cai T, Rutherford E, de Las Casas D, Hendricks L A, Welbl J, Clark A, Hennigan T, Noland E, Millican K, van den Driessche G, Damoc B, Guy A, Osindero S, Simonyan K, Elsen E, Rae J W, Vinyals O, Sifre L (2022) Training Compute-Optimal Large Language Models. In: Advances in Neural Information Processing Systems 35. +Hoffmann J, Borgeaud S, Mensch A, Buchatskaya E, Cai T, Rutherford E, de Las Casas D, Hendricks L A, Welbl J, Clark A, Hennigan T, Noland E, Millican K, van den Driessche G, Damoc B, Guy A, Osindero S, Simonyan K, Elsen E, Rae J W, Vinyals O, Sifre L (2022) Training Compute-Optimal Large Language Models. In: Advances in Neural Information Processing Systems 35. arXiv:2203.15556. Jia R, Dao D, Wang B, Hubis F A, Hynes N, Gurel N M, Li B, Zhang C, Song D, Spanos C J (2019) Towards Efficient Data Valuation Based on the Shapley Value. In: Proceedings of the 22nd International Conference on Artificial Intelligence and Statistics (AISTATS), pp 1167-1176. @@ -611,21 +611,21 @@ Laney D B (2017) Infonomics: How to Monetize, Manage, and Measure Information as Lee K, Ippolito D, Nystrom A, Zhang C, Eck D, Callison-Burch C, Carlini N (2022) Deduplicating Training Data Makes Language Models Better. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (ACL), pp 8424-8445. -Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Kuttler H, Lewis M, Yih W-t, Rocktaschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459-9474. +Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Kuttler H, Lewis M, Yih W-t, Rocktaschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459-9474. arXiv:2005.11401. Moody D, Walsh P (1999) Measuring the Value of Information: An Asset Valuation Approach. In: Proceedings of the 7th European Conference on Information Systems (ECIS), pp 496-512. Northcutt C G, Athalye A, Mueller J (2021) Pervasive Label Errors in Test Sets Destabilize Machine Learning Benchmarks. In: Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks. -Pei J (2022) A Survey on Data Pricing: From Economics to Data Science. IEEE Transactions on Knowledge and Data Engineering 34(10):4586-4608. +Pei J (2022) A Survey on Data Pricing: From Economics to Data Science. IEEE Transactions on Knowledge and Data Engineering 34(10):4586-4608. https://doi.org/10.1109/tkde.2020.3045927. -Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. +Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. https://doi.org/10.1145/3411764.3445518. Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28, pp 2503-2511. Settles B (2009) Active Learning Literature Survey. Computer Sciences Technical Report 1648, University of Wisconsin-Madison. -Sorscher B, Geirhos R, Shekhar S, Ganguli S, Morcos A S (2022) Beyond Neural Scaling Laws: Beating Power Law Scaling via Data Pruning. In: Advances in Neural Information Processing Systems 35, pp 19523-19536. +Sorscher B, Geirhos R, Shekhar S, Ganguli S, Morcos A S (2022) Beyond Neural Scaling Laws: Beating Power Law Scaling via Data Pruning. In: Advances in Neural Information Processing Systems 35, pp 19523-19536. https://doi.org/10.52202/068431-1419. Thakur N, Reimers N, Ruckle A, Srivastava A, Gurevych I (2021) BEIR: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models. In: Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks. diff --git a/docs/en/part9/ch30_internal_data_market_and_sharing_governance.md b/docs/en/part9/ch30_internal_data_market_and_sharing_governance.md index 173d56b1..e084b88d 100644 --- a/docs/en/part9/ch30_internal_data_market_and_sharing_governance.md +++ b/docs/en/part9/ch30_internal_data_market_and_sharing_governance.md @@ -475,7 +475,7 @@ Ultimately, the internal data market solves the problem of how data can flow cre Abraham R, Schneider J, vom Brocke J (2019) Data governance: A conceptual framework, structured review, and research agenda. International Journal of Information Management 49:424-438. -Alhassan I, Sammon D, Daly M (2016) Data governance activities: an analysis of the literature. Journal of Decision Systems 25(sup1):64-75. +Alhassan I, Sammon D, Daly M (2016) Data governance activities: an analysis of the literature. Journal of Decision Systems 25(sup1):64-75. https://doi.org/10.1080/12460125.2016.1187397. DAMA International (2017) DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition. Technics Publications. @@ -483,11 +483,11 @@ Dehghani Z (2022) Data Mesh: Delivering Data-Driven Value at Scale. O'Reilly Med Ferraiolo D F, Kuhn D R (1992) Role-Based Access Controls. In: Proceedings of the 15th National Computer Security Conference, pp 554-563. -Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daume III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daume III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723. -Hu V C, Ferraiolo D, Kuhn R, Schnitzer A, Sandlin K, Miller R, Scarfone K (2014) Guide to Attribute Based Access Control (ABAC) Definition and Considerations. NIST Special Publication 800-162. +Hu V C, Ferraiolo D, Kuhn R, Schnitzer A, Sandlin K, Miller R, Scarfone K (2014) Guide to Attribute Based Access Control (ABAC) Definition and Considerations. NIST Special Publication 800-162. https://doi.org/10.6028/nist.sp.800-162. -Khatri V, Brown C V (2010) Designing data governance. Communications of the ACM 53(1):148-152. +Khatri V, Brown C V (2010) Designing data governance. Communications of the ACM 53(1):148-152. https://doi.org/10.1145/1629175.1629210. Ladley J (2019) Data Governance: How to Design, Deploy, and Sustain an Effective Data Governance Program, 2nd Edition. Academic Press. @@ -497,9 +497,9 @@ Moody D, Walsh P (1999) Measuring the Value of Information: An Asset Valuation A National Institute of Standards and Technology (2020a) Security and Privacy Controls for Information Systems and Organizations. NIST Special Publication 800-53 Revision 5. -National Institute of Standards and Technology (2020b) NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0. +National Institute of Standards and Technology (2020b) NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0. https://doi.org/10.6028/nist.cswp.10. -Otto B (2011) Data Governance. Business & Information Systems Engineering 3(4):241-244. +Otto B (2011) Data Governance. Business & Information Systems Engineering 3(4):241-244. https://doi.org/10.1002/9781118269053.ch4. Reis J, Housley M (2022) Fundamentals of Data Engineering. O'Reilly Media. @@ -509,6 +509,6 @@ Schomm F, Stahl F, Vossen G (2013) Marketplaces for data: an initial survey. ACM Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28, pp 2503-2511. -Weber K, Otto B, Osterle H (2009) One Size Does Not Fit All: A Contingency Approach to Data Governance. ACM Journal of Data and Information Quality 1(1):4. +Weber K, Otto B, Osterle H (2009) One Size Does Not Fit All: A Contingency Approach to Data Governance. ACM Journal of Data and Information Quality 1(1):4. https://doi.org/10.1145/1515693.1515696. -Wilkinson M D, Dumontier M, Aalbersberg I J, Appleton G, Axton M, Baak A, Blomberg N, Boiten J-W, da Silva Santos L B, Bourne P E, Bouwman J, Brookes A J, Clark T, Crosas M, Dillo I, Dumon O, Edmunds S, Evelo C T, Finkers R, Gonzalez-Beltran A, Gray A J G, Groth P, Goble C, Grethe J S, Heringa J, 't Hoen P A C, Hooft R, Kuhn T, Kok R, Kok J, Lusher S J, Martone M E, Mons A, Packer A L, Persson B, Rocca-Serra P, Roos M, van Schaik R, Sansone S-A, Schultes E, Sengstag T, Slater T, Strawn G, Swertz M A, Thompson M, van der Lei J, van Mulligen E, Velterop J, Waagmeester A, Wittenburg P, Wolstencroft K, Zhao J, Mons B (2016) The FAIR Guiding Principles for scientific data management and stewardship. Scientific Data 3:160018. \ No newline at end of file +Wilkinson M D, Dumontier M, Aalbersberg I J, Appleton G, Axton M, Baak A, Blomberg N, Boiten J-W, da Silva Santos L B, Bourne P E, Bouwman J, Brookes A J, Clark T, Crosas M, Dillo I, Dumon O, Edmunds S, Evelo C T, Finkers R, Gonzalez-Beltran A, Gray A J G, Groth P, Goble C, Grethe J S, Heringa J, 't Hoen P A C, Hooft R, Kuhn T, Kok R, Kok J, Lusher S J, Martone M E, Mons A, Packer A L, Persson B, Rocca-Serra P, Roos M, van Schaik R, Sansone S-A, Schultes E, Sengstag T, Slater T, Strawn G, Swertz M A, Thompson M, van der Lei J, van Mulligen E, Velterop J, Waagmeester A, Wittenburg P, Wolstencroft K, Zhao J, Mons B (2016) The FAIR Guiding Principles for scientific data management and stewardship. Scientific Data 3:160018. https://doi.org/10.1038/sdata.2016.18. \ No newline at end of file diff --git a/docs/zh/appendix_a_tools_and_frameworks_quick_reference.md b/docs/zh/appendix_a_tools_and_frameworks_quick_reference.md index 5ff1d549..7a10426a 100644 --- a/docs/zh/appendix_a_tools_and_frameworks_quick_reference.md +++ b/docs/zh/appendix_a_tools_and_frameworks_quick_reference.md @@ -287,9 +287,9 @@ Agent 工具调用数据与普通问答数据最大的不同,是其中的中 ## 参考文献 -Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12): 86-92. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12): 86-92. https://doi.org/10.1145/3458723. -Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. +Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596. Pushkarna M, Zaldivar A, Kjartansson O (2022) Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 1776-1826. diff --git a/docs/zh/appendix_b_compliance_and_release_checklist.md b/docs/zh/appendix_b_compliance_and_release_checklist.md index 12794de0..68031ff6 100644 --- a/docs/zh/appendix_b_compliance_and_release_checklist.md +++ b/docs/zh/appendix_b_compliance_and_release_checklist.md @@ -319,4 +319,4 @@ National Institute of Standards and Technology (2023) AI Risk Management Framewo European Parliament and Council of the European Union (2024) Regulation (EU) 2024/1689 laying down harmonised rules on artificial intelligence (Artificial Intelligence Act). Available at: https://eur-lex.europa.eu/eli/reg/2024/1689/oj. -Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. +Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596. diff --git a/docs/zh/appendix_c_cost_estimation_and_resource_templates.md b/docs/zh/appendix_c_cost_estimation_and_resource_templates.md index 0b7c8fe5..5eb1751b 100644 --- a/docs/zh/appendix_c_cost_estimation_and_resource_templates.md +++ b/docs/zh/appendix_c_cost_estimation_and_resource_templates.md @@ -315,9 +315,9 @@ C_i = C_i^{fixed} + C_i^{variable} + C_i^{risk} Patterson D, Gonzalez J, Le Q, Liang C, Munguia L, Rothchild D, So D, Texier M, Dean J (2021) Carbon Emissions and Large Neural Network Training. arXiv preprint arXiv:2104.10350. -Narayanan D, Shoeybi M, Casper J, LeGresley P, Patwary M, Catanzaro B (2021) Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. +Narayanan D, Shoeybi M, Casper J, LeGresley P, Patwary M, Catanzaro B (2021) Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. arXiv:2104.04473. -Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu C H, Gonzalez J E, Zhang H, Stoica I (2023) Efficient Memory Management for Large Language Model Serving with PagedAttention. In: Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles, pp 611-626. +Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu C H, Gonzalez J E, Zhang H, Stoica I (2023) Efficient Memory Management for Large Language Model Serving with PagedAttention. In: Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles, pp 611-626. https://doi.org/10.1145/3600006.3613165. Kubernetes Authors (2026) Kubernetes Documentation. Available at: https://kubernetes.io/docs/. diff --git a/docs/zh/appendix_d_paper_to_implementation_guide.md b/docs/zh/appendix_d_paper_to_implementation_guide.md index 907fa901..cb350e0d 100644 --- a/docs/zh/appendix_d_paper_to_implementation_guide.md +++ b/docs/zh/appendix_d_paper_to_implementation_guide.md @@ -394,12 +394,20 @@ ## 参考文献 -Kapoor S, Narayanan A (2023) Leakage and the reproducibility crisis in machine-learning-based science. Patterns 4(9):100804. +Kapoor S, Narayanan A (2023) Leakage and the reproducibility crisis in machine-learning-based science. Patterns 4(9):100804. https://doi.org/10.1016/j.patter.2023.100804. + +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723. Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. Longpre S, Mahari R, Chen A, Obeng-Marnu N, Sileo D, Brannon W, Muennighoff N, Khazam N, Kabbara J, Perisetla K, Wu X, Shippole E, Bollacker K, Wu T, Villa L, Pentland S, Hooker S (2024) A large-scale audit of dataset licensing and attribution in AI. Nature Machine Intelligence 6(8):975-987. -Mazumder M, Banbury C, Yao X, et al. (2023) DataPerf: Benchmarks for Data-Centric AI Development. In: Advances in Neural Information Processing Systems 36, Datasets and Benchmarks Track. +Mazumder M, Banbury C, Yao X, et al. (2023) DataPerf: Benchmarks for Data-Centric AI Development. In: Advances in Neural Information Processing Systems 36, Datasets and Benchmarks Track. https://doi.org/10.52202/075280-0235. + +Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596. + +Pushkarna M, Zaldivar A, Kjartansson O (2022) Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 1776-1826. https://doi.org/10.1145/3531146.3533231. + +Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28. Zha D, Bhat Z P, Lai K-H, Yang F, Jiang Z, Zhong S, Hu X (2023) Data-centric Artificial Intelligence: A Survey. arXiv preprint arXiv:2303.10158. diff --git a/docs/zh/appendix_e_common_bug_debugging_manual.md b/docs/zh/appendix_e_common_bug_debugging_manual.md index 48ce826c..d4aace6d 100644 --- a/docs/zh/appendix_e_common_bug_debugging_manual.md +++ b/docs/zh/appendix_e_common_bug_debugging_manual.md @@ -446,10 +446,12 @@ Agent 在自动回写时没有检查版本,导致新结果覆盖了旧结果 Blecher L, Cucurull G, Scialom T, Stojnic R (2023) Nougat: Neural Optical Understanding for Academic Documents. arXiv preprint arXiv:2308.13418. -Chen D, Huang Y, Ma Z, Chen H, Pan X, Ge C, Gao D, Xie Y, Liu Z, Gao J, Li Y, Ding B, Zhou J (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Companion of the 2024 International Conference on Management of Data, pp 120-134. +Breck E, Cai S, Nielsen E, Salib M, Sculley D (2017) The ML Test Score: A Rubric for ML Production Readiness and Technical Debt Reduction. In: IEEE International Conference on Big Data, pp 1123-1132. + +Chen D, Huang Y, Ma Z, Chen H, Pan X, Ge C, Gao D, Xie Y, Liu Z, Gao J, Li Y, Ding B, Zhou J (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Companion of the 2024 International Conference on Management of Data, pp 120-134. https://doi.org/10.1145/3626246.3653385. Chen Y, Shetty M, Somashekar G, Ma M, Simmhan Y, Mace J, Bansal C, Wang R, Rajmohan S (2025) AIOpsLab: A Holistic Framework to Evaluate AI Agents for Enabling Autonomous Clouds. arXiv preprint arXiv:2501.06706. -Kapoor S, Narayanan A (2023) Leakage and the reproducibility crisis in machine-learning-based science. Patterns 4(9):100804. +Kapoor S, Narayanan A (2023) Leakage and the reproducibility crisis in machine-learning-based science. Patterns 4(9):100804. https://doi.org/10.1016/j.patter.2023.100804. Pfitzmann B, Auer C, Dolfi M, Nassar A S, Staar P (2022) DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 3743-3751. diff --git a/docs/zh/appendix_f_terminology_and_chinese_english_mapping.md b/docs/zh/appendix_f_terminology_and_chinese_english_mapping.md index a99820e1..7b4de80f 100644 --- a/docs/zh/appendix_f_terminology_and_chinese_english_mapping.md +++ b/docs/zh/appendix_f_terminology_and_chinese_english_mapping.md @@ -400,8 +400,14 @@ PETs 是一组方法的总称,联邦学习、差分隐私、MPC、TEE 和同 Bommasani R, Klyman K, Zhang D, Liang P (2023) The Foundation Model Transparency Index. arXiv preprint arXiv:2310.12941. -Liang P, Bommasani R, Lee T, et al. (2023) Holistic Evaluation of Language Models. Transactions on Machine Learning Research. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723. -Wang B, Chen W, Pei H, et al. (2023) DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models. In: Advances in Neural Information Processing Systems 36. +Liang P, Bommasani R, Lee T, et al. (2023) Holistic Evaluation of Language Models. Transactions on Machine Learning Research. arXiv:2211.09110. -Weidinger L, Uesato J, Rauh M, Griffin C, Huang P-S, Mellor J, Glaese A, Cheng M, Balle B, Kasirzadeh A, Kenton Z, Brown S, Hawkins W, Stepleton T, Birhane A, Haas J, Rimell L, Hendricks L A, Isaac W, Legassick S, Irving G, Gabriel I (2022) Taxonomy of Risks posed by Language Models. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 214-229. +Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596. + +Pushkarna M, Zaldivar A, Kjartansson O (2022) Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 1776-1826. https://doi.org/10.1145/3531146.3533231. + +Wang B, Chen W, Pei H, et al. (2023) DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models. In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/075280-1361. + +Weidinger L, Uesato J, Rauh M, Griffin C, Huang P-S, Mellor J, Glaese A, Cheng M, Balle B, Kasirzadeh A, Kenton Z, Brown S, Hawkins W, Stepleton T, Birhane A, Haas J, Rimell L, Hendricks L A, Isaac W, Legassick S, Irving G, Gabriel I (2022) Taxonomy of Risks posed by Language Models. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 214-229. https://doi.org/10.1145/3531146.3533088. diff --git a/docs/zh/part1/ch02_quality_framework.md b/docs/zh/part1/ch02_quality_framework.md index 01795a08..4585d0b8 100644 --- a/docs/zh/part1/ch02_quality_framework.md +++ b/docs/zh/part1/ch02_quality_framework.md @@ -504,17 +504,17 @@ def detect_tab_drift(prev_texts, curr_texts, z_threshold=2.0): ## 参考文献 -Cohen J (1960) A Coefficient of Agreement for Nominal Scales. Educational and Psychological Measurement 20(1):37-46. +Cohen J (1960) A Coefficient of Agreement for Nominal Scales. Educational and Psychological Measurement 20(1):37-46. https://doi.org/10.1177/001316446002000104. -Lees A, Tran V Q, Tay Y, Sorensen J, Gupta J, Metzler D, Vasserman L (2022) A New Generation of Perspective API: Efficient Multilingual Character-level Transformers. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 3197-3207. +Lees A, Tran V Q, Tay Y, Sorensen J, Gupta J, Metzler D, Vasserman L (2022) A New Generation of Perspective API: Efficient Multilingual Character-level Transformers. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 3197-3207. https://doi.org/10.1145/3534678.3539147. -Nadeem M, Bethke A, Reddy S (2021) StereoSet: Measuring Stereotypical Bias in Pretrained Language Models. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 5356-5371. +Nadeem M, Bethke A, Reddy S (2021) StereoSet: Measuring Stereotypical Bias in Pretrained Language Models. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 5356-5371. https://doi.org/10.18653/v1/2021.acl-long.416. Zhao J, Wang T, Yatskar M, Ordonez V, Chang K W (2018) Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods (WinoBias). In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics, pp 15-20. -Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P F, Leike J, Lowe R (2022) Training Language Models to Follow Instructions with Human Feedback. Advances in Neural Information Processing Systems 35:27730-27744. +Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P F, Leike J, Lowe R (2022) Training Language Models to Follow Instructions with Human Feedback. Advances in Neural Information Processing Systems 35:27730-27744. arXiv:2203.02155. -Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Advances in Neural Information Processing Systems 36:53728-53741. +Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Advances in Neural Information Processing Systems 36:53728-53741. arXiv:2305.18290. Chen M, Tworek J, Jun H, Yuan Q, Pinto H P d O, Kaplan J, Edwards H, Burda Y, Joseph N, Brockman G, Ray A, Puri R, Krueger G, Petrov M, Khlaaf H, Sastry G, Mishkin P, Chan B, Gray S, Ryder N, Pavlov M, Power A, Kaiser L, Bavarian M, Winter C, Tillet P, Such F P, Cummings D, Plappert M, Chantzis F, Barnes E, Herbert-Voss A, Guss W H, Nichol A, Paino A, Tezak N, Tang J, Babuschkin I, Balaji S, Jain S, Saunders W, Hesse C, Carr A N, Leike J, Achiam J, Misra V, Morikawa E, Radford A, Knight M, Brundage M, Murati M, Mayer K, Welinder P, McGrew B, Amodei D, Sutskever I, Zaremba W (2021) Evaluating Large Language Models Trained on Code (HumanEval). arXiv preprint arXiv:2107.03374. @@ -531,7 +531,7 @@ Shi W, Ajith A, Xia M, Huang Y, Liu D, Blevins T, Chen D, Zettlemoyer L (2023) D Golchin S, Surdeanu M (2024) Time Travel in LLMs: Tracing Data Contamination in Large Language Models. In: International Conference on Learning Representations. -Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723. Bai Y, Kadavath S, Kundu S, Askell A, Kernion J, Jones A, Chen A, Goldie A, Mirhoseini A, McKinnon C, Chen C, Olsson C, Olah C, Hernandez D, Drain D, Ganguli D, Li D, Tran-Johnson E, Perez E, Kerr J, Mueller J, Ladish J, Landau J, Ndousse K, Lukosuite K, Lovitt L, Sellitto M, Elhage N, Schiefer N, Mercado N, DasSarma N, Lasenby R, Larson R, Ringer S, Johnston S, Kravec S, El Showk S, Fort S, Lanham T, Telleen-Lawton T, Conerly T, Henighan T, Hume T, Bowman S R, Hatfield-Dodds Z, Mann B, Amodei D, Joseph N, McCandlish S, Brown T, Kaplan J (2022) Constitutional AI: Harmlessness from AI Feedback. arXiv preprint arXiv:2212.08073. diff --git a/docs/zh/part1/ch03_data_stack.md b/docs/zh/part1/ch03_data_stack.md index e922c6d9..1063ab15 100644 --- a/docs/zh/part1/ch03_data_stack.md +++ b/docs/zh/part1/ch03_data_stack.md @@ -342,13 +342,13 @@ Zaharia M, Xin R S, Wendell P, Das T, Armbrust M, Dave A, Meng X, Rosen J, Venka Moritz P, Nishihara R, Wang S, Tumanov A, Liaw R, Liang E, Elibol M, Yang Z, Paul W, Jordan M I, Stoica I (2018) Ray: A Distributed Framework for Emerging AI Applications. In: Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation, pp 561-577. -Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. +Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. https://doi.org/10.1109/sequen.1997.666900. Heafield K (2011) KenLM: Faster and Smaller Language Model Queries. In: Proceedings of the Sixth Workshop on Statistical Machine Translation, pp 187-197. Robertson S, Zaragoza H (2009) The Probabilistic Relevance Framework: BM25 and Beyond. Foundations and Trends in Information Retrieval 3(4):333-389. -Malkov Y A, Yashunin D A (2020) Efficient and Robust Approximate Nearest Neighbor Search Using Hierarchical Navigable Small World Graphs (HNSW). IEEE Transactions on Pattern Analysis and Machine Intelligence 42(4):824-836. +Malkov Y A, Yashunin D A (2020) Efficient and Robust Approximate Nearest Neighbor Search Using Hierarchical Navigable Small World Graphs (HNSW). IEEE Transactions on Pattern Analysis and Machine Intelligence 42(4):824-836. https://doi.org/10.1109/tpami.2018.2889473. Apache Software Foundation (2024) Apache Iceberg: Table Specification and Documentation. (accessed 2024-11). diff --git a/docs/zh/part10/ch31_agent_architecture.md b/docs/zh/part10/ch31_agent_architecture.md index 542744eb..acd3606f 100644 --- a/docs/zh/part10/ch31_agent_architecture.md +++ b/docs/zh/part10/ch31_agent_architecture.md @@ -578,21 +578,21 @@ MVP 阶段必须清醒认识以下局限性,避免过早将 MVP 当作生产 ## 参考文献 -Besta M, Blach N, Kubicek A, Gerstenberger R, Podstawski M, Gianinazzi L, Gajda J, Lehmann T, Niewiadomski H, Nyczyk P, Hoefler T (2024) Graph of Thoughts: Solving Elaborate Problems with Large Language Models. In: Proceedings of the AAAI Conference on Artificial Intelligence 38(16):17682-17690. +Besta M, Blach N, Kubicek A, Gerstenberger R, Podstawski M, Gianinazzi L, Gajda J, Lehmann T, Niewiadomski H, Nyczyk P, Hoefler T (2024) Graph of Thoughts: Solving Elaborate Problems with Large Language Models. In: Proceedings of the AAAI Conference on Artificial Intelligence 38(16):17682-17690. https://doi.org/10.1609/aaai.v38i16.29720. -Gao L, Madaan A, Zhou S, Alon U, Liu P, Yang Y, Callan J, Neubig G (2023) PAL: Program-aided Language Models. In: Proceedings of the 40th International Conference on Machine Learning, pp 10764-10799. +Gao L, Madaan A, Zhou S, Alon U, Liu P, Yang Y, Callan J, Neubig G (2023) PAL: Program-aided Language Models. In: Proceedings of the 40th International Conference on Machine Learning, pp 10764-10799. arXiv:2211.10435. Karpas E, Abend O, Belinkov Y, Lenz B, Lieber O, Ratner N, Shoham Y, Bata H, Levine Y, Leyton-Brown K, Muhlgay D, Rozen N, Schwartz E, Shashua A, Shuster K, Tenenbaum J, Wolf L, Zettlemoyer L, Riedel S (2022) MRKL Systems: A Modular, Neuro-Symbolic Architecture That Combines Large Language Models, External Knowledge Sources and Discrete Reasoning. arXiv preprint arXiv:2205.00445. Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. -Madaan A, Tandon N, Gupta P, Hallinan S, Gao L, Wiegreffe S, Alon U, Dziri N, Prabhumoye S, Yang Y, Gupta S, Majumder B P, Hermann K, Welleck S, Yazdanbakhsh A, Clark P (2023) Self-Refine: Iterative Refinement with Self-Feedback. In: Advances in Neural Information Processing Systems 36. +Madaan A, Tandon N, Gupta P, Hallinan S, Gao L, Wiegreffe S, Alon U, Dziri N, Prabhumoye S, Yang Y, Gupta S, Majumder B P, Hermann K, Welleck S, Yazdanbakhsh A, Clark P (2023) Self-Refine: Iterative Refinement with Self-Feedback. In: Advances in Neural Information Processing Systems 36. arXiv:2303.17651. Mialon G, Dessì R, Lomeli M, Nalmpantis C, Pasunuru R, Raileanu R, Rozière B, Schick T, Dwivedi-Yu J, Celikyilmaz A, Grave E, LeCun Y, Scialom T (2023) Augmented Language Models: A Survey. Transactions on Machine Learning Research. Nakano R, Hilton J, Balaji S, Wu J, Ouyang L, Kim C, Hesse C, Jain S, Kosaraju V, Saunders W, Jiang X, Cobbe K, Eloundou T, Krueger G, Button K, Knight M, Chess B, Schulman J (2021) WebGPT: Browser-assisted question-answering with human feedback. arXiv preprint arXiv:2112.09332. -Park J S, O'Brien J C, Cai C J, Morris M R, Liang P, Bernstein M S (2023) Generative Agents: Interactive Simulacra of Human Behavior. In: Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology, Article 2. +Park J S, O'Brien J C, Cai C J, Morris M R, Liang P, Bernstein M S (2023) Generative Agents: Interactive Simulacra of Human Behavior. In: Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology, Article 2. https://doi.org/10.1145/3586183.3606763. Patil S G, Zhang T, Wang X, Gonzalez J E (2023) Gorilla: Large Language Model Connected with Massive APIs. arXiv preprint arXiv:2305.15334. @@ -600,7 +600,7 @@ Qin Y, Liang S, Ye Y, Zhu K, Yan L, Lu Y, Lin Y, Cong X, Tang X, Qian B, Zhao S, Schick T, Dwivedi-Yu J, Dessì R, Raileanu R, Lomeli M, Hambro E, Zettlemoyer L, Cancedda N, Scialom T (2023) Toolformer: Language Models Can Teach Themselves to Use Tools. In: Advances in Neural Information Processing Systems 36. -Shinn N, Cassano F, Gopinath A, Narasimhan K, Yao S (2023) Reflexion: Language Agents with Verbal Reinforcement Learning. In: Advances in Neural Information Processing Systems 36. +Shinn N, Cassano F, Gopinath A, Narasimhan K, Yao S (2023) Reflexion: Language Agents with Verbal Reinforcement Learning. In: Advances in Neural Information Processing Systems 36. arXiv:2303.11366. Wang L, Ma C, Feng X, Zhang Z, Yang H, Zhang J, Chen Z, Tang J, Chen X, Lin Y, Zhao W X, Wei Z, Wen J-R (2023) A Survey on Large Language Model based Autonomous Agents. arXiv preprint arXiv:2308.11432. @@ -608,6 +608,6 @@ Wu Q, Bansal G, Zhang J, Wu Y, Li B, Zhu E, Jiang L, Zhang X, Zhang S, Liu J, Aw Xi Z, Chen W, Guo X, He W, Ding Y, Hong B, Zhang M, Wang J, Jin S, Zhou E, Zheng R, Fan X, Wang X, Xiong L, Zhou Y, Wang W, Jiang C, Zou Y, Liu X, Yin Z, Dou S, Weng R, Cheng W, Zhang Q, Qin W, Zheng Y, Qiu X, Huang X, Gui T (2023) The Rise and Potential of Large Language Model Based Agents: A Survey. arXiv preprint arXiv:2309.07864. -Yao S, Zhao J, Yu D, Du N, Shafran I, Narasimhan K, Cao Y (2023) ReAct: Synergizing Reasoning and Acting in Language Models. In: International Conference on Learning Representations. +Yao S, Zhao J, Yu D, Du N, Shafran I, Narasimhan K, Cao Y (2023) ReAct: Synergizing Reasoning and Acting in Language Models. In: International Conference on Learning Representations. arXiv:2210.03629. -Yao S, Yu D, Zhao J, Shafran I, Griffiths T L, Cao Y, Narasimhan K (2023) Tree of Thoughts: Deliberate Problem Solving with Large Language Models. In: Advances in Neural Information Processing Systems 36. +Yao S, Yu D, Zhao J, Shafran I, Griffiths T L, Cao Y, Narasimhan K (2023) Tree of Thoughts: Deliberate Problem Solving with Large Language Models. In: Advances in Neural Information Processing Systems 36. arXiv:2305.10601. diff --git a/docs/zh/part10/ch32_auto_collection_parsing_cleaning.md b/docs/zh/part10/ch32_auto_collection_parsing_cleaning.md index 135daa4e..d52a0795 100644 --- a/docs/zh/part10/ch32_auto_collection_parsing_cleaning.md +++ b/docs/zh/part10/ch32_auto_collection_parsing_cleaning.md @@ -417,13 +417,13 @@ Agent 自动生成的清洗规则需要版本管理——当规则出现问题 ## 参考文献 -Barbaresi A (2021) Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 122-131. +Barbaresi A (2021) Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 122-131. https://doi.org/10.18653/v1/2021.acl-demo.15. Blecher N, Cresci G, Ballas N, Bautista M (2023) Nougat: Neural Optical Understanding for Academic Documents. arXiv preprint arXiv:2308.13418. Carlini N, Tramer F, Wallace E, Jagielski M, Herbert-Voss A, Lee K, Roberts A, Brown T, Song D, Erlingsson U, Oprea A, Raffel C (2021) Extracting Training Data from Large Language Models. In: Proceedings of the 30th USENIX Security Symposium, pp 2633-2650. -Chen J, Yan X, Lin D, Qu X, Wang Y, Huang X, Zhao Z, Yu T, Zhang Z, Li H, Zheng Y, Xu R, Zhu J, Qiu X (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Proceedings of the ACM SIGMOD International Conference on Management of Data, pp 4436-4449. +Chen J, Yan X, Lin D, Qu X, Wang Y, Huang X, Zhao Z, Yu T, Zhang Z, Li H, Zheng Y, Xu R, Zhu J, Qiu X (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Proceedings of the ACM SIGMOD International Conference on Management of Data, pp 4436-4449. https://doi.org/10.1145/3626246.3653385. Chowdhery A, Narang S, Devlin J, Bosma M, Mishra G, Roberts A, Barham P, Chung H W, Sutton C, Gehrmann S, Schuh P, Shi K, Tsvyashchenko S, Maynez J, Rao A, Barnes P, Tay Y, Shazeer N, Prabhakaran V, Reif E, Du N, Hutchinson B, Pope R, Bradbury J, Austin J, Isard M, Gur-Ari G, Yin P, Duke T, Levskaya A, Ghemawat S, Dev S, Michalewski H, Garcia X, Misra V, Robinson K, Fedus L, Zhou D, Ippolito D, Luan D, Lim H, Zoph B, Spiridonov A, Sepassi R, Dohan D, Agrawal S, Omernick M, Dai A M, Pillai T S, Pellat M, Lewkowycz A, Moreira E, Child R, Polozov O, Lee K, Zhou Z, Wang X, Saeta B, Diaz M, Firat O, Catasta M, Wei J, Meier-Hellstern K, Eck D, Dean J, Petrov S, Fiedel N (2022) PaLM: Scaling Language Modeling with Pathways. Journal of Machine Learning Research 24(240):1-113. @@ -431,9 +431,9 @@ Dodge J, Sap M, Marasović A, Agnew W, Ilharco G, Groeneveld D, Mitchell M, Gard Gao L, Biderman S, Black S, Golding L, Hoppe T, Foster C, Phang J, He H, Thite A, Nabeshima N, Presser S, Leahy C (2020) The Pile: An 800GB Dataset of Diverse Text for Language Modeling. arXiv preprint arXiv:2101.00027. -Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083-4091. +Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083-4091. https://doi.org/10.1145/3503161.3548112. -Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: European Conference on Computer Vision, pp 498-517. +Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: European Conference on Computer Vision, pp 498-517. https://doi.org/10.1007/978-3-031-19815-1_29. Laurençon H, Saulnier L, Wang T, Akiki C, del Moral A V, Le Scao T, Von Werra L, Mou C, González Ponferrada E, Nguyen H, Frohberg J, Šaško M, Lhoest Q, McMillan-Major A, Dupont G, Biderman S, Rogers A, Allal L B, De Toni F, Pistilli G, Nguyen O, Nikpoor S, Masoud M, Labbé S, Vial T, Reusch A, Yogatama D, Raffel C, Wolf T, BigScience Workshop (2022) The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset. In: Advances in Neural Information Processing Systems 35, Datasets and Benchmarks Track. @@ -441,7 +441,7 @@ Lee K, Ippolito D, Nystrom A, Zhang C, Eck D, Callison-Burch C, Carlini N (2022) Longpre S, Mahari R, Chen A, Obeng-Marnu N, Sileo D, Brannon W, Muennighoff N, Khazam N, Kabbara J, Perisetla K, Wu X, Shippole E, Bollacker K, Wu T, Villa L, Pentland S, Hooker S (2024) A large-scale audit of dataset licensing and attribution in AI. Nature Machine Intelligence 6(8):975-987. -Nguyen T, et al. (2024) CulturaX: A Cleaned, Enormous, and Multilingual Dataset for Large Language Models in 167 Languages. In: Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation. +Nguyen T, et al. (2024) CulturaX: A Cleaned, Enormous, and Multilingual Dataset for Large Language Models in 167 Languages. In: Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation. https://doi.org/10.63317/5iz6z5g7eit3. Ortiz Suárez P J, Sagot B, Romary L (2020) A Monolingual Approach to Contextualized Word Embeddings for Mid-Resource Languages. In: Proceedings of the 12th Language Resources and Evaluation Conference, pp 1703-1714. diff --git a/docs/zh/part10/ch33_labeling_synthesis_evaluation.md b/docs/zh/part10/ch33_labeling_synthesis_evaluation.md index 72931639..7764c50f 100644 --- a/docs/zh/part10/ch33_labeling_synthesis_evaluation.md +++ b/docs/zh/part10/ch33_labeling_synthesis_evaluation.md @@ -432,13 +432,13 @@ Agent 辅助标注的关键能力不是"理解标准",而是"将标准拆解 ## 参考文献 -Alemohammad S, Casco-Rodriguez J, Luzi L, et al. (2024) Self-Consuming Generative Models Go MAD. In: International Conference on Learning Representations. +Alemohammad S, Casco-Rodriguez J, Luzi L, et al. (2024) Self-Consuming Generative Models Go MAD. In: International Conference on Learning Representations. arXiv:2307.01850. Bai Y, Kadavath S, Kundu S, et al. (2022) Constitutional AI: Harmlessness from AI Feedback. arXiv preprint arXiv:2212.08073. Cui G, Yuan L, Ding N, Yao G, Zhu W, Ni Y, Xie G, Liu Z, Sun M (2023) UltraFeedback: Boosting Language Models with Scaled AI Feedback. arXiv preprint arXiv:2310.01377. -Dubois Y, Li X, Taori R, Zhang T, Gulrajani I, Ba J, Guestrin C, Liang P, Hashimoto T B (2023) AlpacaFarm: A Simulation Framework for Methods that Learn from Human Feedback. In: Advances in Neural Information Processing Systems 36. +Dubois Y, Li X, Taori R, Zhang T, Gulrajani I, Ba J, Guestrin C, Liang P, Hashimoto T B (2023) AlpacaFarm: A Simulation Framework for Methods that Learn from Human Feedback. In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/075280-1308. Gerstgrasser M, Schaeffer R, Dey A, et al. (2024) Is Model Collapse Inevitable? Breaking the Curse of Recursion by Accumulating Real and Synthetic Data. arXiv preprint arXiv:2404.01413. @@ -450,26 +450,26 @@ Koh P W, Sagawa S, Marklund H, et al. (2021) WILDS: A Benchmark of in-the-Wild D Lambert N, Pyatkin V, Morrison J, Miranda L, Lin B Y, Chandu K, Dziri N, Kumar S, Zick T, Choi Y, Smith N A, Hajishirzi H (2024) RewardBench: Evaluating Reward Models for Language Modeling. arXiv preprint arXiv:2403.13787. -Liang P, Bommasani R, Lee T, et al. (2023) Holistic Evaluation of Language Models. Transactions on Machine Learning Research. +Liang P, Bommasani R, Lee T, et al. (2023) Holistic Evaluation of Language Models. Transactions on Machine Learning Research. arXiv:2211.09110. -Liu Y, Iter D, Xu Y, et al. (2023) G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp 2511-2522. +Liu Y, Iter D, Xu Y, et al. (2023) G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp 2511-2522. arXiv:2303.16634. Lin B Y, et al. (2024) WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild. arXiv preprint arXiv:2406.04770. -Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P, Leike J, Lowe R (2022) Training language models to follow instructions with human feedback. In: Advances in Neural Information Processing Systems 35, pp 27730-27744. +Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P, Leike J, Lowe R (2022) Training language models to follow instructions with human feedback. In: Advances in Neural Information Processing Systems 35, pp 27730-27744. arXiv:2203.02155. -Perez E, Huang S, Song F, Cai T, Ring R, Aslanides J, Glaese A, McAleese N, Irving G (2022) Red Teaming Language Models with Language Models. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp 3419-3448. +Perez E, Huang S, Song F, Cai T, Ring R, Aslanides J, Glaese A, McAleese N, Irving G (2022) Red Teaming Language Models with Language Models. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp 3419-3448. arXiv:2202.03286. -Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model is Secretly a Reward Model. In: Advances in Neural Information Processing Systems 36. +Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model is Secretly a Reward Model. In: Advances in Neural Information Processing Systems 36. arXiv:2305.18290. -Ribeiro M T, Wu T, Guestrin C, Singh S (2020) Beyond Accuracy: Behavioral Testing of NLP Models with CheckList. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp 4902-4912. +Ribeiro M T, Wu T, Guestrin C, Singh S (2020) Beyond Accuracy: Behavioral Testing of NLP Models with CheckList. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp 4902-4912. https://doi.org/10.18653/v1/2020.acl-main.442. -Shumailov I, Shumaylov Z, Zhao Y, et al. (2024) AI models collapse when trained on recursively generated data. Nature 631:755-759. +Shumailov I, Shumaylov Z, Zhao Y, et al. (2024) AI models collapse when trained on recursively generated data. Nature 631:755-759. https://doi.org/10.1038/s41586-024-07566-y. -Wang Y, Kordi Y, Mishra S, et al. (2023) Self-Instruct: Aligning Language Models with Self-Generated Instructions. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 13484-13508. +Wang Y, Kordi Y, Mishra S, et al. (2023) Self-Instruct: Aligning Language Models with Self-Generated Instructions. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 13484-13508. https://doi.org/10.18653/v1/2023.acl-long.754. -Zheng L, Chiang W-L, Sheng Y, et al. (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In: Advances in Neural Information Processing Systems 36. +Zheng L, Chiang W-L, Sheng Y, et al. (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In: Advances in Neural Information Processing Systems 36. arXiv:2306.05685. Zhu L, Wang X, Wang Y, et al. (2023) JudgeLM: Fine-tuned Large Language Models are Scalable Judges. arXiv preprint arXiv:2310.17631. -Zhou C, Liu P, Xu P, et al. (2023) LIMA: Less Is More for Alignment. In: Advances in Neural Information Processing Systems 36. +Zhou C, Liu P, Xu P, et al. (2023) LIMA: Less Is More for Alignment. In: Advances in Neural Information Processing Systems 36. arXiv:2305.11206. diff --git a/docs/zh/part10/ch34_dataops_agent.md b/docs/zh/part10/ch34_dataops_agent.md index c35a2cc5..5f6d81dc 100644 --- a/docs/zh/part10/ch34_dataops_agent.md +++ b/docs/zh/part10/ch34_dataops_agent.md @@ -468,11 +468,11 @@ AgentOps 不是取代 DataOps,而是在其之上增加一层——Agent 管理 ## 参考文献 -Amershi S, Begel A, Bird C, Devanbu P, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 291-300. +Amershi S, Begel A, Bird C, Devanbu P, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 291-300. https://doi.org/10.1109/icse-seip.2019.00042. Breck E, Polyzotis N, Roy S, Whang S E, Zinkevich M (2019) Data Validation for Machine Learning. In: Proceedings of Machine Learning and Systems 1, pp 334-347. -Dang Y, Lin Q, Huang P (2019) AIOps: Real-World Challenges and Research Innovations. In: Proceedings of the 41st International Conference on Software Engineering: Companion Proceedings, pp 4-5. +Dang Y, Lin Q, Huang P (2019) AIOps: Real-World Challenges and Research Innovations. In: Proceedings of the 41st International Conference on Software Engineering: Companion Proceedings, pp 4-5. https://doi.org/10.1109/icse-companion.2019.00023. He S, He P, Chen Z, Yang T, Su Y, Lyu M R (2021) A Survey on Automated Log Analysis for Reliability Engineering. ACM Computing Surveys 54(6):1-37. @@ -482,22 +482,22 @@ Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Ov Makinen S, Skogstrom H, Laaksonen E, Mikkonen T (2021) Who Needs MLOps: What Data Scientists Seek to Accomplish and How Can MLOps Help? In: Proceedings of the 2021 IEEE/ACM 1st Workshop on AI Engineering - Software Engineering for AI, pp 109-112. -Lwakatare L E, Raj A, Crnkovic I, Bosch J, Olsson H H (2020) Large-scale Machine Learning Systems in Real-world Industrial Settings: A Review of Challenges and Solutions. Information and Software Technology 127:106368. +Lwakatare L E, Raj A, Crnkovic I, Bosch J, Olsson H H (2020) Large-scale Machine Learning Systems in Real-world Industrial Settings: A Review of Challenges and Solutions. Information and Software Technology 127:106368. https://doi.org/10.1016/j.infsof.2020.106368. NIST (2023) Artificial Intelligence Risk Management Framework (AI RMF 1.0). National Institute of Standards and Technology. NIST (2024) Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile. NIST AI 600-1. -Paleyes A, Urma R-G, Lawrence N D (2022) Challenges in Deploying Machine Learning: A Survey of Case Studies. ACM Computing Surveys 55(6):1-29. +Paleyes A, Urma R-G, Lawrence N D (2022) Challenges in Deploying Machine Learning: A Survey of Case Studies. ACM Computing Surveys 55(6):1-29. https://doi.org/10.1145/3533378. -Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. +Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. https://doi.org/10.1145/3411764.3445518. -Tamburri D A (2020) Sustainable MLOps: Trends and Challenges. In: Proceedings of the 22nd International Symposium on Symbolic and Numeric Algorithms for Scientific Computing, pp 17-23. +Tamburri D A (2020) Sustainable MLOps: Trends and Challenges. In: Proceedings of the 22nd International Symposium on Symbolic and Numeric Algorithms for Scientific Computing, pp 17-23. https://doi.org/10.1109/synasc51798.2020.00015. -Testi M, Ballabio M, Frontoni E, Iannello G, Moccia S, Soda P, Vessio G (2022) MLOps: A Taxonomy and a Methodology. IEEE Access 10:63606-63618. +Testi M, Ballabio M, Frontoni E, Iannello G, Moccia S, Soda P, Vessio G (2022) MLOps: A Taxonomy and a Methodology. IEEE Access 10:63606-63618. https://doi.org/10.1109/access.2022.3181730. Treveil M, Omont N, Stenac C, Lefevre K, Phan D, Zentici J, Lavoillotte A, Miyazaki M, Heidmann L (2020) Introducing MLOps: How to Scale Machine Learning in the Enterprise. O'Reilly Media. -Vela D, Sharp A, Zhang R, Nguyen T, Hoang A, Pianykh O S (2022) Temporal quality degradation in AI models. Scientific Reports 12:11654. +Vela D, Sharp A, Zhang R, Nguyen T, Hoang A, Pianykh O S (2022) Temporal quality degradation in AI models. Scientific Reports 12:11654. https://doi.org/10.1038/s41598-022-15245-z. Zhu J, He S, Liu J, He P, Xie Q, Zheng Z, Lyu M R (2019) Tools and Benchmarks for Automated Log Parsing. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 121-130. diff --git a/docs/zh/part10/ch35_security_permission_collaboration.md b/docs/zh/part10/ch35_security_permission_collaboration.md index 5f224c5b..f7c50c04 100644 --- a/docs/zh/part10/ch35_security_permission_collaboration.md +++ b/docs/zh/part10/ch35_security_permission_collaboration.md @@ -485,9 +485,9 @@ Debenedetti E, Zhang J, Balunović M, et al. (2024) AgentDojo: A Dynamic Environ Ganguli D, Lovitt L, Kernion J, Askell A, Bai Y, Kadavath S, Mann B, Perez E, Schiefer N, Ndousse K, Jones A, Bowman S R, Chen A, Conerly T, DasSarma N, Drain D, Elhage N, El-Showk S, Fort S, Hatfield-Dodds Z, Henighan T, Hernandez D, Hume T, Johnston S, Joseph N, Kravec S, Nanda N, Olsson C, Olah C, Amodei D, Brown T, Clark J, Kaplan J, McCandlish S, Olsson C, Olah C, Amodei D (2022) Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned. arXiv preprint arXiv:2209.07858. -Greshake K, Abdelnabi S, Mishra S, et al. (2023) Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. In: Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security, pp 79-90. +Greshake K, Abdelnabi S, Mishra S, et al. (2023) Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. In: Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security, pp 79-90. https://doi.org/10.1145/3605764.3623985. -Hendrycks D, Mazeika M, Zou A, Patel S, Zhu C, Navarro J, Mu J, Song D, Li B, Steinhardt J (2021) The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization. In: Proceedings of the IEEE/CVF International Conference on Computer Vision, pp 8340-8349. +Hendrycks D, Mazeika M, Zou A, Patel S, Zhu C, Navarro J, Mu J, Song D, Li B, Steinhardt J (2021) The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization. In: Proceedings of the IEEE/CVF International Conference on Computer Vision, pp 8340-8349. https://doi.org/10.1109/iccv48922.2021.00823. Huang Y, Gupta S, Xia M, Li K, Chen D (2024) Catastrophic Jailbreak of Open-source LLMs via Exploiting Generation. In: International Conference on Learning Representations. @@ -495,7 +495,7 @@ Lapid R, Langberg R, Sipper M (2023) Open Sesame! Universal Black Box Jailbreaki Liu Y, Deng G, Li Y, et al. (2023) Prompt Injection Attack against LLM-Integrated Applications. arXiv preprint arXiv:2306.05499. -Perez E, Huang S, Song F, Cai T, Ring R, Aslanides J, Glaese A, McAleese N, Irving G (2022) Red Teaming Language Models with Language Models. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp 3419-3448. +Perez E, Huang S, Song F, Cai T, Ring R, Aslanides J, Glaese A, McAleese N, Irving G (2022) Red Teaming Language Models with Language Models. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp 3419-3448. arXiv:2202.03286. Ruan Y, Dong H, Wang A, Pitis S, Zhou Y, Ba J, Dubois Y, Maddison C J, Hashimoto T B (2024) Identifying the Risks of LM Agents with an LM-Emulated Sandbox. In: International Conference on Learning Representations. @@ -509,6 +509,6 @@ Wei A, Haghtalab N, Steinhardt J (2023) Jailbroken: How Does LLM Safety Training Yi J, Xie Y, Zhu B, Hines K, Kiciman E, Sun G, Xie X, Wu F (2023) Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models. arXiv preprint arXiv:2312.14197. -Zhan Q, Liang Z, Ying Z, Kang D (2024) InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents. In: Findings of the Association for Computational Linguistics: ACL 2024, pp 10471-10506. +Zhan Q, Liang Z, Ying Z, Kang D (2024) InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents. In: Findings of the Association for Computational Linguistics: ACL 2024, pp 10471-10506. https://doi.org/10.18653/v1/2024.findings-acl.624. Zou A, Wang Z, Carlini N, Nasr M, Kolter J Z, Fredrikson M (2023) Universal and Transferable Adversarial Attacks on Aligned Language Models. arXiv preprint arXiv:2307.15043. diff --git a/docs/zh/part11/ch36_compliance_framework_and_governance.md b/docs/zh/part11/ch36_compliance_framework_and_governance.md index cc57e478..351923c6 100644 --- a/docs/zh/part11/ch36_compliance_framework_and_governance.md +++ b/docs/zh/part11/ch36_compliance_framework_and_governance.md @@ -1183,25 +1183,25 @@ Cavoukian A, others (2009) Privacy by Design: The 7 Foundational Principles. Inf Gürses S F, Troncoso C, Díaz C (2011) Engineering Privacy by Design. Technical report. -Spiekermann S, Cranor L F (2009) Engineering Privacy. IEEE Transactions on Software Engineering, 35(1), 67-82. +Spiekermann S, Cranor L F (2009) Engineering Privacy. IEEE Transactions on Software Engineering, 35(1), 67-82. https://doi.org/10.1109/tse.2008.88. European Union Agency for Cybersecurity (ENISA) (2022) Data Protection Engineering. ENISA Report. Zieni B, Spagnuelo D, Heckel R (2021) Transparency by Default: GDPR Patterns for Agile Development. In Electronic Government and the Information Systems Perspective, Springer International Publishing, pp 89-102. -Kosenkov O, Zabardast E, Fucci D, Mendez D, Unterkalmsteiner M (2026) Privacy by Design: Aligning GDPR and Software Engineering Specifications with a Requirements Engineering Approach. Information and Software Technology, 190, 107946. +Kosenkov O, Zabardast E, Fucci D, Mendez D, Unterkalmsteiner M (2026) Privacy by Design: Aligning GDPR and Software Engineering Specifications with a Requirements Engineering Approach. Information and Software Technology, 190, 107946. https://doi.org/10.1016/j.infsof.2025.107946. -Hoepman J-H (2014) Privacy Design Strategies. In IFIP International Information Security Conference, pp 446-459. +Hoepman J-H (2014) Privacy Design Strategies. In IFIP International Information Security Conference, pp 446-459. https://doi.org/10.1007/978-3-642-55415-5_38. Perera C, Liu C, Ranjan R, Wang L, Zomaya A Y (2016) Privacy-Knowledge Modeling for the Internet of Things: A Look Back. Computer, 49(12), 60-68. -Dwork C (2008) Differential Privacy: A Survey of Results. In Theory and Applications of Models of Computation, Springer Berlin Heidelberg, pp 1-19. +Dwork C (2008) Differential Privacy: A Survey of Results. In Theory and Applications of Models of Computation, Springer Berlin Heidelberg, pp 1-19. https://doi.org/10.1007/978-3-540-79228-4_1. Shokri R, Shmatikov V (2015) Privacy-Preserving Deep Learning. In 2015 53rd Annual Allerton Conference on Communication, Control, and Computing (Allerton), pp 909-910. -Anthonysamy P, Rashid A, Chitchyan R (2017) Privacy Requirements: Present & Future. In 2017 IEEE/ACM 39th International Conference on Software Engineering: Software Engineering in Society Track (ICSE-SEIS), pp 13-22. +Anthonysamy P, Rashid A, Chitchyan R (2017) Privacy Requirements: Present & Future. In 2017 IEEE/ACM 39th International Conference on Software Engineering: Software Engineering in Society Track (ICSE-SEIS), pp 13-22. https://doi.org/10.1109/icse-seis.2017.3. -Oetzel M C, Spiekermann S (2014) A Systematic Methodology for Privacy Impact Assessments: A Design Science Approach. European Journal of Information Systems, 23(2), 126-150. +Oetzel M C, Spiekermann S (2014) A Systematic Methodology for Privacy Impact Assessments: A Design Science Approach. European Journal of Information Systems, 23(2), 126-150. https://doi.org/10.1057/ejis.2013.18. Notario N, Crespo A, Martín Y-S, del Alamo J M, Le Métayer D, Antignac T, Kung A, Kroener I, Wright D (2015) PRIPARE: Integrating Privacy Best Practices into a Privacy Engineering Methodology. In 2015 IEEE Security and Privacy Workshops, pp 151-158. diff --git a/docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md b/docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md index 88b1021f..7465222d 100644 --- a/docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md +++ b/docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md @@ -441,7 +441,7 @@ TEE 并不是银弹。它依赖硬件可信根,而硬件本身也存在供应 Shokri R, Stronati M, Song C, Shmatikov V (2017) Membership Inference Attacks against Machine Learning Models. In 2017 IEEE Symposium on Security and Privacy (SP), pp 3-18. -Zhu L, Liu Z, Han S (2019) Deep Leakage from Gradients. Advances in Neural Information Processing Systems, 32. +Zhu L, Liu Z, Han S (2019) Deep Leakage from Gradients. Advances in Neural Information Processing Systems, 32. arXiv:1906.08935. Geiping J, Bauermeister H, Dröge H, Moeller M (2020) Inverting Gradients: How Easy Is It to Break Privacy in Federated Learning? Advances in Neural Information Processing Systems, 33, 16937-16947. @@ -451,15 +451,15 @@ Abadi M, Chu A, Goodfellow I, McMahan H B, Mironov I, Talwar K, Zhang L (2016) D Erlingsson Ú, Pihur V, Korolova A (2014) RAPPOR: Randomized Aggregatable Privacy-Preserving Ordinal Response. In Proceedings of the 2014 ACM SIGSAC Conference on Computer and Communications Security, pp 1054-1067. -McMahan H B, Ramage D, Talwar K, Zhang L (2018) Learning Differentially Private Recurrent Language Models. International Conference on Learning Representations. +McMahan H B, Ramage D, Talwar K, Zhang L (2018) Learning Differentially Private Recurrent Language Models. International Conference on Learning Representations. arXiv:1710.06963. -Kairouz P, McMahan H B (2021) Advances and Open Problems in Federated Learning. Foundations and Trends in Machine Learning, 14(1-2), 1-210. +Kairouz P, McMahan H B (2021) Advances and Open Problems in Federated Learning. Foundations and Trends in Machine Learning, 14(1-2), 1-210. https://doi.org/10.1561/2200000083. Bagdasaryan E, Veit A, Hua Y, Estrin D, Shmatikov V (2020) How To Backdoor Federated Learning. In Proceedings of the Twenty Third International Conference on Artificial Intelligence and Statistics, pp 2938-2948. Yang Q, Liu Y, Chen T, Tong Y (2019) Federated Machine Learning: Concept and Applications. ACM Transactions on Intelligent Systems and Technology, 10(2), 1-19. -Bonawitz K, Ivanov V, Kreuter B, Marcedone A, McMahan H B, Patel S, Ramage D, Segal A, Seth K (2017) Practical Secure Aggregation for Privacy-Preserving Machine Learning. In Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security, pp 1175-1191. +Bonawitz K, Ivanov V, Kreuter B, Marcedone A, McMahan H B, Patel S, Ramage D, Segal A, Seth K (2017) Practical Secure Aggregation for Privacy-Preserving Machine Learning. In Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security, pp 1175-1191. https://doi.org/10.1145/3133956.3133982. McMahan B, Moore E, Ramage D, Hampson S, y Arcas B A (2017) Communication-Efficient Learning of Deep Networks from Decentralized Data. In Artificial Intelligence and Statistics, pp 1273-1282. @@ -467,16 +467,16 @@ Zhao Y, Li M, Lai L, Suda N, Civin D, Chandra V (2018) Federated Learning with N Li T, Sahu A K, Zaheer M, Sanjabi M, Talwalkar A, Smith V (2020) Federated Optimization in Heterogeneous Networks. Proceedings of Machine Learning and Systems, 2, 429-450. -Mohassel P, Zhang Y (2017) SecureML: A System for Scalable Privacy-Preserving Machine Learning. In 2017 IEEE Symposium on Security and Privacy (SP), pp 19-38. +Mohassel P, Zhang Y (2017) SecureML: A System for Scalable Privacy-Preserving Machine Learning. In 2017 IEEE Symposium on Security and Privacy (SP), pp 19-38. https://doi.org/10.1109/sp.2017.12. Gilad-Bachrach R, Dowlin N, Laine K, Lauter K, Naehrig M, Wernsing J (2016) CryptoNets: Applying Neural Networks to Encrypted Data with High Throughput and Accuracy. In International Conference on Machine Learning, pp 201-210. Tramèr F, Boneh D (2019) Slalom: Fast, Verifiable and Private Execution of Neural Networks in Trusted Hardware. International Conference on Learning Representations. -Hu E J, Shen Y, Wallis P, Allen-Zhu Z, Li Y, Wang S, Wang L, Chen W, others (2022) LoRA: Low-Rank Adaptation of Large Language Models. International Conference on Learning Representations. +Hu E J, Shen Y, Wallis P, Allen-Zhu Z, Li Y, Wang S, Wang L, Chen W, others (2022) LoRA: Low-Rank Adaptation of Large Language Models. International Conference on Learning Representations. arXiv:2106.09685. -Kuang W, Qian B, Li Z, Chen D, Gao D, Pan X, Xie Y, Li Y, Ding B, Zhou J (2024) FederatedScope-LLM: A Comprehensive Package for Fine-Tuning Large Language Models in Federated Learning. In Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 5260-5271. +Kuang W, Qian B, Li Z, Chen D, Gao D, Pan X, Xie Y, Li Y, Ding B, Zhou J (2024) FederatedScope-LLM: A Comprehensive Package for Fine-Tuning Large Language Models in Federated Learning. In Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 5260-5271. https://doi.org/10.1145/3637528.3671573. Blanchard P, El Mhamdi E M, Guerraoui R, Stainer J (2017) Machine Learning with Adversaries: Byzantine Tolerant Gradient Descent. Advances in Neural Information Processing Systems, 30. -Sheller M J, Edwards B, Reina G A, others (2020) Federated Learning in Medicine: Facilitating Multi-Institutional Collaborations without Sharing Patient Data. Scientific Reports, 10(1), 12598. +Sheller M J, Edwards B, Reina G A, others (2020) Federated Learning in Medicine: Facilitating Multi-Institutional Collaborations without Sharing Patient Data. Scientific Reports, 10(1), 12598. https://doi.org/10.1038/s41598-020-69250-1. diff --git a/docs/zh/part12/ch38_text_corpora_transparent_ledger.md b/docs/zh/part12/ch38_text_corpora_transparent_ledger.md index 85075924..79ffa88c 100644 --- a/docs/zh/part12/ch38_text_corpora_transparent_ledger.md +++ b/docs/zh/part12/ch38_text_corpora_transparent_ledger.md @@ -594,15 +594,24 @@ FineWeb 展示了开放 Web 文本从网页快照转化为训练语料的处理 ## 参考文献 -1. - Penedo, G., Kydlíček, H., Allal, L. B., Lozhkov, A., Mitchell, M., Raffel, C., von Werra, L., & Wolf, T. (2024). The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. NeurIPS 2024 Datasets and Benchmarks Track. https://arxiv.org/abs/2406.17557. -- Hugging Face. (2026). HuggingFaceFW/fineweb Dataset Card. https://huggingface.co/datasets/HuggingFaceFW/fineweb. -- Hugging Face. (2026). DataTrove FineWeb Processing Script. https://github.com/huggingface/datatrove/blob/main/examples/fineweb.py. -- Penedo, G., Kydlíček, H., Cappelli, A., Sasko, M., & Wolf, T. (2024). DataTrove large scale data processing. https://github.com/huggingface/datatrove. -- Luccioni, S., & Viviano, J. (2021). What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus. https://arxiv.org/abs/2105.02732. - -2. - Soldaini, L., Kinney, R., Bhagia, A., Schwenk, D., Atkinson, D., Authur, R., et al. (2024). Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research. ACL 2024. https://arxiv.org/abs/2402.00159. -- Allen Institute for AI. (2023). Ai2 Dolma: 3 trillion token open corpus for language model pretraining. https://allenai.org/blog/dolma-3-trillion-tokens-open-llm-corpus-9a0ff4b8da64. -- AllenAI. (2026). allenai/dolma Dataset Card. https://huggingface.co/datasets/allenai/dolma. -- AllenAI. (2026). Dolma Dataset and Toolkit Repository. https://github.com/allenai/dolma. -- AllenAI. (2026). Dolma Toolkit Documentation. https://github.com/allenai/dolma/blob/main/docs/README.md. -- Groeneveld, D., Beltagy, I., Walsh, P., Bhagia, A., Kinney, R., Tafjord, O., et al. (2024). OLMo: Accelerating the Science of Language Models. https://arxiv.org/abs/2402.00838. +Penedo, G., Kydlíček, H., Allal, L. B., Lozhkov, A., Mitchell, M., Raffel, C., von Werra, L., & Wolf, T. (2024). The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. NeurIPS 2024 Datasets and Benchmarks Track. https://arxiv.org/abs/2406.17557. + +Hugging Face. (2026). HuggingFaceFW/fineweb Dataset Card. https://huggingface.co/datasets/HuggingFaceFW/fineweb. + +Hugging Face. (2026). DataTrove FineWeb Processing Script. https://github.com/huggingface/datatrove/blob/main/examples/fineweb.py. + +Penedo, G., Kydlíček, H., Cappelli, A., Sasko, M., & Wolf, T. (2024). DataTrove large scale data processing. https://github.com/huggingface/datatrove. + +Luccioni, S., & Viviano, J. (2021). What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus. https://arxiv.org/abs/2105.02732. + +Soldaini, L., Kinney, R., Bhagia, A., Schwenk, D., Atkinson, D., Authur, R., et al. (2024). Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research. ACL 2024. https://arxiv.org/abs/2402.00159. + +Allen Institute for AI. (2023). Ai2 Dolma: 3 trillion token open corpus for language model pretraining. https://allenai.org/blog/dolma-3-trillion-tokens-open-llm-corpus-9a0ff4b8da64. + +AllenAI. (2026). allenai/dolma Dataset Card. https://huggingface.co/datasets/allenai/dolma. + +AllenAI. (2026). Dolma Dataset and Toolkit Repository. https://github.com/allenai/dolma. + +AllenAI. (2026). Dolma Toolkit Documentation. https://github.com/allenai/dolma/blob/main/docs/README.md. + +Groeneveld, D., Beltagy, I., Walsh, P., Bhagia, A., Kinney, R., Tafjord, O., et al. (2024). OLMo: Accelerating the Science of Language Models. https://arxiv.org/abs/2402.00838. diff --git a/docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md b/docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md index 75e0fdd8..79c362aa 100644 --- a/docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md +++ b/docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md @@ -261,9 +261,14 @@ LAION-5B 展示了开放图文候选池如何从网页 URL 和 alt text 变成 ## 参考文献 -- Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., et al. (2022). LAION-5B: An open large-scale dataset for training next generation image-text models. NeurIPS 2022 Datasets and Benchmarks Track. https://arxiv.org/abs/2210.08402. -- LAION. (2022). LAION-5B: A new era of open large-scale multi-modal datasets. https://laion.ai/blog/laion-5b/. -- LAION-AI. (2022). dataset-spec. https://github.com/LAION-AI/dataset-spec. -- Gadre, S. Y., Ilharco, G., Fang, A., Hayase, J., Smyrnis, G., Nguyen, T., et al. (2023). DataComp: In search of the next generation of multimodal datasets. NeurIPS 2023 Datasets and Benchmarks Track. https://arxiv.org/abs/2304.14108. -- DataComp Team. (2026). DataComp Benchmark Documentation. https://www.datacomp.ai/dcclip/. -- ML Foundations. (2023). DataComp codebase. https://github.com/mlfoundations/datacomp. +Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., et al. (2022). LAION-5B: An open large-scale dataset for training next generation image-text models. NeurIPS 2022 Datasets and Benchmarks Track. https://arxiv.org/abs/2210.08402. + +LAION. (2022). LAION-5B: A new era of open large-scale multi-modal datasets. https://laion.ai/blog/laion-5b/. + +LAION-AI. (2022). dataset-spec. https://github.com/LAION-AI/dataset-spec. + +Gadre, S. Y., Ilharco, G., Fang, A., Hayase, J., Smyrnis, G., Nguyen, T., et al. (2023). DataComp: In search of the next generation of multimodal datasets. NeurIPS 2023 Datasets and Benchmarks Track. https://arxiv.org/abs/2304.14108. + +DataComp Team. (2026). DataComp Benchmark Documentation. https://www.datacomp.ai/dcclip/. + +ML Foundations. (2023). DataComp codebase. https://github.com/mlfoundations/datacomp. diff --git a/docs/zh/part12/ch40_visual_document_table_data_engineering.md b/docs/zh/part12/ch40_visual_document_table_data_engineering.md index 3751f370..0f8f3f14 100644 --- a/docs/zh/part12/ch40_visual_document_table_data_engineering.md +++ b/docs/zh/part12/ch40_visual_document_table_data_engineering.md @@ -743,13 +743,13 @@ Levenshtein, V.I. (1965). Binary Codes Capable of Correcting Deletions, Insertio Liu, H., Xue, W., Chen, Y., et al. (2024). A Survey on Hallucination in Large Vision-Language Models. *arXiv preprint arXiv:2402.00253*. -Mathew, M., Karatzas, D., and Jawahar, C.V. (2021). DocVQA: A Dataset for VQA on Document Images. *Proc. WACV*. +Mathew, M., Karatzas, D., and Jawahar, C.V. (2021). DocVQA: A Dataset for VQA on Document Images. *Proc. WACV*. https://doi.org/10.1109/wacv48630.2021.00225. Niu, J., Liu, Z., Gu, Z., et al. (2025). MinerU 2.5: A Decoupled Vision-Language Model for Efficient High-Resolution Document Parsing. *arXiv preprint*. Park, S., Shin, S., Lee, B., et al. (2019). CORD: A Consolidated Receipt Dataset for Post-OCR Parsing. *NeurIPS Workshop on Document Intelligence*. -Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D., and Finn, C. (2024). Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. *Proc. NeurIPS*. +Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D., and Finn, C. (2024). Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. *Proc. NeurIPS*. arXiv:2305.18290. Schulman, J., Wolski, F., Dhariwal, P., Radford, A., and Klimov, O. (2017). Proximal Policy Optimization Algorithms. *arXiv preprint arXiv:1707.06347*. @@ -775,7 +775,7 @@ Cui, C., Sun, T., Liang, S., et al. (2025). PaddleOCR-VL: Boosting Multilingual Guo, D., Yang, D., Zhang, H., et al. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. *arXiv preprint arXiv:2501.12948*. -Hunyuan Vision Team, Lyu, P., Wan, X., et al. (2025). HunyuanOCR Technical Report. *arXiv preprint*. +Hunyuan Vision Team (2025). HunyuanOCR Technical Report. *arXiv preprint*. Li, Y., Yang, G., Liu, H., Wang, B., and Zhang, C. (2025a). Dots.OCR: Multilingual Document Layout Parsing in a Single Vision-Language Model. *arXiv preprint*. @@ -787,9 +787,9 @@ Wang, W., Gao, Z., Gu, L., et al. (2025). InternVL3.5: Advancing Open-Source Mul Zhang, J., Liu, Y., Wu, Z., et al. (2025). MonkeyOCR v1.5 Technical Report: Unlocking Robust Document Parsing for Complex Patterns. *arXiv preprint*. -Smock, B., Pesala, R., and Abraham, R. (2022). PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents. *Proc. CVPR*. +Smock, B., Pesala, R., and Abraham, R. (2022). PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents. *Proc. CVPR*. https://doi.org/10.1109/cvpr52688.2022.00459. -Zhu, F., Lei, W., Huang, Y., Wang, C., Zhang, S., Lv, J., Feng, F., and Chua, T.-S. (2021). TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance. *Proc. ACL*. +Zhu, F., Lei, W., Huang, Y., Wang, C., Zhang, S., Lv, J., Feng, F., and Chua, T.-S. (2021). TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance. *Proc. ACL*. https://doi.org/10.18653/v1/2021.acl-long.254. Pandas Development Team. (2026). pandas Documentation. https://pandas.pydata.org/docs/. diff --git a/docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md b/docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md index 57975dcb..3e9c39d3 100644 --- a/docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md +++ b/docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md @@ -824,17 +824,17 @@ MedImage-ToolVQA 展示了医学图像 VQA 数据工程的一种扩展方向: ## 参考文献 -Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. +Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. https://doi.org/10.18653/v1/2022.findings-acl.177. -Methani, N., Ganguly, P., Khapra, M. M., & Kumar, P. (2020). PlotQA: Reasoning over Scientific Plots. WACV 2020. +Methani, N., Ganguly, P., Khapra, M. M., & Kumar, P. (2020). PlotQA: Reasoning over Scientific Plots. WACV 2020. https://doi.org/10.1109/wacv45572.2020.9093523. Kahou, S. E., Michalski, V., Atkinson, A., Kádár, Á., Trischler, A., & Bengio, Y. (2017). FigureQA: An Annotated Figure Dataset for Visual Reasoning. arXiv:1710.07300. -Kafle, K., Price, B., Cohen, S., & Kanan, C. (2018). DVQA: Understanding Data Visualizations via Question Answering. CVPR 2018. +Kafle, K., Price, B., Cohen, S., & Kanan, C. (2018). DVQA: Understanding Data Visualizations via Question Answering. CVPR 2018. https://doi.org/10.1109/cvpr.2018.00592. -Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. +Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. https://doi.org/10.1109/wacv48630.2021.00225. -Masry, A., Islam, M. S., Ahmed, M., Bajaj, A., Kabir, F., Kartha, A., ... & Joty, S. (2025, July). Chartqapro: A more diverse and challenging benchmark for chart question answering. In Findings of the Association for Computational Linguistics: ACL 2025 (pp. 19123-19151). +Masry, A., Islam, M. S., Ahmed, M., Bajaj, A., Kabir, F., Kartha, A., ... & Joty, S. (2025, July). Chartqapro: A more diverse and challenging benchmark for chart question answering. In Findings of the Association for Computational Linguistics: ACL 2025 (pp. 19123-19151). https://doi.org/10.18653/v1/2025.findings-acl.978. Xie, T., Lin, M., Liu, M., Ye, Y., Chen, C., & Liu, S. (2026). Infochartqa: A benchmark for multimodal question answering on infographic charts. Advances in Neural Information Processing Systems, 38. @@ -850,7 +850,7 @@ He, X., Zhang, Y., Mou, L., Xing, E., & Xie, P. (2020). PathVQA: 30000+ Question Liu, B., Zhan, L.-M., Xu, L., Ma, L., Yang, Y., & Wu, X.-M. (2021). SLAKE: A Semantically-Labeled Knowledge-Enhanced Dataset for Medical Visual Question Answering. IEEE 18th International Symposium on Biomedical Imaging. https://doi.org/10.1109/ISBI48211.2021.9434010. -Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. International Conference on Learning Representations. +Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. International Conference on Learning Representations. arXiv:2210.03629. Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. Advances in Neural Information Processing Systems, 36. diff --git a/docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md b/docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md index 10f2dab0..00e9536e 100644 --- a/docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md +++ b/docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md @@ -449,8 +449,8 @@ Latent-Switch-69K 展示了推理轨迹数据工程的核心问题:如何在 ## 参考文献 -1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022. +1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022. arXiv:2201.11903. 2. Lightman, H., Kosaraju, V., Burda, Y., Edwards, H., Baker, B., Lee, T., Leike, J., Schulman, J., Sutskever, I., & Cobbe, K. (2023). Let's Verify Step by Step. arXiv:2305.20050. 3. Yao, S., Zhao, J., Yu, D., Du, N., Shafran, I., Narasimhan, K., & Cao, Y. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. arXiv:2210.03629. -4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. -5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021. +4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948. +5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021. arXiv:2103.03874. diff --git a/docs/zh/part13/ch44_pretrain_recipes.md b/docs/zh/part13/ch44_pretrain_recipes.md index 496e3543..ed05764c 100644 --- a/docs/zh/part13/ch44_pretrain_recipes.md +++ b/docs/zh/part13/ch44_pretrain_recipes.md @@ -305,6 +305,6 @@ Sennrich R, Haddow B, Birch A (2016) Neural Machine Translation of Rare Words wi Shao Z, Wang P, Zhu Q, Xu R, Song J, Zhang M, Li Y, Wu Y, Guo D (2024) DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models. arXiv preprint arXiv:2402.03300. -Su J, Lu Y, Pan S, Murtadha A, Wen B, Liu Y (2024) RoFormer: Enhanced Transformer with Rotary Position Embedding (RoPE). Neurocomputing 568:127063. +Su J, Lu Y, Pan S, Murtadha A, Wen B, Liu Y (2024) RoFormer: Enhanced Transformer with Rotary Position Embedding (RoPE). Neurocomputing 568:127063. https://doi.org/10.1016/j.neucom.2023.127063. -Wang X, Wei J, Schuurmans D, Le Q, Chi E, Narang S, Chowdhery A, Zhou D (2023) Self-Consistency Improves Chain of Thought Reasoning in Language Models. In: International Conference on Learning Representations. +Wang X, Wei J, Schuurmans D, Le Q, Chi E, Narang S, Chowdhery A, Zhou D (2023) Self-Consistency Improves Chain of Thought Reasoning in Language Models. In: International Conference on Learning Representations. arXiv:2203.11171. diff --git a/docs/zh/part13/ch45_posttrain_recipes.md b/docs/zh/part13/ch45_posttrain_recipes.md index b9f8fb46..219d4922 100644 --- a/docs/zh/part13/ch45_posttrain_recipes.md +++ b/docs/zh/part13/ch45_posttrain_recipes.md @@ -52,7 +52,7 @@ SFT 的核心任务是“格式化”。它负责把基座模型从一个无意 如果说 SFT 是教模型“应该怎么答”,那么偏好对齐层就是教模型“两个合格的回答中,哪一个更好”。这一层建立了模型的奖励曲面(Reward Surface)。 偏好数据可以服务于多种不同的训练范式:用于训练 RM 以支持 RLHF,或者直接用于 DPO、IPO、KTO、GRPO 或 RLVR 等直接偏好优化方法。其常见规模跨度极大,从 $10^5$ 到 $10^7$ 个偏好对不等,这取决于数据构建过程中是否包含大量自动生成、多轮采样以及在线反馈。 -在这些方法中,偏好学习并不只有“二选一排序”这一种理解方式。KTO 等方法把人类反馈解释为更接近前景理论的收益/损失信号,理论分析也在尝试统一 RLHF、DPO 与更一般的人类偏好学习范式(Ethayarajh et al. 2024; Gheshlaghi et al. 2024)。这提醒数据工程侧不要只保存最终标签,还要保存反馈来源、评审理由和候选组结构。 +在这些方法中,偏好学习并不只有“二选一排序”这一种理解方式。KTO 等方法把人类反馈解释为更接近前景理论的收益/损失信号,理论分析也在尝试统一 RLHF、DPO 与更一般的人类偏好学习范式(Ethayarajh et al. 2024; Gheshlaghi Azar et al. 2024)。这提醒数据工程侧不要只保存最终标签,还要保存反馈来源、评审理由和候选组结构。 **第三层:在线持续优化数据** 模型发布上线并不是后训练的终点,而是起点。第三层决定了模型能否随着真实业务演进而自我修复。 @@ -412,6 +412,6 @@ Shao Z, Wang P, Zhu Q, Xu R, Song J, Bi X, Zhang H, Zhang M, Li Y, Wu Y, Guo D ( Zhou K, Zhu Y, Chen Z, Chen W, Zhao W X, Chen X, Lin Y, Wen J-R, Han J (2023) Don't Make Your LLM an Evaluation Benchmark Cheater. arXiv preprint arXiv:2311.01964. -Zheng L, Chiang W-L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E, Zhang H, Gonzalez J, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. Advances in Neural Information Processing Systems, 36, 46595-46623. +Zheng L, Chiang W-L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E, Zhang H, Gonzalez J, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. Advances in Neural Information Processing Systems, 36, 46595-46623. arXiv:2306.05685. -Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations. +Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations. arXiv:2305.20050. diff --git a/docs/zh/part13/ch46_rl_reasoning_data.md b/docs/zh/part13/ch46_rl_reasoning_data.md index f584e0ec..70e368af 100644 --- a/docs/zh/part13/ch46_rl_reasoning_data.md +++ b/docs/zh/part13/ch46_rl_reasoning_data.md @@ -610,7 +610,7 @@ Zelikman E, Wu Y, Mu J, Goodman N (2022) STaR: Bootstrapping Reasoning with Reas Madaan A, Tandon N, Gupta P, Hallinan S, Gao L, Wiegreffe S, Alon U, Dziri N, Prabhumoye S, Yang Y, Gupta S, Majumder B P, Hermann K, Welleck S, Yazdanbakhsh A, Clark P (2023) Self-Refine: Iterative Refinement with Self-Feedback. Advances in Neural Information Processing Systems, 36, 46534-46594. -Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations. +Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations. arXiv:2305.20050. Zheng L, Chiang W-L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E, Zhang H, Gonzalez J, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. Advances in Neural Information Processing Systems, 36, 46595-46623. @@ -622,8 +622,8 @@ Shi F, Suzgun M, Freitag M, Wang X, Srivats S, Vosoughi S, Chung H W, Tay Y, Rud Jaech A, Kalai A, Lerer A, Richardson A, El-Kishky A, Low A, Helyar A, Madry A, Beutel A, Carney A, others (2024) OpenAI o1 System Card. arXiv preprint arXiv:2412.16720. -Ott S, Hebenstreit K, Liévin V, others (2023) ThoughtSource: A Central Hub for Large Language Model Reasoning Data. Scientific Data, 10(1), 528. +Ott S, Hebenstreit K, Liévin V, others (2023) ThoughtSource: A Central Hub for Large Language Model Reasoning Data. Scientific Data, 10(1), 528. https://doi.org/10.1038/s41597-023-02433-3. Hsieh C-Y, Li C-L, Yeh C-K, Nakhost H, Fujii Y, Ratner A, Krishna R, Lee C-Y, Pfister T (2023) Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes. Findings of the Association for Computational Linguistics: ACL 2023, pp 8003-8017. -Patil S G, Zhang T, Wang X, Gonzalez J E (2024) Gorilla: Large Language Model Connected with Massive APIs. Advances in Neural Information Processing Systems, 38. +Patil S G, Zhang T, Wang X, Gonzalez J E (2024) Gorilla: Large Language Model Connected with Massive APIs. Advances in Neural Information Processing Systems, 38. arXiv:2305.15334. diff --git a/docs/zh/part13/ch47_vlm_data_recipes.md b/docs/zh/part13/ch47_vlm_data_recipes.md index 20a7894b..0b24e41d 100644 --- a/docs/zh/part13/ch47_vlm_data_recipes.md +++ b/docs/zh/part13/ch47_vlm_data_recipes.md @@ -325,9 +325,9 @@ Chen Z, Wu J, Wang W, Su W, Chen G, Xing S, Zhong M, Liu Q, Lu Y, Li B, others ( Chen Z, Wang W, Tian H, Ye S, Gao Z, Cui E, Tong X, Hu J, Luo J, Ma S, others (2024) InternVL3: Exploring Advanced Training and Test-Time Scaling for Vision-Language Models. arXiv preprint arXiv:2504.10479. -Dao T, Fu D Y, Ermon S, Rudra A, Ré C (2022) FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In: Advances in Neural Information Processing Systems 35:16344-16359. +Dao T, Fu D Y, Ermon S, Rudra A, Ré C (2022) FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In: Advances in Neural Information Processing Systems 35:16344-16359. https://doi.org/10.52202/068431-1189. -Gadre S Y, Ilharco G, Fang A, Hayase J, Ilharco G, Marten T, Wortsman M, Goyal S, Guha E, Jain H, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. In: Advances in Neural Information Processing Systems 36. +Gadre S Y, Ilharco G, Fang A, Hayase J, Ilharco G, Marten T, Wortsman M, Goyal S, Guha E, Jain H, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. In: Advances in Neural Information Processing Systems 36. Available at: https://arxiv.org/abs/2304.14108. Laurençon A, Saulnier L, Tronchon L, Bekman S, Singh A, Lozhkov A, Wang T, Karamcheti S, Rush A M, Kiela D, others (2023) OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. arXiv preprint arXiv:2306.16527. @@ -339,14 +339,14 @@ Liu S, Zeng Z, Ren T, Li F, Zhang H, Yang J, Li C, Yang J, Su H, Zhu J, others ( Lu P, Bansal H, Xia T, Liu J, Li C, Hajishirzi H, Cheng H, Chang K W, Galley M, Gao J (2023) MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts. arXiv preprint arXiv:2310.02255. -Mathew M, Karatzas D, Jawahar C V (2021) DocVQA: A Dataset for VQA on Document Images. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision, pp 2200-2209. +Mathew M, Karatzas D, Jawahar C V (2021) DocVQA: A Dataset for VQA on Document Images. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision, pp 2200-2209. https://doi.org/10.1109/wacv48630.2021.00225. Radford A, Kim J W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, others (2021) Learning Transferable Visual Models from Natural Language Supervision (CLIP). In: Proceedings of the 38th International Conference on Machine Learning, pp 8748-8763. -Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278-25294. +Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278-25294. Available at: https://arxiv.org/abs/2210.08402. Wang P, Bai S, Tan S, Wang S, Fan Z, Bai J, Chen K, Liu X, Wang J, Ge W, others (2024) Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191. -Yue X, Ni Y, Zhang K, Zheng T, Liu R, Zhang S, Stevens J, Jiang C, Zheng N, Sun T, others (2024) MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 9556-9567. +Yue X, Ni Y, Zhang K, Zheng T, Liu R, Zhang S, Stevens J, Jiang C, Zheng N, Sun T, others (2024) MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 9556-9567. https://doi.org/10.1109/cvpr52733.2024.00913. Zhu W, Hessel J, Awadalla A, Gadre S Y, Dodge J, Fang A, Yu Y, Schmidt L, Wang W Y, Choi Y (2023) Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text. arXiv preprint arXiv:2304.06939. diff --git a/docs/zh/part13/ch48_t2i_t2v.md b/docs/zh/part13/ch48_t2i_t2v.md index faebe04a..cd042e2e 100644 --- a/docs/zh/part13/ch48_t2i_t2v.md +++ b/docs/zh/part13/ch48_t2i_t2v.md @@ -402,17 +402,17 @@ T2I 与 T2V 的数据工程,已经从早期的“采集—清洗”升级为 PySceneDetect Contributors (2026) PySceneDetect Documentation. Available at: https://www.scenedetect.com/docs/latest/. -Gadre S Y, Ilharco G, Fang A, Hayase J, Ilharco G, Marten T, Wortsman M, Goyal S, Guha E, Jain H, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. In: Advances in Neural Information Processing Systems 36. +Gadre S Y, Ilharco G, Fang A, Hayase J, Ilharco G, Marten T, Wortsman M, Goyal S, Guha E, Jain H, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. In: Advances in Neural Information Processing Systems 36. Available at: https://arxiv.org/abs/2304.14108. -Ghosh S, Bhatt U, Bhattacharya R, Parmar P, Patel S, Islam M, Reddy K K, others (2023) GenEval: An Object-Focused Framework for Evaluating Text-to-Image Alignment. In: Advances in Neural Information Processing Systems 36. +Ghosh S, Bhatt U, Bhattacharya R, Parmar P, Patel S, Islam M, Reddy K K, others (2023) GenEval: An Object-Focused Framework for Evaluating Text-to-Image Alignment. In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/075280-2270. -Kirstain Y, Polyak A, Singer U, Matiana S, Penna J, Levy O (2023) Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation (PickScore). In: Advances in Neural Information Processing Systems 36. +Kirstain Y, Polyak A, Singer U, Matiana S, Penna J, Levy O (2023) Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation (PickScore). In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/075280-1594. Open-Sora Team (2024) Open-Sora: Democratizing Efficient Video Production for All. arXiv preprint arXiv:2412.20404. -Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278-25294. +Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278-25294. Available at: https://arxiv.org/abs/2210.08402. -Wang W, Lv Q, Yu W, Hong W, Qi J, Wang Y, Ji J, Yang Z, Zhao L, Song X, others (2023) CogVLM: Visual Expert for Pretrained Language Models. In: Advances in Neural Information Processing Systems 36. +Wang W, Lv Q, Yu W, Hong W, Qi J, Wang Y, Ji J, Yang Z, Zhao L, Song X, others (2023) CogVLM: Visual Expert for Pretrained Language Models. In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/079017-3860. Wu X, Sun K, Zhu F, Zhao R, Li H (2023) Human Preference Score v2: A Solid Benchmark for Evaluating Human Preferences of Text-to-Image Synthesis (HPSv2). arXiv preprint arXiv:2306.09341. diff --git a/docs/zh/part14/p03_llava_instruct.md b/docs/zh/part14/p03_llava_instruct.md index 5519bfaf..f8eb6800 100644 --- a/docs/zh/part14/p03_llava_instruct.md +++ b/docs/zh/part14/p03_llava_instruct.md @@ -1136,8 +1136,8 @@ P03 这类项目特别适合把高频错误沉淀为 replay 集。比如: ## 参考文献 -1. Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. NeurIPS 2023. -2. Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollár, P., & Zitnick, C. L. (2014). Microsoft COCO: Common Objects in Context. ECCV 2014. +1. Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. NeurIPS 2023. https://doi.org/10.52202/075280-1516. +2. Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollár, P., & Zitnick, C. L. (2014). Microsoft COCO: Common Objects in Context. ECCV 2014. https://doi.org/10.1007/978-3-319-10602-1_48. 3. Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., et al. (2021). Learning Transferable Visual Models From Natural Language Supervision. ICML 2021. -4. Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. -5. Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. +4. Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. https://doi.org/10.1109/wacv48630.2021.00225. +5. Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. https://doi.org/10.18653/v1/2022.findings-acl.177. diff --git a/docs/zh/part14/p05_mm_rag.md b/docs/zh/part14/p05_mm_rag.md index a8928144..fc9e61e9 100644 --- a/docs/zh/part14/p05_mm_rag.md +++ b/docs/zh/part14/p05_mm_rag.md @@ -1210,8 +1210,8 @@ P05 之所以适合往工作台方向扩展,是因为它已经具备几个重 ## 参考文献 -1. Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. NeurIPS 2023. -2. Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollár, P., & Zitnick, C. L. (2014). Microsoft COCO: Common Objects in Context. ECCV 2014. +1. Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. NeurIPS 2023. https://doi.org/10.52202/075280-1516. +2. Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollár, P., & Zitnick, C. L. (2014). Microsoft COCO: Common Objects in Context. ECCV 2014. https://doi.org/10.1007/978-3-319-10602-1_48. 3. Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., et al. (2021). Learning Transferable Visual Models From Natural Language Supervision. ICML 2021. -4. Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. -5. Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. +4. Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. https://doi.org/10.1109/wacv48630.2021.00225. +5. Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. https://doi.org/10.18653/v1/2022.findings-acl.177. diff --git a/docs/zh/part14/p06_prm.md b/docs/zh/part14/p06_prm.md index 2a32a9d1..a3084d4d 100644 --- a/docs/zh/part14/p06_prm.md +++ b/docs/zh/part14/p06_prm.md @@ -1137,8 +1137,8 @@ P06 这类项目还有一项很值得长期保留的资产,就是 replay 集 ## 参考文献 -1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022. +1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022. arXiv:2201.11903. 2. Lightman, H., Kosaraju, V., Burda, Y., Edwards, H., Baker, B., Lee, T., Leike, J., Schulman, J., Sutskever, I., & Cobbe, K. (2023). Let's Verify Step by Step. arXiv:2305.20050. 3. Yao, S., Zhao, J., Yu, D., Du, N., Shafran, I., Narasimhan, K., & Cao, Y. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. arXiv:2210.03629. -4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. -5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021. +4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948. +5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021. arXiv:2103.03874. diff --git a/docs/zh/part14/p09_privacy_pipeline.md b/docs/zh/part14/p09_privacy_pipeline.md index 2a3a0439..c160a4c3 100644 --- a/docs/zh/part14/p09_privacy_pipeline.md +++ b/docs/zh/part14/p09_privacy_pipeline.md @@ -1127,7 +1127,7 @@ P09 当前以规则、策略和运行流程为主,这很适合作为最小可 ## 参考文献 1. European Union. (2016). Regulation (EU) 2016/679: General Data Protection Regulation. https://eur-lex.europa.eu/eli/reg/2016/679/oj. -2. NIST. (2020). NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0. +2. NIST. (2020). NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0. https://doi.org/10.6028/nist.cswp.10. 3. Dwork, C., & Roth, A. (2014). The Algorithmic Foundations of Differential Privacy. Foundations and Trends in Theoretical Computer Science. 4. Kairouz, P., McMahan, H. B., Avent, B., Bellet, A., Bennis, M., Bhagoji, A. N., et al. (2021). Advances and Open Problems in Federated Learning. Foundations and Trends in Machine Learning. 5. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications. https://genai.owasp.org/llm-top-10/. diff --git a/docs/zh/part14/p11_mini_deepseek.md b/docs/zh/part14/p11_mini_deepseek.md index 1850bbeb..12f27df5 100644 --- a/docs/zh/part14/p11_mini_deepseek.md +++ b/docs/zh/part14/p11_mini_deepseek.md @@ -62,7 +62,7 @@ Mini-DeepSeek;项目实战;可复现数据工程;数据流水线;验收 ## 背景与目标 -在预训练数据工程中,“按比例缩放(Scaling Laws)”(Kaplan et al. 2020) 不仅适用于模型参数,同样适用于数据配方的实验与验证。我们在前作 项目 1(Mini-C4)中,已经走通了单源语料的清洗流水线;但真实的工业级大模型(如 DeepSeek-V3 (Liu et al. 2024))从来不是在单一语料上训练出来的,而是由网页、代码、数学、学术论文等多种数据源精确混合而成。 +在预训练数据工程中,“按比例缩放(Scaling Laws)”(Kaplan et al. 2020) 不仅适用于模型参数,同样适用于数据配方的实验与验证。我们在前作 项目 1(Mini-C4)中,已经走通了单源语料的清洗流水线;但真实的工业级大模型(如 DeepSeek-V3 (DeepSeek-AI et al. 2024))从来不是在单一语料上训练出来的,而是由网页、代码、数学、学术论文等多种数据源精确混合而成。 为什么我们需要一个 Mini 版的预训练流水线? @@ -171,7 +171,7 @@ unique.save_to_disk("./data/mixed_1b_dedup") ### Step 3: 训练 150K 超大 Tokenizer -DeepSeek-V3 (Liu et al. 2024) 采用了一个规模为 150K 左右的超大词表(相较于 Llama-2 的 32K 提升巨大),这使其在处理中文与代码时效率极高。在此步骤,我们将以混合且去重后的数据训练 BPE Tokenizer。 +DeepSeek-V3 (DeepSeek-AI et al. 2024) 采用了一个规模为 150K 左右的超大词表(相较于 Llama-2 的 32K 提升巨大),这使其在处理中文与代码时效率极高。在此步骤,我们将以混合且去重后的数据训练 BPE Tokenizer。 ```python from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers diff --git a/docs/zh/part14/p12_r1_reasoning_flywheel.md b/docs/zh/part14/p12_r1_reasoning_flywheel.md index 148bf0e1..ffe8371f 100644 --- a/docs/zh/part14/p12_r1_reasoning_flywheel.md +++ b/docs/zh/part14/p12_r1_reasoning_flywheel.md @@ -504,7 +504,7 @@ Guo D, Yang D, Zhang H, Song J, Zhang R, Xu R, Zhu Q, Ma S, Wang P, Bi X, others Guha E, Marten R, Keh S, Raoof N, Smyrnis G, Bansal H, Nezhurina M, Mercat J, Vu T, Sprague Z, others (2025) OpenThoughts: Data Recipes for Reasoning Models. arXiv preprint arXiv:2506.04178. -Hendrycks D, Burns C, Kadavath S, Arora A, Basart S, Tang E, Song D, Steinhardt J (2021) Measuring Mathematical Problem Solving with the MATH Dataset. In: Advances in Neural Information Processing Systems 34:24262-24273. +Hendrycks D, Burns C, Kadavath S, Arora A, Basart S, Tang E, Song D, Steinhardt J (2021) Measuring Mathematical Problem Solving with the MATH Dataset. In: Advances in Neural Information Processing Systems 34:24262-24273. arXiv:2103.03874. Hui B, Yang J, Cui Z, Yang J, Liu D, Zhang L, Liu B, Yu B, Lu K, Chi K, others (2024) Qwen2.5 Technical Report. arXiv preprint arXiv:2412.15115. diff --git a/docs/zh/part14/p13_multimodal_instruction_factory.md b/docs/zh/part14/p13_multimodal_instruction_factory.md index 1df75305..39c45497 100644 --- a/docs/zh/part14/p13_multimodal_instruction_factory.md +++ b/docs/zh/part14/p13_multimodal_instruction_factory.md @@ -547,10 +547,10 @@ Bai S, Chen K, Liu X, Wang J, Ge W, Song S, Dang K, Wang P, Wang S, Tang J, et a Zhu J, Wang W, Chen Z, Liu Z, Ye S, Gu L, Duan Y, Tian H, Su W, Shao J, et al. (2025) InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models. arXiv preprint arXiv:2504.10479. -Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu C H, Gonzalez J E, Zhang H, Stoica I (2023) Efficient Memory Management for Large Language Model Serving with PagedAttention. In: Proceedings of the 29th ACM Symposium on Operating Systems Principles, pp 611-626. +Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu C H, Gonzalez J E, Zhang H, Stoica I (2023) Efficient Memory Management for Large Language Model Serving with PagedAttention. In: Proceedings of the 29th ACM Symposium on Operating Systems Principles, pp 611-626. https://doi.org/10.1145/3600006.3613165. -Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, et al. (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35, pp 25278-25294. +Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, et al. (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35, pp 25278-25294. Available at: https://arxiv.org/abs/2210.08402. -Wang X, Wei J, Schuurmans D, Le Q, Chi E, Narang S, Chowdhery A, Zhou D (2023) Self-Consistency Improves Chain of Thought Reasoning in Language Models. In: International Conference on Learning Representations. +Wang X, Wei J, Schuurmans D, Le Q, Chi E, Narang S, Chowdhery A, Zhou D (2023) Self-Consistency Improves Chain of Thought Reasoning in Language Models. In: International Conference on Learning Representations. arXiv:2203.11171. -Zheng L, Chiang W L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E P, Zhang H, Gonzalez J E, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In: Advances in Neural Information Processing Systems 36. +Zheng L, Chiang W L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E P, Zhang H, Gonzalez J E, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In: Advances in Neural Information Processing Systems 36. arXiv:2306.05685. diff --git a/docs/zh/part14/p15_dataagent_semantic_nl2sql_agent.md b/docs/zh/part14/p15_dataagent_semantic_nl2sql_agent.md index e5e77e10..52ec0cca 100644 --- a/docs/zh/part14/p15_dataagent_semantic_nl2sql_agent.md +++ b/docs/zh/part14/p15_dataagent_semantic_nl2sql_agent.md @@ -902,7 +902,7 @@ NL2SQL -> CSV -> 图表 -> Markdown 报告 -> 业务交付 ## 参考文献 1. Yu, T., Zhang, R., Yang, K., Yasunaga, M., Wang, D., Li, Z., et al. (2018). Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task. EMNLP 2018. -2. Wang, B., Shin, R., Liu, X., Polozov, O., & Richardson, M. (2020). RAT-SQL: Relation-Aware Schema Encoding and Linking for Text-to-SQL Parsers. ACL 2020. +2. Wang, B., Shin, R., Liu, X., Polozov, O., & Richardson, M. (2020). RAT-SQL: Relation-Aware Schema Encoding and Linking for Text-to-SQL Parsers. ACL 2020. https://doi.org/10.18653/v1/2020.acl-main.677. 3. Schick, T., Dwivedi-Yu, J., Dessì, R., Raileanu, R., Lomeli, M., Hambro, E., Zettlemoyer, L., Cancedda, N., & Scialom, T. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. arXiv:2302.04761. 4. Yao, S., Zhao, J., Yu, D., Du, N., Shafran, I., Narasimhan, K., & Cao, Y. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. arXiv:2210.03629. 5. dbt Labs. (2026). dbt Documentation. https://docs.getdbt.com/. diff --git a/docs/zh/part2/ch05_cleaning_dedup.md b/docs/zh/part2/ch05_cleaning_dedup.md index 0ee6cdc4..e4f88ff5 100644 --- a/docs/zh/part2/ch05_cleaning_dedup.md +++ b/docs/zh/part2/ch05_cleaning_dedup.md @@ -411,7 +411,7 @@ def detect_and_redact_pii( return text, found ``` -**命名实体识别(NER)模型**则覆盖规则难以枚举的 PII 类型,如真实人名、地址和机构名。推荐使用 spaCy (Honnibal et al. 2020) 的中文模型(`zh_core_web_trf`)或 HuggingFace 上开源的中文 NER 模型,对人名(PER)、地点(LOC)、机构(ORG)等命名实体进行识别,再根据上下文判断是否需要脱敏。 +**命名实体识别(NER)模型**则覆盖规则难以枚举的 PII 类型,如真实人名、地址和机构名。推荐使用 spaCy (Honnibal et al. 2023) 的中文模型(`zh_core_web_trf`)或 HuggingFace 上开源的中文 NER 模型,对人名(PER)、地点(LOC)、机构(ORG)等命名实体进行识别,再根据上下文判断是否需要脱敏。 --- @@ -624,11 +624,11 @@ class DocumentQualityScore: ## 参考文献 -Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. +Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. https://doi.org/10.1109/sequen.1997.666900. Heafield K (2011) KenLM: Faster and Smaller Language Model Queries. In: Proceedings of the Sixth Workshop on Statistical Machine Translation, pp 187-197. -Honnibal M, Montani I, Van Landeghem S, Boyd A (2020) spaCy: Industrial-strength Natural Language Processing in Python. Available at: https://spacy.io/ (Accessed 2024-11). +Honnibal M, Montani I, Van Landeghem S, Boyd A (2023) explosion/spaCy: v3.7.2: Fixes for APIs and requirements. Zenodo. . Indyk P, Motwani R (1998) Approximate Nearest Neighbors: Towards Removing the Curse of Dimensionality. In: Proceedings of the 30th Annual ACM Symposium on Theory of Computing, pp 604-613. diff --git a/docs/zh/part2/ch06_tokenization_loading.md b/docs/zh/part2/ch06_tokenization_loading.md index 693eb764..95256ddf 100644 --- a/docs/zh/part2/ch06_tokenization_loading.md +++ b/docs/zh/part2/ch06_tokenization_loading.md @@ -462,7 +462,7 @@ dataloader = DataLoader( Bengio Y, Louradour J, Collobert R, Weston J (2009) Curriculum Learning. In: Proceedings of the 26th Annual International Conference on Machine Learning, pp 41-48. -Brown T B, Mann B, Ryder N, Subbiah M, Kaplan J, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D M, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language Models are Few-Shot Learners. In: Advances in Neural Information Processing Systems 33, pp 1877-1901. +Brown T B, Mann B, Ryder N, Subbiah M, Kaplan J, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D M, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language Models are Few-Shot Learners. In: Advances in Neural Information Processing Systems 33, pp 1877-1901. arXiv:2005.14165. Grattafiori A, Dubey A, Jauhri A, Pandey A, Kadian A, Al-Dahle A, Letman A, Mathur A, Schelten A, Vaughan A, others (2024) The Llama 3 Herd of Models. arXiv preprint arXiv:2407.21783. diff --git a/docs/zh/part2/ch07_data_operations.md b/docs/zh/part2/ch07_data_operations.md index 63532dbf..5547620b 100644 --- a/docs/zh/part2/ch07_data_operations.md +++ b/docs/zh/part2/ch07_data_operations.md @@ -388,7 +388,7 @@ Chen M, Tworek J, Jun H, Yuan Q, Pinto H P d O, Kaplan J, Edwards H, Burda Y, Jo Cobbe K, Kosaraju V, Bavarian M, Chen M, Jun H, Kaiser L, Plappert M, Tworek J, Hilton J, Nakano R, Hesse C, Schulman J (2021) Training Verifiers to Solve Math Word Problems (GSM8K). arXiv preprint arXiv:2110.14168. -Covington M A, McFall J D (2010) Cutting the Gordian Knot: The Moving-Average Type–Token Ratio (MATTR). Journal of Quantitative Linguistics 17(2):94-100. +Covington M A, McFall J D (2010) Cutting the Gordian Knot: The Moving-Average Type–Token Ratio (MATTR). Journal of Quantitative Linguistics 17(2):94-100. https://doi.org/10.1080/09296171003643098. Heafield K (2011) KenLM: Faster and Smaller Language Model Queries. In: Proceedings of the Sixth Workshop on Statistical Machine Translation, pp 187-197. @@ -398,9 +398,9 @@ Lees A, Tran V Q, Tay Y, Sorensen J, Gupta J, Metzler D, Vasserman L (2022) A Ne DVC Team and Contributors (2024) DVC: Data Version Control - Git for Data & Models. Documentation: . Source repository: . -Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17-28. +Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17-28. https://doi.org/10.1145/3299887.3299891. -Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the ACM CHI Conference on Human Factors in Computing Systems, pp 1-15. +Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the ACM CHI Conference on Human Factors in Computing Systems, pp 1-15. https://doi.org/10.1145/3411764.3445518. Whang S E, Roh Y, Song H, Lee J G (2023) Data Collection and Quality Challenges in Deep Learning: A Data-Centric AI Perspective. The VLDB Journal 32(4):791-813. diff --git a/docs/zh/part3/ch08_multimodal_image.md b/docs/zh/part3/ch08_multimodal_image.md index 0c8bd130..6f21f6c9 100644 --- a/docs/zh/part3/ch08_multimodal_image.md +++ b/docs/zh/part3/ch08_multimodal_image.md @@ -320,11 +320,11 @@ Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehg Gadre S Y, Ilharco G, Fang A, Hayase J, Smyrnis G, Nguyen T, Marten R, Wortsman M, Ghosh D, Zhang J, Orgad E, Entezari R, Daras G, Pratt S, Ramanujan V, Bitton Y, Marathe K, Mussmann S, Vencu R, Cherti M, Krishna R, Koh P W, Saukh O, Ratner A, Song S, Hajishirzi H, Farhadi A, Beaumont R, Oh S, Dimakis A, Jitsev J, Carmon Y, Shankar V, Schmidt L (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. Advances in Neural Information Processing Systems 36. -Laurençon H, Saulnier L, Tronchon L, Bekman S, Singh A, Lozhkov A, Wang T, Karamcheti S, Rush A M, Kiela D, Cord M, Wolf T (2023) OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. Advances in Neural Information Processing Systems 36. +Laurençon H, Saulnier L, Tronchon L, Bekman S, Singh A, Lozhkov A, Wang T, Karamcheti S, Rush A M, Kiela D, Cord M, Wolf T (2023) OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. Advances in Neural Information Processing Systems 36. arXiv:2306.16527. Liu H, Li C, Wu Q, Lee Y J (2023) Visual Instruction Tuning (LLaVA). Advances in Neural Information Processing Systems 36:34892-34916. -Liu H, Li C, Li Y, Lee Y J (2024) Improved Baselines with Visual Instruction Tuning (LLaVA-1.5). In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 26296-26306. +Liu H, Li C, Li Y, Lee Y J (2024) Improved Baselines with Visual Instruction Tuning (LLaVA-1.5). In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 26296-26306. https://doi.org/10.1109/cvpr52733.2024.02484. NVIDIA (2023) NVIDIA Data Loading Library (DALI). GitHub repository. . @@ -332,9 +332,9 @@ Radford A, Kim J W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, M Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, Schramowski P, Kundurthy S, Crowson K, Schmidt L, Kaczmarczyk R, Jitsev J (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. Advances in Neural Information Processing Systems 35:25278-25294. -Zhu W, Hessel J, Awadalla A, Gadre S Y, Dodge J, Fang A, Yu Y, Schmidt L, Wang W Y, Choi Y (2023) Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text. Advances in Neural Information Processing Systems 36. +Zhu W, Hessel J, Awadalla A, Gadre S Y, Dodge J, Fang A, Yu Y, Schmidt L, Wang W Y, Choi Y (2023) Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text. Advances in Neural Information Processing Systems 36. arXiv:2304.06939. -Zhai X, Mustafa B, Kolesnikov A, Beyer L (2023) Sigmoid Loss for Language Image Pre-Training (SigLIP). In: Proceedings of the IEEE/CVF International Conference on Computer Vision, pp 11975-11986. +Zhai X, Mustafa B, Kolesnikov A, Beyer L (2023) Sigmoid Loss for Language Image Pre-Training (SigLIP). In: Proceedings of the IEEE/CVF International Conference on Computer Vision, pp 11975-11986. https://doi.org/10.1109/iccv51070.2023.01100. Zhu J, Wang W, Chen Z, Liu Z, Ye S, Gu L, Duan Y, Tian H, Su W, Shao J, Gao Z, Cui E, Cao Y, Liu Y, Xu W, Li H, Wang J, Lv H, Chen D, Li S, He Y, Jiang T, Luo J, Wang Y, He C, Shi B, Zhang X, Shao W, He J, Xiong Y, Qu W, Sun P, Jiao P, Wu L, Zhang K, Deng H, Ge J, Chen K, Wang L, Dou M, Lu L, Zhu X, Lu T, Lin D, Qiao Y, Dai J, Wang W (2025) InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models. arXiv preprint arXiv:2504.10479. diff --git a/docs/zh/part3/ch09_recaptioning_ocr.md b/docs/zh/part3/ch09_recaptioning_ocr.md index 4781b8b7..c2dce5b0 100644 --- a/docs/zh/part3/ch09_recaptioning_ocr.md +++ b/docs/zh/part3/ch09_recaptioning_ocr.md @@ -270,9 +270,9 @@ Blecher L, Cucurull G, Scialom T, Stojnic R (2023) Nougat: Neural Optical Unders Fu C, Chen P, Shen Y, Qin Y, Zhang M, Lin X, Qiu Z, Lin W, Yang J, Zheng X, Li K, Sun X, Wu E (2023) MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2306.13394. -Dou Z Y, Xu Y, Gan Z, Wang J, Wang S, Wang L, Zhu C, Zhang P, Yuan L, Peng N, Liu Z (2022) Coarse-to-Fine Vision-Language Pre-training with Fusion in the Backbone (FIBER). Advances in Neural Information Processing Systems 35:32942-32956. +Dou Z Y, Xu Y, Gan Z, Wang J, Wang S, Wang L, Zhu C, Zhang P, Yuan L, Peng N, Liu Z (2022) Coarse-to-Fine Vision-Language Pre-training with Fusion in the Backbone (FIBER). Advances in Neural Information Processing Systems 35:32942-32956. https://doi.org/10.52202/068431-2387. -Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083-4091. +Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083-4091. https://doi.org/10.1145/3503161.3548112. Kim G, Moon S, Xu R, Yim J, Park J, Seo J, Baek J, Yoo M, Park S, Park S (2022) OCR-Free Document Understanding Transformer (Donut). In: European Conference on Computer Vision, pp 498-517. @@ -280,17 +280,17 @@ Kirillov A, Mintun E, Ravi N, Mao H, Rolland C, Gustafson L, Xiao T, Whitehead S Lee J, Jia M, Sangkloy P, Krishnamurthy J, Han S, Chang S F, Hutchinson B (2023) Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding. In: Proceedings of the 40th International Conference on Machine Learning, pp 18893-18912. -Li L H, Zhang P, Zhang H, Yang J, Li C, Zhong Y, Wang L, Yuan L, Zhang L, Hwang J N, Chang K W, Gao J (2022) Grounded Language-Image Pre-training (GLIP). In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 10965-10975. +Li L H, Zhang P, Zhang H, Yang J, Li C, Zhong Y, Wang L, Yuan L, Zhang L, Hwang J N, Chang K W, Gao J (2022) Grounded Language-Image Pre-training (GLIP). In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 10965-10975. https://doi.org/10.1109/cvpr52688.2022.01069. Liu H, Li C, Wu Q, Lee Y J (2023b) MMBench: Is Your Multi-modal Model an All-around Player? arXiv preprint arXiv:2307.06281. Liu S, Zeng Z, Ren T, Li F, Zhang H, Yang J, Li C, Yang J, Su H, Zhu J, Zhang L (2023c) Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection. arXiv preprint arXiv:2303.05499. -Liu H, Li C, Li Y, Lee Y J (2024) Improved Baselines with Visual Instruction Tuning (LLaVA-1.5). In: CVPR 2024, pp 26296-26306. +Liu H, Li C, Li Y, Lee Y J (2024) Improved Baselines with Visual Instruction Tuning (LLaVA-1.5). In: CVPR 2024, pp 26296-26306. https://doi.org/10.1109/cvpr52733.2024.02484. Lu P, Qiu L, Chang K W, Zhu W, Rajpurohit T, Clark P, Kalyan A (2022) Dynamic Prompt Learning via Policy Gradient for Semi-structured Mathematical Reasoning (TabMWP). arXiv preprint arXiv:2209.14610. -Masry A, Long D, Tan J Q, Joty S, Hoque E (2022) ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp 2263-2279. +Masry A, Long D, Tan J Q, Joty S, Hoque E (2022) ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp 2263-2279. https://doi.org/10.18653/v1/2022.findings-acl.177. Radford A, Kim J W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, Krueger G, Sutskever I (2021) Learning Transferable Visual Models From Natural Language Supervision (CLIP). In: ICML 2021, pp 8748-8763. diff --git a/docs/zh/part3/ch10_video_audio.md b/docs/zh/part3/ch10_video_audio.md index 9963ca46..39b152b8 100644 --- a/docs/zh/part3/ch10_video_audio.md +++ b/docs/zh/part3/ch10_video_audio.md @@ -345,7 +345,7 @@ DataLoader worker 0: Pipe broken, resetting shard iterator. Skipping shard. Bain M, Huh J, Han T, Zisserman A (2023) WhisperX: Time-Accurate Speech Transcription of Long-Form Audio. arXiv preprint arXiv:2303.00747. -Bredin H, Yin R, Coria J M, Gelly G, Korshunov P, Lavechin M, Fustes D, Titeux H, Bouaziz W, Gill M P (2020) pyannote.audio: Neural Building Blocks for Speaker Diarization. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp 7124-7128. +Bredin H, Yin R, Coria J M, Gelly G, Korshunov P, Lavechin M, Fustes D, Titeux H, Bouaziz W, Gill M P (2020) pyannote.audio: Neural Building Blocks for Speaker Diarization. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp 7124-7128. https://doi.org/10.1109/icassp40776.2020.9052974. Brooks T, Peebles B, Holmes C, DePue W, Guo Y, Jing L, Schnurr D, Taylor J, Luhman T, Luhman E, Lyu C, Ying P (2024) Video Generation Models as World Simulators (Sora). OpenAI Technical Report. diff --git a/docs/zh/part3/ch11_cross_modal_alignment.md b/docs/zh/part3/ch11_cross_modal_alignment.md index 7893e118..e8d69744 100644 --- a/docs/zh/part3/ch11_cross_modal_alignment.md +++ b/docs/zh/part3/ch11_cross_modal_alignment.md @@ -367,10 +367,10 @@ Rombach R, Blattmann A, Lorenz D, Esser P, Ommer B (2022) High-Resolution Image Sakoe H, Chiba S (1978) Dynamic Programming Algorithm Optimization for Spoken Word Recognition (DTW). IEEE Transactions on Acoustics, Speech, and Signal Processing 26(1):43-49. -Salvador S, Chan P (2007) Toward Accurate Dynamic Time Warping in Linear Time and Space (FastDTW). Intelligent Data Analysis 11(5):561-580. +Salvador S, Chan P (2007) Toward Accurate Dynamic Time Warping in Linear Time and Space (FastDTW). Intelligent Data Analysis 11(5):561-580. https://doi.org/10.3233/ida-2007-11508. van den Oord A, Vinyals O, Kavukcuoglu K (2017) Neural Discrete Representation Learning (VQ-VAE). Advances in Neural Information Processing Systems 30. -Wu Y, Chen K, Zhang T, Hui Y, Berg-Kirkpatrick T, Dubnov S (2023) Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation (CLAP). In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp 1-5. +Wu Y, Chen K, Zhang T, Hui Y, Berg-Kirkpatrick T, Dubnov S (2023) Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation (CLAP). In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp 1-5. https://doi.org/10.1109/icassp49357.2023.10095969. Dufumier B, Castillo-Navarro J, Tuia D, Thiran J P (2025) What to Align in Multimodal Contrastive Learning? In: International Conference on Learning Representations. arXiv preprint arXiv:2409.07402. diff --git a/docs/zh/part4/ch12_sft.md b/docs/zh/part4/ch12_sft.md index 81dfd97a..d5a495c0 100644 --- a/docs/zh/part4/ch12_sft.md +++ b/docs/zh/part4/ch12_sft.md @@ -707,13 +707,13 @@ Askell, A., Bai, Y., Chen, A., Drain, D., Ganguli, D., Henighan, T., et al. (202 OpenAI. (2024). *Introducing Structured Outputs in the API*. OpenAI Blog, August 6, 2024; OpenAI API Documentation: *Structured Model Outputs*. Accessed May 14, 2026. -Singhal, K., Azizi, S., Tu, T., Mahdavi, S. S., Wei, J., Chung, H. W., et al. (2023). Large Language Models Encode Clinical Knowledge. Nature. +Singhal, K., Azizi, S., Tu, T., Mahdavi, S. S., Wei, J., Chung, H. W., et al. (2023). Large Language Models Encode Clinical Knowledge. Nature. https://doi.org/10.1038/s41586-023-06291-2. -Gebru, T., Morgenstern, J., Vecchione, B., Vaughan, J. W., Wallach, H., Daumé III, H., & Crawford, K. (2021). *Datasheets for Datasets*. Communications of the ACM, 64(12), 86–92. +Gebru, T., Morgenstern, J., Vecchione, B., Vaughan, J. W., Wallach, H., Daumé III, H., & Crawford, K. (2021). *Datasheets for Datasets*. Communications of the ACM, 64(12), 86–92. https://doi.org/10.1145/3458723. -Pushkarna, M., Zaldivar, A., & Kjartansson, O. (2022). *Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI*. Proceedings of the ACM Conference on Fairness, Accountability, and Transparency, 1776–1826. +Pushkarna, M., Zaldivar, A., & Kjartansson, O. (2022). *Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI*. Proceedings of the ACM Conference on Fairness, Accountability, and Transparency, 1776–1826. https://doi.org/10.1145/3531146.3533231. -Mitchell, M., Wu, S., Zaldivar, A., Barnes, P., Vasserman, L., Hutchinson, B., et al. (2019). *Model Cards for Model Reporting*. Proceedings of the Conference on Fairness, Accountability, and Transparency, 220–229. +Mitchell, M., Wu, S., Zaldivar, A., Barnes, P., Vasserman, L., Hutchinson, B., et al. (2019). *Model Cards for Model Reporting*. Proceedings of the Conference on Fairness, Accountability, and Transparency, 220–229. https://doi.org/10.1145/3287560.3287596. Liang, P., Bommasani, R., Lee, T., Tsipras, D., Soylu, D., Yasunaga, M., et al. (2022). *Holistic Evaluation of Language Models*. arXiv:2211.09110. diff --git a/docs/zh/part4/ch13_preference.md b/docs/zh/part4/ch13_preference.md index 553d8adf..4d27ec65 100644 --- a/docs/zh/part4/ch13_preference.md +++ b/docs/zh/part4/ch13_preference.md @@ -473,19 +473,19 @@ if __name__ == "__main__": ## 参考文献 -Christiano, P. F., Leike, J., Brown, T. B., Martic, M., Legg, S., & Amodei, D. (2017). Deep reinforcement learning from human preferences. *Advances in Neural Information Processing Systems*, 30. +Christiano, P. F., Leike, J., Brown, T. B., Martic, M., Legg, S., & Amodei, D. (2017). Deep reinforcement learning from human preferences. *Advances in Neural Information Processing Systems*, 30. arXiv:1706.03741. Ziegler, D. M., Stiennon, N., Wu, J., Brown, T. B., Radford, A., Amodei, D., Christiano, P., & Irving, G. (2019). Fine-tuning language models from human preferences. *arXiv preprint arXiv:1909.08593*. -Stiennon, N., Ouyang, L., Wu, J., Ziegler, D. M., Lowe, R., Voss, C., Radford, A., Amodei, D., & Christiano, P. (2020). Learning to summarize from human feedback. *Advances in Neural Information Processing Systems*, 33, 3008–3021. +Stiennon, N., Ouyang, L., Wu, J., Ziegler, D. M., Lowe, R., Voss, C., Radford, A., Amodei, D., & Christiano, P. (2020). Learning to summarize from human feedback. *Advances in Neural Information Processing Systems*, 33, 3008–3021. arXiv:2009.01325. Askell, A., Bai, Y., Chen, A., et al. (2021). A general language assistant as a laboratory for alignment. *arXiv preprint arXiv:2112.00861*. -Ouyang, L., Wu, J., Jiang, X., et al. (2022). Training language models to follow instructions with human feedback. *Advances in Neural Information Processing Systems*, 35, 27730–27744. +Ouyang, L., Wu, J., Jiang, X., et al. (2022). Training language models to follow instructions with human feedback. *Advances in Neural Information Processing Systems*, 35, 27730–27744. arXiv:2203.02155. Bai, Y., Jones, A., Ndousse, K., et al. (2022a). Training a helpful and harmless assistant with reinforcement learning from human feedback. *arXiv preprint arXiv:2204.05862*. -Rafailov, R., Sharma, A., Mitchell, E., et al. (2023). Direct preference optimization: Your language model is secretly a reward model. *Advances in Neural Information Processing Systems*, 36, 53728–53741. +Rafailov, R., Sharma, A., Mitchell, E., et al. (2023). Direct preference optimization: Your language model is secretly a reward model. *Advances in Neural Information Processing Systems*, 36, 53728–53741. arXiv:2305.18290. Bai, Y., Kadavath, S., Kundu, S., et al. (2022b). Constitutional AI: Harmlessness from AI feedback. *arXiv preprint arXiv:2212.08073*. @@ -495,26 +495,26 @@ Lightman, H., Kosaraju, V., Burda, Y., et al. (2024). Let's verify step by step. Uesato, J., Kushman, N., Kumar, R., et al. (2022). Solving math word problems with process- and outcome-based feedback. *arXiv preprint arXiv:2211.14275*. -Bradley, R. A., & Terry, M. E. (1952). Rank analysis of incomplete block designs: I. The method of paired comparisons. *Biometrika*, 39(3/4), 324–345. +Bradley, R. A., & Terry, M. E. (1952). Rank analysis of incomplete block designs: I. The method of paired comparisons. *Biometrika*, 39(3/4), 324–345. https://doi.org/10.2307/2334029. Roijers, D. M., Vamplew, P., Whiteson, S., et al. (2013). A survey of multi-objective sequential decision-making. *Journal of Artificial Intelligence Research*, 48, 67–113. -Deb, K., Pratap, A., Agarwal, S., et al. (2002). A fast and elitist multiobjective genetic algorithm: NSGA-II. *IEEE Transactions on Evolutionary Computation*, 6(2), 182–197. +Deb, K., Pratap, A., Agarwal, S., et al. (2002). A fast and elitist multiobjective genetic algorithm: NSGA-II. *IEEE Transactions on Evolutionary Computation*, 6(2), 182–197. https://doi.org/10.1109/4235.996017. -Cohen, J. (1960). A coefficient of agreement for nominal scales. *Educational and Psychological Measurement*, 20(1), 37–46. +Cohen, J. (1960). A coefficient of agreement for nominal scales. *Educational and Psychological Measurement*, 20(1), 37–46. https://doi.org/10.1177/001316446002000104. -Dawid, A. P., & Skene, A. M. (1979). Maximum likelihood estimation of observer error-rates using the EM algorithm. *Journal of the Royal Statistical Society: Series C (Applied Statistics)*, 28(1), 20–28. +Dawid, A. P., & Skene, A. M. (1979). Maximum likelihood estimation of observer error-rates using the EM algorithm. *Journal of the Royal Statistical Society: Series C (Applied Statistics)*, 28(1), 20–28. https://doi.org/10.2307/2346806. Snow, R., O'Connor, B., Jurafsky, D., et al. (2008). Cheap and fast—but is it good? Evaluating non-expert annotations for natural language tasks. *Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing*, 254–263. -Aroyo, L., & Welty, C. (2015). Truth is a lie: Crowd truth and the seven myths of human annotation. *AI Magazine*, 36(1), 15–24. +Aroyo, L., & Welty, C. (2015). Truth is a lie: Crowd truth and the seven myths of human annotation. *AI Magazine*, 36(1), 15–24. https://doi.org/10.1609/aimag.v36i1.2564. -Northcutt, C. G., Jiang, L., & Chuang, I. L. (2021). Confident learning: Estimating uncertainty in dataset labels. *Journal of Artificial Intelligence Research*, 70, 1373–1411. +Northcutt, C. G., Jiang, L., & Chuang, I. L. (2021). Confident learning: Estimating uncertainty in dataset labels. *Journal of Artificial Intelligence Research*, 70, 1373–1411. https://doi.org/10.1613/jair.1.12125. -Gebru, T., Morgenstern, J., Vecchione, B., et al. (2021). Datasheets for datasets. *Communications of the ACM*, 64(12), 86–92. +Gebru, T., Morgenstern, J., Vecchione, B., et al. (2021). Datasheets for datasets. *Communications of the ACM*, 64(12), 86–92. https://doi.org/10.1145/3458723. -Bender, E. M., & Friedman, B. (2018). Data statements for natural language processing: Toward mitigating system bias and enabling better science. *Transactions of the Association for Computational Linguistics*, 6, 587–604. +Bender, E. M., & Friedman, B. (2018). Data statements for natural language processing: Toward mitigating system bias and enabling better science. *Transactions of the Association for Computational Linguistics*, 6, 587–604. https://doi.org/10.1162/tacl_a_00041. -Mitchell, M., Wu, S., Zaldivar, A., et al. (2019). Model cards for model reporting. *Proceedings of the Conference on Fairness, Accountability, and Transparency*, 220–229. +Mitchell, M., Wu, S., Zaldivar, A., et al. (2019). Model cards for model reporting. *Proceedings of the Conference on Fairness, Accountability, and Transparency*, 220–229. https://doi.org/10.1145/3287560.3287596. Liang, P., Bommasani, R., Lee, T., et al. (2022). Holistic evaluation of language models. *arXiv preprint arXiv:2211.09110*. diff --git a/docs/zh/part4/ch14_qa.md b/docs/zh/part4/ch14_qa.md index 72e88006..bda3ed29 100644 --- a/docs/zh/part4/ch14_qa.md +++ b/docs/zh/part4/ch14_qa.md @@ -647,7 +647,7 @@ Wei, J., Bosma, M., Zhao, V. Y., et al. (2022). *Finetuned Language Models Are Z Ouyang, L., Wu, J., Jiang, X., et al. (2022). *Training Language Models to Follow Instructions with Human Feedback*. Advances in Neural Information Processing Systems, 35, 27730–27744. arXiv:2203.02155. -Christiano, P. F., Leike, J., Brown, T. B., et al. (2017). *Deep Reinforcement Learning from Human Preferences*. Advances in Neural Information Processing Systems, 30. +Christiano, P. F., Leike, J., Brown, T. B., et al. (2017). *Deep Reinforcement Learning from Human Preferences*. Advances in Neural Information Processing Systems, 30. arXiv:1706.03741. Stiennon, N., Ouyang, L., Wu, J., et al. (2020). *Learning to Summarize from Human Feedback*. Advances in Neural Information Processing Systems, 33, 3008–3021. arXiv:2009.01325. diff --git a/docs/zh/part6/ch19_tool.md b/docs/zh/part6/ch19_tool.md index b98a8957..92c19f89 100644 --- a/docs/zh/part6/ch19_tool.md +++ b/docs/zh/part6/ch19_tool.md @@ -529,17 +529,17 @@ Parisi, A., Zhao, Y., & Fiedel, N. (2022). TALM: Tool Augmented Language Models. Nakano, R., Hilton, J., Balaji, S., et al. (2021). WebGPT: Browser-Assisted Question-Answering with Human Feedback. arXiv:2112.09332. -Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. International Conference on Learning Representations. +Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. International Conference on Learning Representations. arXiv:2210.03629. -Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. Advances in Neural Information Processing Systems, 36. +Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. Advances in Neural Information Processing Systems, 36. arXiv:2302.04761. -Li, M., Zhao, Y., Yu, B., et al. (2023). API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs. Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, 3102–3116. +Li, M., Zhao, Y., Yu, B., et al. (2023). API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs. Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, 3102–3116. https://doi.org/10.18653/v1/2023.emnlp-main.187. Qin, Y., Liang, S., Ye, Y., et al. (2024). ToolLLM: Facilitating Large Language Models to Master 16000+ Real-World APIs. International Conference on Learning Representations. -Patil, S. G., Zhang, T., Wang, X., & Gonzalez, J. E. (2024). Gorilla: Large Language Model Connected with Massive APIs. Advances in Neural Information Processing Systems, 37. +Patil, S. G., Zhang, T., Wang, X., & Gonzalez, J. E. (2024). Gorilla: Large Language Model Connected with Massive APIs. Advances in Neural Information Processing Systems, 37. arXiv:2305.15334. -Zhuang, Y., Yu, Y., Wang, K., et al. (2023). ToolQA: A Dataset for LLM Question Answering with External Tools. Advances in Neural Information Processing Systems, 36. +Zhuang, Y., Yu, Y., Wang, K., et al. (2023). ToolQA: A Dataset for LLM Question Answering with External Tools. Advances in Neural Information Processing Systems, 36. https://doi.org/10.52202/075280-2180. Huang, Y., Shi, J., Li, Y., et al. (2023). MetaTool Benchmark for Large Language Models: Deciding Whether to Use Tools and Which to Use. arXiv:2310.03128. @@ -547,13 +547,13 @@ Patil, S. G., Mao, H., Yan, F., et al. (2025). The Berkeley Function Calling Lea Yao, S., Shinn, N., Razavi, P., & Narasimhan, K. (2025). τ-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains. International Conference on Learning Representations. -Shinn, N., Cassano, F., Gopinath, A., et al. (2023). Reflexion: Language Agents with Verbal Reinforcement Learning. Advances in Neural Information Processing Systems, 36. +Shinn, N., Cassano, F., Gopinath, A., et al. (2023). Reflexion: Language Agents with Verbal Reinforcement Learning. Advances in Neural Information Processing Systems, 36. arXiv:2303.11366. -Yang, J., Jimenez, C. E., Wettig, A., et al. (2024). SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering. Advances in Neural Information Processing Systems, 37. +Yang, J., Jimenez, C. E., Wettig, A., et al. (2024). SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering. Advances in Neural Information Processing Systems, 37. https://doi.org/10.52202/079017-1601. Ruan, Y., Dong, H., Wang, A., et al. (2024). Identifying the Risks of LM Agents with an LM-Emulated Sandbox. International Conference on Learning Representations. -Greshake, K., Abdelnabi, S., Mishra, S., et al. (2023). Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security, 79–90. +Greshake, K., Abdelnabi, S., Mishra, S., et al. (2023). Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security, 79–90. https://doi.org/10.1145/3605764.3623985. Liu, Y., Deng, G., Li, Y., et al. (2023). Prompt Injection Attack against LLM-Integrated Applications. arXiv:2306.05499. diff --git a/docs/zh/part6/ch20_agent.md b/docs/zh/part6/ch20_agent.md index 2d11cc18..1f7ed8e8 100644 --- a/docs/zh/part6/ch20_agent.md +++ b/docs/zh/part6/ch20_agent.md @@ -483,29 +483,29 @@ AI 助手类场景中的多轮数据,通常最强调长期偏好与轻量任 Young, S., Gašić, M., Thomson, B., & Williams, J. D. (2013). *POMDP-Based Statistical Spoken Dialog Systems: A Review*. Proceedings of the IEEE, 101(5), 1160–1179. https://doi.org/10.1109/JPROC.2012.2225812. -Williams, J. D., Raux, A., Ramachandran, D., & Black, A. (2013). *The Dialog State Tracking Challenge*. Proceedings of the SIGDIAL 2013 Conference, 404–413. +Williams, J. D., Raux, A., Ramachandran, D., & Black, A. (2013). *The Dialog State Tracking Challenge*. Proceedings of the SIGDIAL 2013 Conference, 404–413. https://doi.org/10.1109/slt.2014.7078595. -Budzianowski, P., Wen, T.-H., Tseng, B.-H., et al. (2018). *MultiWOZ - A Large-Scale Multi-Domain Wizard-of-Oz Dataset for Task-Oriented Dialogue Modelling*. Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 5016–5026. +Budzianowski, P., Wen, T.-H., Tseng, B.-H., et al. (2018). *MultiWOZ - A Large-Scale Multi-Domain Wizard-of-Oz Dataset for Task-Oriented Dialogue Modelling*. Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 5016–5026. https://doi.org/10.18653/v1/d18-1547. -Yao, S., Zhao, J., Yu, D., et al. (2023). *ReAct: Synergizing Reasoning and Acting in Language Models*. International Conference on Learning Representations. +Yao, S., Zhao, J., Yu, D., et al. (2023). *ReAct: Synergizing Reasoning and Acting in Language Models*. International Conference on Learning Representations. arXiv:2210.03629. -Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). *Toolformer: Language Models Can Teach Themselves to Use Tools*. Advances in Neural Information Processing Systems, 36. +Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). *Toolformer: Language Models Can Teach Themselves to Use Tools*. Advances in Neural Information Processing Systems, 36. arXiv:2302.04761. Liu, N. F., Lin, K., Hewitt, J., et al. (2024a). *Lost in the Middle: How Language Models Use Long Contexts*. Transactions of the Association for Computational Linguistics, 12, 157–173. https://doi.org/10.1162/tacl_a_00638. Packer, C., Wooders, S., Lin, K., et al. (2023). *MemGPT: Towards LLMs as Operating Systems*. arXiv:2310.08560. -Wang, W., Dong, L., Cheng, H., et al. (2023). *Augmenting Language Models with Long-Term Memory*. Advances in Neural Information Processing Systems, 36. +Wang, W., Dong, L., Cheng, H., et al. (2023). *Augmenting Language Models with Long-Term Memory*. Advances in Neural Information Processing Systems, 36. https://doi.org/10.52202/075280-3259. Zhong, W., Guo, L., Gao, Q., et al. (2024). *MemoryBank: Enhancing Large Language Models with Long-Term Memory*. Proceedings of the AAAI Conference on Artificial Intelligence, 38(17), 19724–19731. https://doi.org/10.1609/aaai.v38i17.29946. Park, J. S., O’Brien, J. C., Cai, C. J., et al. (2023). *Generative Agents: Interactive Simulacra of Human Behavior*. Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology. -Lewis, P., Perez, E., Piktus, A., et al. (2020). *Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks*. Advances in Neural Information Processing Systems, 33, 9459–9474. +Lewis, P., Perez, E., Piktus, A., et al. (2020). *Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks*. Advances in Neural Information Processing Systems, 33, 9459–9474. arXiv:2005.11401. Asai, A., Wu, Z., Wang, Y., et al. (2024). *Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection*. International Conference on Learning Representations. -Shinn, N., Cassano, F., Gopinath, A., et al. (2023). *Reflexion: Language Agents with Verbal Reinforcement Learning*. Advances in Neural Information Processing Systems, 36. +Shinn, N., Cassano, F., Gopinath, A., et al. (2023). *Reflexion: Language Agents with Verbal Reinforcement Learning*. Advances in Neural Information Processing Systems, 36. arXiv:2303.11366. Liu, X., Yu, H., Zhang, H., et al. (2024b). *AgentBench: Evaluating LLMs as Agents*. International Conference on Learning Representations. diff --git a/docs/zh/part7/ch21_rag_pipeline.md b/docs/zh/part7/ch21_rag_pipeline.md index ade4cc5f..34210690 100644 --- a/docs/zh/part7/ch21_rag_pipeline.md +++ b/docs/zh/part7/ch21_rag_pipeline.md @@ -931,35 +931,35 @@ if __name__ == "__main__": ## 参考文献 -Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W-t, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459–9474. +Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W-t, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459–9474. arXiv:2005.11401. Gao Y, Xiong Y, Gao X, Jia K, Pan J, Bi Y, Dai Y, Sun J, Wang M, Wang H (2023) Retrieval-Augmented Generation for Large Language Models: A Survey. arXiv preprint arXiv:2312.10997. Guu K, Lee K, Tung Z, Pasupat P, Chang M-W (2020) REALM: Retrieval-Augmented Language Model Pre-Training. In: Proceedings of the 37th International Conference on Machine Learning (ICML), pp 3929–3938. -Izacard G, Grave E (2021) Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering. In: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics (EACL), pp 874–880. +Izacard G, Grave E (2021) Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering. In: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics (EACL), pp 874–880. https://doi.org/10.18653/v1/2021.eacl-main.74. -Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: European Conference on Computer Vision (ECCV). +Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: European Conference on Computer Vision (ECCV). https://doi.org/10.1007/978-3-031-19815-1_29. -Xu Y, Li M, Cui L, Huang S, Wei F, Zhou M (2020) LayoutLM: Pre-training of Text and Layout for Document Image Understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 1192–1200. +Xu Y, Li M, Cui L, Huang S, Wei F, Zhou M (2020) LayoutLM: Pre-training of Text and Layout for Document Image Understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 1192–1200. https://doi.org/10.1145/3394486.3403172. -Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083–4091. +Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083–4091. https://doi.org/10.1145/3503161.3548112. -Appalaraju S, Jasani B, Kota B U, Xie Y, Manmatha R (2021) DocFormer: End-to-End Transformer for Document Understanding. In: Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp 993–1003. +Appalaraju S, Jasani B, Kota B U, Xie Y, Manmatha R (2021) DocFormer: End-to-End Transformer for Document Understanding. In: Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp 993–1003. https://doi.org/10.1109/iccv48922.2021.00103. -Smock B, Pesala R, Abraham R (2022) PubTables-1M: Towards Comprehensive Table Extraction from Unstructured Documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 4634–4642. +Smock B, Pesala R, Abraham R (2022) PubTables-1M: Towards Comprehensive Table Extraction from Unstructured Documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 4634–4642. https://doi.org/10.1109/cvpr52688.2022.00459. -Liu N F, Lin K, Hewitt J, Paranjape A, Bevilacqua M, Petroni F, Liang P (2024) Lost in the Middle: How Language Models Use Long Contexts. Transactions of the Association for Computational Linguistics 12:157–173. +Liu N F, Lin K, Hewitt J, Paranjape A, Bevilacqua M, Petroni F, Liang P (2024) Lost in the Middle: How Language Models Use Long Contexts. Transactions of the Association for Computational Linguistics 12:157–173. https://doi.org/10.1162/tacl_a_00638. -Karpukhin V, Oğuz B, Min S, Lewis P, Wu L, Edunov S, Chen D, Yih W-t (2020) Dense Passage Retrieval for Open-Domain Question Answering. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp 6769–6781. +Karpukhin V, Oğuz B, Min S, Lewis P, Wu L, Edunov S, Chen D, Yih W-t (2020) Dense Passage Retrieval for Open-Domain Question Answering. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp 6769–6781. https://doi.org/10.18653/v1/2020.emnlp-main.550. -Reimers N, Gurevych I (2019) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, pp 3982–3992. +Reimers N, Gurevych I (2019) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, pp 3982–3992. https://doi.org/10.18653/v1/d19-1410. -Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150-158. +Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150-158. https://doi.org/10.18653/v1/2024.eacl-demo.16. -Niu C, Wu Y, Zhu J, Xu S, Shum K, Zhong R, Song J, Zhang T (2024) RAGTruth: A Hallucination Corpus for Developing Trustworthy Retrieval-Augmented Language Models. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (ACL), pp 10862-10878. +Niu C, Wu Y, Zhu J, Xu S, Shum K, Zhong R, Song J, Zhang T (2024) RAGTruth: A Hallucination Corpus for Developing Trustworthy Retrieval-Augmented Language Models. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (ACL), pp 10862-10878. https://doi.org/10.18653/v1/2024.acl-long.585. -Manning C D, Raghavan P, Schütze H (2008) Introduction to Information Retrieval. Cambridge University Press. +Manning C D, Raghavan P, Schütze H (2008) Introduction to Information Retrieval. Cambridge University Press. https://doi.org/10.5860/choice.46-2715. Thakur N, Reimers N, Rücklé A, Srivastava A, Gurevych I (2021) BEIR: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models. In: Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks. diff --git a/docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md b/docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md index 4acb73da..947aca45 100644 --- a/docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md +++ b/docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md @@ -542,17 +542,17 @@ $$ ## 参考文献 -Xu Y, Li M, Cui L, Huang S, Wei F, Zhou M (2020) LayoutLM: Pre-training of Text and Layout for Document Image Understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 1192–1200. +Xu Y, Li M, Cui L, Huang S, Wei F, Zhou M (2020) LayoutLM: Pre-training of Text and Layout for Document Image Understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 1192–1200. https://doi.org/10.1145/3394486.3403172. -Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083–4091. +Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083–4091. https://doi.org/10.1145/3503161.3548112. -Appalaraju S, Jasani B, Kota B U, Xie Y, Manmatha R (2021) DocFormer: End-to-End Transformer for Document Understanding. In: Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp 993–1003. +Appalaraju S, Jasani B, Kota B U, Xie Y, Manmatha R (2021) DocFormer: End-to-End Transformer for Document Understanding. In: Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp 993–1003. https://doi.org/10.1109/iccv48922.2021.00103. -Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 498–517. +Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 498–517. https://doi.org/10.1007/978-3-031-19815-1_29. -Mathew M, Karatzas D, Jawahar C V (2021) DocVQA: A Dataset for VQA on Document Images. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), pp 2200–2209. +Mathew M, Karatzas D, Jawahar C V (2021) DocVQA: A Dataset for VQA on Document Images. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), pp 2200–2209. https://doi.org/10.1109/wacv48630.2021.00225. -Mathew M, Bagal V, Tito R, Karatzas D, Valveny E, Jawahar C V (2022) InfographicVQA. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), pp 1697–1706. +Mathew M, Bagal V, Tito R, Karatzas D, Valveny E, Jawahar C V (2022) InfographicVQA. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), pp 1697–1706. https://doi.org/10.1109/wacv51458.2022.00264. Radford A, Kim J W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, Krueger G, Sutskever I (2021) Learning Transferable Visual Models From Natural Language Supervision. In: Proceedings of the 38th International Conference on Machine Learning (ICML), pp 8748–8763. @@ -562,11 +562,11 @@ Li J, Li D, Savarese S, Hoi S (2023) BLIP-2: Bootstrapping Language-Image Pre-tr Lee K, Joshi M, Turc I, Hu H, Liu F, Eisenschlos J, Khandelwal U, Shaw P, Chang M-W, Toutanova K (2023) Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding. In: Proceedings of the 40th International Conference on Machine Learning (ICML), pp 18893–18912. -Masry A, Long D X, Tan J Q, Joty S, Hoque E (2022) ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp 2263–2279. +Masry A, Long D X, Tan J Q, Joty S, Hoque E (2022) ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp 2263–2279. https://doi.org/10.18653/v1/2022.findings-acl.177. Liu F, Piccinno F, Krichene S, Pang C, Lee K, Joshi M, Altun Y, Collier N, Eisenschlos J M (2023a) MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 12756–12770. -Liu F, Eisenschlos J M, Piccinno F, Krichene S, Pang C, Lee K, Joshi M, Chen W, Collier N, Altun Y (2023b) DePlot: One-shot Visual Language Reasoning by Plot-to-Table Translation. In: Findings of the Association for Computational Linguistics: ACL 2023, pp 10381–10399. +Liu F, Eisenschlos J M, Piccinno F, Krichene S, Pang C, Lee K, Joshi M, Chen W, Collier N, Altun Y (2023b) DePlot: One-shot Visual Language Reasoning by Plot-to-Table Translation. In: Findings of the Association for Computational Linguistics: ACL 2023, pp 10381–10399. https://doi.org/10.18653/v1/2023.findings-acl.660. Ren S, He K, Girshick R, Sun J (2015) Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In: Advances in Neural Information Processing Systems 28, pp 91–99. @@ -576,7 +576,7 @@ Kirillov A, Mintun E, Ravi N, Mao H, Rolland C, Gustafson L, Xiao T, Whitehead S Nogueira R, Cho K (2019) Passage Re-ranking with BERT. arXiv preprint arXiv:1901.04085. -Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150–158. +Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150–158. https://doi.org/10.18653/v1/2024.eacl-demo.16. Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28, pp 2503–2511. diff --git a/docs/zh/part7/ch23_online_feedback_knowledge_update.md b/docs/zh/part7/ch23_online_feedback_knowledge_update.md index bf2d5f46..10509269 100644 --- a/docs/zh/part7/ch23_online_feedback_knowledge_update.md +++ b/docs/zh/part7/ch23_online_feedback_knowledge_update.md @@ -671,35 +671,35 @@ print(route_feedback(event)) ## 参考文献 -Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 291–300. +Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 291–300. https://doi.org/10.1109/icse-seip.2019.00042. Breck E, Cai S, Nielsen E, Salib M, Sculley D (2017) The ML Test Score: A Rubric for ML Production Readiness and Technical Debt Reduction. In: Proceedings of the IEEE International Conference on Big Data, pp 1123–1132. -Chapelle O, Zhang Y (2009) A Dynamic Bayesian Network Click Model for Web Search Ranking. In: Proceedings of the 18th International Conference on World Wide Web, pp 1–10. +Chapelle O, Zhang Y (2009) A Dynamic Bayesian Network Click Model for Web Search Ranking. In: Proceedings of the 18th International Conference on World Wide Web, pp 1–10. https://doi.org/10.1145/1526709.1526711. -Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150–158. +Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150–158. https://doi.org/10.18653/v1/2024.eacl-demo.16. -Gama J, Žliobaitė I, Bifet A, Pechenizkiy M, Bouchachia A (2014) A Survey on Concept Drift Adaptation. ACM Computing Surveys 46(4):1–37. +Gama J, Žliobaitė I, Bifet A, Pechenizkiy M, Bouchachia A (2014) A Survey on Concept Drift Adaptation. ACM Computing Surveys 46(4):1–37. https://doi.org/10.1145/2523813. Gao Y, Xiong Y, Gao X, Jia K, Pan J, Bi Y, Dai Y, Sun J, Wang M, Wang H (2023) Retrieval-Augmented Generation for Large Language Models: A Survey. arXiv preprint arXiv:2312.10997. -Hu Y, Koren Y, Volinsky C (2008) Collaborative Filtering for Implicit Feedback Datasets. In: Proceedings of the 2008 IEEE International Conference on Data Mining, pp 263–272. +Hu Y, Koren Y, Volinsky C (2008) Collaborative Filtering for Implicit Feedback Datasets. In: Proceedings of the 2008 IEEE International Conference on Data Mining, pp 263–272. https://doi.org/10.1109/icdm.2008.22. Huyen C (2022) Designing Machine Learning Systems: An Iterative Process for Production-Ready Applications. O’Reilly Media. -Joachims T (2002) Optimizing Search Engines Using Clickthrough Data. In: Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp 133–142. +Joachims T (2002) Optimizing Search Engines Using Clickthrough Data. In: Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp 133–142. https://doi.org/10.1145/775066.775067. -Joachims T, Swaminathan A, Schnabel T (2017) Unbiased Learning-to-Rank with Biased Feedback. In: Proceedings of the Tenth ACM International Conference on Web Search and Data Mining, pp 781–789. +Joachims T, Swaminathan A, Schnabel T (2017) Unbiased Learning-to-Rank with Biased Feedback. In: Proceedings of the Tenth ACM International Conference on Web Search and Data Mining, pp 781–789. https://doi.org/10.1145/3018661.3018699. Koh P W, Sagawa S, Marklund H, Xie S M, Zhang M, Balsubramani A, Hu W, Yasunaga M, Phillips R L, Gao I, Lee T, David E, Stavness I, Guo W, Earnshaw B A, Haque I S, Beery S, Leskovec J, Kundaje A, Pierson E, Levine S, Finn C, Liang P (2021) WILDS: A Benchmark of in-the-Wild Distribution Shifts. In: Proceedings of the 38th International Conference on Machine Learning, pp 5637–5664. Kohavi R, Tang D, Xu Y (2020) Trustworthy Online Controlled Experiments: A Practical Guide to A/B Testing. Cambridge University Press. -Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866–31879. +Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866–31879. arXiv:2205.02302. -Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W-t, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459–9474. +Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W-t, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459–9474. arXiv:2005.11401. -Mallen A, Asai A, Zhong V, Das R, Khashabi D, Hajishirzi H (2023) When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 9802–9822. +Mallen A, Asai A, Zhong V, Das R, Khashabi D, Hajishirzi H (2023) When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 9802–9822. https://doi.org/10.18653/v1/2023.acl-long.546. Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28, pp 2503–2511. diff --git a/docs/zh/part8/ch25_data_versioning_experiment_tracking.md b/docs/zh/part8/ch25_data_versioning_experiment_tracking.md index 1f476a8c..5ffc10f0 100644 --- a/docs/zh/part8/ch25_data_versioning_experiment_tracking.md +++ b/docs/zh/part8/ch25_data_versioning_experiment_tracking.md @@ -655,7 +655,7 @@ Zaharia 等人的《Accelerating the Machine Learning Lifecycle with MLflow》 ## 参考文献 -Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291-300. +Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291-300. https://doi.org/10.1109/icse-seip.2019.00042. Armbrust M, Ghodsi A, Xin R, Zaharia M (2020) Delta Lake: High-Performance ACID Table Storage over Cloud Object Stores. Proceedings of the VLDB Endowment 13(12):3411-3424. @@ -665,31 +665,31 @@ Breck E, Cai S, Nielsen E, Salib M, Sculley D (2017) The ML Test Score: A Rubric Breck E, Polyzotis N, Roy S, Whang S E, Zinkevich M (2019) Data Validation for Machine Learning. In: Proceedings of Machine Learning and Systems 1, pp 334-347. -Buneman P, Khanna S, Tan W-C (2001) Why and Where: A Characterization of Data Provenance. In: Proceedings of the 8th International Conference on Database Theory (ICDT), pp 316-330. +Buneman P, Khanna S, Tan W-C (2001) Why and Where: A Characterization of Data Provenance. In: Proceedings of the 8th International Conference on Database Theory (ICDT), pp 316-330. https://doi.org/10.1007/3-540-44503-x_20. DAMA International (2017) DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition. Technics Publications. DVC Documentation (2024) Data Version Control Documentation. Available at: https://dvc.org/doc. -Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723. -Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. +Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. arXiv:2205.02302. -Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. +Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596. Moreau L, Missier P (eds.) (2013) PROV-DM: The PROV Data Model. W3C Recommendation. -Peng R D (2011) Reproducible Research in Computational Science. Science 334(6060):1226-1227. +Peng R D (2011) Reproducible Research in Computational Science. Science 334(6060):1226-1227. https://doi.org/10.1126/science.1213847. Polyzotis N, Roy S, Whang S E, Zinkevich M (2017) Data Management Challenges in Production Machine Learning. In: Proceedings of the 2017 ACM International Conference on Management of Data (SIGMOD), pp 1723-1726. -Sandve G K, Nekrutenko A, Taylor J, Hovig E (2013) Ten Simple Rules for Reproducible Computational Research. PLOS Computational Biology 9(10):e1003285. +Sandve G K, Nekrutenko A, Taylor J, Hovig E (2013) Ten Simple Rules for Reproducible Computational Research. PLOS Computational Biology 9(10):e1003285. https://doi.org/10.1371/journal.pcbi.1003285. Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28, pp 2503-2511. -Simmhan Y L, Plale B, Gannon D (2005) A Survey of Data Provenance in e-Science. ACM SIGMOD Record 34(3):31-36. +Simmhan Y L, Plale B, Gannon D (2005) A Survey of Data Provenance in e-Science. ACM SIGMOD Record 34(3):31-36. https://doi.org/10.1145/1084805.1084812. -Stodden V, Leisch F, Peng R D (eds.) (2014) Implementing Reproducible Research. CRC Press. +Stodden V, Leisch F, Peng R D (eds.) (2014) Implementing Reproducible Research. CRC Press. https://doi.org/10.1201/b16868. Vartak M, Subramanyam H, Lee W-E, Viswanathan S, Husnoo S, Madden S, Zaharia M (2016) ModelDB: A System for Machine Learning Model Management. In: Proceedings of the Workshop on Human-In-the-Loop Data Analytics (HILDA), Article 14. diff --git a/docs/zh/part8/ch26_data_platform_observability.md b/docs/zh/part8/ch26_data_platform_observability.md index 4fdeb8c8..6929922c 100644 --- a/docs/zh/part8/ch26_data_platform_observability.md +++ b/docs/zh/part8/ch26_data_platform_observability.md @@ -682,7 +682,7 @@ PagerDuty 是业界最广泛使用的事故响应工具,支持多级告警路 ## 参考文献 -Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291-300. +Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291-300. https://doi.org/10.1109/icse-seip.2019.00042. Baylor D, Breck E, Cheng H-T, Fiedel N, Foo C Y, Haque Z, Haykal S, Ispir M, Jain V, Koc L, Koo C Y, Lew L, Mewald C, Modi A N, Polyzotis N, Ramesh S, Roy S, Whang S E, Wicke M, Wilkiewicz J, Zhang X, Zinkevich M (2017) TFX: A TensorFlow-Based Production-Scale Machine Learning Platform. In: Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp 1387-1395. @@ -694,15 +694,15 @@ Breck E, Cai S, Nielsen E, Salib M, Sculley D (2017) The ML Test Score: A Rubric Breck E, Polyzotis N, Roy S, Whang S E, Zinkevich M (2019) Data Validation for Machine Learning. In: Proceedings of Machine Learning and Systems 1, pp 334-347. -Dean J, Barroso L A (2013) The Tail at Scale. Communications of the ACM 56(2):74-80. +Dean J, Barroso L A (2013) The Tail at Scale. Communications of the ACM 56(2):74-80. https://doi.org/10.1145/2408776.2408794. Hellerstein J M, Sreekanti V, Gonzalez J E, Dalton J, Dey A, Nag S, Ramachandran K, Arora S, Bhattacharyya A, Das S, Donsky A, Fierro G, Kumar C, Mazzariol M, Narayanan S, Parameswaran A, Rahman T, Shah R, She C, Storey M, Turman C, Wu E (2017) Ground: A Data Context Service. In: Proceedings of CIDR. Kleppmann M (2017) Designing Data-Intensive Applications. O'Reilly Media. -Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. +Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. arXiv:2205.02302. -National Institute of Standards and Technology (2006) Guide to Computer Security Log Management. NIST Special Publication 800-92. +National Institute of Standards and Technology (2006) Guide to Computer Security Log Management. NIST Special Publication 800-92. https://doi.org/10.6028/nist.sp.800-92. Nygard M T (2018) Release It!: Design and Deploy Production-Ready Software, 2nd Edition. Pragmatic Bookshelf. @@ -712,7 +712,7 @@ OpenTelemetry Authors (2024) OpenTelemetry Specification. Available at: https:// Polyzotis N, Roy S, Whang S E, Zinkevich M (2017) Data Management Challenges in Production Machine Learning. In: Proceedings of the 2017 ACM International Conference on Management of Data (SIGMOD), pp 1723-1726. -Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. +Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. https://doi.org/10.1145/3411764.3445518. Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28, pp 2503-2511. @@ -720,4 +720,4 @@ Sigelman B H, Barroso L A, Burrows M, Stephenson P, Moshchuk A, Osina D, Fikes J Turnbull J (2014) The Art of Monitoring. James Turnbull. -Xu W, Huang L, Fox A, Patterson D, Jordan M I (2009) Detecting Large-Scale System Problems by Mining Console Logs. In: Proceedings of the ACM SIGOPS 22nd Symposium on Operating Systems Principles (SOSP), pp 117-132. \ No newline at end of file +Xu W, Huang L, Fox A, Patterson D, Jordan M I (2009) Detecting Large-Scale System Problems by Mining Console Logs. In: Proceedings of the ACM SIGOPS 22nd Symposium on Operating Systems Principles (SOSP), pp 117-132. https://doi.org/10.1145/1629575.1629587. \ No newline at end of file diff --git a/docs/zh/part9/ch27_data_catalog_and_metadata_governance.md b/docs/zh/part9/ch27_data_catalog_and_metadata_governance.md index 29002778..f1fbd387 100644 --- a/docs/zh/part9/ch27_data_catalog_and_metadata_governance.md +++ b/docs/zh/part9/ch27_data_catalog_and_metadata_governance.md @@ -591,15 +591,15 @@ Abedjan Z, Golab L, Naumann F (2015) Profiling relational data: a survey. The VL Breck E, Polyzotis N, Roy S, Whang S E, Zinkevich M (2019) Data Validation for Machine Learning. In: Proceedings of the 2nd SysML Conference (MLSys). -Buneman P, Khanna S, Tan W-C (2001) Why and Where: A Characterization of Data Provenance. In: Proceedings of the 8th International Conference on Database Theory (ICDT), pp 316–330. +Buneman P, Khanna S, Tan W-C (2001) Why and Where: A Characterization of Data Provenance. In: Proceedings of the 8th International Conference on Database Theory (ICDT), pp 316–330. https://doi.org/10.1007/3-540-44503-x_20. -Cai L, Zhu Y (2015) The challenges of data quality and data quality assessment in the big data era. Data science journal, 2015, 14: 2-2. +Cai L, Zhu Y (2015) The challenges of data quality and data quality assessment in the big data era. Data science journal, 2015, 14: 2-2. https://doi.org/10.5334/dsj-2015-002. DAMA International (2017) DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition. Technics Publications, Basking Ridge. -Fernandez R C, Abedjan Z, Koko F, Yuan G, Madden S, Stonebraker M (2018) Aurum: A Data Discovery System. In: 2018 IEEE 34th International Conference on Data Engineering (ICDE), pp 1001–1012. +Fernandez R C, Abedjan Z, Koko F, Yuan G, Madden S, Stonebraker M (2018) Aurum: A Data Discovery System. In: 2018 IEEE 34th International Conference on Data Engineering (ICDE), pp 1001–1012. https://doi.org/10.1109/icde.2018.00094. -Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86–92. +Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86–92. https://doi.org/10.1145/3458723. Halevy A, Korn F, Noy N F, Olston C, Polyzotis N, Roy S, Whang S E (2016) Goods: Organizing Google's Datasets. In: Proceedings of the 2016 ACM SIGMOD International Conference on Management of Data, pp 795–806. @@ -607,15 +607,15 @@ Hellerstein J M, Sreekanti V, Gonzalez J E, Dalton J, Dey A, Nag S, Ramachandran Herschel M, Diestelkämper R, Ben Lahmar H (2017) A survey on provenance: What for? What form? What from? The VLDB Journal 26(6):881–906. -Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency (FAT*), pp 220–229. +Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency (FAT*), pp 220–229. https://doi.org/10.1145/3287560.3287596. Noy N F, Musen M A (2000) PROMPT: Algorithm and Tool for Automated Ontology Merging and Alignment. In: Proceedings of the 17th National Conference on Artificial Intelligence (AAAI), pp 450–455. -Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17–28. +Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17–28. https://doi.org/10.1145/3299887.3299891. Rahm E, Bernstein P A (2001) A survey of approaches to automatic schema matching. The VLDB Journal 10(4):334–350. -Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1–15. +Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) "Everyone wants to do the model work, not the data work": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1–15. https://doi.org/10.1145/3411764.3445518. Sandhu R S, Coyne E J, Feinstein H L, Youman C E (1996) Role-Based Access Control Models. IEEE Computer 29(2):38–47. @@ -625,4 +625,4 @@ Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young Stonebraker M, Bruckner D, Ilyas I F, Beskales G, Cherniack M, Zdonik S, Pagan A, Xu S (2013) Data Curation at Scale: The Data Tamer System. In: 6th Biennial Conference on Innovative Data Systems Research (CIDR). -Wang R Y, Strong D M (1996) Beyond Accuracy: What Data Quality Means to Data Consumers. Journal of Management Information Systems 12(4):5–33. +Wang R Y, Strong D M (1996) Beyond Accuracy: What Data Quality Means to Data Consumers. Journal of Management Information Systems 12(4):5–33. https://doi.org/10.1080/07421222.1996.11518099. diff --git a/docs/zh/part9/ch28_data_productization_and_data_contracts.md b/docs/zh/part9/ch28_data_productization_and_data_contracts.md index 58e8c3db..6aac6d54 100644 --- a/docs/zh/part9/ch28_data_productization_and_data_contracts.md +++ b/docs/zh/part9/ch28_data_productization_and_data_contracts.md @@ -315,7 +315,7 @@ Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawf Kleppmann M (2017) Designing Data-Intensive Applications: The Big Ideas Behind Reliable, Scalable, and Maintainable Systems. O'Reilly Media, Sebastopol. -Lwakatare L E, Raj A, Crnkovic I, Bosch J, Olsson H H (2020) Large-scale machine learning systems in real-world industrial settings: A review of challenges and solutions. Information and Software Technology 127:106368. +Lwakatare L E, Raj A, Crnkovic I, Bosch J, Olsson H H (2020) Large-scale machine learning systems in real-world industrial settings: A review of challenges and solutions. Information and Software Technology 127:106368. https://doi.org/10.1016/j.infsof.2020.106368. Machado I A, Costa C, Santos M Y (2022) Data Mesh: Concepts and Principles of a Paradigm Shift in Data Architectures. Procedia Computer Science 196:263–271. diff --git a/docs/zh/part9/ch29_data_valuation_and_reuse.md b/docs/zh/part9/ch29_data_valuation_and_reuse.md index d954cfd0..748b727d 100644 --- a/docs/zh/part9/ch29_data_valuation_and_reuse.md +++ b/docs/zh/part9/ch29_data_valuation_and_reuse.md @@ -668,7 +668,7 @@ $$ Brynjolfsson E, Hitt L M, Kim H H (2011) Strength in Numbers: How Does Data-Driven Decisionmaking Affect Firm Performance? Available at SSRN 1819486. -Fleckenstein M, Obaidi A, Tryfona N (2023) A Review of Data Valuation Approaches and Building and Scoring a Data Valuation Model. Harvard Data Science Review 5(1). +Fleckenstein M, Obaidi A, Tryfona N (2023) A Review of Data Valuation Approaches and Building and Scoring a Data Valuation Model. Harvard Data Science Review 5(1). https://doi.org/10.1162/99608f92.c18db966. Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86–92. @@ -676,7 +676,7 @@ Ghorbani A, Zou J (2019) Data Shapley: Equitable Valuation of Data for Machine L Gunasekar S, Zhang Y, Aneja J, Mendes C C T, Del Giorno A, Gopi S, Javaheripi M, Kauffmann P, de Rosa G, Saarikivi O, Salim A, Shah S, Behl H S, Wang X, Bubeck S, Eldan R, Kalai A T, Lee Y T, Li Y (2023) Textbooks Are All You Need. arXiv preprint arXiv:2306.11644. -Hoffmann J, Borgeaud S, Mensch A, Buchatskaya E, Cai T, Rutherford E, de Las Casas D, Hendricks L A, Welbl J, Clark A, Hennigan T, Noland E, Millican K, van den Driessche G, Damoc B, Guy A, Osindero S, Simonyan K, Elsen E, Rae J W, Vinyals O, Sifre L (2022) Training Compute-Optimal Large Language Models. In: Advances in Neural Information Processing Systems 35. +Hoffmann J, Borgeaud S, Mensch A, Buchatskaya E, Cai T, Rutherford E, de Las Casas D, Hendricks L A, Welbl J, Clark A, Hennigan T, Noland E, Millican K, van den Driessche G, Damoc B, Guy A, Osindero S, Simonyan K, Elsen E, Rae J W, Vinyals O, Sifre L (2022) Training Compute-Optimal Large Language Models. In: Advances in Neural Information Processing Systems 35. arXiv:2203.15556. Jia R, Dao D, Wang B, Hubis F A, Hynes N, Gürel N M, Li B, Zhang C, Song D, Spanos C J (2019) Towards Efficient Data Valuation Based on the Shapley Value. In: Proceedings of the 22nd International Conference on Artificial Intelligence and Statistics (AISTATS), pp 1167–1176. diff --git a/docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md b/docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md index 06514b1c..6af33602 100644 --- a/docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md +++ b/docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md @@ -534,7 +534,7 @@ Abraham R, Schneider J, vom Brocke J (2019) Data governance: A conceptual framework, structured review, and research agenda. International Journal of Information Management 49:424-438. -Alhassan I, Sammon D, Daly M (2016) Data governance activities: an analysis of the literature. Journal of Decision Systems 25(sup1):64-75. +Alhassan I, Sammon D, Daly M (2016) Data governance activities: an analysis of the literature. Journal of Decision Systems 25(sup1):64-75. https://doi.org/10.1080/12460125.2016.1187397. DAMA International (2017) DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition. Technics Publications. @@ -544,9 +544,9 @@ Ferraiolo D F, Kuhn D R (1992) Role-Based Access Controls. In: Proceedings of th Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. -Hu V C, Ferraiolo D, Kuhn R, Schnitzer A, Sandlin K, Miller R, Scarfone K (2014) Guide to Attribute Based Access Control (ABAC) Definition and Considerations. NIST Special Publication 800-162. +Hu V C, Ferraiolo D, Kuhn R, Schnitzer A, Sandlin K, Miller R, Scarfone K (2014) Guide to Attribute Based Access Control (ABAC) Definition and Considerations. NIST Special Publication 800-162. https://doi.org/10.6028/nist.sp.800-162. -Khatri V, Brown C V (2010) Designing data governance. Communications of the ACM 53(1):148-152. +Khatri V, Brown C V (2010) Designing data governance. Communications of the ACM 53(1):148-152. https://doi.org/10.1145/1629175.1629210. Ladley J (2019) Data Governance: How to Design, Deploy, and Sustain an Effective Data Governance Program, 2nd Edition. Academic Press. @@ -556,9 +556,9 @@ Moody D, Walsh P (1999) Measuring the Value of Information: An Asset Valuation A National Institute of Standards and Technology (2020a) Security and Privacy Controls for Information Systems and Organizations. NIST Special Publication 800-53 Revision 5. -National Institute of Standards and Technology (2020b) NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0. +National Institute of Standards and Technology (2020b) NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0. https://doi.org/10.6028/nist.cswp.10. -Otto B (2011) Data Governance. Business & Information Systems Engineering 3(4):241-244. +Otto B (2011) Data Governance. Business & Information Systems Engineering 3(4):241-244. https://doi.org/10.1002/9781118269053.ch4. Reis J, Housley M (2022) Fundamentals of Data Engineering. O'Reilly Media. @@ -570,4 +570,4 @@ Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young Weber K, Otto B, Österle H (2009) One Size Does Not Fit All: A Contingency Approach to Data Governance. ACM Journal of Data and Information Quality 1(1):4. -Wilkinson M D, Dumontier M, Aalbersberg I J, Appleton G, Axton M, Baak A, Blomberg N, Boiten J-W, da Silva Santos L B, Bourne P E, Bouwman J, Brookes A J, Clark T, Crosas M, Dillo I, Dumon O, Edmunds S, Evelo C T, Finkers R, Gonzalez-Beltran A, Gray A J G, Groth P, Goble C, Grethe J S, Heringa J, 't Hoen P A C, Hooft R, Kuhn T, Kok R, Kok J, Lusher S J, Martone M E, Mons A, Packer A L, Persson B, Rocca-Serra P, Roos M, van Schaik R, Sansone S-A, Schultes E, Sengstag T, Slater T, Strawn G, Swertz M A, Thompson M, van der Lei J, van Mulligen E, Velterop J, Waagmeester A, Wittenburg P, Wolstencroft K, Zhao J, Mons B (2016) The FAIR Guiding Principles for scientific data management and stewardship. Scientific Data 3:160018. \ No newline at end of file +Wilkinson M D, Dumontier M, Aalbersberg I J, Appleton G, Axton M, Baak A, Blomberg N, Boiten J-W, da Silva Santos L B, Bourne P E, Bouwman J, Brookes A J, Clark T, Crosas M, Dillo I, Dumon O, Edmunds S, Evelo C T, Finkers R, Gonzalez-Beltran A, Gray A J G, Groth P, Goble C, Grethe J S, Heringa J, 't Hoen P A C, Hooft R, Kuhn T, Kok R, Kok J, Lusher S J, Martone M E, Mons A, Packer A L, Persson B, Rocca-Serra P, Roos M, van Schaik R, Sansone S-A, Schultes E, Sengstag T, Slater T, Strawn G, Swertz M A, Thompson M, van der Lei J, van Mulligen E, Velterop J, Waagmeester A, Wittenburg P, Wolstencroft K, Zhao J, Mons B (2016) The FAIR Guiding Principles for scientific data management and stewardship. Scientific Data 3:160018. https://doi.org/10.1038/sdata.2016.18. \ No newline at end of file diff --git a/publishing/accessibility/springer_alt_text_inventory.csv b/publishing/accessibility/springer_alt_text_inventory.csv index d1ef65b7..5778158c 100644 --- a/publishing/accessibility/springer_alt_text_inventory.csv +++ b/publishing/accessibility/springer_alt_text_inventory.csv @@ -1,31 +1,31 @@ row_id,language,part,unit,unit_title,source_markdown,line,figure_number,caption,image_alt_in_markdown,image_file,image_exists,image_dimensions,image_format,alt_text,long_description,decorative,review_status,reviewer,notes,zh_reference_caption,zh_reference_alt_text -ALT-0001,en,Front Matter,front_matter_guide,Front Matter Guide,docs/en/front_matter_guide.md,7,front_matter_guide-1,"Book architecture. Source: original illustration by the authors. The figure organizes the fourteen parts, forty-eight chapters, fifteen projects, and eight appendices around the data lifecycle, with layers for foundations, collection and processing, cross-cutting capabilities, model alignment, application governance, security and compliance, specialized practice, and project delivery; Alt text: book architecture diagram showing the data lifecycle, layered manuscript structure, DataOps flywheel, platform support capabilities, and engineering principles that run through the book. The book's core contributions appear in four areas","Book architecture of Data Engineering for Large Foundation Models, showing the data lifecycle, foundation layer, data collection and processing layer, cross-cutting capabilities, model alignment and capability enhancement layer, application governance layer, security and specialized practice layer, project case layer, DataOps flywheel, and platform support capabilities",docs/images/book_structure_en.png,yes,1536x1024,png,"book architecture diagram showing the data lifecycle, layered manuscript structure, DataOps flywheel, platform support capabilities, and engineering principles that run through the book. The book's core contributions appear in four areas.","Review against the figure and surrounding text in Front Matter Guide; expand if Springer requests a longer description for the visual relationships, axes, or sequence.",no,Draft,,,全书架构图。来源:本书自绘。该图以数据全生命周期为主线,将十四篇、四十八章、十五个项目和八个附录组织为基础、采集处理、横切能力、模型对齐、应用治理、安全合规、专项实践和项目交付等层级;Alt text:《大模型数据工程》全书架构图,展示数据全生命周期、主体篇章分层、DataOps 飞轮、平台支撑能力和贯穿全书的工程原则。 本书的核心贡献体现在四个方面,《大模型数据工程》全书架构图,展示数据全生命周期、基础层、数据采集与处理层、横切能力、模型对齐与能力增强层、应用治理层、安全合规与专项实践层、项目实战层、DataOps 飞轮和平台支撑能力之间的关系 -ALT-0002,en,Part 1: Overview and Infrastructure,Ch01,Chapter 1: The Data Revolution in the Era of Large Models,docs/en/part1/ch01_data_change.md,153,1-1,"Figure 1-1: LLM-Era Data Engineering Role Restructuring Diagram. Source: original illustration. The figure depicts the role flywheel loop spanning platform architecture, data collection, model fine-tuning and validation, and product-research iteration; Alt text: LLM-era data engineering role restructuring diagram showing the closed-loop interfaces among platform, data, algorithms, annotation, product, and compliance roles","Figure 1-1: LLM-Era Data Engineering Role Restructuring Diagram, showing the closed-loop interfaces among platform, data, algorithms, annotation, product, and compliance roles",docs/images/part1/data_engineering_roles_1775830393574.svg,yes,768x768,svg,"LLM-era data engineering role restructuring diagram showing the closed-loop interfaces among platform, data, algorithms, annotation, product, and compliance roles.",,no,Draft,,,图1-1:大模型时代数据工程职责重构图。来源:本书自绘。该图展现了从平台架构、数据采集到模型微调验证再到产研迭代的角色飞轮闭环;Alt text:大模型时代数据工程职责重构图,展示平台、数据、算法、标注、产品与合规角色之间的闭环接口,图1-1:大模型时代数据工程职责重构图,展示平台、数据、算法、标注、产品与合规角色之间的闭环接口 -ALT-0003,en,Part 1: Overview and Infrastructure,Ch01,Chapter 1: The Data Revolution in the Era of Large Models,docs/en/part1/ch01_data_change.md,226,1-2,"Figure 1-2: Full Fourteen-Part Lifecycle Map. Source: original illustration. The figure uses infrastructure as its foundation, threading through pretraining, multimodal data, alignment, applications, platform governance, compliance, and hands-on projects; Alt text: full fourteen-part lifecycle map showing the knowledge structure spanning general principles, pretraining, multimodal, alignment, applications, platform, compliance, and hands-on projects","Figure 1-2: Full Fourteen-Part Lifecycle Map, showing the knowledge structure spanning general principles, pretraining, multimodal, alignment, applications, platform, compliance, and hands-on projects",docs/images/part1/data_lifecycle_map_1775830407042.svg,yes,768x708,svg,"full fourteen-part lifecycle map showing the knowledge structure spanning general principles, pretraining, multimodal, alignment, applications, platform, compliance, and hands-on projects.",,no,Draft,,,图1-2:全书十四篇制生命周期地图。来源:本书自绘。该图以基础设施为底座,串联预训练、多模态、对齐、应用、平台治理、合规与项目实战;Alt text:全书十四篇制生命周期地图,展示从总论、预训练、多模态、对齐、应用、平台、合规到项目实战的知识结构,图1-2:全书十四篇制生命周期地图,展示从总论、预训练、多模态、对齐、应用、平台、合规到项目实战的知识结构 -ALT-0004,en,Part 1: Overview and Infrastructure,Ch02,Chapter 2: LLM Data Lifecycle and Quality Evaluation Framework,docs/en/part1/ch02_quality_framework.md,104,2-1,"Figure 2-1: Multi-dimensional quality layering architecture from a lifecycle perspective. Source: original illustration from this book. The upper half is a horizontal four-stage pipeline: pre-training (scale/diversity/low duplication), SFT (instruction coverage/format compliance/factual accuracy), RLHF/DPO preference alignment (contrastive signal/annotation consistency/value alignment), and RAG application (timeliness/retrieval accuracy/traceability); the lower half is a triangular mapping structure showing the bidirectional relationships among offline data quality, proxy model evaluation, and real online business metrics. Alt text: horizontal four-stage pipeline diagram from pre-training to RAG application showing key quality metrics at each stage; the triangle below shows the relationships among offline data quality, proxy model evaluation, and real business metrics","Figure 2-1: Multi-dimensional quality layering architecture from a lifecycle perspective, showing how metric weights shift across stages from scale and diversity toward truthfulness and helpfulness",docs/images/part1/data_quality_hierarchy_1775835516841.svg,yes,768x768,svg,"horizontal four-stage pipeline diagram from pre-training to RAG application showing key quality metrics at each stage; the triangle below shows the relationships among offline data quality, proxy model evaluation, and real business metrics.","Review against the figure and surrounding text in Chapter 2: LLM Data Lifecycle and Quality Evaluation Framework; expand if Springer requests a longer description for the visual relationships, axes, or sequence.",no,Draft,,,图2-1:生命周期视角下的多维度质量分层架构。来源:本书自绘。上半部分为水平四阶段流水线:预训练(规模/多样性/低重复率)→ 指令微调SFT(指令覆盖/格式合规/事实准确)→ 偏好对齐RLHF/DPO(对比信号/标注一致性/价值观贴合)→ RAG应用(时效性/检索精度/可追溯性);下半部分为三角映射结构,展示离线数据质量、代理模型评测和真实业务在线三者的双向关联。Alt text:水平四阶段流水线图,从预训练到RAG应用展示各阶段关键质量指标;下方三角形展示离线数据质量、代理模型评测与真实业务指标的相互关系,图2-1:生命周期视角下的多维度质量分层架构,展示不同阶段质量指标权重从规模、多样性转向真实性、帮助性 -ALT-0005,en,Part 1: Overview and Infrastructure,Ch02,Chapter 2: LLM Data Lifecycle and Quality Evaluation Framework,docs/en/part1/ch02_quality_framework.md,148,2-2,"Figure 2-2: Cross-mapping matrix of large language model data defects and quality metrics. Source: original illustration from this book. The matrix rows are six defect classes: noise, repetition, benchmark contamination, systematic bias, structural incompleteness, and staleness; the columns are five quality metrics: accuracy, consistency, diversity, coverage, and traceability. Each cell uses a filled circle (strong impact), half circle (medium impact), or empty circle (weak impact) to mark impact strength. Alt text: 6-by-5 cross-mapping matrix with six data defect classes as rows and five quality metrics as columns; each cell uses a filled circle, half circle, or empty circle to indicate impact strength, with a legend for the three symbols at the bottom","Figure 2-2: Cross-mapping diagram of large language model data defects and quality metrics, showing the relationships between six defect classes and accuracy, consistency, diversity, coverage, and traceability",docs/images/part1/defect_metric_radar_1775835533937.svg,yes,768x768,svg,"6-by-5 cross-mapping matrix with six data defect classes as rows and five quality metrics as columns; each cell uses a filled circle, half circle, or empty circle to indicate impact strength, with a legend for the three symbols at the bottom.","Review against the figure and surrounding text in Chapter 2: LLM Data Lifecycle and Quality Evaluation Framework; expand if Springer requests a longer description for the visual relationships, axes, or sequence.",no,Draft,,,图2-2:大模型数据缺陷与质量指标交叉映射矩阵。来源:本书自绘。矩阵行为六类缺陷:噪声、重复、基准污染、系统偏差、结构缺失、时效衰败;矩阵列为五项质量指标:准确度、一致性、多样性、覆盖度、可追溯性。各单元格以实心圆(强影响)、半圆(中等影响)、空圆(弱影响)标注影响程度。Alt text:6行×5列交叉映射矩阵,行为六类数据缺陷,列为五项质量指标,每个单元格用实心圆、半圆或空圆表示影响强弱;底部附图例说明三种符号含义,图2-2:大模型数据缺陷与质量指标交叉映射图,展示六类缺陷与准确度、一致性、多样性、覆盖度和可追溯性之间的关系 -ALT-0006,en,Part 1: Overview and Infrastructure,Ch02,Chapter 2: LLM Data Lifecycle and Quality Evaluation Framework,docs/en/part1/ch02_quality_framework.md,299,2-3,"Figure 2-3: Automated blocking and governance flow driven by the data scorecard. Source: original illustration from this book. The figure shows how hard gates, soft gates, manual review, and rollback actions collectively block contaminated or degraded data samples; Alt text: automated blocking and governance flow driven by the data scorecard, showing hard gates, soft gates, manual review, and rollback actions","Figure 2-3: Automated blocking and governance flow driven by the data scorecard, showing hard gates, soft gates, manual review, and rollback actions",docs/images/part1/data_quality_gates_1775835548587.svg,yes,768x634,svg,"automated blocking and governance flow driven by the data scorecard, showing hard gates, soft gates, manual review, and rollback actions.",,no,Draft,,,图2-3:数据评分卡驱动的自动截断与治理流。来源:本书自绘。该图展示硬闸门、软闸门、人工复核和回滚动作如何共同阻隔被污染或劣化的数据样本;Alt text:数据评分卡驱动的自动截断与治理流,展示硬闸门、软闸门、人工复核和回滚动作,图2-3:数据评分卡驱动的自动截断与治理流,展示硬闸门、软闸门、人工复核和回滚动作 -ALT-0007,en,Part 1: Overview and Infrastructure,Ch03,Chapter 3: AI-Native Data Stack and Cost Governance,docs/en/part1/ch03_data_stack.md,72,3-1,"Figure 3-1: Five-layer architecture of an AI-native data stack. Source: original illustration from this book. The figure shows how ingestion and access, processing orchestration, storage and indexing, evaluation operations, and governance and security layers jointly move data from raw corpus to trainable datasets; Alt text: five-layer architecture of an AI-native data stack showing the data flow among ingestion and access, processing orchestration, storage and indexing, evaluation operations, and governance and security layers",Figure 3-1: Five-layer architecture of an AI-native data stack,docs/images/part1/ai_data_stack_architecture.svg,yes,768x768,svg,"five-layer architecture of an AI-native data stack showing the data flow among ingestion and access, processing orchestration, storage and indexing, evaluation operations, and governance and security layers.","Review against the figure and surrounding text in Chapter 3: AI-Native Data Stack and Cost Governance; expand if Springer requests a longer description for the visual relationships, axes, or sequence.",no,Draft,,,图3-1:AI 原生数据栈五层架构。来源:本书自绘。该图展示采集接入、处理编排、存储索引、评测运营和治理安全层如何协同驱动数据从原始语料流向可训练数据集;Alt text:AI 原生数据栈五层架构,展示采集接入、处理编排、存储索引、评测运营和治理安全层之间的数据流,图3-1:AI原生数据栈五层架构,展示采集接入、处理编排、存储索引、评测运营和治理安全层之间的数据流 -ALT-0008,en,Part 1: Overview and Infrastructure,Ch03,Chapter 3: AI-Native Data Stack and Cost Governance,docs/en/part1/ch03_data_stack.md,264,3-2,"Figure 3-2: Training-data cost-governance loop. Source: original illustration from this book. The figure shows a cross-version iteration cycle that starts from budget planning, passes through cost monitoring, ROI evaluation, and optimization decisions, and returns to budget review; Alt text: training-data cost-governance loop showing the cycle of budget planning, cost monitoring, ROI evaluation, optimization decisions, and budget review",Figure 3-2: Training-data cost-governance loop,docs/images/part1/cost_governance_loop.svg,yes,768x768,svg,"training-data cost-governance loop showing the cycle of budget planning, cost monitoring, ROI evaluation, optimization decisions, and budget review.",,no,Draft,,,图3-2:训练数据成本治理闭环。来源:本书自绘。该图展示从预算规划出发,经过成本监控、ROI 评估和优化决策,最终回归预算复盘的跨版本迭代过程;Alt text:训练数据成本治理闭环图,展示预算规划、成本监控、ROI 评估、优化决策和预算复盘的循环,图3-2:训练数据成本治理闭环图,展示预算规划、成本监控、ROI评估、优化决策和预算复盘的循环 -ALT-0009,en,Part 2: Text Pre-training Data Engineering,Ch04,"Chapter 4: Data Sources, Acquisition, and Copyright",docs/en/part2/ch04_data_sources.md,63,4-1,"Figure 4-1: Layered map of pre-training data sources. The three-layer taxonomy positions mainstream sources by processing complexity, knowledge density, and license risk, with typical reference ranges for mixing. Source: original illustration from this book; Alt text: layered map of pretraining data sources showing the quality and compliance positions of open web, forums and Q&A, encyclopedias, code, academic papers, books, enterprise internal data, and user feedback data",Figure 4-1: Layered map of pre-training data sources,docs/images/part2/pretrain_data_source_map.svg,yes,768x768,svg,"layered map of pretraining data sources showing the quality and compliance positions of open web, forums and Q&A, encyclopedias, code, academic papers, books, enterprise internal data, and user feedback data.",,no,Draft,,,图4-1:预训练数据源分层地图 —— 三层分类体系按照处理复杂度、知识密度和许可风险对主流数据来源进行定位,并给出典型的配比参考区间。来源:本书自绘;Alt text:预训练数据源分层地图,展示开放网页、论坛问答、百科、代码、学术论文、书籍、企业内部数据和用户反馈数据的质量与合规位置,图4-1:预训练数据源分层地图 -ALT-0010,en,Part 2: Text Pre-training Data Engineering,Ch04,"Chapter 4: Data Sources, Acquisition, and Copyright",docs/en/part2/ch04_data_sources.md,266,4-2,"Figure 4-2: Data ingestion and provenance chain. From source contact to final archive, each processing stage appends metadata records to the ""Provenance Ledger,"" forming a complete auditable data-lineage chain. Source: original illustration from this book; Alt text: data ingestion and provenance chain diagram showing the links among source contact, acquisition, parsing, cleaning, storage, and audit records",Figure 4-2: Data ingestion and provenance chain,docs/images/part2/data_ingestion_provenance_chain.svg,yes,768x768,svg,"data ingestion and provenance chain diagram showing the links among source contact, acquisition, parsing, cleaning, storage, and audit records.",,no,Draft,,,"图4-2:数据采集与权属存证流程——从数据源触达到最终归档,每个处理阶段均向""Provenance Ledger(权属账本)""追加元数据记录,形成完整的可审计数据血缘链路。来源:本书自绘;Alt text:数据采集与权属存证流程图,展示来源触达、采集、解析、清洗、入库和审计记录之间的链路",图4-2:数据采集与权属存证流程图 -ALT-0011,en,Part 2: Text Pre-training Data Engineering,Ch05,"Chapter 5: Cleaning, Deduplication, and Decontamination",docs/en/part2/ch05_cleaning_dedup.md,55,5-1,"Figure 5-1: Overview Flowchart of the Cleaning and Decontamination Pipeline — A multi-stage quality gate gradually refines raw corpus into candidate training corpus. The proportions in the figure are illustrative only; real retention rates depend on source quality, filtering thresholds, and compliance requirements. Source: original illustration from this book; Alt text: overview flowchart of the cleaning and decontamination pipeline, showing the sequential relationship among rule-based filtering, model scoring, deduplication, PII redaction, decontamination, and manual spot-checks",Figure 5-1: Overview Flowchart of the Cleaning and Decontamination Pipeline,docs/images/part2/cleaning_pipeline_overview.svg,yes,768x768,svg,"overview flowchart of the cleaning and decontamination pipeline, showing the sequential relationship among rule-based filtering, model scoring, deduplication, PII redaction, decontamination, and manual spot-checks.","Review against the figure and surrounding text in Chapter 5: Cleaning, Deduplication, and Decontamination; expand if Springer requests a longer description for the visual relationships, axes, or sequence.",no,Draft,,,图5-1:清洗与去污染全景流程图 —— 多阶段质量闸门从原始语料逐步精炼为候选训练语料,图中比例仅为示意,真实留存率取决于来源质量、过滤阈值和合规要求。来源:本书自绘;Alt text:清洗与去污染全景流程图,展示规则过滤、模型评分、去重、PII 脱敏、去污染和人工抽检的顺序关系,图5-1:清洗与去污染全景流程图 -ALT-0012,en,Part 2: Text Pre-training Data Engineering,Ch05,"Chapter 5: Cleaning, Deduplication, and Decontamination",docs/en/part2/ch05_cleaning_dedup.md,410,5-2,"Figure 5-2: Quality Filtering Funnel and Spot-Check Feedback Loop — The funnel on the left shows the data retention rate at each stage; the feedback loop on the right shows how manual spot-checks drive continuous iterative optimization of filtering rules. Source: original illustration; Alt text: Quality filtering funnel and spot-check feedback loop diagram, showing the cyclic relationship among rule-based filtering, model scoring, deduplication, manual spot-checks, and rule write-back",Figure 5-2: Quality Filtering Funnel and Spot-Check Feedback Loop,docs/images/part2/quality_filter_funnel_loop.svg,yes,768x768,svg,"Quality filtering funnel and spot-check feedback loop diagram, showing the cyclic relationship among rule-based filtering, model scoring, deduplication, manual spot-checks, and rule write-back.",,no,Draft,,,图5-2:质量过滤漏斗与抽检闭环 —— 左侧漏斗展示每阶段的数据留存率,右侧闭环展示人工抽检如何驱动过滤规则的持续迭代优化。来源:本书自绘;Alt text:质量过滤漏斗与抽检闭环图,展示规则过滤、模型评分、去重、人工抽检和规则回写之间的循环关系,图5-2:质量过滤漏斗与抽检闭环图 -ALT-0013,en,Part 2: Text Pre-training Data Engineering,Ch06,"Chapter 6: Tokenization, Serialization, and Efficient Loading",docs/en/part2/ch06_tokenization_loading.md,324,6-1,"Figure 6-1: Throughput bottleneck diagnosis flowchart — starting from abnormal GPU utilization, a three-level decision tree is used to locate disk I/O bottlenecks, CPU preprocessing bottlenecks, and PCIe transfer bottlenecks, with corresponding remediation steps. Source: original illustration from this book; Alt text: throughput bottleneck diagnosis flowchart showing the decision paths from abnormal GPU utilization to disk I/O, CPU preprocessing, and PCIe transfer investigation",Figure 6-1: Throughput Bottleneck Diagnosis Flowchart,docs/images/part2/io_bottleneck_diagnosis_flow.svg,yes,768x768,svg,"throughput bottleneck diagnosis flowchart showing the decision paths from abnormal GPU utilization to disk I/O, CPU preprocessing, and PCIe transfer investigation.",,no,Draft,,,图6-1:吞吐瓶颈诊断流程图 —— 从 GPU 利用率异常出发,通过三级决策树定位磁盘 I/O 瓶颈、CPU 预处理瓶颈和 PCIe 传输瓶颈,并给出对应的修复方案。来源:本书自绘;Alt text:吞吐瓶颈诊断流程图,展示从 GPU 利用率异常到磁盘 I/O、CPU 预处理和 PCIe 传输排查的决策路径,图6-1:吞吐瓶颈诊断流程图 -ALT-0014,en,Part 2: Text Pre-training Data Engineering,Ch06,"Chapter 6: Tokenization, Serialization, and Efficient Loading",docs/en/part2/ch06_tokenization_loading.md,400,6-2,"Figure 6-2: LLM training input pipeline layered architecture — the complete five-stage path from tokenization, serialization, data mixing, and packing to DataLoader GPU feeding, with the two highest-frequency bottleneck risk points (disk I/O and CPU-GPU transfer) annotated at the bottom. Source: original illustration from this book; Alt text: training input pipeline layer diagram showing the sequential relationship between tokenization, serialization, mixing, packing, DataLoader, and GPU feeding",Figure 6-2: Training Input Pipeline Layer Diagram,docs/images/part2/training_input_pipeline_layers.svg,yes,768x768,svg,"training input pipeline layer diagram showing the sequential relationship between tokenization, serialization, mixing, packing, DataLoader, and GPU feeding.","Review against the figure and surrounding text in Chapter 6: Tokenization, Serialization, and Efficient Loading; expand if Springer requests a longer description for the visual relationships, axes, or sequence.",no,Draft,,,图6-2:LLM 训练输入管道分层架构 —— 从分词、序列化、数据混采、Packing 到 DataLoader GPU 馈送的五阶段完整路径,底部标注了两个最高频的瓶颈风险点(磁盘 I/O 和 CPU↔GPU 传输)。来源:本书自绘;Alt text:训练输入管道分层图,展示分词、序列化、混采、Packing、DataLoader 和 GPU 馈送之间的顺序关系,图6-2:训练输入管道分层图 -ALT-0015,en,Part 2: Text Pre-training Data Engineering,Ch07,"Chapter 7: Data Evaluation, Quality Closed Loop, and Operational Iteration",docs/en/part2/ch07_data_operations.md,63,7-1,"Figure 7-1: Data Operations Flywheel — The left side shows the high-cost startup zone; the right side shows the gradually accumulated cycle of automated, high-quality data assets formed after long-term model evaluation and root-cause analysis feedback. Source: Original illustration by the authors. Alt text: Data operations flywheel diagram showing the cyclical relationship among data production, model evaluation, root-cause analysis, rule write-back, and asset reuse",Figure 7-1: Data Operations Flywheel,docs/images/part2/data_operations_flywheel.svg,yes,768x768,svg,"Data operations flywheel diagram showing the cyclical relationship among data production, model evaluation, root-cause analysis, rule write-back, and asset reuse.",,no,Draft,,,图7-1:数据运营飞轮图 —— 左侧展示高成本的起步区,右侧展示经过长期模型评估与根因分析反哺后,逐渐形成的自动化、高质量数据资产积累循环。来源:本书自绘;Alt text:数据运营飞轮图,展示数据生产、模型评估、根因分析、规则回写和资产复用之间的循环关系,图7-1:数据运营飞轮图 -ALT-0016,en,Part 2: Text Pre-training Data Engineering,Ch07,"Chapter 7: Data Evaluation, Quality Closed Loop, and Operational Iteration",docs/en/part2/ch07_data_operations.md,280,7-2,"Figure 7-2: Data Evaluation Feedback Loop — A circular architecture proceeding from sampling-based blind review to root-cause investigation triggered by metric anomalies, followed by targeted system governance actions. Source: Original illustration by the authors. Alt text: Data evaluation feedback loop diagram showing the closed-loop relationship among sampling evaluation, metric anomalies, root-cause investigation, governance actions, and rule updates",Figure 7-2: Data Evaluation Feedback Loop,docs/images/part2/data_evaluation_loop.svg,yes,848x708,svg,"Data evaluation feedback loop diagram showing the closed-loop relationship among sampling evaluation, metric anomalies, root-cause investigation, governance actions, and rule updates.",,no,Draft,,,图7-2:数据评估闭环图 —— 从抽取式盲审到针对评估指标启动根因排查,再针对具体现象采取系统治理动作的环形架构。来源:本书自绘;Alt text:数据评估闭环图,展示抽样评估、指标异常、根因排查、治理动作和规则更新之间的闭环,图7-2:数据评估闭环图 -ALT-0017,en,Part 3: Multimodal Data Engineering,Ch08,Chapter 8: Image-Text Pair Data Engineering,docs/en/part3/ch08_multimodal_image.md,58,8-1,"Figure 8-1: Overview of multimodal image-text data engineering. The pipeline starts from DOM-tree crawling and PDF parsing, then moves through format parsing, watermark filtering, CLIP semantic alignment, interleaved-sequence assembly, and tokenized representation. Distributed computing and metadata form the foundation across the pipeline. Source: drawn for this book. Alt text: an overview of image-text data engineering showing DOM extraction, image download, format parsing, filtering, semantic alignment, recaptioning, and sequence assembly",Figure 8-1: Overview of image-text data engineering,docs/images/part3/multimodal_data_panorama.svg,yes,768x768,svg,"an overview of image-text data engineering showing DOM extraction, image download, format parsing, filtering, semantic alignment, recaptioning, and sequence assembly.",,no,Draft,,,图8-1:多模态图文数据工程全景图 —— 从最左侧的 DOM 树抓取与 PDF 解析起始,依次穿过格式解析、水印过滤、CLIP 语义对齐、直至最右侧的交错序列拼装与 Token 化表示。分布式计算与 Metadata 是横跨底层的核心支撑。来源:本书自绘;Alt text:图文数据工程全景图,展示 DOM 抽取、图片下载、格式解析、过滤、语义对齐、重标注和序列拼装之间的流程,图8-1:图文数据工程全景图 -ALT-0018,en,Part 3: Multimodal Data Engineering,Ch08,Chapter 8: Image-Text Pair Data Engineering,docs/en/part3/ch08_multimodal_image.md,256,8-2,"Figure 8-2: Image semantic alignment and filtering flow. A CLIP- and heuristic-rule-based quantitative decision tree filters out low-match samples, sends medium-match but high-value images to the recaptioning pipeline, and finally stores zero-padded or dynamically sliced images in the training pool. Source: drawn for this book. Alt text: an image semantic alignment and filtering flow showing quality filtering, CLIP scoring, recaptioning, dynamic slicing, and training-pool ingestion",Figure 8-2: Image semantic alignment and filtering flow,docs/images/part3/image_semantic_alignment_flow.svg,yes,768x840,svg,"an image semantic alignment and filtering flow showing quality filtering, CLIP scoring, recaptioning, dynamic slicing, and training-pool ingestion.",,no,Draft,,,图8-2:图像语义对齐与过滤流程图 —— 展示基于 CLIP 与启发式规则的量化决策树,将低匹配样本筛出,将中等匹配但高价值图片送往 Re-captioning 流水线,最后将图片 Zero-pad 或动态切分后存入训练池。来源:本书自绘;Alt text:图像语义对齐与过滤流程图,展示质量过滤、CLIP 打分、重标注、动态切分和训练池入库之间的路径,图8-2:图像语义对齐与过滤流程图 -ALT-0019,en,Part 3: Multimodal Data Engineering,Ch08,Chapter 8: Image-Text Pair Data Engineering,docs/en/part3/ch08_multimodal_image.md,272,8-3,"Figure 8-3: AnyRes dynamic multi-resolution patching. The core idea is that the high-resolution panoramic input on the left is no longer forced into a square. Instead, it is divided by an adaptive grid into $1 \times 3$ native-resolution local patches and paired with a global thumbnail in the upper-right corner before entering the vision encoder, preserving both high-frequency local features and global semantics. Source: drawn for this book. Alt text: AnyRes dynamic multi-resolution patching showing a panorama divided into local patches and combined with a global thumbnail",Figure 8-3: AnyRes dynamic multi-resolution patching,docs/images/part3/anyres_dynamic_patching.svg,yes,768x382.5,svg,AnyRes dynamic multi-resolution patching showing a panorama divided into local patches and combined with a global thumbnail.,,no,Draft,,,图8-3:AnyRes 动态多分辨率切割算法原理图 —— 展示 AnyRes 的核心思想:左侧的超长全景图(High-Res Input)不再被强制压缩,而是被自适应网格(Adaptive Grid)划分为 $1 \times 3$ 个原生分辨率的局部图像块(Local Patches),同时结合右上方全局缩略图(Global Thumbnail)一同送入 Vision Encoder,以保留高频局部特征与宏观语义。来源:本书自绘;Alt text:AnyRes 动态多分辨率切割算法原理图,展示全景图被切成局部块并与全局缩略图共同输入视觉编码器,图8-3:AnyRes 动态多分辨率切割算法原理图 -ALT-0020,en,Part 3: Multimodal Data Engineering,Ch09,Chapter 9: Re-captioning and Document Understanding,docs/en/part3/ch09_recaptioning_ocr.md,165,9-1,"Figure 9-1: Recaptioning and OCR dual-track enhancement. The left side shows a semantic vision track for dense narrative descriptions; the right side shows a structural text track containing DOM layout segmentation and table matrices. The two streams are fused into a unified hybrid supervision template. Source: drawn for this book. Alt text: a recaptioning and OCR dual-track enhancement diagram showing visual recaptioning, OCR structure extraction, BBox injection, and hybrid supervision",Figure 9-1: Recaptioning and OCR dual-track enhancement,docs/images/part3/recaptioning_ocr_pipeline.svg,yes,768x768,svg,"a recaptioning and OCR dual-track enhancement diagram showing visual recaptioning, OCR structure extraction, BBox injection, and hybrid supervision.",,no,Draft,,,图9-1:重标注与 OCR 增强联合的双轨管道图(Dual-track Pipeline) —— 左侧展示语义密集叙述流(Semantic Vision Track),右侧展示包含 DOM 排版分割与表格矩阵的高密度结构流(Structural Text Track),最终融合为统一的混合监督模板格式。来源:本书自绘;Alt text:重标注与 OCR 双流线增强图,展示视觉重描述、OCR 结构提取、BBox 注入和混合监督格式之间的关系,图9-1:重标注与 OCR 双流线增强图 -ALT-0021,en,Part 3: Multimodal Data Engineering,Ch09,Chapter 9: Re-captioning and Document Understanding,docs/en/part3/ch09_recaptioning_ocr.md,197,9-2,"Figure 9-2: Document structure layout-to-token mapping. The left side shows a fragment of a two-column academic report. The system first uses bounding-box arrays to locate titles, body text, charts, and formula regions. The right side shows how outputs from specialized models such as Nougat and PaddleOCR are post-processed into hierarchical Markdown text and rich text streams with discrete coordinates [xy]. Source: drawn for this book. Alt text: document layout-to-token mapping showing a page converted by layout detection, OCR, formula parsing, and coordinate labeling into hierarchical text",Figure 9-2: Document structure layout-to-token mapping,docs/images/part3/document_structure_sample.svg,yes,768x768,svg,"document layout-to-token mapping showing a page converted by layout detection, OCR, formula parsing, and coordinate labeling into hierarchical text.",,no,Draft,,,图9-2:文档结构 Layout-to-Token 映射图(Document Structure Layout-to-Token Mapping) —— 左半区展示一份双栏学术报告残页;系统首先通过 Bounding Box 阵列定位标题、正文、图表和公式区域;右半区展示 Nougat、PaddleOCR 等特化模型输出如何经脚本后处理,归并为层级化 Markdown 文本与离散坐标 [xy] 的富文本数据流。来源:本书自绘;Alt text:文档结构 Layout-to-Token 映射图,展示文档页面被版面检测、OCR、公式解析和坐标标注转换为层级文本序列,图9-2:文档结构 Layout-to-Token 映射图 -ALT-0022,en,Part 3: Multimodal Data Engineering,Ch10,Chapter 10: Video and Audio Data Engineering,docs/en/part3/ch10_video_audio.md,56,10-1,"Figure 10-1: Distributed audio-video alignment pipeline. Raw mixed videos in the video lake are split into visual and acoustic tracks. Visual-frame extractors and acoustic separators extract features independently before the streams meet in a temporal alignment engine, which produces aligned multimodal JSONL samples with closed timestamp constraints. Source: drawn for this book. Alt text: an audio-video alignment pipeline showing a raw video split into visual, audio, and text tracks and transformed into JSONL by a temporal alignment engine",Figure 10-1: Distributed audio-video alignment pipeline,docs/images/part3/av_sample_pipeline.svg,yes,768x768,svg,"an audio-video alignment pipeline showing a raw video split into visual, audio, and text tracks and transformed into JSONL by a temporal alignment engine.","Review against the figure and surrounding text in Chapter 10: Video and Audio Data Engineering; expand if Springer requests a longer description for the visual relationships, axes, or sequence.",no,Draft,,,图10-1:音视频对齐分布式管线图(Audio-Video Pipeline: Temporal Alignment) —— 左侧原始 Video Lake 中的混合视频被剥离为视觉(Visual Track)和声学(Acoustic Track)双轨并行管线,视觉帧提取器与声学分离器各自提取特征后,最终汇集入跨模态时间对齐引擎(Temporal Alignment Engine),生成带时间戳闭合约束的多模态输入样本(Aligned Multimodal JSONL)。来源:本书自绘;Alt text:音视频对齐分布式管线图,展示原始视频被拆分为视觉轨、音频轨和文本轨,并通过时间对齐引擎生成 JSONL 样本,图10-1:音视频对齐分布式管线图 -ALT-0023,en,Part 3: Multimodal Data Engineering,Ch10,Chapter 10: Video and Audio Data Engineering,docs/en/part3/ch10_video_audio.md,68,10-2,"Figure 10-2: Adaptive shot-boundary detection and semantic leakage prevention. The upper track extracts aggregated HSV-channel color-space differences, while the lower track extracts optical-flow pixel displacement to capture subtle motion posture. The two tensor differences flow into dual-threshold triage. When the jump score $\Delta$ exceeds the hard-cut threshold, the engine splits the clip and prevents semantic leakage across scenes. Source: drawn for this book. Alt text: adaptive shot-boundary detection showing HSV difference, optical-flow difference, and dual-threshold routing",Figure 10-2: Adaptive shot-boundary detection and semantic leakage prevention,docs/images/part3/av_shot_boundary_hsv.svg,yes,840x768,svg,"adaptive shot-boundary detection showing HSV difference, optical-flow difference, and dual-threshold routing.",,no,Draft,,,图10-2:自适应镜头边界检测与语义防泄漏架构图(Adaptive Shot Boundary Detection & Semantic Leakage Prevention) —— 展示双轨特征侦测逻辑:上层提取 HSV 多通道色彩空间聚合差分,下层提取光流像素位移(Optical Flow)以捕捉细微运动姿态。两种张量差分在右侧汇入“双重阈值路由(Dual-Threshold Triage)”。当突变分值 $\Delta$ 超过硬切阈值(Hard Cut Threshold)时,引擎切分片段,避免场景转换导致视觉切片语义泄漏。来源:本书自绘;Alt text:自适应镜头边界检测图,展示 HSV 差分、光流差分和双阈值路由如何共同判断镜头切分点,图10-2:自适应镜头边界检测与语义防泄漏架构图 -ALT-0024,en,Part 3: Multimodal Data Engineering,Ch10,Chapter 10: Video and Audio Data Engineering,docs/en/part3/ch10_video_audio.md,84,10-3,"Figure 10-3: Large-scale ASR extraction and temporal calibration. Traditional ASR can suffer cumulative temporal drift and semantic errors, such as mishearing I love apples. as maples. WhisperX uses VAD slicing, multi-path acoustic decoding, and a DTW phoneme-level forced-alignment matrix for temporal calibration. The bottom shows word tokens aligned with waveform troughs through vertical dashed lines. Source: drawn for this book. Alt text: ASR extraction and temporal calibration showing traditional ASR drift, WhisperX calibration, and word-level timestamp alignment",Figure 10-3: Large-scale ASR extraction and temporal calibration,docs/images/part3/asr_whisperx_comparison.svg,yes,768x768,svg,"ASR extraction and temporal calibration showing traditional ASR drift, WhisperX calibration, and word-level timestamp alignment.",,no,Draft,,,图10-3:大规模 ASR 提取与时间轴动态校准对比图(Large-Scale ASR Extraction & Temporal Calibration) —— 展示传统 ASR 管道在长序列中可能产生累积性时间漂移(Cumulative Temporal Drift)和语义错误(将 I love apples. 误听写为 maples.);中间展示 WhisperX 通过 VAD 切分、多路声学解码与 DTW(音素级强制对齐)矩阵进行时间校准;底部展示词汇 Token 与音频波谷通过垂直虚线对齐后的输出。来源:本书自绘;Alt text:ASR 提取与时间轴校准对比图,展示传统 ASR 漂移、WhisperX 校准和词级时间戳对齐结果,图10-3:大规模 ASR 提取与时间轴动态校准对比图 -ALT-0025,en,Part 3: Multimodal Data Engineering,Ch10,Chapter 10: Video and Audio Data Engineering,docs/en/part3/ch10_video_audio.md,106,10-4,"Figure 10-4: Cross-modal temporal calibration and geometric alignment. The cyan top track is visual key frames, the gray middle track is acoustic features, and the coral bottom track is discrete text tokens. At t=4.2s, the temporal lock binds the visual action ""raising a cup,"" waveform features near the trough, and Nb1Ox&EwC#eSwYjJN!^f2v6a)nNN9*mt>gDWa zZ|>}D&*JUq&Q0X76Fxl{_Q*KQGhHPWF>WJBra!xbEi#Ubt`M4Ew3}8JBQa>!TGY*79Drb zT7%doC0|QfU`ECBs?1{{%e1u*=HKIN)M-SPFi~cwfssY@i?me*ojmtqgLfbmQjBB* zj;vE+{Az2mAg z-1azNq>(XwqkI!+|9Ty|XnL7RG+rhwhkB zWPkLO{Zl26l~vS7EVaim_L)YJ&=}4SF5WgSM7yr|;*hXxmw8X}j(@arlnS=LI6BQ9JN+V7;lgcj>hvzZAdd75NbyiJ4 zRpplyck;pV4poHLaT^RBkBUQak3p0IAN<&j3T1AipS5&h)3Fz)?qNyB9MH{T*8q%T>hPKCSwrGQ8rXG zCjY9CD<8{54p$`@kxQBZ=lh?c%uquTCB-1ICnX}0G0EUE(8c~OPH5tyBQ-!MQOo8^ zOoiK136K6T$^5s&VI->%dR5lFwhaRENg&c~!=qKDNlc5kuHHc9Vd;RXgtB zRGsnDUzq>B8RzfCL?!?`F`21XE%oov8oNPb`m`!zULQLytSjgkyXlL)RXy$?Foo6S z4m0Cr@;}#8;dmHy`CnkGKkTYL>?9E8>1{j~#?6_dhKyBlRC7YKttXfk3cXL+nF=_m zcTgI2qyHwt~uT^mZWCEVW2B)3#Gv8|+p}!N^Dy1L^&g z`gibbr4ras&zln9?5{ZACf4MMmn!G0LDMJla%|B-{PgP1s4}o99QI3Ky|Q{)Ut~k$ z`_gRy-=@z4_~wN&WA)xL&ju1j)({@F2KA=7Vk(8Z6~nxR+C9d=hEP;T7Ec63crp zz)C^8tUZ0PRZz0SRZNH+wik_^@9txgTGXm^7J(@#Cj0L%alus#HzT0(!u=W3g0d|W^)~Rmv zeOJ>vn$QJSv7*i~@5K6(YckK@!>gcp+hvC3p?87zSNWCE7e^J@HA;(Qcm0L8mNLh< z_|dG?+}8-dY95p`mX7s7{reV(^?s*&m*&oNglVC_o^hw=`$bOoHqCx~kvHk9LfU5} zh!jHSo^d*z@R+$SezbORB#6^ON1iI$oid?ayxt@3%T4q9EW`Z~6(=<^5XM}@ zzgvMDsP*91yREo&MRKlE=!aC3e7Tzi(wA*C4D=g>4WuxUF1Z+3{NDzlmqRr)GX|i` zhj6btP*(~@r8C!W@wE_JkPo$}Od z^2mph8kv@Q$6x=#dgH=tc#V!gV_MDgno{0*BR*W?$ymGW>#&48|CQ!5Osl|?_a@K? zf0?VWRLeno&(|Q^pkB*Fq_)(ceEEf#L{)Lj!b4!>+*5XBlz@!Xs!U_p;nR5Eu}kK< zS&tne@l_D@%^}EzWiv-hF@A(#8|CVkN3{+gRTjO(KjTroc9n$ZaE*?^>@w1L_*9J! zay|LH7t$(Y>*a1ky#uC^QBLb)qFNx&p{~QduJgr6J*7!97n3BGz8dS90Y{0)a=U7c zlCU8)5!MT_@z2Av^3a3-zjMl`$+A1iycMjG{Di5^_wPRzT3T8#G#=|ahsU#ohn@Tf zrnwdP==E^@D$wgYZ$(>cn2r|avvk1Y;qKmk8GT;52b8CiwnCQYNzDPbUUdL(mQ{t= zJD>aW^nK4?Mbx%gYe&mpKfa!x!@IwBk>Quen2*piIuHB7R&=exuB-auD5=9rXE9gs^e0*>4a^Mi-_$5?EO3Q7!nZ-w^l^VHITA8FyaMab6kF_-i z)h!yOhL3IaaHzD7<{KKUntxaSl#iMet=*s{%q%Y1#qCYj@R6v88#x5%g|ako?!}dr z=@Tnn6yVC$N7c7Yup& z1sin;PTbKv>^u5WK>RcRYnk2SMUj5G=o88_YUiQcaSHZC(_tl0a1rfq>l3QTtVRs+ z2}kKCZ_PW)Ff_kS%?-kVTYij%1lumKZ#l~WE2rvlrLY2zxvdpcB|JE;P+>aE5I~Dh z4bSQWQL4uag9SIE&T;)c&{QO$l(Uzz^B=jyv035Xx4c#_-f|dXw;GRzy`?nVXi{T7 z^i6(rnbY=(%?K8N4?R~PJVWK=KKr=`TcnFF?p0I2^ZI!@{OanPpHpMo-CBG5UqzPX zG+c$oxKn$=yJo3_x1zsx1#KPL4E`@o!e+HGD@px#YlgNyr}yL!&~H9R2jHo=?YyM^ z!9EAq+p^J@2=H2BKtSKG8_t*O^T=-3#wH$heL}_>+*27K9Q-nUA<>^wUq8C`^VOl4 zi}UGux~}mdNk~}xSDTqh2IYHpE2(jFnusQkmFZ=lgRSSxqem=HOB%~+8tz%fzpK9C z5BtG+!jwn!J8=KibL-tLqA&DdU*q4E9PrrKT_ti!U*ClU9?a8ji67BBDG&&oxhkE5H($!L?lMO6e9z1a~G(K$G?cHU8yD^`^C*TwA zH)-+d>R9gy878bm%*uv`&0GOg}V$Co>i z;sx-`{_Im=qjI+5Ikb~o@b)^LJ!yzn7kF+Dd{3&r9QV+@uBz$dne$`6oF$h!<}NuNveq%9fw6%hAnGL*+cmvcQN3@|M7>?Z$g-7 z5*5C${dD1d%}=~y-@dOvDNMCU^BCXy5=G=JDd6dQj(Uly2b zR&F8kbK;yMcxno)1-Ur&FxCj&?qpwGN*0)#XQls2j;b{a$TG1O)b^7DEGQiRB} zzJ#udkn31J9{myIFWWv8VM*#Nku$?t4fJ#d#${v~A?6vvc+%0PmSmuCC;(An>fS0W zo7@0Y9AspJ+2Av;-r%dRUU?Y2*TiOE_h;y$=dzQ@)W?#F*|FuA8$iKg4|tSx43|0J z?lwu8?QT%bimI47P2paS*MvlomLDn4ze=EA>Ycr~N{EvQ5iPIqs$gW?6a0*08{H}; zi7hP!)%z0xt!QP5jo}l5G(hhBfZqDDEXxqGdP_eMniFp zMoTA2=xy?Xw*I2_bh0klQI~R%ftgo-^_WR%C+P&@N(WHbc@ECYS>M}yG z^MGkmK%)QhUONx#MNCb8{&BZm=|%Yy&92SwZ`!@mu8yPl`i!hNzX1WP%Y{l_$GF-D z=MH!0;*X^1N^j1@PFN{GZjsy9YK=n**Qwo zZUydqH=;{&G864y&Mx<9Kkl=-Hb_^E2Ic!X3MkU(y_nML*1O-#lxp;9z0PDSmt?(!}dcFLhh_OCfc`v{y|Sh^tDri6udY} zN5lU!KT0rfYy=daFh?R5d}bicn+zOfQkU-*7tS}4(b1IcT4>GedaABf=kDXanU_*& zZKQ!HF*NGo?#q39;wN>u?@SEBzR7$E>-k-Zol3AKiNpo8&M{sTvHuc2>#miN*Q*mt zRxj@&FM)esp~x0;y&3)`VXYi$$eLN`Y50ieHcXLmc^OiEW0%-byYCQ8`B{UVSx*7= zJam5VzOgo#>#WRB}TI9z>Kc=PqN5rv+*GolE1^$PLT>;}eO^gvMT z))f{Adh|eAsNPjCIjP~XHaJQw05KJxDfJr1sPg#u-IZdnNAP>6Ca4F zJftSUc-<#SFd{zuK(?NJYtbl=yDNR>dHwm*(+GP3eL8;0&?hRleA;*|YjE!{0cD(y zk#$JPy@IrUT})TX(MaVk|M#P``Rr$B8i*J;$RHs1ZJ4eBa*FRTj*1}J1Cm3J_!qw{yPdWx zCA2NR47YT#>3{gWVKI>7dn2Cn#82zzi!CukT_ax)I>zywuCB%8#f?mL}*BdJ|InZ;JTV8pFiQ9wVd zp8>xluC9~hw&Xdh5}gP5<}D1J?V(RzIs5~5x`bW>NpDecbs|Q;Iu9FFb+<+=j3MKV zI&Um-?wre7-vcsgf@^(eUXv59HYb9_4BbBJPad-@T}##~tZ5=neBqYXHrumLdnza2 z2Ini^eCTV8Sy&Cd&IMa^r0E7~ooMA*w^Z}KupG^WHoBeJ+FB)dh>eX=POeto-F^qa zuQDH5`|q*P^JrH9P_Oxb(2r&mlEy1!Hf7kz+RcJK#lb5YYl7YUI(=L$#3^RN=eVyJ z(>o{Jri-7&RHHQ0dUhT^{LJF~Vm@V9G_`}m{JUhH`gt^NavBOa!Mb(~Fz>2XNp8RW z4!is`K+y5gu09R?T3Dc6xU5!*0h({Wv$+oa*DL$~JWQ)Q;GrchF{4vkQ;(^_{l={R zB-3=P&TIHnLDWV691=jw5JazSyY%EG8^EzWe8tFuEA{`4Ltd;o!e!ZLH0?f^YycxM zIg%=#;MrPL!l5_9r()h!TvgbPu|~wj%+IJNFMp^_g>c}mgTEFdjS&GjjF|4xqp&f{ zQNI`S2$fVmG7UsQZa~@mTc&K$!4!)o{n|9=;;MkNuL>EsRj0$pRKg)HudQ|5k`zWF zrM1@-ua2Ud0I)Mdw-C$o(3U`yW4Lb{jgQ(==T#fcp^8oJSM}P32I_seFv4XrBEyn= zN4+Mq@74yj!y|U6f^PsDuSG`7%0=IkKa~b8Pvr6SfzJ`!zFR)TK+ge=pg6&0=xXNb+H2u?3xA{FE)Z4MlMj1PZ|yKsKmaggR) z{PLCWoa)DY^}o!LTk*YfKZKl{?P2rn455JK!O6J5sx_Z6U^#9YZtoG1&#LXF;l?iZ z2V%qdn}rB0y5PF)G@Xi{!<-NIgG21d5AKI_tyu^1U-@Y~*F z%2n1u-on98GQSs|{O3Lg%eT=?e-OL;fQas_5~~mQ7(M7lb=K0S zBUyJP(UUuX2cMXpuMN;;vq>5*H16BG;cdB@{?Zhf5%z3c(1WLAA|79Tp?)+Jvbd*p z>(y~m@Xo|++>FecE&zh-X=mOh_=N`zi7#e_^w%hW6Ne7WeZDF*;L{0jHT^r-JAPfF ztDrN&yKY_L#r}-sFY2byfSwxB2-Uay->EDu9F)$$kHN=$wl$X1DFa{ahTW#=>f`cj zWB;7Eq&bPT5D=JF2ay#hDO`3W zrblh(K1N!dC38NI=~Taw{`J8=TX@6^gV5!MZP_So+ZW@(ZY$j$i22Z~Y42L4;|G14 zuirR<0vCVY8XtM*vF`xbQ78@_hx(k)%&tpZat!JXd%`f~jatJWK9nQ(c}S6NxGd7W0$ zj4>m&^!GL#9!I9IP55_OPZ?Ftz8i=>k@KRoyuPt2p3ZE+*F5TnihLqfqwk_Mu}9JlPya7vRRoe^OQCpg^FAak}>`xF=6i zh#7^$lWvtV!cY=4yz-aU@kE*dS`lt4$@Hi$nzkB;nHfDksC}iqC;z~(Bj`l_G6SjM z>)p?4mexBF52;4YNYcyP*tHh&TWG+ZJB_#3Lk3O1#3`kP&5|mp2|sB(4DLIzIJDfq z>h>CDe;X8RDyZNPNz;2>3Jkj6RyR{69oHqJAk6P@epyg-GtPVF@-SjD--8b6dc|b2 z?Qu)gFCnJLZqU^UQ^A6ayEu#zRxLuMZ1~#>4R3w5}R9O)^3(}z0sjdXoJ4dTaj(^(`?_N z?|F1=!mib$3)c9;=R_zEkRCqbkI&>{h{RO_dFLt2WXKp-a?$ZcGTtPQ5a;T{$C>6w z`9BQ88q4D52i1@H6-F4|3q!ze^5pazU3?0R&Nc%O?G1-N(h-vg*dr7H-Q+cLrVz+Z zJ7~{@v=GQb3o_AU6qLxsU*Q>7HA-5!G0CY;Sjs~lepRL_CeKa6YWi6$08vqY>V6HshD#*m zMOl}ix}iyZtqS(y#S#|KdOEXMlGZ9*t-^O7A#cBPw0Aqhr5^DG|kd6!rJF-nRD604O@6;JVE{w0V6k=2|7_~^?HEv7H zL<{IUCpUsN^g5CCPc2IKW#{WyDac)-&i3|l&>f}L@Zs6YWs}%s69Qpqa(cCdZCwEZ zsswWEqEIAuH^3~{KvTa8+igW+BDV%E%AnvyDSji|H~A=s%C2cA=%1rp|8%v31MJzJ zd%2!bvzbb752^|B9`WvdG!@>DA>|p7AmbC*QG&P*f+32LK~VAgz;N>)o_i0@-PZfz z8*(^O+gJnt+*7oTL1oEgp;mlygKazP4trQ+POj1nH$X6a6J!MtX5vIu##fG$kTQ4c zhgMnCE@popD^WmpIJcG&$Eq$0#7I`<%Mph$ z@$Qcj+Zv%t_CI_-+rD(tBUuZ>VKsjby3o^WxUZHY}L_^%2f)bx**(K~wBWima zrv%BFe0yOBm<*A7uL=yMw!cFxWg$~9)H=+mGNb*@K8&e#p<=n^cqAlw=7Tku`FT4= zp$3kK`Ls;V+i;_Kh&SjbE2MbyRbmBN9}OJe=o_W3RHHjNwXLnJ9o_7XZihh#Fy?-c z!s#x%rre_EG|GOFJRcwMrAY;}1{?aWJp1|c?|3p^JJD18V(a8ysR|#S!c5Ep88v2k zy$GS)jG6f+y*v|*H)~^dSq8sG%%25Ewc)XJEmMvcEXL03zSvJJF7a3ZI}%OEUWZKP zx8AVND2jDi@9k?@9c!3d;Ga!5K&R%;CPO5Yj}uA~?1Zt&q8T#H&R(sI)qjx7$@tt?yslT?nim_539)ZT|Q?cHNv<9rb;2@c^A` zgIF{cVH_dI>_6zkr%w-0Svt}7X;GE1vdh98s0WN;0>&Lai-A66T-;VFKtHBrJ=>nI zo*;=(c5jtXg!ii>rU_j;~Q1izAF*6TVvZQfi?zu3 zC3pxy)j|W9l_hYm+AtwoVMHzuircJO)G8qklkP2j`MKwBl{#doxd~qq%!q_m5y4<0YcSZ~ z@&P15T8l2Tyi>GinYdNO<`}n0vw_Zvbo=@|`gsmTc&EnE&wuZF(8!0~7@q*{vu3&n z0Bt3kt?>BT&@J~+7@qY(%Q)Tb_Sv}J;wPVUi)cPRD({VO|Fjf`Ktl~>HhQ?U4OoMp zfdCN;&OWXlG9eV{#uPImhcvciusBq-4C%e0^phJ$ZDR-wvVr0I_3BHvn7EeNI8jH; znuK2acAVPGnHhBz=7GR?Y&PmQ!>4i}@9y(sA_LJ*ijGi-Y&EozE5Va~(NmE(BW_S> zNHO)`8s}O}gR2WkJbi_*5FhfIdR}f;)|Zx)rXhG3OE}!3l*@-LCA|&9=4P@dKWC^lT)TPkksMG}5F);K6F1BMOHQ_SQaYgDx zL*tU7;DdDPt&YVMkX-6`vDQ z>J6@evEUju(-|}PWlC$(+2uLavHWQGk22aHnHXwrIb=hFAch6A1*E}CFaVInrF zQ?<8ABpm;D*O6S2O;^O+d+AZYc~gKjZxt*2_$WGyKl)OSkyqr)J*n2%C+tO?t+kzh z@P6m@Lc+&_+4W_#XN0hRR#AKoR@w9T1at;UlT!7vdj|qexvIW#&NBIlr+scm6C9Q? zaL~}58Ig{rK_XferXmXKTF-d;-T z1e!#WHN^&B-kEio+{^E=8&+<%F|pz|O}UIt!5n(Ko!A%m>2-CmJdIaCHr9N zXEZ%;A4Q-4<_7h}LuPC}O=x=mAE*bTaFV_?Drx(H&1w4uk;fi%N0NTi#XEQ79J_NA z`2!gp=Yqays#flaElsT5@ZuM9@{jZWmMDLvZ5O&W3~2ptY*XFUq$onBO>#LYdePaDOok2Kk6=&dl5|=1 z7~=Imm=?MsqX^KWvhMQ?GG3ShA$=&?r89J$14ifJqz6JJKBXK z!&VA4O3`Cfmq7O6o1{d9a-IXV%n`lBG5l9i^nP?=Ot(Zx+C5b#g9X`@0sSZP)?>3~ zxgHn+J~|`y*kpT5v|K(HjU|1CI@GYVPfXm91XgvTL4GNs%$tI9Oe!n!PeQL_ zLT`w7ZhX#RvG$&+8C+dct;&c4BXXHevU-R^JiVWiC??ujVob~B67~U>v)Z~QOqBV< z>{0>M#Z43{Sb3|cru`r)<1ZTjbKuCxXQwjG@$qw-ZReK841ay&4@qf|F+b9SWY~y_TdBkoTLb&s0*DlL;Yk~bUgt~#(IP3EtSjMJ9~D!K?Pu@# zH}9DYBhMNWWYBa=ma)HfYp?AaQz^MI!7PSAEd57(vmN>Kx5+{@JJkVAS%HVBj9nu` zIw70R;UD5S@qp4m2E-Ix1}rFptfEnxo~%BvFVI|5f4yVJM}nQ1dLGFg+RN_^m=Z*R zZh`uw>x>v${52yaw>3Zco4s~J5QP}BnP?TXk`-ji!kYcMPxRA)t(0vPOVSL&l zL7Tu-vFeT#*!pKEl|ZWl(t9FmH#cp`tP}YUF{M%Q5C$8_qe}3`Fw3{*4VkOg zQdWz(whKJ7-sSaw0BEDsP!8C6ag5r;Y}ON#o{0da-5a7F)|#EQ#{cM zW=d^x9Cb^0%skPG8}bRDa-Z03*FMymJO_4u!D$6mg2#hfWljf8QF5LcLTVg_npr^< zo1DL;k{sRO3FG2>qFboatA*wuHrM)Wl2C`(DU`)oJj`{T&q(||(h%2dVu^cne*C^| zi3(&ESO{THvzW`jGysFApiStVnd-$^VPv!vZLBA`9$QDahP5FTMrqL zHzmwi>mpUNcXYRLJ+oL8*3_-@(NLL3-*#A9o|lWPeprYNlH-A@3u-=K?rv0HUS$jp z+cZJxvRzeP5u~il9GSAA6vxyH`Kg=u9!}7~6fVLQ)qhG6Wys*xfGxWim7rmHPyVQa zg#@LW9Qx}asx2CD$efLF&k>5CLY~9+0-kZZHsDmf|3l`Ha3*$ ze;5SpUO3ghDWfbF0v}Iw@l;;K6MO&hUoZByM=jS+G@8{J_|>5Xx#mB8fefG--o75u z0CU6-O~P-)Mq0!h4Kym8Ft=eK(3!$ic+aedUwVy!%TcuuYoNbshfkWc&8#22(VM9= z8nm%d=0}MqpN|;>!r@e50}kma2|c=Nx1U1b@dNjSh7|k}7vnl%!)hgwUe%yWcX&s6 zO$y`lWl(83$kEs5QAAdcGK*(x;*?DXPOmOXI&YGL#l^Mvw^d_uYB4bLB!5UG?6YXf z_I+Q~dr!?9_(~+mC>qHSHeND`+>jdVjg_9;Jg!?S1L9qt3lJ#$vat z2YzSQ_Me5&aNoA0SVh-z+bpJD6wyDI)uge+g#t%cwOju(Ox#Io(@0)*+oAOW8ALV< z_CCqm^JYg-*2J$QQf{E;VPsCUzqo*N@gq=z?Bk*;B_LimfS!nm+)i!ZOYBJrQvc$J zlwJ(QEyTQBh)u0+%AChDvuKg^hGB(PriNMOj|0aN1rKgZB&1pNlHo^}sHLufDI#&4 zucIjPfy#|9lB+gT1;Otww_D!=JYQLzx*=~ang2cB>j9I1@Ytxl8v=UIn>WMMpet<7>VxSP(;FZH=k)dv1SxYTrC17D0IT*LR5 zP9eOab+NmC%61lXQbK5yztb`9l&1yuGFkmZUECLd9{*t0SLAanj=VvZY=+%t0H z;3o4@vb^kgum>gTGNq-)0{{>A-eUw&Ed9%jM|efgZU6FD6JuE=Ls@ig-u45Xu9Hmt74@6ddV2y6*9fmUnr7tky}JA?LynoO2xRI7Rd?RKVR^71x_?>X!dRlZ}Qdh zVD#Jb3Xb)Wo#oqE*)2aozp;<&*e@>SYbr$eF3KI~ry$k#Yuqb{cni77o%+!v0gUn6 ztMYHi>X1v{vi2UOWaIpg@caS@284ZYYy&2N+`zLQWUuXnHOP$y#_lc@r{eDhEVM4y zaK&mAKKQ}xd{77|%J_1)H0nG%9k1KOyMBQlkXL+4YZc7$>EI8|m`<5I>q2&x${9T8 z$Z0gvuf-k{#Dr7P~TUUq`;=LKY5+!M2-^yQ=24;wq+L=*z|u!hK-jPib}%%J-LZo^oOnI)Z9EGW*mohs1HeU%gW) z7{{n^r6``=iSL}6hYDwwnZ*G(CdJ9=S)ZSPEJ2W5CG+IPRF1GmAM%~+?5Ub3j_XeY zbsqR%@LM5b=h;U2O`u=2Jc_G&Xor`=2Y(w)+w=Z|AAx_h9Rry0 zI8H96%gS^2tx-%8B)HJAV%HXn9!pRJK0XbYYIkciHux^ILcT`EpTn{EKA~yFdb2W5 zgerW8kmgvdc0U(NBQAz;S=Y) z8$0~0DselQ8Q?XO8;pX{1S#ZGtVi+FZ^{Q2XM*h0*((J#FJJ4XmQ#=Gf$eMQx39;u zPu{OgZ6^sGNFSQPe7eO*Dn~k zmOMXQZS?aGRXYB<9d!-T$Y*RY2gX~sM8+{>0)xVVoV29Gc@YlIPl*tifi`%dnhY-3 z^47)pf*>t@KHu|;>T)1P@|dgGU5i(LB0D)maZJbw(@r)aTTRnWB>2gMmuOByHn>~+ zHjumwMU=vNaJ>AJ&>B=45})Jt%(#CmY?RyyD(K{>Wy=VAOt_=ngt-6nBb(e*Q?+06(cI& z{+yntfLWs=@f&hn;xgCC!E(u72EAUA7qn4NACxKdiY$a^)=V{vij&Tt(|&TtI--5q zYsq5Nh^Ah*^sfvzQ_jadgS_FUmc}3t3n|kj4jYii{{=4LxKw*3cs1JXKAVAerCpB& z?%(=+z5u&j-AEC2yY8)S2RaA02%1df8O(;ikE-v!;S(c~wpG5WdNjYI(`P+<75Jk0 z_XVI^SDdV)OC6c92}P7bT4k|je-w2k?Dr<@^A;{}bZSLM)kC1SvB(Dhg3h%-&dIFr zcFr%lPndQaX%QD<2y4&Oene}a5247s@?FW;B(Q4s{MY?x;_LA1uev+ifETV;G}&;t zNRWZwx#EU94Qtwh^qG?ED{T1?uJ9L$EpDb7VMb=fm6N#OR58h=d%PpPmTSB5UpFtU z1vTKoJlX268?HH>FZfT zV6f}>uAW%o-s#a|YM-gSHm5L6Pi-5L0nbU()=n?huJv4sS`tALsKp%Z@VNt0QsO?0 zxiTGJkHTzg2uACIa&Sv-gtd)sM&vy~Z9LoIW%DNH^5PjCbChfZMU3)HZsA`ADFeV< z=AOxs-BuPRLHv&4hHsSAXsm@rz`m*fjV*z!0@4n=q+&dQa!x{dcLK^XDBO>rI4=z> zNAEJw>!m>I|}*R4IsUW@c3z5`Dth)_aB02QxezTva1JzDbq-pav$=vK8R=q}jj zWpN*~0ljQm6QuB*_`_6hoH0UYx`JLGLFc&Gl`+!nJ-w*nqX?gxCo__ofOKpLo!2yY zd*miGLA*uMv;~p`$Tiu?=V(zYazxFjO4hTZy{kb~*>r*DpEp+RO?9Uz0COA3)GabL zI%!n^SSur9>!a{sgjYAQ!OY4E=##eUNQ=COjeV$y$64CQ^S~q;4 z*0DoQ29y;Bq8b9|M^=Mz3*^w-7Z{3KwBX?yiJ+U_->y~lTYR_}L_?V{KfcDo3{_o0 zk$WCh6h-c;8ZcrhkxHyALTDBn@gW4pX+II@mj2*Cvs@qha|~q2rfu@m*@l!H7~&wop2_lfY>8Tz&x!sGASBgfeR6EX#`$d#Fk#NcrrD#s|X$xpd4AX@N>8^J71uE3N z;Vv_F6jrDddTcLb}{XK`~bNo=`$Zwyh$Wx}8Ps^8&j^%1-6z0s;#+01Tfu9RK zNroN&fOb{4m|*_aMA! zFFt3yQu_ShqPO~?!PqQp%SWNHo3Da$!hbF*HrK3GfpnZ{)eJ!A$*28l%lGlBb}v@S z1ucmwxTf(Md@_3akq1?yjmS8#?c@JIVt=TCr2cKEOm%3`$*+Uowm{Y0XMGdksj-qH z#uEE`VZtEkQZSz;l5j8MX~5ubLyBPRU5V0q=%B?s=K1)oSE>6Q+*51HyU{~uiF*YR zR*88aHidUG)Dq(74Za+W-Ctdjk3c0X)q)}I=REL^-GeFwp*ZmvwVuB`^tB`{tAv{_ zT_3e0fBTb`c#2D+q?MYY5wl*~)N6}vl_cBR+VpJ`s_`A(DTWWMfTU+HS}{5(Z6|hc zOm{C9gY)KwZDnIVu%J4<5SJPH_*|C6+us4`BHF|^M)OGEY-7n}{Anf0zB^tqjA@Ox zE9yj!-B;++K1iKUSbkzuO`LNz5zQ*;ijkOEe2*j(vM#=zzqphCg!!s6?wNZlZsQ^%L6P0_qDg%QR2jmiR) zL6H8y)`a>~W8dHHKM>+nyq-J~9PB_zn;C29r46R+ife!95Udy@Lo*N6x);s-CgxCg zO!5&t7C(?&RFJ$fbrflQ#*nD)^e6oPi?Z1=$v=tfa%av7i-%aMV|88DFBX)D_+;e{Rw&HYZrf~Nqu61mWZI?4x1Ny%fd|!ADd| zl?kKqX(E`uDvE^Mlj=-umqeVLDK*J}2dcx-hg4|yXq}k*R4m096_R_e{xCVkYMK-+YvHyL0^eD(6H=sJE-Wc8=Y(2rQ-dLf52QwNI=T@#M-x zQvo}esw|szv@G*$F5)<8+Pu@;dTcN9i6a1o-m6gdChCUqP8np1C1dz0N6sH@4 zg&L=dYk*tnxoXESk7~ck5AofUB8=rdx{qy{+e$8qY)Tr)`wh?zBH<`GHDAr6uvw!l z$MVy#g`o=7xnYr)whP6ycojq}v1AK?#muJ23p<9ZADpj=9Gyyfc8=1YdlQv8*Nv{{7#9mt ze+6?L1m4zOc{Z{M%5854Q54#Ag-dKOB6r&`ErItwhQQn*7+T;x?ANlGLOv3Vto;v< zJuR~T7Ag~@6z(!VB*A=GUk975-3`!IjT+ivviLY&+c~@JbO;0-a-)19L+qUgAW@Edt zb7I?ePHZ)5Y@1CQ8x0#Z?`i+P=lzoFe4KsnnOSSEK?qIO2+&)(#zZq~#j<4vk`6?5 zkeVHk7vP(@WbasVrQ3GJ%-=n^1;;2T0O7DUafJDuC;@|mf6GP1IKic2#bwrG{j8yy ze>iuizK4|Nm1?l?Z~4;;bsunIy&U`QS&-}o3p<{*4vYy`&UIx7UgdchqC zE$ebK|HY@35SV6GR3ns!Hb;~^!7X4UGXuJg)vPR5D}@lG&}@86)rXhu>olRjUP)ynv}mi%*uv?9wmBRoeD-XV{QchK)9$pM_@@RGc{bkyjRoC}E9E zU_3iaZm&qu>e0B!qJbeeJLQg`rd-EmU!P3Z+xJyW_gtQLB;+aZmwxs$@@Wc(U%vzK z9L=;JCHad8GJgpEo+6XivFB{tnb*Fb{_0<7-wJXV-Cc`nz~Mixb$Fdmn|hRsk32v5^q@vv$Yk=cAmgWKllGq?J$V1Cu7>e?5qtA>)c7pJ{H;Ne^%M&&@`@ zVdIVni{n;Aw_qlh%|*il%y0od+tg$%o~4zvB;dsuhHBwU2nM{h3c1l@p(KH^ijFQO zg|kNm5jiVXe;eIfjW_z5z^7>|s>}8wLK-UCAbLbtcg?g;_jbh<)YmA994F21;i>!G zn)Xh%Cc|3p0$j^p#?qd5Ouo^fFZ0;&zp z3COd4H7PPs0@CplwO3%Lc?Kn*W#Gk;KqkFpgp1nATOiz^oAun5@PEjAD@-?&i|;vN z^@Zp6>+$1Kn8Vvy96^I&!i#!$s5hPAzy%fy=r{=@1XVI-;1AXp0K3z9osb7whI55f zB`>~ckV$N%6Q5Bqppl*>^1m*IT^Fw5k&XN|WBy&$QaMO+ti&aew6IJH=m)JV(l~HLINgrmmSX z_9)F!)1a?=PEAndjtSoCxL0lcLsFuZIcDoqTr=V_X|V(w(&%l}H#Rl+brDoY<6k>e z08P0?qqZkRPNNa$?Np8yM%a0nk~>faF+mUhm8{6K@%PhOpg*3>lq1d82?jHeV=dN+ z!PotF;Z^YPP(9!EyxEi_(*JK3uKgD#r){6l@)?awxVPjCYtC5IeZPiXQ&AQ6p#%GL zpGiwkwcftF=TC=m&;U8W;^gd@p4*E3`Ar5*%zcAv`{6S^m}U(lfmNo%VLZ8-gfr62 zZAY4OILCbrs10Za&qq1<0;ZEXH??RY(bV$!?7>cse9 zRWi4sUp80vt`9S!LJy_#23C?2XdBW*^k0|N!R3m#3IpI@BPU~>h(}#-=wRh9&x!$u ziw1XFo;@o8X)pUx)||`8d$A?v1)mICTzq8BF6&q6Y6D3X&?W$jLR%=Y%1ve+%C;?$&-Z)*R6iRvX?08r@sM`K!1(ZZyDSq|+*e3UI6dhN zUwTe(yvODU;fh!B2V=E2*w|%jt+s(b*$eS^mOJHDW|EpTycPKPaGJ|-pH*TsOM5n`nx>8_f@AnzrSp{tsMs{@?wmrQLe zm!=^ezk=lV%!QG5_Yuy310Vf;8VPAH^D(#_B#{5u$E{K7Q^2%v?X5O)nNrvz#~|Wc z9s@+8Ws#^Fm>>!YGr_(EC~DqO_sRZFe;{7FEjM4pnmKfBnlDdW3ZW@=%$i1g|NcxK ze|MB+qRC4bXuKs;>P?v|3qXl9;Wl^x<+WCNP+luw8%q5e=_NwS&SpTEN;dYtCYEA? zq;KUS@o6~qpf>I_VnO$6sjsI&$b=1s%`Y86=)^p%14l6A{;bmkj>(o%azToo^MBo3 zH|0`1qOQ6&4@l^px=M@1Pg_}4uCqSD+09{*ebMA*Ak_+>OM#3`6_Jn(PsVdOR=-a! z{TBFi|6qNS3%%)?z}n~P3Sbf&^-eJ?*G&=ZCXU67Tz#=|MJIM(9so|0 z`2$!D2yK$<;-HpVlYplivJ!4gDa4}z&@$*rMU>%Ro`+f~`{ccpcOy$HZWeX<%AUqY zjIrsRvF$lzL}qdf!?g~7E*rb?+Q;vkZi=!Gv;>tW3RkE}tsE#HA!fs*4slJ!#H!^YxlXv|IKS9ESnl+zHKS?iP* znh|Hr^<|&=M-AU-hs(PA6c?Ot(g)UwP_BC35}5H~-X!hzRCO{X0t+Rk`WaBgDpC{X z{bXgMXj!I&FnS$|Tqj4vA0BP6~(>cHvruem(41q>REfH+w75VpO< zqa;ake$w!D&L?$Q8jX^soNKQEwAzXNVKQ>IT7p#E(f@e@xzlnCHM0|A2ZuPgx%Ym? z!T>>)79qE3kKqHyKDnUfryQ=YAPhNO)=9ayWh^v|^M~#su!}XcXxtmZ+e0#*#M)#H zoG7amYxi(MP?ik=nEx67;&!xfLza@L$;)?|ICWQBjx4ppxbE*9S@OVu2+A9XNVSte zjs2OfvUic0vB;68vD3K{{6%^hKm6M7B4|yBGf+Ap72#nI7ydu5n4rYRUs<_e=JzO;chCk%>_gJsVwL zQTG;KgfgWoP9TOS4aE&`v*S%);yIp6|22MRInb#H|GXNhEz+O*>=l2mlkZ`w+KA8Ho(2sS3bU`p1OdZ4Wl@QcO zprStDxE+quz_NDjOzWY+y@B$)^X$CCy36o9^awLC+ldoF9TW zLu3T}&pSStXps_8Ek~Jn#1YAVFXHnc^m*EP*WsbX>u}iKh|=+D0`qGU>BBYluzE}Wt zMXzGA!-%#PxncTeS7LA1vQ@eDF;7GALsirPNaD1GVsKm&q@e`1XgV#5Y~#YSQj#%6 zT1Pf-Y^DAV-ZGoJUsPuG#g*9=uiX-*Q6(@Xl%GSQ9*D_6-PgrIRn0+{crRMP33rAl z64x@$JtVO;QJe{3m{CEpSmP@ZZsHHZUC$Xa6veaNn#4&0{rBD~rWF5Z-R5TPoH4@le z9Pf#?N5Ba41?Hx4jx`lor*8r5Xje$}B>?_equ%wQMY(3*Tu?^5C9_A#$(%vDgWw}5 z8}l{i0Q#}WeX?YNB^@CaC&lCv{|`$&0Lb@N)Rs4sr8`TC_=k~C(Bq-Gf5hP$eBd8$mhxq2@ODG2{AKRHHwpujH8>`L1l zf0bzEh#|ul!_cSrdz3rUx4&-av41&$5C8poC)}}=Jvti)3pC%?*IrxpgS@^cy!22L z{<~d&B9lbYYvS#(k@lAJc8u}K0dHMONF!Ae$gzltBPpcjl7uVdrA4v~6p4%Dyi?^Q zg@JJ)YSe)L@`5R-e3mw{v(92A@07oG&O=7G)a+YkRKE!-m@XDAI&op=;T)@(}?f>pxdE%mia zl3jE2^&x}J+I9Xl;=QI8p;N}~MWxW{fb>|6aKD{!ds(Q!My_jZERWnL3_Ks6pz|Nb ze)6T>v^X&+lzSHgt+FH<*C151JVUFUEYT)kV5=oeH5~tkiXHE_!F@?j*JKy!zlJEt4>Cm8-)6^p>eI!?8`3nIO|~0WvwoF(u&c z&Mm5)yc;S8Q{)CCl2EV|rIRWQnEpf{M_O@eGT_~!sq|)lWfD)&FN|7gUv^DT*8C$i zSfEL}-jb4e??igXd!5G0>58#B)gPG!Dv7e=;71l4MjH<)mOkSuk`)(uYbYI1vmz6v zR{dwQ?WW5u%mEi$)NgKOKJ9gpiqYPDAb*sW4!j$)g}b3ylGz!NqV6&qcB?sd21-o1X7J^UMK7_B8v zMOZ3yHKe7U$)eHVx1lzt0x!cc(t0XawbqG!Z`HkRC_?cQ91+^nc~0wK)fD{JZ)#Hz zSYk_9?SCOQC)Kc`kYY}80LeP~fLJ9&DgGkFL&w6n?ov!3gi z-S+B3ZMj}F+xrgh{`{s!h{G`7h*5|kki+{~!on=Pw$FLuFeAER^)TSkB6ak8Y}0xM z$o34LFn^Hk=eLq~Zr;V~ow-u;SV`T0u^*A-;s;_NxcB;IzzeWG0j{4ZD=mx2V|sPX zsMOJ%zSff+@LmhC-g7GNh{1?IaH`=nGhq?SHYr5sioX=v66p4T1CdlGqNFiBJs!#Y z8R$`OyFt>y1WxUF!|=?_TJa`;EHU}!y||edRql(>lNV@hLsq9FF3_9VGZ(Eh$K?Dn zs^AX)&{?>|9xy|*0NYHXwChJ+viC^2F6f(V-zH?T0+d|nC-tO=#+!|pmBS2@zFVJK z66{V?(@sfROQA;g%jMZY>gq4rhbr+WpOzpCzGf>>WZj@y@aysxkA<$Eqa=r4tPzt- z{x4DH*e<(SS!Yhcy4wq!FV56%+NVWxP(a23w96ID0{G2iWg=pmP7`HGBs0isdsQ;X zONu+O96k;^vG6;cJKbUe^)9$_$V0Z`K+cN~f~8Yw1KifC92Y%aa2aFD;}MfWSHxGQ zNtMpKgRbxvLKkHMzx__3r!`_Dx_~cD1xg#*I5C7z z3Qy9!5t(Br8hqEb2R>YvG;||gl*4#Ar667e8YGmB9;<-mAjgCUbs%=x@^3NcuE_3v zqqt%dGT1GwNzlG~o4d%jx99;#+?swD+siYWYUh~lQP;OdmD`9Q)TUAe98>OL=x3ftw`vzk5i4|la*P;GN>+}3_~yJmUk zU~Y*b$jjX+4n>CB+Jba~i6}uLf+i1pnGttPB#e_A@`UKlZu{*Z>1RM(7O;?Vx)ViT z5i_B*<9$_<{ZKj0uqF+Ggs!1|R9ikQVR)IL>^0${EFx`FA&th9)aL3;sx4?hF58GR zpeO4#h0{zh*)mpXLGs1evWkvU$y{SgF>0Dy-E?spD`pPg6jq!_%25W1BBEXPbprvH zZsfx+d2Euu_}^wT0F~a)Q1knVTAGKJninJHrq+LC=*?W*xSYNcC~8!)NE-s+Rq3?ZLUN-_FeJXjL8P zw^p3wC_^b1@>w9Ta&6AJw!K0NBl;t8MGujr2a<>thA3KZnLQjo4g*27G9ZJjRx z1{U@;>^ul2Li;5%nZVbt1MTyhJFAk{Us0lUZ4;s9 z7_7(Jq<5r2^^tSaHLAC7c=&Lw3II4hZj1!vp*4*@by<=n=EfYrE+aWk^k|BdBb54q zn0M3vBAA7Lj7+qte9=^{7Zo`pT>D}G(f1|0#|M4sa!bPMGXL&e542eYvo%DvnBYC8 zY^mBqM4fxYY+__hQ5O_v8+?lEq52cYZpz%WaHI?uvGlr$L>jzMfKJ0%u?jZ*{t`dk zqdXE&upkJFUmv^8)HoFFhVz@#G30XlP)#3_HnqM!Qf@$5Kgr@AtgpCQYJ=KRCzHz< zs}{1T6eS{K;7k^!8%mIbS6TN|OMLFTcNdQSt|6nz;2WP1b^9HKNn>;N)eET|XNY(V zqGFO?^2?XUjoL&6@o0 z$zC1&?apLj3q#KJGrcD<_A2mkErlm0Ox*h*-xctYA@ulrg7|gN&evx+|C?qW#aHa< z=?*&76{|^-6*FpU3ATUVaALqbGk|1GSNQBSJ^? z70s^7Kay)ym{p;XoR2KY7mZ>K9im_Y58UXF*5$d<+^O(7819ghMDweS!-k5qWq1z5t*m@NiN*kBC5|HMrV=olmcl|0qgg<@CF_595@2kXc7 zqV?N>hucFV011J|zx86E9{?aMluA$m+6d0? z@jf?Nv?=2XQL$uo-6@U3qb4O5m_>PjruSO(26*c=K4`V2Va zZ=hHIRW01qIXdgz!8$!u`m8fai?jGM3JOpIA8UeaOe61Qi6Mf|H$p)%x@K-(ifXBVq0~Y}OO+MX32JCH(8qAUKE7@;32x(~vr%I) zSxb4n^~WU&PaR0+BNiK4!cU`Rqc`x^$S!{H7}o^A8>BOUbFnc-yI?f3!b$K&dzbuP z8l(N~Gb8(V;)seW5mLFQT&rWopOmc2{lylQ#tZe>68Y7^x=rRZMPJhec;Nm-vrUyn z)|J*`Iv1i#`Xd{dueK!#FHK*f!%pYY-zFyDMUHCWJ3F8QeS9P6+?`=)g2t!(#NgDh<(BF(L58W1Ne2< z9GvgxPCrv}hW9`Z#{u$yjXMh8o*WE&%BKuN)-#?3K`dk%7*l)U1rYtPr2-_e>{j=v z4_*A?Js1-*^B|**$o>D0!^OK8ZKr+}P|a$VQ2!0R_kfkSO}E7l(MHA48#>$15%jtb zeLthNPcam3wW{klGIO8E&#(ED;`y`r2h;Pgzh!A#E2*;xvd~OPQSA9t9K2jpKk4Rg#L~`3cyy;@S0{EHe04{jz#7lBJ7q_ZAgah zS7GZ~5FMCo;r2a=kNF7k4t$pNGh+ctv+`#@afI*4_{z{?HciHwZZ_u_A1XVXeWRDE zPpLIBgBQK3A|I=pz~#T3LL(vgSsS$Lp~`!!-uDnCen~L#8J}2K{|xVA%v%)y2z!^?SN5xJ2lU)0J~s8L{p{2m2X_T{~K{g zI=uDn1B*k@={2se>kNJPx6~webhTo%{u{T2QI}f4b8)w5aPNqA?kGv&`a;j%2qiKZ zEj9pWVp2O-pM~$peACx-dyrdeXf`?FPUqHs%oU`g>L#gR*2>ZrV#9N7M~7zHNa+my zmW5~}hvN_qzu-!Ikv}1r-7Xt+#_sZMd=wC^z&zd$?a2k@0Rmy$?D8r4{$p}}!doFlycNry{jl!QOG=BpzHzcZr*o4SpX*qNJ z2~dOK0Q6WZS4*Eqy5{8%4~c8HKA-AinjfRyrLVj>ZVC7OHN+8a1#TIe0gI_jivR&l zu6s9O=#S7c=>Yx;y9Q6I<6hxVS>849+#8ZXb{okQ<94NnwwJv;fPc11BN0#vrB`y=I-#v{-+dgP$R~4&Kl)8i&Xt z0QRq9S);w9`}z4&wz1)7w_%G6P2s@&S(@4|h=ztBos%Fv+V8tQ>VX526CC{9njut3 zc2O$$ok4|IUQi)6_g^6vKlAG%vLbI`5IoOfUubW1kclO<=z=tPJLO2@J5GVar+NWi zF;v;_GT|G#*ZbWa=6LWkb<}uaBsRpTGE#yF;HRN_QF${{h9&~( zKHquN?WzZu$0-G9Bo-9cd{p+AnVwEyu!Pdo@bTzspX`7ZqAgVilL!=<-W371;-1hm zZ2GqDYNuAruhsXb(l{_FPz1ze<j1ezEOKQl>m|d z%Do#HJ0??aDmL%&{-<0*;HI0Xk1Cs}Do!+S_V2t2wX-wQE}-aGkCI)yl%y=oi{xBX zmi!)_XUzVzjvo;J_X{P>Ih6VF`o$)-H+YiPY&=YG@0*G&f{Oe@t9lBSQ8ay zrUD!WtGpLySn0fUxY*Hb)z+pl=VeVvKSeby4v^-lq+ob0Y-tBps1XkmCxulm6e9Wl z*H5llhUR=BvjAGDTedFq*a|8+7Q23^o=ylv&E)Hg?bsk`OwsE38-^wWO--0~f3Rfl^Z!QOVoVz)InpG(Xb%&uu zT2E0G1?Y6c*-1YUC-ylHYPFB~{6z5*4W>Nklz#TP7i4rga7(F$00|5b5)XS(pA&;r zmM0yFF(`_HD3hy+PD5WG`JqBV1_K!h(1OVj46;eTp^fMtb zX}tBS*Kk*MEExldf;3gnvTwk3ln4sR;WOTG$q< z%ak+yUFJ$o(fAk^;1}&%>=W>FyC*TZML3GcEGD*f@Cu~H>sn!8bgKs%r+SrovtvXd zNn+~IYOpH5&6Hc#&lMs5@Rm|S#Q3JBaW>#g*owA{@b-l)hlgxt>mmdB=3PfRKi9e( zy6@2^LDY?tcH5&!ghE$b`1r*Xiz%aBk*szSSg0c|zdGdy&~hxQz3*;4w4IrIO>ObS zp2VqBI0A*?!pB@`VbMmRnGRq%l2$p!riy~i(uowRi^I=JO(?XkY}%6 z>JI?_H-=MR%MJ$*jG&S+r6~BRg4yyKnyNp3*WAzwGvaT!O}c&0n-3H!#e|8wp{eqF z?7qlC1rk$Yf{A=Tf4_z}Zzmnio9l0iQf`gRWP&<`ro|Fomflka1cxP8OfJmGF^pisWmy2O& zITF+cNhE&Q`a(6=aew#SB(+z}iX$&-yPP}rllsEGpuA76lAO$oF+JbBTUst$MKY$6 z=6FyikGn*&MfZS)}I#=Nk0WkzUE6Q4JYNIxDI!7I|jhLi%G@t&b-++s2Q_wSZ z-Lk?wzE=x~w41o*t)mr!YV{|}8`MZf#afs$#{VlpfSgv3Dyz5r zn%C9oLgD;z0XEXqeB??yje&hbd89(SG=SJ8cT^(2*?SBmge6hu)XAoRW7{-UMsq%= zVbR4zFg7trgH!=|MSyXDv+y_%c)+g^w=O$4d2ZMdg9S?WISsS~m=WI}&v>KW9Fr>9 zpFVrC>ai{eQc9aa!arK>VJ*NeXY~WE>gSSF75kGrHFXQzRKth76 zFW8W6-zsLH?ZwX}tB-id7T-pOvyBpA3%O)}NVLG9KPpE*#Xk1mgX^^~%fHq*zf&~# zwp+Wb-tb{Ua>F&iH>3pFNeV#>o?%FdqW#bUxR^bQU1V}0RG}*eKUgMyN0}cV@!=2= znxUs}bO#%5B>XPnxI&*vwP_<1zg^T9(q#D~bVY8QoRb!=h)b3@0R>r!@D~}|mk>p` zIx!b#*K45CgnIU(DaWbpJ}{34YC2mAm`{(Huf7UV@>2<2A01l*mXy9qP*OwpBK5)q zKoDx`4;9@ zyxsh7H|+&6T{DC}Y?jmDVA5~VDWiM4fFUOd$0T9y&A# zH&;w7k#%SsGs@haKN~8Bh7^+$>1MiTleNp6$GpT|Z=aUe!ee`o@+s$|CNqqkNiM_2 zfx<2sP9~>GF(VJEC=Bz;P^Ib{pfs1ve+Ir93Qy95g`kri3oxv-4t z{D}4a$aDJ;nr+BqMK>C84XL`;XChSba~9e;oft7xwg0MmFq|(9@@|6-iQ=ruG9JN^ zr}x)MP<_SUK0RK4i|uZ$R1UOYgXWu9q-W4F-o%<1WqgA=BSwgJhRoR>fYejJAsoNe zYadl(3Qs(SOwEDH!p^2fbbOSsI)#pt)>|QrD7E#ei`E`#d_{68HG|ZfT$q!SC@$Kn zB|?lkazNjn7Obe}JDu(?*Y!q|Z^`KxrRfbJiVAeTnJ`5r=S3ouG37wCNsa3BbM4jP4Dz;mM7^N8T5(kcS)~CXJ~jL_4v7adCvt zHN|U~%dLmE3Z>nas@vLs;G9C$XF1`uM4m#p2|g5~Oc3ac;v{nsb16~D=Fed!oywBa z2~$9@i#4sOae!AvzR6XJ%P}CRDO899vD1VULsW9<*P^H2asjUZL?@emkq@=Kb%=@6 zL|+cJmS|B-c64iTMhIwcG&WhuLZ7+jTW{ZBO1m1@O9sU?=tyXos{LfciwZMRjWX?sQy12Pf({%fo+9G-lXA!vd3H%qdr0{&U%2dV8pJ>h@ zH~ELGnTnBi#wXSQb~5ZRwKkK>Od%rEM+ z1ac9Dpf`Y&MSXM&$(KVSxiT`_9*c6-Z1LBRUCQ@@ub^d{W3cc*ZDd5=@>Tu?hX8MZ z5i$8QwM4xEF?RRL3loA+a+C#qkNh57)an>qZwlM(EShaV^Q^#>-q0a^dRtbB-a{FD z)^`m*?HjnU5MFw2+4yKW7Y+NHPuS|WVrV5)--VXb0T*jzrO z-pcs#G#kxVX>&sjv+|bYOXAhFXAw58C<^=PxsaxCEpy6oKN+P~gyi$aAFBSe#iPKK zE;FAmxJFX|AH0L`?v`mSXgA75zu-i;5>2jxrZu|tK~A_~wrqo)rcU`)>!uinXW3q7 z{AR%E+sONK-1Uc+zsvW|mrP}~KkN+B_RgseX+8M8`VYnMS%w<+)VvV4q;K)47`;hY zOK9SY!kNN-PT!mM6o2_z=K19tEw?KOn5K{MB(Nj`E~oV^i!kj+yFxF&j?|>)DYl_A zsvk76uahD{IVS~c)TH1;FS89;=aXS4g6@gQpkdU(<;>Uf8^5xr*9~G(%h7f64A;=E z5AJT^g?~k%uMYo?3T9F8!OPrSi!c)@k&XlqoV& zR9Dl7Ato>RZ?t?$wb4`c_1o!n<@&~V-hXt(GkF@ z#u}HYQ(r4~crUqKV@*t#gCV%#fVL_-7MBv0unSWnoH%dg;YXu6ZzUq(AJ#GCUtTe_ z@PWL~$HX5Dbm3yQ@XKRZgG~`OXwmA(0_?AT&ih+w8=J0wHXxm@?A;$4KPGEF?!DbL zFNz@%F4{7cC{ye;QE4pBc0+Z9kZu8B$>5c*66~aF+j75{x?YW6ex|U7w@Mu0`G;Y4 zwV0J79Li0$hV1LT5@bW*cLc)aj1P|;gvjEz6VonsSA(-D=sznln3FXIcH+V4#6p~& z51G!HV*MbjK0IrGY&1xJxrWQCv+J_pN2SmY5m#-~zY)`2tVW9`)AxZ8L)!%e9vEPt zjK#r~<1#T&7bsD+&p?RkfK8Kfg2dut8`H{T)r|Q{?h!Z$oWl`rhtT_d9+KTAc%Zfo{tu^ zC(J7?c4--FiqyOPri$7^%TXGvR%n&g^V#8I4N&++t{X{B;Y6I;F#R7gF|QTA9T5R8 zrDdcD7-h13Ifz>1(^A+gUDOygVZ70ZBw^%W(WUq!?~_o-g@BMnwLx1U}G%gn$2r@R#;1b(a2&cqK*na`568PS$qYVmGgalphz)lemyxS0CnvL z_UM{rBdmo|y*Q8RsY^n3EI#FRy5rK&)TWcRr~n4^WjQHru(hU5{r^Z4x}?nd#^(m+ z!8)&pqZyxB;cLMLSkVoCxtyR*&4rv)IiD0$IhL`IK%c~9Lo_4+q`Q5fC=nr;N=#{w zkB;9HLPa9F)Va8ejZ^AM*ja(F0j^$Y(hdNMzBrf55&Me*J2qOz_d^KOi4A@9JgRCb z+hoJUFC-g8c`GGdVR`qgV^Ct=gvjteb%1-&yJ(-PtGp~4)&HfZK{nUd4wu#!H2lkZ_ z8_H>?twef8ietk_tc_IdIZ}S*}ml0Z2bVSZX>~{YN>Cfn~9M<(8-2o|ex3 z4|^{g_1mC-`f^0}L$TFI31+qlM~l{pL$-ZFBZWPyYK_Vk>OhQ?DW&pka3d+%e_$$q z;cbhfe@oN1$tQ*C7CUWC}xHNA4Ngw?=g{^_6e3l_vDZ3B_<#BY)WK3NKHpeOq<9d#>6H zGt^Z^-c+dtODp!NyCoXIaQ|G59?Sv?bP;Lcml~UC^k5PZ9&gXdmfrFvXcj zBp${e&UjF{96w6DXTMhp>mcW|F#Yx<@ktOi0=!SZY`ncpgq@w~U%wrm0nVwMu1-g4 z@jup&-w0%03fE*b^nKLz_11o0X#?c^kmJ;r=R7xpWc7!K<+8r<(&6BS2u8}Lypu1K zk|$BWDwd81p;fAgAvu?gBLz*(E$tPr~8(huhC(l}khM z%HKPxmkG-uOISUWQd5PNTP8?5hzSX@l_56C6JeWZ=CGXZkii@k|KNA1%C&*A**8^c z{r}5mfosVI3LhW-mRhHnXzJU2#N*~FIMLJM%0CsS0$?^8j5f$cR5xf!! zaOlt{Y!qz~mFlLH z%f-QJUr`SYWlJ=MsIyKpS?nd%2D=+WPlb00+vcg8)9etx=FvV0*+oct;m5$f3X3b2 z$8Su$dD{X{ErQ=nI?{KI(OMyHZ=Oc{9HYEFlfOV+0|;I=y+wjE6-`2s--(UHj-5&> zIv9zoULI|39W8(NtzXeTdM1~D*f78-^$>%!RdYzqXQ7QYNJA}A%!KJ%tymj_d!-uK z{j6#q`CgT6Y4bbv-iAPG|Ldpc{bf+=1*GUpJ}y(=gH{&DyF&1}X&%P}l(Wzl&ll9{ zJ)dto-$12Tq`DZ+*(Ba5pZQNnr#jh81)aK;ou!1uggXOZzp)nvuAqg>GL&Q$Q<)?K>QIQ&$ zmt7?6&hPA>rEa^FyA@UmX~?Nm>Z!}5PX%IsF9Rr-X#KBuSji!?ZU~l9l7p)5^$3Ki z{IjL!c}-<1}40}9QUvFEV4vPWjj{Xs0m!(@b%#0cz0nTQ-Wjs%A{)eXS4-O9_ zryCt4J%};eW#@l}W&Ug>oHN{QFNizHdFa)+`j@`9VY;{2wx1edoT0_Et5VAnN$*q) zssPGRN_Lh^kiz*T#jh%uaa>CNURgsHN+FWcw|Hbb->kBP5k~)9XB1?;?X1EF^IM0n zo({q(ST&6Tj@}b=X|kPAb!P20?^*+s&Ec)HEY{A{p4Rb=7cC0K9guYA{ZDswH6t>B)t>H2frX zXsc4uL5uVrQ9<^LCZMifcMA-smw(PK56)BQN(Jo+7A(8ua|tXko5(4HXBb)qR5V)( zv!vf#O3sBWJW-j(c>!M*Kc6!kzn`-_ed6mA{yy4&cD=|_wTffsWT@5Kq2d2t=6Phr zXHgL~rRsB9$(;(X9?tvDtw*F9DhUrd3`{AUX43yZ%u@WvuQTbA5S3D^F zF{sqJstX`Mj#iBE5+I#LN+tO!4X>&xANiPrQ5;ePl@ux<97+ow`OS=Mx3c@MJLX{y zZv2rKCZW$hl;h7;=cLJwS-v=mmQMng8Mcryyv6a=6G4XLHGwS#g+CiQ9VPp+UCf^? zM5n9mktbB!NtQROf<^A1S;KMlJ4=R)^}U_PfQQf-^}1X@rJ<05eAA-#!9^huUgq=x zs>@#4eklO&aNr(ndxYB%Y!5a|_|r1vZAg{)m;XG|jRvl>eMuYaGZJZLAd4RK(LD>i zQ6%;eE#^Ut&dzTLgo!Y-eScnwKHOg>4!vn5LVOJ6L`_o|T_d61KX2s2<5L+|>GKLj z%4+}@=KbRWm~Fo&w-W{&EEBveyNS6|PzTtrXm1;`K$A|uEessYXjxlgF+YgXBWKbo zG5oVJV${}dO&F?ao*Z|CeQ$wDWeZD%`19}+q;i8gw5CjNhlz=GU5;7QiI#C*@bPXI zO+H%x&73Tn{CSR6+?YSLn*3P+n_8duO=H0O>unk+$**YV&m~E+@Dc<0XBL#*Y1DKcpxEu~ed8R4Jh3V^DdD{-0kW;sOgH;bet z2$5BqKA%65(b9B$6WGOyqo3j`_NpYYNf=d!J!2~*$=`T4Ou#{Z(P}{s>QSfwV3{aG z7{8zD);fCnVQr@MTxx;M+omzW-Q^y4j^l+&oBnCx(L`$_=^Jd>wW_4pZl~qp1UoXf z6>)w26fDW*=H{O;SP(W<+ZipwzhsRrckxv7?6^q?+p+^vg>{G|4oZMmGN}o}tD`=^>s`|gLjPl^HfK)aE;bj5V4^_HQRZtF)la*pDl*uBkyjKuU zkWWvPcWEe1-eOZjKfM?ZhbyVbmi}}mdEoQJTgiix6_%UB6{BU~av8fSQ%+XRYJ=1C z<&Dt^Fqi;?cMeKjQDYRzyO2WH5dz!QnjZtP0K0X8<`Y2gmM(Wfut~nGX?~wo=Thn& zZI2Hmh2u$wUch9*g|f}DDxoNhhUTo(%=}Cs`@3NDh+HL|ZqmJ%s7DWzCrACMgPxZ` zaoB1^AP%v)csES)Q{bxTc;>2c_@5gBI36&7p!{f%taY8pu!iv%M-BsO7d%&KVV!#Gw zGj1f*I%28SMH*TT(I-%5@eR8F(MoGvqh!%GZ7C`ldPv#mICBWbP?@Y)asrc<>&xqu>1Xb?KrI29oi|GpXVhPfx-$lS&wiWNJ90qrC1*YRqS z3SYvJ=#IGI*q)wUdihXEhe5Vdz)AN+ta79 zpe7csvhe*`axMClI zUx{DU55XqpBzF!P9-XK-tWroq4M-!6&+1&(r-)$#x6jE@<#UWe55{9Z2AkSn#=2bA z0q)biZAW?f=P;evQtPDt5$^a>s@$@u`e^@;rn3x+YirgpPH=aE2X}XO2<{f#-F;&L zLV^d#Ai>?;f)j$f4(^@_9{kQZ_k91TsiJD{UTgL0e*5X(7A-oS7!i`=dg&%({iQ|f zbBv@h^K??op(`*&LAnJnK-%iye21zkn=dS>6o`uX?!798a(bSpcU>9zI+qS%d3AiO z`vY&>i4rL>wIaEskP)wg%mA2yp(+J5)jB@7uFbx3<=xSDTqvFaXgt_PNw?&idlWxh z8Tp*oyiQ!*EPe3nCPn?(7yCAiiXRs>OqIC)=W(Z_TGYl4&8rSF;3Y{WnV^*(;x;-n zsj0Oh5KG9o9!>gLsQ(2+u>!j53@YCDSC7%H2`9NnswNdb&~GEXQ!#L?!*#`W#b$k% z;!cLLZS9V~kSa_DRkq35Sfz=uTZkt?s?nx+;`&fzQfsYHmGX~=4~t0<(}j8;95!z_r% z)py(W%}In1Ukibnu1;v7o3A+p1E;ag3}Cm^!|RURN)LIvwS=K68` zOk{Hw!qd1OtQOYqN7WW~wrc&SLlvnOou8D3aX2-as1uKg%FA7e|?Goz{fux)%EyrV7@9bg162cZY=_*hE;52u8LMb}EUKhM0_GoHH@l9SAixZ5VqL$+kwviIGXjQs1eON(-*X@F1|J z5-(~&@t4!DaRh$aKGDtUae7jZeooFAEt|k(#tNmM1Eo>BvK8wna%p*rq5EgsPkJv{ zhV3YoQL$xcKZC!;GA8F!+tWk;Uu|UkYZfeb03i%df+8s;f0!I`SyHkjp#_!9+j~4} z(^WU3Dxh_!De=6&>{!jQeGzGwE_6$>Y`?-FH1Ot|Hj_=Y#RSs|9>0*(%6;%pHe}$A zF&@NQ?2Zbyx3P@iJ{M>t46-Rt@#>$ga`3A7*1Co@bROX|OS)>hcR<0LRHGj|zfz;S z*kG%)>`!@fz62|Ls^zCHMpIxHeXaUums{J{V%D31M{acz`mw7@DhoC_4>3hbia5Fw zA2DX~Pvr9LoG$q#FK*KT*b>TePg?-8OsEdw#sAJ`a(dZZ(3MkB-s=f&7e@T%=opsN ztN2v_up!q9w|iL%^9(q{K%66Hxw>H&$A2z3n>|yE z-#yLZx{k|hqvx*xcj~=QjqjnS%l-93W$DI*2z2+YU**pJx-m?wTnPvFy9~itfVjdi zzrae;%eqhn021XGQG@eHJZHqs;iNYyV6e3J4;BoaVMVH1zNGvSlIYv~)Lic7bmo{N zI~|h=VJQL2vJ`aY#h)# zqRjQASeOX+scV=81HdXB%3w)~X~X^9{8p{e6nkE$ETlM1HuhaL}v z60GmN-nwven<8*?kLu<$yHRC-5Arp2plcDl4qXc-*$CIO1~;XwhSGbmmirItuZ(p*2SC@>p6&1PTJBCjB?u^CTDLIW=1ISyL=R$u* zwl%zEt$MfzG57dFi}e~B)R+#LrK}2NySJp>K0cSf;KZyIq#>%VKq%qbc&d7HkS36? z=&Rk-G)*Hato?QO=F9EFU}yab)Z~!SlU_njxnh+Dy2K-d1+BsLJe7GfTQD*!zSi0a zx3JOkz6i3klk_n5-h_|?fb8`_$nD@U9(2`QT)J`UcI4ZGEWmH;>O3C$_Imj@GO2NN z^%Qg(gj!)dzeH-?_!J%bld2@WAea+L0?etSKsFR44*gpgjNP7pq?DJ5?8B2{pMh^} z-{381f9C7>W+rj^rO~nhExbzpGh#+@%2(>aBJ_8a_Tb`kZpB?-{fptpv%Rl|!fKL^ z>Un&J;ZKa5FP!s3l-uS{I8iolBcmvD1WcspMB%tvc;Pg!?O=~L57naNX|>#l|AqDxXI0f>dP!ljLAcKx(wuu!Gn-@3I^jak z>Xn58-%64MpftEz|Ec%20pifN8LNi1E=wUGvVK(Q8Tld>0;pjW19_|B8hP7`+1 z^_~b8^L&<$VR?^u_7v}y{Q0F|7koCHqPuL0A0rqgZGMSgip6lf+=R(5k6CDvr| z4RHe8BnL^3FL*;z{P}iil1%vXl1Pb0zL#|V?SJ+6p|w|a`(1u_c6D-MwXsXO zq~*CueclRk>X|T^0Q18CU2+g;7`1o_{VDw+9awBaYyt{G|KldXH{Bf!>x94eh zVdCV3_Vw1ZLnPq$<4xMwi{IFdLWUzx25^)+mFpG^fIFMpJbsVGQ@Xy2^PEmqxjead zKJ5K?ozQkW&=uQ3%&h!ixbgQw%Ozk@vD16~_4oI?Ywyw- z#jG(cgM$WqBz6@lJo0#1WC_ww6$$bMkVktu;kudF;XEeRZXvRS-IE}_UL&R>Hfvu)}Qbp>5kon4vNGU}f7i?VH+WF?AA96g3B^0vaq?)_{ z+_6Z!d_i0xj>kg|1jBx9#>uH%0{GP?Gw}tLE+bx~$X2JJ^q9TdeP^o$a|Fu^55BgV z|BT_J5EC*!MYfdgsN&|Rf(EtRLT>16=;LMnfkc4fK+@somuFDC8&PGV?TFCDVj>xj zfTP=#%XrnoMsRyKx3}i>{MEEwg3u2YH+&e@;vsw-PpqY5=DFI2u;c{`b-d8sR5!ogj zn)Leh2HoNBj``T3L5YBqnjskB{BD^SR6QMGUDF>ZPbQT1)`b;fo#^D7uW4d9mSv^) z{jQ!$+l%@k66qK!eNG6!Q_1r5)HCy9-nT@=P;R*|?6s#W;V0$dzVU(K$#p>a>VTAz zsEA+PDEXj9oAPZZRyO-Z>^H4_8I4Ea1OBtB;>ak!&c6{K;7#BU|V$#0#w_YtA`++cIoh%@tG2#Rb_!kXQ}&+i0Fq1k&|2Z?PT@CVUO$lXqL$1uSy}vs zeRsfH%erq#O9c5jaIz!!`XmP&6LETm$(!~8mD@dsIr;;DS?RWP)}x-mK{o-?Z)51; zVIM@6Fo~_iB-(*Ix>JkSZAT2Zu?QpJujiTJF2Hb3GYe^xO6XNJ&ug(5y%RprZc#Vr zToxU%@tS?O5U?{5NhhVwo6`3rWb<+g7^_mp&I#TS8mZ#WJUy-UT2iK1r%iNR>HP%^ z*QIB&8m1q7XSQZgQ49C^(5gk3$ZHFz2Nl_V2dRN>bVKy~IQe5M<1uJJ>E?ZlpRDQf zjy4fzPwUILa-e|dP8@vrT@?#eCXxPCE(`7@v{gZy7wW{Gk~0gSPE51urukk{Zj!j; z_AiUujpohbPYE;Af|>;bwlcqVFwmacRr(4g^IKm$D&yKGO@Lzm19wKCe-%hmc*AK}@M3obWjY$TI5n`z`(Du)<#Awow&*xdB zt>S7{wLJk^RoRvc!3&a}BtMly7OekCa!Xqo*+Sh;xHcxTf)}215Z&zs!M}*%Xk2*2 z(8g4!2201~HRu(KA^`z0cu} zaU5l>XSeC-YZ_UvKJIL$JtM$&HNOLgvS8o-~Q#cD;?!bUtk_ z7ONr&-=Rp#eXWozj2Xe)1p2>PzgmVh82Nvc!)6t$8z5dP*HU`92|+*3rp{@vSb;%m z31XTPQ|p9DKLxmZv`2cSW;!s6Ae`=s%$VdaD-Nym4l{yEYuT^l3?sjk9O;9p+MToM z__kBUpsEWb6xljOymRk^QJHwV+4DjE>w1I0T{ZCFdh>e#dg=8{1yA1D{9qY7b;5d9 z$bBRNE-6B~yQN#2uBqoKHt!%NHwTcwM2^dkh}~g-fM1m=8(D70^8-*BIE}JvY}4Pa zJFiqgZHbUQz-v=4_jRl5A=>rDz~Guf^tBi8%*7|K?YaEEPJOyu4>gQsl%ErsK+WG) zz4kk^^Boz;zQm7qx1|r8Mjs6KeGWUYIMXB90fZsm^q^9?h=VrYrQA;NvKxv~{;v6? zed9gAW6cj(m|r1t=l78-ZcO!&ziIc=`ttkHJsO7f8p(jXS#`;RtOV}(fbO{@BwTxi z{d{+86tRQ?Z8|)=K|1L8p-*Sz35&(pSNRVno<`*atspe)WLa>^U}72JU8Px2aObMS zg{vT#Nw!9?F2fSKPU<#(oc0}i7gW_hJw*a6d18tz<`!oa+GoD4!*7M#!U)OZ{H^+R ze5WWU!%pe~{+6pXWCl|S`dDwyJM0we?^fZet3lJ^ zuvmLxc|z)X!KrjRB!H2cvWA&-lHySP`Pv#G76w^`DnykMRb9bS#=Hr^vu_4hu?G)W zheMZ?#AxM$ITk26TP}m=B_Q<)+e^{>RYLAkh`BQ7O|&VnzIeG6?RWLw#lqwHJVHW6 zAn5+;^B&FziL$nf0Yk{|b#P(hLe^4Od*j9^k+-eoy#;N!-@(s^)7pC^CBgdcODmuA zDgmNByhM1(348LoeIp-I^d>`kQtmN87T$n9Le-s<(|=4BTZ@}`dULEVOuViytY!-v z@gK8U!du&ihHX!jLG*^;)Yx*FFHfuRHFI&0WI`e@T1q}#1yE1>o0)1WugyX`IVQd^R!Fw1_ivpt3&3gTWWx&j20%+(8 zJ6|BDk@6&2Re4JT)o@IMJJI+JXDMQ9Nk4AirQ8m{7ltHrS;NhAw;AEQ{q#;U`XIwTRwxXXSY)-^ zhV&bAy0fp_H&^@k9pQ%1U%&&VQ*O&*HQ5eUB~DLrxxRkdBf_N<#S{@<}H0&sq@qf7XZ6J>$CL;(T@pYtu*RIZ*Ps8U%AA zQ#$0bP>pxf8o*D=JtYtC&mWhoPcN!+I_cLFiNJ8`wcJ(TVfvooknpmggTG<VS}5lzp>iA$r@W8BjCyN=(X^Sa>3rURd8Q1xSilVPbAWA3c7?Y1BR zUXo6S`P|#vU!zSxZr5i5YbYmya>;Icv)4{mlx}BtQyt+u%B>Ox zUkO!e#|&))W(kJiXh8|p5>;=VUUjwQXc}8euznvQz>~?inB$GqMGyZ81EmXgz81y~ z@y3NFQ-xUC+Y58!6D4RtYu0&M(6ImYY_jWU(msryuLVGGLr75i8qfBxDu2u_F2@Do zc{fN|d%*uv-*$1Pdmh8E(cUas+Wu6R{3TKHN0xv?=k|{JLeGc2&+y)}+h-(h$HlLw z0T3uskre=y3*26+{Prmq4#%2{KQ3B!-M4P6cfLI7H-A)lDKzex8o#`5ZvEi+kaW-$ z{M%FvIPYqk%lrlSPUad`)~r{vu{}ZK{~W_2QI6c8)~Wl_T7RsXNA<-;eWFEu{b#rD zm)7R02x7cxt0i-SWfO99N$vKey>fWg5xaSPo%Rt)DIOYoUF67eNc@<%Mt1UjQhFM- zu?hIpR5DjriN6=cg`{TXPw6c9mmqv!k)ZQ$81adC54=pL5@U|ZDRL$XduGr`YF9ta zX@30Sj=uGKwnVib^28KRnzN$AtzH-w(Uig`)i`V#uE{PomQ~6s%)68wHuoGN#W9>t zzJy`JP|Zq9-njK0VJSLfUTzmdz-`Yz#-7aY*1Og-*++OLi5}b*QK;i;;NY|W9g$`m z2*>zyIbZn6L5vXRd#}V7&+VV29uI)8MkN9=y_f*zb8e?+(E%Z9%9y-^R;$K8`(1I4 z_*2p@qESv?V3zrF*+qtm-O)6&NIsacTKF#2Z$I#6WCEJb^Ogjq==*o9k{H;G@hU?3=IJ zkPxB0D46G{#$=b~>tL1Ez<-==-3tz#;UY3NNIgJVWm@vNRjnGNAgC~&w( zFo8YUiRm(SdkFZG5kZ|Av2QP1>cyt5$KGZY{m}R6m3LKM_0I`5_M60SVVm~;_Vz}U zxL6RZLKEnZ*FN2wVo33T4PAG7Zb2^UvGIzxHsOEdgIn@reYK2 zb+-eXOPd=tp;yljqjImeGLl_lk0?3*EfdmnNrupUD~$(OM~I5}qrJ!0>Tj&SOR=+p z-!)8jv!x4fi41cqp}_5^AeSV`^v189+5-)EYS#@D&K;LVHOyP@2d3tLCrGuthTd%d z4KDgU<=kP^Cp^LtJAWo!FYPH>GH=cwqfL;(hW7Pb`lD+?SM+PIg2!2q2rFvPYP9!&H8Zk4?weFwU%A{j+n{I~w;r zi>m88o!;U=Y?@VE6|35-L%!wgl#01YmtDNCeOqfj zQ7^s;+XR*~O_)XHRxsu_E~wzM)$G%|u775l;cI(u6kj>x$h(`6aViipgXm5fbKshx1Os5NKI{3pniH6az@BOWiNZZ@G0~kB4OxH_W{V!pp%PhLiG5=42)|6 z72aNRU#Cz+60VubkBlGb#Kg6qSjnxF=!{?p69N^Fyggu`ESOS?EX*04`6YxSBU{v| zYGm}Q;TsP{tM2FDzvT-(!4{2Q;>gZGgsd!wDU?{N4(Kf;etg*O`j44rE1)|2xIeX- zhD;*b^f})gJ%DQ$9je}W;e*)(hYd@4f${2J`P?uuw`YFvzg5dT?20^kcsWF5Y{rEu&_oL3Q^=H@k?JJ>mRWtR8(;#E4OjX!|95z6-;LybTi-aFlDItxHT?j> z^0lHgDZ$Y5C4hY|$x$7kPT^7Elrw|aS=egp2qfAHk$C;G?$%WCmHC26RyoYOD|%Px zXOaBD=Z~78+fC-(19E0`O{lK1ARFTRv0RK5sGPtt1wo-RduLqR8inS4hEtBIpbsBz^=fCTb?*+3UElEQT-2C=zqQwMJ^K8q zuJiKwn)$lW8BqGL2X^CWbsU>oTC!dIF!{{y^_Zxi;g16;%0F!}Acefl#dk`~$q|0Y zF0XkuFbN-Bx*hXuoOx0OW#P z&bKq~7K-D&4xfTJ_$N(jy5(7z#hl|Yty&bbaI9r*5gcG>6_ZfnS0pb(6ILV@MQtNV zeBH;9PUdsDjE(DtazGw++AtM_D71GBN%=y@JBqo zk^M$}=|LpES(63dW{=LjbiTP_?M-h}-5Ksg<6?f-G2xHoB3!>4RMJsljaOXPufe05 zGkx{mpNDJrbm-}YQ1nJqF$FBy4R`fEozuC$unJ$_-s9ZDt#-PYZSOS{8lqD%LfF|1 z4!0C0kFl(5H(0HY0ZkM+fG21zyDkdyyYRa}jsGzHX@dE1$<@n}zk_=n(r9!fUy_+X zzswvG_!tJXWsXtwYA0~M{Jw7(gZD^6!4;CQCMlDjOs2?A((W=k!|(ZTh8$cay`h^W zfxW$d;5~ZzAoRMZzHvDQzH|Y83sWw*PgMBXCft^VwzqW)F9Clk>`%*H#QwGtEiQ%H zkS>iq^?L!KlReKL{r0cG<3!M{K;{SB=C1F?@Rj}XIvi$Bjh{bS`&CT>ISKYTS3W;) ze^5N1^)ZUJS}IR%+Znfkyu*0YRodA6Xg3JoYQ`sDDrKByVJB<5%+%d#4ZR5>Tl&a+ zp@EB3)(ZtD0KS&O1j{Dm$fK{Bp#HN?5O(M~)tH+Ze8GwBWLMVPmF$Q30oCnDRW89c$ zLPndjmo~Psz;XN@*Q?P3VG0uB!-Oqm=6?FvF)z6`I4 z{`{o~m7N6XiGO4GQd>UkeeQI~Q`S^8u;CgtSq&+go^ zpTp!!F0wcF=$+)Ulf7-K3ZKhiv-h5^t}LAi~k* zCpj6ko1UDZPCaPLw_-T=^yS7^q1eNezZZS`T-k$si)I=s_#(rQ+YHE*D|NbydOHKm z)=|fCPT(;2%NrJf(Xt$#Kc&F(YzA{ zQ#IYCWT2&>cm@4MeLY0C=(gY&%bSJGqiSf*w0dh~@3x<_B>l*8R%{PFZ}Cg|4qCu2 zZxGLuB&)gI4P~jbEw;ne+LvKbfuB5UP>~Y%dP@utkWxgEB7^Wrk-^j3C&Sz6`0F|1 z>#|$+ni91`vZ23v2ZvQZ;iiMv_+yOeX9$vO2vvT^DHPlwZUYM?iY5FuDTP$w_UwX| z2CRmLtM3TpBCn;N%tjyf_4i&B`_>}4ix!>P@5w2krG*!|jxA^Ous?V`ju$6qQO##!Ztw`fC9bX< z)Q5&n*4|W6n6I-NMGGezHkgJw4KUmUm8|eOLPX}6<0krKL{bDA7mHG-U~7Lx&YIDG zL$=HD3}Mf9(NW^Y&;xi)5RQFlaW~$6B2GTGXYCLne@{0pDp(+hMOgZpyMcu>r}E=L zT+Rg%WVWKH@xK@=Yj)OcwQ~nm3DUP4@1_6k$&>#Z%j+M%*B!$HUP+zqe}(Y~n~m$8 zBkbLiCnyU6RReBYE?hKdL&7ZiYT}i zt3um3^odvdE~D0m^j(u7th}{e7>+sO8}2^M{;g^lDa8>+>Ibt#+yZ!tcd=h!edyp3 zn?DmDy`VyoiH|s>@9K7q&p3V;dCl%8ys!z7dCc#B5Dmc(sN~fAzNY(O0)4viW$8e{ z^mPnC5P2y|Q%SqD-c1}YYFV+irm#Y*hZNR?{)~Yvwo08iqpp`ZG`hXDB(O*{en|4g zU)arS@`|NjzCagApWEOuI$;7iu{Qhy9|Y8IYA(^DQj%#erWZgXIQzK_bFc zZ<&O&CviOJX9pzVxpcy&Q_$-qPq#JNiB_u5+_2WBq?R&~O zhQD{JJ5q5?Cso8oK^RhV5DLo9VdZdoBFBSd;gh&j`dwUYgj|5RN<5fX4LOH3xWst5 z+2*Fv7jzo@^Nu{<%$_E<)dM2JZ`Dsa@jOBl8S8|4cs;z22^G2?pJO8=-|~Yd()hG~p4wO@_k?DanQ$pkm`etYNkSOYgX*bTVt( zN@7Z0izs@c3m4b|N$%`itr zX;k6Al_Q7+WVe#JJ)&P(KD;bthe%r?yS?Q0ShGEty+{LZE9c~*?(sG7F78UsYm;zu zW`jA0UVuq)*jPo^w{Mh@5ax@;%%{(MK9RfKj0>_Dp132xnbX7#JYrkP9A&o@Fqr3V zk`}xv$D=C4D|%zYAo=gj5{QN&k4}Ii)nI4^QTG6@{|!5 zQ-e;I8_jiwF8q{<--11ZS=(9j5L?99bC^|CXISpss^QqT(-OIzX(QbV{zb9uLgCw$ zLObw|A7#wIStyZFct8Al4@@DYUmhml=?5&(>h&5{(L;0 z2oRvQ-5o8r@|IpZj}`!+l~8r!=m_X+CB~^ zUW@0dlMgC-HP+u*d23^d?a(-y-Z&nd8naY)GMWZkQ|9N^?FYgwDp%B}&jb&A>GuCM zr3#W8srg=zPo`Ll_3;?N#o#X!6ngv9vH_H`M;yK1nZIFr_?Wcu)n(B79}XHgX5*D8 zB@%9KxEOE3hbP%+daD#6H=F#6hiDMhsFr8Q$CoNF6W&QDW zsc@yBgEFIQV3@h9&e1U8F4+KubE}|CQ_$dKKpQlVT56(gArKsC%|=@vfRL)bPiIYQ zCoFT$b)-AsUST2hFr*u(3HF5IeKdZvT86?9xSukl@W(zK^RyAcN>cQK7lG;`C|fKV zt1jev-W8_m+T9iG0$M@=6Ic1lpqbalrsl|v~nK4Us~oA>PFR;<&|m=a;;NTWXj7@NN5l}dXw;*i|xAPraG38=_O<~m)lJlMgPn%s*^SEA|?r(2DK z2XUtMOo<_1w9k*dd7@Da7&Vjs7a+JM%*;x~$SE#sy%`!;^|^(-RCiVv?E;UdL#o<8sn~SRF zRQp(0gS0VyW}nMV`}ue5mc*Gj8ENknIyF3)Yi^b5oz+)f-?j4>W-0SwvNUvNF$4%$ z&E-H5dhT`M8ccjx0vwaBb4+`c91ZQf}l@n!;YxC-Ve1u?tg$Alq`|gHV=-(Hm;1YEr=aVR14wS}kMA z|Kk%$yoIep_JeO$p!^rk?^&audQ}}wyLXUr8OUgeFvS$xR&CF<+06dH+E&OjUw3?| z)lmy%?+&!mg7q=y^lRf#^Fz+te%5TYP8TEsQO`;0M>$0 ztK^)WdPNC;)jbzH!c8-=f9)d!?Vm`c`m(r!+oO>H^?&EPcAa^}FFpGA%EMcw7{j9% zP=9dRml#~|er0YC&+~fPP61ZkM>9S!YLN-268sXlkYF?MV?pWqU62aR4j1>na<^sE z4{k?Plv761Ut!m84HRbBkcuG3S#ni0&9<-vkyQoG6Ey#A5pwXcqI{Rb;?lt>+tuT^ zGtgxU?ftK^Xj*K?YR|1dj`K#E_t_fSD=mHk@6NsKz~v(P6eRF(ek~gpc!wIS4MjHJ z&cJFR^Qz)}zh8yI)$=$`1guwL_IinhF$f5rI!ofy;$rs>h3R2@9d%L`2KZxNFWHo> zg;8S+W%hh<70A?jPw0>`miOqXxr`=g09xkIUlBI@%gsmy?=q@Vr#`N6cLldR^ozcg zVIR0Cj5*qfe9a@n%a-2N`nZLu@!7Qv)uS`a>QOsPx1;K?O*`MI<;~+MPi|pg?B>OE z)1+f37o59FM$NmvNAg*ocU}8wi)a~1p9>686gR_73gY%6#+8v);ISVS$JxZ?_<)$R z!m}UL|2sfX$PxV@CuG~y^p>5~u}^s|@)!9-b773)P0&T<3D_kxhq1HvS?l};mu#`S zk%q{`f_k@5O;@PG>Bd-oM6U2A5EXLpUQIKntZYJ>mbq~{77h&~2}MiOIZOv%ySQK( zQ$w?gQiragYx?IBdqM#-AZzJPt@axYr?OM8)lwZstXO#iH6COwW6+?M`Oe7PqV#Ix znYQ`V+Gccbi7|b~$e4&vXfzJpnZU4p?JB9pttfKchGh)%&5W>`$Q4Zgr*a%M!2v+0 zgo1yLRIgtoE}9T}8a6a}8(@xuo?CDl6~!1N*QpvAgVg@THi3T{GQ5l$!1H?0+Muf| zi&5v!(z^f^G-$qnRIONEq`Fll4f%=*q7@&wwpCngVBTtVp<$8=oAX>Of~(MYYBY!6xc4Z6&2*SJH+YGwzvH-?|7 z*^x_dc!IJ;%M#!*PYN#k16!Ucdij`b3Al^|3nM4S{Xm_z3qWC%E&e$(6tm_nJkk$O zvO0W(v7|3wK!oP3ps&^J|6cph_NPX8wqQO|K+xb%$DQT#gdF7QCVo{SDTvF9+S#ES zTBI{xO+0Vm%}msrsz~H$HRG2(Z2w>iJKh}M;Zsn-KlQn6{E7xulmmJqbm6amz|=!0 zpFNA*LLp||pE1o-Cq5NRk=1IgInV4OU^^4>8HddhFd2zDtSM>%4Vp}X?I*Yf9uaZN8|Gq;--b$ebo^dj!{w>MDd>upLcsmzQ? z>|x}Q%Ze^{0JkC0bOMv-Aw8@||~FgIl=8XeR&rJK9w0XCStzYIB1hLxEAlac#_~YUW3B zyu;CDLfJ59wAoY)QF*OAskO+;gqLGh^VJ(&H z4JcVLPDAo6lFO0WOzrR&sBjh345(!E1<4(Z%_>#I$~yUNbeOMI*!xRGXpiK64ki0gIM|cdqfu=HTx|1x65qS$FZ%@Fuj)&tYG>nOQ_}U) zU$W_Ry>&pz*g3F$$ORp7o0EnzY2PAVigtgw*fhlNCnF_1F=0f9jch{pn{Vm%z-d!x zN0)~dcEo0og^xD*9~MnE1f^w0$z=t*?}ETYvS5`>sW;9=o`wsQtcXthDNHqq0NI`n z&Qi%>W?4#)H}F>+4RW$Ox(N{soC2ddmBJuTEE07E#8==O$Kd4S5VZxo4feG`Q%q2wum& z*rYf0SYM?ZQiZYGs}-A6eSIw;P*Gd-TsH+9eAFUTnMCMRzl}0mV@R?f;$ZL^Lv_2z8 zSP&AF3S1X)xybhSPyW-B&dRLTvwka2$B=N#Krh5cNdV=LJhdnZR{j}KQU1vnw$kmO ziy=|G?0fE&8*mZc>b(K&;uUMtGz~Ec3Yz>7w6%GEW7_p0_3lBVEev(_3!r7mU5HbMiX`*DI_D^yNNt^MU^RD6|Zw9`WUNvF0;y(aaNDY zh8^NebT+*)EE0pR2HsaXFAG4OHX=G;NXClucHU%P200@%W)5G-3WSZ53!s_|ehvIo z_hldt1Qf*!G%XcaIDH428#tuS9YVEfyacKJ>*Yw8-$GK#IWTFUnfidE?t0lmv-Jy&zHKSoqA$ zH{HUpv`zPEgVv$%xVH46!vN)DZ(?yQsK}#~Z9ZWAD|mEdiC_JyB&(wSXUbIS3XhqK5)$**(|1F~l3cP+B5CV{)IUh8I*jA;((X(B}Tl>-!<)Bx$hFb0HSgl+B_ z+BQ|jejCdqchC`u2`U=}?hy>PGrm8Zp<{M0cLPC;=3LJFi`)5hz2>qm*pVNomc5nGEtMI{U{(_+eyx0n`pefMC!5IL-Sa((qeZQpueGms>2kk*MrPVEBeSNxw* z-^y8&Gycu8I*)~cA03*;q0-UTL{ouYcE{|^?Ts(ZHjsY@=)v&j_nS_?1q32(f4tA8 z$oMp8C7F%MO+Ae;k|u?h7Ac8W3_-JxFgJ#oXh}KABU$Jp#uHFh6#OI+=Go*uwp@4X z;J&Tn74ss);?&7Id@RZYp@|^ov59}f$&-%qdkA$>$9x>&qs%zMacX`3dZ52BDXb-L z5=KA0;^saYOj&#RH2*vTHGtz7v4=bV^}k0Jjmn%N5|}jI&br@N$*okc87tY(8yB`7 z5|*;fN)lwp!~GSW1fA8IRDSn{+edDA9vKA`I7OvhgqAQA{tX5CfZujuMrURjb_c?@ zXEc3Y^U3P31rR(+}1(ZPtIvbX!o* zPS-vFO!$MXtReI717IizM*n^r{?eGKsG+HOf--x$M^l7_5e~0KMKmNgf_MaJxpZa2 zM|Cj$fr*eYD>5ndp2~!I4E6uPMb{2oQ>8vk0^OebjiAe%*ONHl*%;=lh7sdY8eE=p zD$bA%xAcYl7Szp#&mQ4ukj6*4(HhDOi9lj>Hvq4+WXJ*M#y#*xm zfc&(^t_0&ndFAu~(Zg(6o&`<*r%8`1!Mtqs}ZNwdr31N=ZYbDHyC7 zPJzfg?LaHuGSnzFo8nZ~U;sMvW?i@T0XnQG{$s{MmdcZGW4v3g&SBqoLB<*n*^n|k z&zAD^9qt7!^0IVE`q*qSEw+45Do;$UW`qbAL?O9~Vp-gCh@y^)WD7z=lkYI$x>jY% zgH@&I@m}^19uH*^Jy0gMO8?^DvPx_ir8EN7j_)AHMV8&mK&5FNO`sEDb}WOHpcx8h zXFy;qO0K_KVdGksH1sO`TWk7V@!G@~{Fi|Kl*UE!*({7qlEdax=`$2Ki{7<}LFB!@FsMsk|19axq3(E9BY=OxO>-xPl8?z1n5;bg{tjQyxzn8Gg!R!k}^mYJPcKdThG z+zvgL%y-_uxn!l*X0x{*#O%%Oz^$lo*WmL)SC3}q>gOkyTeAw^HK*#nXk==nR> zaUAd#_65HVXiY;nL=Jf2LfPpMZA|A%$j^(Y7WjNFY6%(fIx zD$m+@-b3)7E^&<;n}6?-=L!d+dV8(79yJ>7opeU_mM%yUnT9JBR-eY4jOv^L7@nCW zpdA2rANTsLlPul7k2h@3mMh7?OZRQ#=Ce&}DqQ^P)jxjq{#5C*w6O{BQ*Osz{jr!} zBsa$nn|@>h+l#AYlOD@U(vKRN5=D)_phnfDqlR^99*tcJnRt{Mq+XB?CzZ<9U6daA zw@r2Znr(LV)pJDTYC6cr#tMpFopk*lQCAt3Ro8Us?k?#r>F!SH?(P<*AV}wTeDuA(A6)bE9%j#;z1Eu9`+U1fy}L@Kn{3#Tce=Z{U!^%4uma4S zK~W&+BZu95*P1uyHtkA@8e%3FBU*_XIktphq9LK18b5-Y+WiR?EBHus)T*|&q(~KV zcHo~J7C{2fD21ei3%h2k-4=`E)x~&lGMPa`Wc#$pmOX8J1>Mw+>xaHJ>kpvG-V6tp zMiK}1p!A1hB770O+(zXWNRg1)=W?`~fv?g*=t+tYfJQt|f4Z=;_isopxJB^dSr47g zli%w{15vwfVux2Fiv_m=v))((v@1A9h?zx-HQIl`)9j7hO!#=PT>TB|!4)ZGk6b@4 zrb|P-!U}|)v3PviIzcs605JYG!GXbJg@I3_yv0JvhToX?Hqjz*j~wE+r`2SF)p(ho zkOtZzgLr^6B);5h+5gN&zysA|y8TBnO@#DuBG&(%P;)^OSFPM0J0zOQZL-!p8 zNXRstc>STjqG%hrB>huG(K>R;#?*s&Khd)XYR$rq19?X*L&*jAAGl10Tp2(piy>I! zFYX%$Rh})YPS}vL5|e&_dLC9xyXfF zDhMyOly4Lt_z5*r=U}>aQzk3f*y-(8#C-Vyc6y{0v3u4HsBMVKkEmf`iIvRGf4gm6 z2jVTa{#$v#Lku{*d-&zdzRz$Qq0{X&2RM^1A4bSrJXt$s2%6-nHIOLtRD`Gny`*TC z)#uej3!9YNAm^Nw9L14eEi!MMtSCWE+`$KtUi_GuQ+;yLm`nItl-3KE9x-stZKSxW z!S=h{aMdcm;n#`(yfO(y4yM&6HKr^7Cztf}bxXMMk#~gOpo|$A*Tj5Fb%InisvMw$ zDeM`KR{kO8PIMM|=E*yg9EznuK^^dZo^D@lUBJl({~98fl|Z|9R~ZD?s--hx<|GTG zr87A;%oozJ@2dP5T*W{D^cP!nag7rDnD;J?WMF}vhYn?~mEDXSWhmtTF(*@VAwN;1 zSI+I6y4-UvtJF2*f`m$2FI!uqF-`$OD| zOv?nJisNdvY@JbTf#(X0ZXkB{I;hiNX!(@{O74njo@C#nrgU>_;#O{$W4gi+wnNS% zfnKjQ)&X*j0cKqZxO_)UyFDr(%9=kK1c&bt7KUH<7zcPZ@4IudX000upNeb3wx490Sq zvEt~w17d+ObS62>wOmiccreB??-dcfj{^q1di`*c7)&|E-v5Vsw=9hl6gj@ay+$rH3EMiJQmKt)sX1R8eChtzto6ab zB+q77gY_tR9O?Bx-_VB4X7ho4lZUI&l~o7u`UTl7e&0p?)8QB4U*HS6czBSu<1~Qr z*;{2u+cvvyERHxGDBEtZIAsaXz5|Ps?X~g`ScP>W+h{IhH)y9P)3|grDPwX)2bWl| zTb;_#jM0Q#cu_Je%ih9uovAg-Ps3tmK(jKm3qA#7+7x38A(TpEJqGkpeXKc#Vox`&MbY1H-e+V%?yTVZXNx#Z+#W2^!Iw2eT>S0f$1-Q6ItbQBKaHsgg zx-B$Zf_Gp@mC->1qI5=yzru5bIn}A@B5Bk@93rZeyaz{7ob->qn1E32ti0u+haJF7 z0qGNc>RAi9=wU?x0csEN$6z66)SDRP)PW|&%D5xT)m_ak`irB5AxDK=5a-=m`W}sI$sP2 z;r;kF-T%_!j}|dc%@yST5DpvwU@;}}<{Gl4UlQq=gNL&(zU&a4)S8-=`jZ*Tu+mClmAi_4-hvyRS&xoXOU@A^Rml&a(0 z6}44o9`*E9kP-%tU&)d9u9)g^b{#xeKUHxK0N+cBw>G%WVTsEj|K%upW(%Fo6VLCf zJX5Q)_RRVOed%(|#p|TK7K1jVZyL3qi0(3)ViF-(L3HoiboHkCxWrBo=Xb$7lyn1( z`ye3#mJic5imzd|Dl{*kt03I|A5|}8W_)}Myjnj7$1E0g$aKEGfX`?E4$t9{>*?U- zHf#I6ya&#Ny9{jTYQy{AZz6Fa9I~(J80Ma%T)0R=C-D#kU4Z|o)N7BNsdo#eumD3X%_p#F8&MeDM3!iL7fvphdKe()icmT3`@aZJNv#Sar!VR_ z|9LQxtp41uj;A*;6jGD*w@TvTtdV77@Nr+FriEcA={<-KmcU=uM3%39~ZnuDH}wt9~yj7llLn? zt{01!iR4-i*IDCHh@7bJM1V`w6u-7;_*ptpuZ!#roZ}*d6mpdXhG6du$RV|ej=zQx zuhfKsDUzhuN`v@^wBMG2NxaFj`}A^>Yv4UU(stGtEn_@|Jk@l+9xx(eu{q?Z3s8rN zc~24SNlTulIXQJ2^iXpzAB|Cee;pB~7!~Y_2;;t$CdSqleAZ-MxvJtH&Z%(dtmySr zt|_BD^{pno)m*c=%(v=L)U$G8+6+0Eqn!mgkj?;sSbzu6j}E`KM{U{Ev; zoF)tb23tsH76x3{eQ;+MSzF!DWMEP2v`m-3q?(P?R)g6TlOVTJosDEmIU|Jl%@+kS z!mCaJvkbSqm*O?3@GrLt%?%0R{>6Kv5y3MY$fks(cdUNdm4?E1aXk!-Ui3mkwKQ5<|VTJ;dGtX)t) zL^d6>Pe$l3y%0X_lYVB=5gUVmhWx^ajXe@uwCJ}gBnc@E591GJu-U<0cp~EBH{u3QkhVH%m-w9NBfko_!LwXjbp#08^|geFB-VD-{@QT)Tog% z6VPl!dYMY#1rKC0UGe%obl3XUfAiw5P5UW3#Dr=$e}rv+6{NEssVA+i5&Z*Ku7;Vp zk$;!$*=KHF@Q?_(hWT*;@tbMDZaWZj6ugsKlrn862KeBXrpVn`UD(o0>6+ivuCscr zmEWw9{mf`GehLKdP_g!e`JPVL!&IaYp@t|Nzn0LxE5PCiY05&WfO!aR$|7p3DN9Pz zVr@y;#xJ4*9dra86wiDuFR0lw^?aK(PC~YJvEFm?l zOEVAT=^I5mHbjNEO3RSh(F`+CxTrEo^V9qbpnLaCp-QL9*?sgu_v1*1FX!QN@?Z*2 zyiB|*90ScYK1GVZ0wSS!p;Cl;$dg#161oBXoIEm3f-d6)ugRMH5cIJ|7&8?32&qEP zEzF`X@^=RrKM4CwJnabySBk))lQMvyEo^13J==UhpWroRYAd1NARC(0zNJziL$PjvX!P0i|*U_I!b-qrPWpaxk=u#9028G?$doGuKeo<8q?F^`# zmY#h-+Iwn!)5{77qT_3uiP#s2;hF%BJLOvXSaB+!#ph3FoJsYm zxz*@v*`L2H%G4XbWzBu>dsuE;9``AUDp!2@{gbpD9hsMejH*C#aIUGSs(^=N+SGS8 za1LCM;vkFyl1ci5w5elRz=7pvtbV=X>X*Y7XNTvygn%qs3Blq3h?7Y~2OE961D1MC zd9JOn`_G2}!y7&d;+ni)SR-99av~mxXKJt6jGn7<4`&i0^u$w^Rva;>kZ|g8ZL}|& zLi`R64kwULZ-9>*WQEoDg>N3l$jyZv?YZP^8~dUx`{`N|F_qxPF!US9GEgog-@x6O zQA4wqr2drDmy+vG(zV1SNXnf38M)0er|0$oq2+|wHoWj_C5Tqc^p}0M1 ze`}=LR#3tG7D`oHx{~?dRw!E*2fB znYb@dMidOkW*@4$|4!dF%|TP%A+yt(02 zr>WUfKd5xY7}OnpjP4-Vf|&v6VM2}5wI*WXOk2vaETiL0ZzsaG@S#BN4(QtjGR)>m z$!*>*BQVl_TKwFko=5c*P82+t=ckvlPdu}K8tjf2p2}Hn3m?}xtA1i9!86RqGH*P! ztHC*p(wXeHM-AgSWq6ugye9*o>60n=#Wh`in6a%abCwpa{=A>@`hzAxu+-1gDl;Z{ zmQpJWdc1uCdum%12WqwuBE*$BO!z~D`}?z}DZEEp>iYtIiu-J4i6WI(wL_UOF4}m@ z!^TEAH>_>H_ro9dDRKO!%}kDq(~id}ES1U<*$K%l`Vpc7Bt{7i;iG#1AEW<_1xlz>97XOt zholoCeLXphKdmnqPF+_+V;hctugZQVmG65LQRpMsfXmXDRk?7QpHav;=wDAT?B2@b z>+{;eT@^CB%kcQgkrs#qIIQw@@XeA9u2>>D{IIC97)4M3By@=n;S~V;(b;^Wh|Axj zZ79n*i3no|z-w4AQRAiIP;4mKWfSXnC+(;E)Vz=Dxr5;b( zZB)elw)z~*2da`Cj6GV79O6El?*tqnm0~XnLTtjVmkVuqg!6P0@xIV?Ea9bxc%#o1QW#y)j(bJOq+p|xq;)nXxelO zANV3-L&(J{5Mzu)T|x~B?=MM#OS?(`-+{NtMpR?(s$%RqMRLWP?-u{OeiuN@nZ?v0XK&9 zl7Y7CA783gnpU1;*SfQX6Q>D8g2IrBotq}xGR}cF+A|O3vbGiE2XHHMF~mWbcxr(U zAMqjI(~UJk=Bo;Y^Ye`VhZ(qomPt9$$=;^*xs#HeKD*B6_8sLsKu%c0)Z&F#k5L#Z zm$`BiZrb-Fj9W{5Bsz>th=~1kE&R>0I&aZ|`FQm?f0mCpfkmU8%Pw+VGg~Y7)vufE zRlvFN2dS7YY(`rR{<+$>Pwex)Mu^Yk68f+MJkuhe83WCX zV)!L*_AlfumNz|pr@IXVTku&L3=shJV+B~_IT>^}zax9!95}VT-l@$0o?O1xu?x8s z+>8c$@03%WQ-TfkWQlZ{47m>DNmQKIlC1cSjns#5-S-(UL#b4|j+cgMv6Pks6}i0?D{}&JaLNM>3%Vp;dwVj4mkGJ& zNc6zSt*tH02zaSq=Q+VVQle*d{b9MY6OVyUuVGk#@=?qb#zR)Zko*w-XZ)k!#0drIod+-I!QBmYG@K64L&RdX2(+E2f#%-aWl1<B9qC7Nj}9b06)9?r~V zeX+X^dpevJsSR=~9H>Ti70D@m)}tJ%qQ;nXT#C;hh;?{;HYadpT$ z`G=EfhXp5*D0%gHfXbeJ_<7^$yhUSEm{$B2AchH=+B>;8aVKO>ED1RaJ4x|x#AQ-n z*uQq%+k2kRA8y;V5$~Pdl`KjC3sH4u|@PZ+Mtk?2n?F;u&?Jk`quQZRGfR3e- zN}d+U2%r9k7IXWy%I4w$hV|7lRC?zYhk}ny4ck9)z3j&~QxGe0rJ%$gqR{>NksMey z=hiO<$p>iKvS+qVQrsYE*)(j;r^Cd8p@WX;Z(vRfxw1vK-)e1k{KD)9V|8xME4DS# z)F+)+vTRGI;KJY=~>+(as z&R~q4+PNcBz9y%t$$7jk(t$=~=%6C~g&MAzk^V0Ysot*ziG1w3{+&BDE8NF}VW9kz zeFd{iFcg@2&?g!eD3Qo9jNaZ+4@w5hsUr6F4!C0!JYf2!KHR zf~;s8u>e7)_zpqwVBXZu?A&s1%>8Ef>GTLcu<>J*=U~BVAWu{gmpTO&tMms~2C+d% zsti=DKUj*57-!vxhn_kK^&O5OK#PumN`fY33r03*GOe3zA}_(u&P-Sk;wj-2dmj+j zeQADXLYxq!l6ZlIpw6;HejambwKbwYA@DM2Xy0{R?i_W|D7nFh{Z zK^~BsXB;3wx+58f4S9TJ``U{^HH5x*4^bkP2Ifm8jCfPnACKO1ygL$kx4W*_yBP?? zE#UcHQXWcUhGRGbIpNtJ(gI-^Wej17FL#b8goTEXS;%`x^XTKa6;BxMu&y*2OcFgP z?$VG#MBqj#g4KnQvF=T05>caT=``82uD8?Vo*=rb(aTT0KGINbCB6EH@d;g2OVtdf zlJZQHb-o9CgaPUjMFYlG1^5Uf!zLYcEujNpi7etp^eR!`6^w={!Q>S^b2@IYQ$`YibSv);(B-tS&{%z_*F zUZg$yBt4P8;fCKj=fvi8W10)h;ml>iDbcZo_)t-lVP8T2mbqWsjeAS!6FT7mSP!JV z=Ob~QkTS)BA(K&@B@G>T_Yl{nx4=w`*^!D=9&KZpI9P>#fIW{4tz zQQ2!TdN-f@11F5#o9X`%-z&TlesRAobIt<20^c1Q_W?(&xu4Y)KE!2@(8(%F+FE2u5Z~c)d``ripADM^BMM;ex-IvX zm)<3{1SmMHsDEQSq{3B1F0>Tmw4jXjk+w2oYBQT(O^aAahUdf^pnZRPLzRZ9iy~v5 z*BC2>LFqa|um;_T(@*%@m``@sRSLFWW)r2_#+vDn0fDE;2f+WKPd-5XO=c4vsf=y) z--?gSfO-OI-VZ23w1%}qi=@X8F?qje- zcMqDU)KEb|G9W#Os#I3{rOlGyzm2Q4O3G|3e#%Vz2|IDBrr~WFPIhkbHf~@{2Rx-~ zs5CYk5Z7VuRyJzG1fCwuS*F+TjW9~`< zjF}dMErW06S8Bf~{P@DqQ+a2#=?#tyKB*xlSe;wo;AZuGPL99D2e5sO+P`V=O~Hhy z(%O=_Juo9!gFY1_y>ENilc9zZr^Iea|6Nmp*!S}*sbv^8qy)pp zGh?B@xU(q26ns?;+sePJl36YodT;X0vVkiEm!;=)$kqhrD#mU`8U1H~ ztAGToC97fdvS2M)HNP-7P||hmAV$UeusjGsUWW;m67xE0y%Rp6mF|ryDV`sRCnlX95&7cK$&id85+kS1wCmO}>G!;&=4H1Gb+RimIlj z;3^}(rc+Z(b@FTc*feFujJLC6`Voe1O@<+EJXYF~VDA-9= zW>cbupne1fMv#kCfLx@a;>m!Xao>F+I%{hd;Qv*H4-YW|{8?1Or*htv!Ik`9&qL@xukZ%6&HH=&{J;OG0rrV;t079=6+4aY;r4q>gopGvmPpSa_;9 z?egeieEpIH^fw6D*{e7Xw-tpn~EsAJoP9G6|j_y|HOv%h7y8`BWOBE-j!gSrgFArUOrjx=`|#2^7|(m z#@ ze{t>WIqH61G&=(+JQzFKXRVLK~`a@g2{Ale~2 zFg=23zuE<}wzC!py{QgTxY>q?9fg>Oq8-@Eu!gEyTs;aO_U8VE^ychq!m51$^?Kw3 zL0(WGQmDSW{tQ}`r^|agPyK4rg5;5BMnGGKw4QaUOLVe>_f%kyvTtFIYcV2tY-SQ8 zKDE%5`8hBhls4_HCHU|T;q9$2`0)1KkFv%-HAX+qVhO&7i2YSf`A4u*8?EcNHC~yy zmMb_9m02@UZ~m(4fQoxOO;T=BkT@uomZPThVu)acBh?`G@Spj06*40u0LT@H=TnKQ z!iRUz{L|~y?@ijje)Ucx`^bdfRbTIwFG?xuIyXHVG4NS!0HZafu%Z;DAT0<}b0DmO zX8tR^x6dS>e!VVgFky);kZ zbq*e))EJO&2a@!bqb-pJ6$W#T9>VlfF&nua2~R|N zx*`lHc0W+ID{K%X$5jc`;Y{WF5vpt?2(S*}t8Cmp9s+twO2R&D>REjYYi}q~Ds=H0 z_ECp4kC%eHCWvSD~wJ~V&Aio2nv@2`PcnbU(M7^t~8t>cmt&3;4v79IT!IwHVr`v!sgKlMOrXe!(ISh_(+xxnK* z!f*Sr?qUzsb z3#@CtYi^qE@ctxx{%GQXzvSGrZx}F|a=p&GmfZK`a7u}+CdG}vSzO<&_PVf{Vvz)>mdC+|)~TVr9#)$Wtp=eqi3#>EqGTj#%NZN6RD$f@4wnn2Ed zUAzecy4l=CT6pa52d0COxL(^LwTZn zJG-*e1^h_HS3mjQ;VH=tCYdg3Q;KyNGf&rOe z>6$eFmb3v28$~y`Z*?Ao8L98+DP+uZ0$E{zd*j`A>@ugMO1+cL9u|L- zQDf|Cjm=L7UFXf&u|jf8KBY^=99C7ec9ew(Asz(*L$y8ReChtCY8({r1zd?#y~5uM z{H*S#=cjh^^L%}^I2E>PXQ*KhFkS_Ya@uGm>U>?QF`gVz%yJM zSY@MKB;LLPn_DhkAGfoWmHazmUv;-fnu)cJ1xdRY;wjAbKF_x(2Sl&E_W9jv;+=9k z&6gAJo!kg}p>In3@NSB-YBi|0vD-Pk6lK}xk7 z@~g8&z)bf|vB#-{7qlO6+vc(7IG-*S-LT4+z(+&`b=-twnhqQE3T)YBVVFkV2NPux z*G2rqno}jUPJbOj$qOz!2?4n5!<0-e7TLZFE3^x74A&%jOG?>KKr?OSNtUx>%_Vc< zOyN~eb}XCfY>~N=ZkCVg)wOhXTty}_i77wqFC)h6OtTm0OF%QRSCO_vPF?;H?PEj| zW9-I1PYt`zzaH-9@t>{>O{HnQyX5HNe7e`X8Aw=HKB_(aWPr&=`fE0Qsp8P=8%9ie zsctv3>3I-l%2JrF5U7PMJ-Y64=Vx?ZM0G>GoZ8S~!s$w)WLzRf&Lp+;@UoOY#_3#I zYt=U{0qJEoS(x-k7`|)D5vlc+7Zux;(UF`t0rb7-UZ`5u|(LIkc11Kz2 z^fF6dSwSv<5HF8b>gtcMgqD5?y#}~QtN3V|eCTty0Ly>FCIQ=libe-@z)mg8$?NW` z^z5cJR~7>YZJW|id!u?E6PchEw50)Oj%6_`E32M4KE-HQHBpd6n82#_!Tg9WY5PSQ zSchFqDDJ@+oXAB5atODvC%viD@gD9e2Yh4Cd{J{mOSd||$4WtOmy*AifNC4!==RRAF9mjt7j8MV|P5%2{~exeB%S^eVBkKSs<%eOE6a z0~kOi@|bVON|)~5^wk-$0lz-V@S`(YZia$49m8UN5D~1%wC`bc zN(dvhxP{4-fN)^;bt6qR;tdyoBSL%J^oQ&paTMUCN+?Q7>qnwPdFR&d<)*qQNAxz9 zJ@>iuErkppC>3;)Bg+1f3WUx#Z;D#lTzfsvejOeiomL7y%iM$tbA0dET}a3Gzr@@@ zjrRM_m);Ez$pXm3KyCd3;X9WGja@qrQyg#kWm#Dx153ap9&$q~G~TLhy7GFE*9Z1{ zKa$AD9C29uomiZQ!HT;s((J4S^PBJMOzycWj|6-n^3W{_azy<|w5Zd84(V`~D4Rd} zY(^kN@vKm$b4CfkZZ=`4063Hwm($M6xCe?y%bm0 z*)o`w9E@Ah?7HlZlI<7L#+zBbm@8~CXsYB6?qAW>G{KKj%+Nu(M=|F}-Y<9Q*oMgw zg_zRV0xEr(P%?Zr1+SSeQ0t*^14{>!nYYv6xIJLU^(nVBDGT+uBW+P zuR(y~KK85!a(~Wlzw4l+}l0Uhb?&2G1r~ zkUhg1`2QpTDx$>kI7YyzQqWabqn=9EszMPezuIvxt`<@_?`d zg+zSnFs`CNQ`S)w27Jq;udqTD>>5%X5O7fJzyDBB51NDGPJ6v0^bV!y$%u8s4XB;p zZT3~l*#YrPH_P#xRNSMOGj-zeqX>IZIqIlSaAyGQttgz0atWe%A|{x=*dgRpwl8hz zh^6RB)-ZceSK1lYf7L8L$=EgDe7;4WrsJHCKZFcTvlU+(9zT9O{=}zt#zG(O_TzS! z-Z{!*QjNp{%{8_kfzerlgz6BU(b-EbI$PP;&4=0|4gLI2OxfJQm?qyPT{iM;6S43a z=YM%{(r9gEFlz25E={-^lmB9=nDlKe-5(iJoD_T-_I$fP?CN0-oN1YQYyvgXn;G`K z$z70yJ3;907uLYP|_n%Aeo855-_s~hL1Mpu@!mMFQ@VI=-tQjIuWw=ujyY< zASKYv@{KMlhZlB5bC7LvOJmYWQ6U9@u}%?gryJdmJ!J#Vp+T-A`&jfg;+p)sYNZRJ z?W0cB;y1ed?IC2K@{nW8=1%nt&YEg1w3W&W{OF=Pa@3Zqnpf=!Fl_Bt-%b>7|i1uRO6n(rds?)3+1Vb^>izzqK@CeV+IPu9yUH=q%qz zvbR^4Y?d@?S?0In9^)yo3$WMclvP?2*nDZWx^p#Gp0(y`9$%K4u-5tY3BBZ9DDp0Z zJGL~2at3r*qvStI#SV^5qQJG_I3(jvA3+em4_WY__zr&YGlD;e7~Injh+7fho_>%V zL>2FIDSZ(7&W7HB2?g-NbW^Y)#!Gj!+>K0T&-kHEI1`lBa+kcy(*9jl5&#v$GJFm* z8_Z@+KXEiHq958!%@IOw3hKO;Fyg%!E%;q{a>9tyCmuV5Rpo5qe-F*Z5lp|_qx8F9 z&KWxRd!<)qz1R+x_k~D_=@6Rb~T9ux5`HLKJ`dw8j^W1j+GS&CEo1$6To(*Cw z@#wu_>X@)tS?Vp%{0yPHimKwl&=my*!nFlp3e)$-HnKuZPbv3Av|!&#tsrJ$ugL$` z9Bu^QRMF);>c8+OeC6LGZB&r|o$!?VvdG#U(u8k}QANn%E%7;mam~F7>@#XMJ7VP%8ZZ3h3wOelj ze~FIyv9Cvp*2v552aj#Hy>!|2+bd-4nx|g@H@lMF&K-VdFpK_B!K+*^{7GaIn-@4x z>F@|7Y#~Yc7*HU3Oae!tcFL2I{aJ6cA*|wl)yrNn@aVzF9BiloJ!)Bu@~LT7*;y+D zlYtu9<_H6h_0y%bnaTEA$wc-*bt!L0Q*$;7{`5-9toJ+<`HF{Bw;`Ny!?rW5D~NI3 zu+iAC6q3B5=^(?^Gra$HC8H30s8$WAyvB_TFK?`k&@q_;MFDUwA=GcT?Ds8#wjz|@ zDwijpJ>lT`1?@pGg{_UzV&f(K33vbh`u;BcXzQWPAAwKD##a zNk>22VS9DIAKlkg#a7=|EI{A2tc#3)};*KO>LvdoK{Eyw&f zAVF!q9rq1YVBe|e?GV3*&&mF`sfkET;*yZ(2O z%ni3P0Uy!xJAU6heJNgw&p-Gsy4LXukhTndekdZ)LZ=|3Y`VbtF#K9m5bP7s(@I%_ z0G8f5k|O%sj?{-3Gpp%1ipQ%8ghASv2KuBN#WvpeJvSCdXLQvvEQcNZOf_M8(x^CX zS{dl2FZ)s9Y8W+8Cse1Hume@?@eJycSmjX0g+xd{H2-l_ESJlsPW^S@;nPBC%vAF0 zCxfAdT`~b)bXj34Y=VU{pMej97&58lY*j-26(Hyz3}a&iLH{5)xXPBEgLAQ5>4=F@ zdAn3h1Ocv}x?B@eM^X#0Xs#$u3-|a25QJalI#i#S&VEF0&5p%P{FKvl`q}Tp^6%o? zKm>Ulm{Ku6euL33IKZ9OMQT-lT8kX&dQ+6TJQ$ z<9;shzVe^T?oHsUC47NlYPE}ZwPYVs*k#aWMAv07g|T9%&_=p~wqh4W(VL{Bi!l_7 zHZk(0J3qp#=o_PCj-d`ndM*F3qU_>f&}{AWZYY~gGZ}cu_DbG@jLy)R^qN~53`SOf zht;D43CW^rqNnewDJ3HH7VRdELIK-%ssAb#NJwN(vQuFJh7TsJY`2NW#=xu65SVW* zb_%YVY*IYC3{4E^qqL?lT5J?eh*!{B4Qwd5%+%!}LOBfiP*&RK^|Gc<;1VJwJxUjH zMFF8)X3Jhi4je?Kg&}c)l?@PAO!P;un)l2lZei4DrbW!Lp-sb6U1A`mz^A^2`Uy3K z%u*u%raa|V-1I=R>)s%lL%T|YyYlO==I^QK^OS$&Qn^t?(W!_F`9PdQP|>>~iVJ}a zW4B>5IYV~-_dk*ZrB4;^bBF3>sUHE*m7x!bQjIUMJnBnm_MUK;nZ84;Ut z8os#uMryie(a~qYxKVX>8bnu~Qi(&5U}jyy$md8Ey&p=Tb_=qh4;-XM`>Qa84HI{T z(yfEj3V_G&sbS*N_rD!HCf0OJEO4nYa+Ed9o00G&VBQ2%x?TkJLjp2#u;L!z?TIIH zT&a@{1z7QN1GNEFozRmB}2eRD0@C7?;kQkNtsX)|slKFO^QS;$P~2!2u>Tbkc%|iyuvD#BU!ZG3UpXKxw&bO_bn{F8As|J5wNcP1`^Js1# zu)z;Fp4__K{QNDthlb7?^sw_Y-y?EuZsToQ{ex^SHuxEnskbH~m)j>fqFdd!p{a3K zvpP339~M$SgK#=991r+LAZLQ3*y_gPo}?93%Bae$1=*n4k3d@;rpq{=FlnU64Td;b z3;dd?CQodZe!3U;lijHGI9jVc@lR%8RG;0hs%4hovTpyVhsn`H{Zr2HLCj}e3Ub_> zxSWV@F++wKxtDhowfq;ubW~}=221{9VDt4BH;H?-rJOs)?O@02pg;fJj(75hjSRI} zh2nu2-&i_oaZ_ZY;F+13On)L_N<=6Fa4eu6NBwsRFKJAkIk5ISksunJ!|njyOU;0j zu2KA*>B_U<3`I}O>)I!_uU@};nm5<`cBfu>eI+q5>iq%!^Lcz^XFmJ2HuQ88VYaPM z($%;SwQ#YfY4TCpl24ZHw&H)`bg`K zirhxI91Czvj@!r(F$S9!#oq1dErgIX069}m;_b=Cb7$fA6nOz&8XoqSP7+!fG*HJt zqZ$+cqhoN{Fj1bpGU$40eX;WY#MFIyKzF+PBXE0=EE+5GYtADs6f!c7?r0w;nWPLG zKm&sZGQ;Qb=hi!>Z2#x&qA1%!{&@VVeQ4%!My8T|)MHIhi8XUE=nj~B@&e$Mvr&T+4U(4HVEL2{jc^9SM?%8EDN3F%_*-Bc&tDUrbLv2^IzE~nzRhzl4 z?Tm*}(13uuA*NoKP%_Yd+%UYno=QLlrn$ zqiIsnx{_jjl`NsmjV@QFCQR<574hh;qpirzqMXISUCZ!lzKp9Yo$WrGzIoebJ&mXa z+4@UWiO-exw>x?6xJ(Xw;le9hX}o92PT^W!;bvMUlKMrOS|3&3IBP)CVI@#5bg8!L ze$dD&e2@8Sy!0cWo3xPk%lZe8==>^my=%t}Py7?if-`}(gFa`gQx6K;z~fEs;gxS! z-`*Zir9HtFW}8wkEuogOQxnX{VB;6`Tr#q>(G_b^DdS`7YGlK`S9e$<3bb`nupgds}U zWCKEg#(W$)PQ$S4+JDLm#9aR$WXr~;usf6_2b&rbKF6YsIL(BBL|%s zL-W_Z+L$&l57kxirrD%dI-*)99yTMFr&-TU!Z#6;x~o_2NXqft8ajH_P4f6fUkNu{ z)X*FNEtc-bY-iXRX~!@Vc`_=CodyvrymC-r&2=l8v|FuSU8%!QG0K)6CrXS5<`%mM zLg7anT&$z9AD(YJW1sM`9?KoFQG(pUSM=|TyACg789i5~H!pRW>U zbJeh2w3~RIJNS9=x67{rXcr+g_DCiJLWCZ5PBx->WXQPY^!9klYuYu@wAe1R@AFXl znd@LKjZyd7nAes^t@dIq)BB=gJ7c1`Pl+7s5MDoh^=~Kyzrckc-v$WGnQM$v-OXL(!E)JQ7m6# zS@d1fZy)PphU@n-?STyYIC>6mP{z5`^6&t3M0fXAQt==9F4&8n;pe1N>^e{;pi~G2)zML|dt7Y%Xy={x5Y4 zfyI>#XZ6fHFJQ{Cg=`b&J95oOcyj_4_3@%ZWLH z9v|HPXX{4Pvr-3fP8E~`TzAmz8Bb-(X=11us+g2$dix-LRh5jbzH+L^Omy4S4ko<`)}gdEWPX+4ny;uh_a_{FdEjt2@$f<2u{>bm zUyLWsoG&K^?kiS%oj>Gdo6kP?c_+n(^zE%0I&f9asxf6#H*T9&&aLrd?!U-+Czq|b zS}p~tYlcSKf7RK1!J<-D6Z2!x-GT#nLrK=%g0(`I*4f}QrwFDmXqx_hzLWL|#Vey< z`YV+=E}1zV*lImoSucoV?CzB$55HQZSzR*yHNNNoDtFXD9pcp{8m=ng@`-^Y?LA`o zOU0%Owk;Shosfo!e9PcSBgdan`MkqLy9qsstPSsj1k!jyIhic#`oS|(rd91Dcs)`D zJaBzDd%{#Gm7Uvn;uWpebx|d!S`HbQEQ~g$RLAOkYj_=ug8kl=_^4Ol@;e_ z`cQG()4b7uEMlqy4qJCT=wx}EdB#?;r|Pk01Do5z)Cc_u4%xcXP5JHQipD|{#@H9X z8D0AE!Dbr6rvuq7kGu~}sv)uK$d2eE#h3fsciv>HpX~U7y1wh;XHshLzyCBsbl)ZL zCKGhdb3*>K5#it|3e2!+KT(YA!akiapU&_e)O7h1x6r(z^stJ8n*E`&W`)P-32c_D z(BO$^xT0fTidjweD(*bHAS3s5k%LQ_Z1mym>!*~Zv%?~-y??=T%Vdirz}*Edw;2Y5 zDZ=DJdwkm|h5L5_aK*L0jV4#6(q0$P0F*8F@3k~HGqc!lddBQ0b1U$y9)MoIrgK$u zrrHe5(gXBJc5ZJ?w;)EI|2&j3^X};Ju5u(Y?5n)^n^nqGUQ6r_I}NzgsM3wg?w7ZC z8^+Ad`Xnqal}V&46@5p0PJd7t9q~&$>*kJdOE9mZITjWOwv`W@7Y4>}t4ouZHiefq zHL5Kd+!Sqb>bo*-CfZf?e%s8Iaf1o_i}7FPTWns-?;xQze%n~i`)eatRoXfEN_)3$ z{H4+nHoDRBX<71(Qn<2Kf!gQyNg>OpH?(S>NE4PO`=~ZhY96H@&)Az}?mpGmWK{bUBEC`le!Y_KCKa7iuI^QUzZKKhO+)?>+sECLo{Cg9 zkxGQ{|JnJJtcOE;8WI$nv;;aE0zbha5&XjtLO=(k8UmW%<=Qnl0_-&ecETYD zKrtk*yNAf8Bh)|>4DiGDHC$S|%?|d;!(iUKV6ZI^y8k={$wmrbJ_bO6kr+S~y{5Wp zlm5@~8vs~`gTnu*T7wTUfZlf_tr*D21l-*r`b(`V;aXIkT9zX;BY&!q~2Eb-DcNclY+Hgux%UlP8 KE$x5|!2Sle=+Bh^ delta 72934 zcmZU(V{}+i*DV~|Y;4&)D7h}BgHKuH!H0s{mD1O|k`e_jvdluhvdt0V#j0)qUNdf72~I=a}JI6B%gdfD5x z&P~gJvtU4OYr=Y7lHdN5<6~zCCU#iFjvSY#Viyui`t$T)f?3|-GQY7;k3^Ky&4*z* zjyGZ!e5j;}0ONy`axIA@yTpaZedcNIhkSj9I|xK!So-6_N6;L4m#~k+=1Qq&{K2+# zk5t%G3-cbS*kxhg*1ep2h**CProJ4~eMHReZ{KCItHbtWh&7=yx#iuf=Ni z0jC82PbRq-Dh2jGs|T&L8?uxv49x~`&e9y)tzL|N_sj@5Gw@JDX)AA=njqad^t6NH zp%Q>rs{Wf4cj^`U7m>>{Z$ zE2?_$s#d$yoOsQn|IylK959*Z;rdtaV?{5IE5Cf1GoK&P6#wy(GGiy|k}!-&)b!WY zRXaWEJ832rDSeXQ(=`+F<0W8!b?>wC`!s7-kWV!@Av0%EodDItJHrAcg?wP z*17kTE3wQXejhF5uRCRo>&cBEEwdHQ&EC+jB<@U0c!S&kvrCNI?8>%(OY=;eaOG4meF0{+F=oask z3E3cIz}AI1zw1hEMc`qNpWXS}zMV%WAM^&#>U-bC>sqSNTD!aH@k>sHB2zkcK@aid z7UzgRdKoNJhyX;u#c+MPqR4X!{pYN}gUNbEQwv_>c@NA;x<^MwwZCo7)k9%h{GL}? zzhEBDx8FbdLLUv+cgz4cg$~A9Y_{h8W73b&&|V_Owa>>kBEvL=9=;lTwmhxI3q9sm zJcV}a;BWEI%+B)&K`xi}pP~!$&6goWOqU;F%nq(SoG;yz=s6*a?4uV!0VRi;+Mu6o1#% zI8I^A*;IlVp0M&tQ|B#)x8%|*RZ6g%!JIiBe6IhkBR?-kA!YAC*=kPKbq_fC&4v}^`Clq3E7aF#4tFSVs2}42E>W~)M8M@mXwM-tM0tX zkfc~Vd1f;HPtiD_6y;qp`4o-_=(Q?fk4~1Dn=A0MBT12Po|DqSO^FwXP{JLSCbPCy z5^O{HzXl}@*l}fwd8{$Mj7#Q;A(sBhVUJFO+BbzSWY0>L`157rj3jK%N&)x#*GL-K z#OWmd+rbj%|KInm@@j%N0|@`W{mTa}xu%7w;C* z*JK#P>E`e`vdX?jicItWb=QTd9OtnC*yX<_(|6@cEk7e&{Ga)Hgezs`q*Q37ybWcYZi(+Vtli+M9s<&Sf@l8q75&y`ClhSW>Ow{ug%YWWdE$# zYmpysu%muF)h1s%A9?&M=0`Q`M(KLEt7)<@eZP{UFj;rsdE0v`JbspCecbwgVOlTq z$WgTm6BOiJ5jq_d5A++r`X3Lr*K6&%F4e^OqKOiRemEm8)z+b!#1ap&p1k!O-N($eEI3)``S`ebByfYPE5 z(y_KYT5^dlr9bt`9GXB1s`)+$g4BmkFneDCtaPRBRB3FC&6c$Eg)|NzfD(z1f+X@Q zG;8Vn6AlN567Uuti+tbf_jj3-dcPMY_7avFA1^6+yBD4WB}tq|G)iwXNCSc`> ztkKo}bVS(H9r5>Kpd4HaO%q2)hE5OqzY2wZ^ap5AefUuL`BrWr;sj7jO z(A4ux$t+~?jl28#Pf7*{xjZ*HzB;B@a<_}HqE&hUkQoM3c0dlLWxXQ!@Qgz!$_)e{hFAxwAY`ED6<@ zd?$3O%sqVh*sH$=kGFbV+oRpoMM4GSnVBzFtV5a^zWSH9N5n5{F{|_!#6sQ0P}$D9 zA2s0eOEB08?;5FuI~gTZ?5nFOF)H=+_ppa*A>-vl36Qd|>gb4_nlFkP^)Y7#oVBvy zR(n83PS?;Wm30l{`57aCKJTJJZW~(4Qt58gFQc94Cfz~pE}K-Q+2bF2R@Jvb?LW*bHG(4-ONbl1 zY$dvuiV6y>$aB}d)^J@-YLgN)e;%)jSxj{%aMj_7KwCp0w#x;BN(bN0(&xDLPCKJ)CaBSyhPTEp$Y)D z7A2!nP&;I&p_1exItQt&>ME1;BO3!zS!!}a#e_I9Eh|)TRSMSHS$NKkG`y>jBC~pv zoP5D8>qL&#K|>Wiscuq6F*#vYP2W*r4?#@BCp0tzvwN?_ZY*h&+-#vF1#76~Z?PHq zuvKtn$v(usaQqsbb#w)J38hFn8V?}P2{m>E%aJTU`4CZ)k}%xzrU+XR4J#Jjzvdd4 z3MWm&@F=tP({MY2`kPAsA4<3LO+*r!7=f$|)Vq{5=vy=AKzaiiI2`41Wf9t=@H8@T zcGy7k8B6yC;tNjkecj;3$@!l|sB~#;f@=~?Ng}hzSs}L{-SbpL&MD#1 z<9sRMx&es=;n6NjVQzTcN-k#{EgsgCbomxYAFmt@!8jBFVuct`5fv3Dr7mr*{Mj4{a!z9u z5OHl(8_ZKfh)N;&>3xSCH2NyZISBa|NrgE?eUz{knUAEqpJGq3;I)8(_U|Zbk>okG zh4MXm<|?9Bt=e;=12nljZ)Q_tsvBk-p)Ssa}v z2D*$hf-H^>nOy0$5sCoZ>GR-=1k0zjrR$&`9t%~()bsQ1$#TLGLrVBP^M{dX|?Xbz4uLj%)vjrgOkylB?N@Le1;iD-G0DfcI059!`pr}2ZxRU z@$<`oR=>W%n^he@qraEsW>x_@^-_@K~|EsQNzbLfE z<@cTM&d(#4AzU8LNK5Z83fJCw@?BE5OXA1pHrJzyDMro?843eAJ|=MjPhV>)%vHtBWyesHonM!> z$%sXz5xBIF_3X5olh0K1X}0mM6>yglb+`&$97I>HD~&C;J}babqxi?ZRcMD#S1Z3M z^ZKr_Gqe1>*Vq1YxOpS&zOw3^67h_z{bmKX`}Z0MH}Hc4PwC04*zc0z>U}MHHyiOI zmvXOjKGe%Ii}+Rn{l>N2n2WM3M>yN5PjjvUMm=eerc@VQA9JARq(PD|%VeuB&UCuc zpK4ZO0X#?vyVQU~{+yrn2vwT9E()Z9I;{vK+8Y|q`BFIW6+8m1GEJE3XmmzgM$Ko| zS1HmN8GsWPoytbZb^z&oYS3LT%NKSnkmq(iuw$#~giKXiUfDH8;eVSB0TA!otPDie+(;GvE0hk@%Y1x z(U{#fiE$t>*iOpLWfKpYUVlzpW+;YbOuV(K6lhLRB&|JBY09J**Fcay#Txf#DE7e; zU!En8JSJ36j#Udc!u`&1cD$x)zvdo8>S|<&{$Q?{{Q*M9N3e!&GQWlSEbYx%lZ>#6 z6FE$Jtt4B)fe{ske9~)oMsO3>yIO1d#|xmxwIp@)KsMt%T&?|lqDn?g7vmmzCE0`a z0>b^k6p?m3X4;WB30wpCX7N{6bE*q|_+G-}{qE5!W@lISd9C5+)u!Tx{Knd2keGEU7eRK)#xpa5X?s&d>aHM(nuaot$hV`3f zG~&dg0cN>I^^WnotEUKBZaQD_7IZ{39q?Gg<+;rSU%ZrU9vJ;har=PslMKxKh+glT zD3ytPFLQZ53#k6DZyy|#`XL2_gbtyM;}d5xBJaNpvCw9dC1kyc<0Qs0^>vy^poe`&W__Bi?^9db;wl0VYd9dOK-avfPoPbq!2om?I>b@OUm^{vwPXr@)Tu z)QE{O5*GaHfQfOw)(^3c+}m2RA7UZI!r&&W&as{cE57wKqkbJ4y}M%vlbHJ>`TSqwr1jI}+q4tkKu)3-df-hXiMg8@z5YuY8|<(xw;z@x__M?bPA z4zr`7tbxTG(IG|S#t8R@hsKllU?t0wcerzBHyOKhF>!aq{`8s|y@10AyKfcO=1ONM zWSNw|XT(pO2krVRi#Tk1_9w1vrLFTHDx(2BS~AG6Ev7mo+cODO5>@BFUK3u){4E75 z6A7?>BTB271?aWl+|L4l%+rJKm+>saAZtJ)*B*Jc4+=}bcdsgbnDC@#;;8K8xc-C; ztTXF*O?nf9n?AtZ)&^?RLXXgb0&$aU;NF7i5Oi)?YP{<(iZ*>(CMyALej^wmUXVsT z`LmKXTuNQJ*&z^IO=CvJT03z~IF_ByNBk){4`qI?ki4hlw21|9qON^_9nYy2|M*L; z$NPuz4R*6)yJCPG4dix+@EGO(@Zl&;Y{O(Q(2{~Bam?DpWc#=L;4mANB0jIj4*#q> zza?#b7eE?sQ-lLV{kG-#`gVqI*bJNeRb53){fp@3Rh_4zx zmTQab(X{@~OP`g*w|5g!9N!2hMgyQG=yH)H0u0RbFBIjN7X+1M;lk6{R5m)t~o3ri0TQuu_BJ;i3SrJ+% zXbz+wKRN>YK`7U*&yi0(e&XEe$p06Hq|TBI5_ENNX&5$WYqkO{mk(WsW7{o0%#Y7q zyxp?;a{0pb?-*6I`W+Mum<;tfMoI(_jFKTFrXhJas_>4Ps`*2ih?zpeqq@L z?o@Siah7FAkiFkvY|F$8gxj-ggy>=d)6st}C$j+lfpO8tt@X=4tYj3GFZJCey6UdI z4Ci!8YnmW34wTQ61!&wuKx3I5 zv4fv-PIKf{z>P=-W5q7orD394Do&F&DXFV!WaWb?%_+pnjtXT}&?SG%)DV9VS>!*`!SX`0X3T$Xpl-A1&3Q01 zPWS3^cstt4WA?1?8GNGwy~I2#aKZ=C5S1xEZVsb%n>=nw!9Z(*qPkJ+Dg;eQV$)^; z^%QhbbQQEHDan~s6RgT2Hg6gUTrW^WW8%vPeMg1*^9d~|B@+)M-WXIDP3wbCqpmY~ zFOv`l+?a9Ec3;@R_SDZ3s3ysAihyUQ0)+)=`mT$(w##M}!3YZPMI`42QRkCPaMqQQ z!BHzx@CH8QB1B_umDa&FmGRZa6*vdO6PNWvpnvmU*(w9hDkSFglm0+oBen{Gv(BaC z3m#C=>0DC8OqBx3bp@*E$wdd74WflhN(jBi-~BfliuxS2lnuG)Jx)KJ`@tUw=w5_g z;xO-BN26bQOuzGZ($BQVQ6q_s&x^LoIy$o%23&#m*a|c>B>)Zz@#~twR6=Q&{r~iu ztwQE@A^30geAWGWMMfcXkONrr2k^jqg_R|ZT-}CzdaVF~PDDX+N6u(G#cjd6ppKM; zQ%PqwM?nBFj$%(jRY#Am*T1ouM-evD^l^JGDW8EUphn`|^)AW$z!1&Q_Vr+Ic<75} z|DnUR|JRHAF6cF!gTqO6nXz1_1D|~r+Iy*Sco%Ut}%KhANdTtKe;n%rjsvT=z zRm1MDcjwHm)6);`pj{6R2S=R`%qs@U8zwNvU4gNmt7E_GLVVjTm;HG>M)|Rezg?OP zxkd7z-MX?wbKIF=+6x=^5qL>yIyR!|5X@|xLd*jPbM`oh^A3KW6T%4v2Z z0*6~Dv=V9r_yp@&Rb}(~Cc8}_paCe&``kf~GRL4@2*2&6WrAhoWWHX!DM=3t-#ZA^ zpJ}5EXP|$p4jwXvNB!-e;JjGBQydjM3=5fcYxLwMjQY32cL47@s&qzx*hofc+SkC< zeNVI>*hu31;~y%9X)6=K7;$zjKR}9MHKZnMFp)=s+w5jr$SCS zs@OKUljuciR+Nyid^m(MC{z$dE3k~(Xoed5lf@}^dE+mP3785QySFQOg$^mC2yYz0 zveLkh?>dt*DkoG91*yxq#mtfVgX4fv`O6cNy2cN7esMlqviYSU!AOta z2W|}w3F)qlbn?1KbByiwTRGK;FH~JW1k(Z2IA+kr0DL0KyQpPYfAQ5@Ti~QdhCd`I zuY+k8{n6g($19k?+0tik92y?)FrJy=^!^Ws!N}H!*Ry#I>}9v z|5J*Qf&k5opmV~K7KxBL}BOKIu3XEkHw_?F+rhY}a-i{}zJ8^{$pzPmWI)c%1O=S?en_uNS;yae!{4YE4@@%j zSp7uRXcT-VUd>eeV^?oz@#k^31aOL6D-mUHit{fhn8ah}kW`sD~9sZyW zJh~BzHpvjB#r!i}`ekV46FgV`XZI%7=4sF{<`HL_PuI(C*aF6a{5JxLt;T)L)Omii zYPH~Sq97+oj z34tt=Q9lAnU|n?q&7rvwPI=rX*@^o)KBs2A&K;xM44OUi-lz!6%!|nm7R(NIy&)`V zAW(ME3dtp^iH(OO9)-a(JaV@E#Z9R)ljXWUM@2?t7s#syv<4z$5Y@;PhvH;Jq~u2~8KdRa3y~!x;-y zlS{CkOl~AAA{yW9TPqHZ!{x_SDt1wc?YEzLKx&&yPmYIqXC<@fITU&@>7BGbd9ov; zn14yYlP62^`{WK~MGb3$RJAsZbPhYZ4I2Lvnu*H97%T~$D7gxHW;IeQqV%Lq6XTyL zei;gBAS4OGl2-mGt%TfrI;fwGL;?Ia82a&500vf)3XbLS?y zczSgLIMoBLiaTE#8~Q*=NK-IG%t{q+ONU`y&FdOCV3l_Mz3z^H3mYsiO^>4JD)(_( z*Un_kYs!RZPDlAYZ|WjUs~1y$4ASOFWrr-Cc{rb1cwZB&$ZFgi6hGhertVg2YfXoo z1G{tbd?RK-;ye95)&hDAwR^SQ30Rma9(_>t9fJob5ZOQU=nPxsHBnU9y6s~9naimr znMUoHDm#r&i1y30Xzk>&+BGs}PbVK7kVDnf>&SnKNCjbWio!jKN(K3gNP*_V&@}YJ z(99}@Vi>J?tW;*9%oz`Lgs^)qn#>-E1NZ1+8mhh&KB@90;@Uhgt62oIV!1!iptK~q zU@=G>@ljOe^a=#LBEy^JhTh3R)X1|$Lw3KwpFBM&6dap&ZwACH$Fro0#I!K#;A_NJoQ_uds3P2k84AkAk*xvn)ahMWVqAa1M>@ zvoO|#pV!bRFUFaOXba{(i0hJ|50;xcK!TWAvwcLljl2QzR-7FucV*V}RBJ7&GnXGgAO4c@DCzBQ;siNX>1++^MDa7z7|?sN6by>owT6j3<6ohpBQiV4Xp+_T%4C6 zb3PM7h8o3KohX_)-4m^lU_g#TBo8iz-PiC%p=7a79L_`boRW3P7$b&d5Ryt6O3x5! zUe3h>3Gwg&)}ROs;1)d`a`OQ>qfba2JjExc3#TO)r-0>YCSeNqXFamC!Jn|9#rQ|O ztie}b{%@^7ny}>$oybIXQ2*>5Hg0HfQk2bhHQZ`HNhG|ByORBm#oSTaJk<0qm<6>! zX<4$5eVOqw5sx-mn91GmQU0o5Cl?zv9!PNVH%&2%3JOUl0F2}boAFrpUs}7|iv2ki z*Ypb6@d>cpoc-+`*$*yl{|}}dUJ5IfB%h1HWqpyK%v|gHn!l?gTuCV?7|CX>w|*1g zX`Jihj()iF3du#Z=gw?yM@|@4;+ODXD?`4M(0bf{rbQrtTEx;u-)64u=<{lg5L7W!0rfseq7yk=aVQ~h-{p<6GS>1oF=GA%NZagG-jYa4dYAK1NP?d3*pEfnoZ zm{EU$T`nu)~-V4?j2u% zQXHju6y{xku$OaA6dS>Y7%EvmwYo!>Y2w^1LEM|Q*Ex!~WOwe|m~2idx+XRHT|1nE z3iYZR6cn)|39XtNBnW>SU`c3>LmI83jmg+1_%HD4$-*1qb9Yh}=%>&^r@}2!T}8 zGm3e*&>1@q2Ga4G(l6IA86`n;qZ7V@q}SO3kl$H1fs!@7FZ6HKn+`yQ-DA4K61tFC z!Hz3TSoZzKVC95^d&hPb=_)8l*myI1#~^u%*W8EPfq+VSTk_ud<8bJICM<>8D$5@L zb63Fpb1kRsA{eerCztoVgZTK@Bvo?a1DdN!J_up>pMYJ#if$}tmW@iKd6_=teaZcHF*932|_hiOcZ2F!kgzg zh84UwFZdP+FH9Y~fI4Up zg3F6EbXIbA!Q!MYQac!oI%Kf#e2B=vzzjP98l-5=8Uu-k_ImOdMo|VyobEIk98{vm ziD>RL4tXxho+3B^t9_=droSZc=>2q!UT#M^Cf&8e2svj}9(Kv-2E!xvz`S@}m*hhd zm3BTx`z>KeYH5In3>G=qI(1K2X5z(sXQA*m59dgJ@DDt}w&$+5am@?DtH%y-L}Svg zdgmZr{UMiAi&j-8?W0lp=+IC?Vb14SM`GUpv&>>)y0uD)DFRvhkJParA($_IAG6nh zPJhkaV2B1X+;0D)Ct;kLhR2kCrY)Cgao&qfB?R)gjb)f~F5^GmAZPB2YfULg>zQGJ z51C5h>CdNfa1qX)ESfPL7PEgqyA52t2j8VB^#9z#xhxyD`032d-Sfx&a?{eq)1?hf zj_~}!5saM%o(EHB51cdBY|W{A_ht1Ses0Ydb~xCy8`*`RRhUeRvNuWWRq5R%qMm?63`VS;YR# ze(0;%e(2Xxj--b0TjCFpTMckIp}Hty$i8sOQ?tKni&5;`*i1TK$>@laH|$>Hl5N#j zILE-V1ePAeNv}6N@_>LpQmYjO(fd9elTnb7FD~o%Uf;72MwP}C4IX(L3N_UZb|?X!T#72rP>|!C@jW-ayR?P&-E#57^zB0 z%t!X~sxf<+GK%L>@K9rCw`sx~KlK7( z9oGc>dML>MY+>{^6mvo#eS7{|n`8E}0fr{lq*!~e_p?CY3_c!BZPHZPv$g9PAqx?Y z;0cjnisZm=;5C>beY&xABe?U;rG}py!i!$W);ArsoEgp+L^BaT$AcsCP0IM)+xVU8 zbjRrV@O8hOtB@XafNMdz&ujnR%s~b;9atN$PvO7zL~UXTP^@uPo%AsH9g$RMe%I%2 ztdH021j|+C@`*XRO*7xVqw_}loG*xd?7>6^>q&wJKt0C6OOT-+i(bJVXLq5e5cI!p zu-e>^-Q$)F?QhD*#a(I|UxiKbiZPzck^t2CK^m&Dyx8}md*l9Z?k@FvDI&&(3zFo- zNi_J)HqOaCqd%iXwD#%!h{!j?0lB&iuP`4#SUwE+o)r1x~`6vp$ zNfLg|NFIW_#0)~>GKkmXlHj8mHq_i)TjePn8ue=0Q3);eRmH+}V@z4cdCPBYhDh2P z8mh@HOL@0Y{}3H5YYMC%i(}4-5zkQBY0B9E|GK`_XyKkk_l3AZSzd^zFxp9^O;k0NHg8ry&@Bd}mKX?a}JYAEv%iqqNt!;--%ft<~ z&#qqcFV{&PU4NB&64hujO-1#*o>&|fkVhd+cnWGG=XW6Mg-vsK>FYdM0W(Y)resV1 za9n-ndUCle%OVGCM|AQFXRaKwWTQzYgtppUgXVeq@EG^fqh+gy+pF&&r{YcD>M%9< zcaGV1&yqz6=U}`n5kF5eIh^C`dqxKS;b)?fV`b<+TysD2@HIjET^~>&U!f&KuSAU} z3V!p8I*0d*dJbi3bObXr0b;|Mru_Duhit;kbVq75{;Dmra1Bmo*Y=Je9y8NWipunA zWY&+7fposCsq>Q{a_`j@GNS2QXO{4hV(W|D%j{M`gH>Ry&WqKU>MKBEh^Q_6$Vl5s zTCFWBe4>B05#GTU7Hlz4RSy>7Bf3p5L`Oz4>55jXrgVwU8utwM09>AwOXF>pBd$xM z8hPU{r5NE!AR|3rupRjD+K1k@(<)>d))!xDZ=<(>=Y=8J-fAYG@okWcEh3 z+npvO+@?T7Za|MB^Jlgjl%^nTiqW@RD9NZpPbuge%A7mk2G;U|lf1H>;LOa~hNX&X z7CBtsv3ro(p(WfHgjS}XY%bSKLlar&&XI)pUCm~lwspIRgy%Lyfj&Pf(69cY2{zU z8zO{{#o}ELfj?~Jj4)j(sF>7DUinRap9yoqDPROz5oaf0wa#RBa!3sdJ2k%?%Xs4AoU z;}$okBaUgtiqwj37aRzS^(j2g-n2zdB6I@Pb`k_d0hM#rZGyJS@F2prMsBF@Tx0XW zuNvCFxHvPeJ+rv!On+s)xI{-Fc?W1g?3jK3zBu$FQVb8E8BRRVbN$g@vMY59L=^`S z<0wF#5+&@LNR2p}*b|<6Y>py@rttI=lq=BvKCIdQZtFCbv(VA_t*yt`7-Gc;1%N>tsg(Di zJ@h{;<>vB=jbF>@M>xbLZz>1;UcA?L4-^+CefJL2;$~)uLsjV-G259+NnEUO#x~<1z zmwU6qC3%iMV@44IsS=7o%|Uz2zv8faywX~R;L@s$ohD;!=!xQF4~~9aboFYFW@B_B zGR~iCwwKo&XQ0hJovq+KduyN`F7{vG-mHTM->l<5ShMsVq+N>`p*;n$j!F)Tp6!PZ zG+#K~GL7zJDR%r!==3<&=F)=lGYc*C`de`lxI9l%QWLR-IM|IM5?nR>8KG zE6^(5guP*yb4urS*eGm~Pi)44(#0!u^Ay)YIt_C!OTg)>9t=s25lhpKV;W9p7w=8m z+7B_l%JyNkNqoUHlbv7SHy<#Xi4d`ahXN;Mn};!$F}==*wv114Hot5()q^z{d!{#` zwr}WCnBC?efN3#EhD8}rKswrr?NKzc4DW7QD8QozD}Ngr!c*}Z7kUeaX$`bzFBgaW zu;oxZS-kTp;;>;OFn1rkuz<_mGa?43v;%G`@<5DF_kTZZ|8lK)iyOiUL)1 zOez*-*|xk2xe;Uxb{%gKuY~C|xQI6(b)!lm5XLM74f36cM>!b@t{h_{VY`h!`9Qi; z6AEP>(mEEB1=gh)n;$UiKJ14K{j?v$k;WGNq@0y8+*}n7)y=9CD;ke$E<&ic8QRjt z@R_qo&%pYj#yAg@MzUHVv2N&b(pgV0oKhFsz=gI;Z~O#FdRXzEIVv?$R3TPxf-)Z@ zC-?!yuEqF0Q}vp(y!kjy+>5qkzL9F1TCUgOO8s)PQ%A$CO~S2HhFfd0C^YLh2{T7p zE@koOzmZU_9wI}`A+%Ib`8hs&+T07Ak)N2X%usa+2OxET==HWp8QC6B2|83gGwAzy zTTX1T&B5`o@)#)!ept(hGsb`Ed9;--HK%R!s>bQ>!=I_1*+X`*v04B6zoM?%ju*o6 zjhTtK;`=$*(UlfgiSGA^M^2-->OMfN-(m7744Vg93B(;KDhglrzp#60SF@(OJ4;tK z$GBdAtK0#D3;5D=Q6nm7J!;A-0*tXAYJ@#tptibPwPDk_(~ZC4tlkuolfS-IvNpFX zX~?TjOGs!;Rugl@hMeK4ho00-rx{}=+E`!3=MzI3gY*&h;2Z_Tn3mIe+{ET`$YwJ& z+SBQG*m#$M%R|KC0_Zn`f|>ZTO}GAp%mWwr_)S?N6+)6t7H~i@+AVi#hLPvGm-5qH zh1Vgd``I=l&bsRM~PyvisT;ggcUK4Fc1i6MY!rBV+5tCd>w+dII3JTIJ(@Pvb^> zsahZOFiTa&zK%iGhz~DiUR9KhMq9 z@UTfFMJcAVrJci2gK5O6dyK&HZLH#3K6~@Er6=CJMe@8C+*wciaJ4k^c#J28a@JW# z4A}kLa|Y$20_rl^gyt>VDs_jiS(q_nKjAcThRtn_o1gYZZ~Vu$)IOdCYj0HkcGwFL zwneG!ku_?0Mb74&m5wGKOnuwy{OhHn6~Ugc6oK4u^qf5``Fnfy?hn<=4^7L|bX)}m z#lS+E�yr-VpRaq9Yao{TGNnt=WORk2?f^74itoO#rGs(Wi5BJkeVHZO9qf`E%Wn z?O4DlnakBDA)QBHY$hk}Zoo*jp|OGH2%Qu15t|bd4=GyI!4~q50 ziR2Zc$C3f+C`J-Ho(8TrL$A^92>+K@hFd6A>1xOa+r~WM>t?qUP(%f%??`2NpQSWMC@KE z{>lRjD-^8|h4DJW1khZ^yt}V}UV57va0_kbtH_32^ku(-6BtlGKzJD3>QHs@_UoBAM#8 z^LbA6^mzhhC2hH!&|AftD(sJ^ew@ngByx)DL~Ng;M(onm`00W8=GPB!L9LY)Fqj5r`RYmSWy1`vdR;oMiI__d&cn)dpXYu3+qG}g zOceOebFv^hG+O|)^DFsyN-MKYfDaMiS7;zmjCt}?sYOaJx9;7>V~O7ZK{>0_=< z?G@1^@i#uwrY9TxFWIfNN$kR$D-%*^(&c1m@ELH}(-Tb<)viGDY&uwi1o-66 zA`C7FZ4Lg+B~z}D=ZLdoGh5(XhRV%{QDi3lks1tN=$UrF+td^OBeg>Xn|Su3Q+rgL z$t!$kxWt51R&$I+2(kZ>$@T`0B^=%a#~pKHqn`ixv{=ND0bci_vDt0<0>1SOr1MA8 z7tnn+M1Tgl6hf7ERRNN?K=qWymKSd<3o4#AIJw0_$P>&Lhl7F_oi?EZNnE z)Zc2o^0(ay(r*bc^b|b~Wj&r#VJORv`4PWnyel|Jl3q9->T*Gkkpo=f665B=EJv@g zhtu)G+}BImnt3EB`fV@4x9{wx{qE5y|G@-2k~KPH5pAVoQ2|FaI|CU%%<7nwWpkcE zewex;Q1iXJ{PJtCn-LCxT?_2%p2NpjXtC-IR|&ludp6D-G5m6?>6g8&&n@1gT>`Dm zJQ|abuly=(=FShWJr#BRCAqbYT>HNDKcO-JYXKiHrNyX=0;Z~kW|TB*TO4Ev(*EHt zSNFxn#u<3#T7Y%$EZp5RRIosmFs<$9V*T45qTdGCo??)VwQXiuVyS1O&cSTyI$CI5 z45DrSQ{)Axb#Lc!NYCFp5f2BSuajr}D>kgfI6r~2M=^o(uawlJjKFV=JSx<|AgrmP zad{4;jWvj6OL4_Y2{sm{=r!TtA!F#=RVI9E=^nYH2EayzskT-uHgZUPCZM^su$(bc zKp+dElsI{CTo$lZC=pljA}=gcuhsq?4LioWwZULrAV;k4C!?S$2WQP-6CN_b3;Xlu zHfLVytF#WeKsMHSJG+V!t1LaFPu*l7v~g;ecqQ*!>kYk~R5a4%0_N@lc5a3Di^?HJ zXed7ZHqcG;MzCvU?C_a7X%?l~MgF$_DM(q0c)bNtNp$1Cf-@z~u$nU#B-lPmEYij) zTtFP!CRajo2wLeY*HO8Avx;Q5h@M3H{By$hyfP$C!Qdr30$zTR{vm+a9lRQ+xRRVZ z9TDgs954;pZ9rDaQHfQEaIloQ95!z3&w2cMJbHCNMht(rY?4j)AM}w)rKk52W^#lh zyj#>C{x`O++$!5-$y?{K&xbsH=qS5R6yr%*;SQhYINt5_{87J@{Ux^&yU|^7`m>rS zOM@BBv%=>xwUG#5v#s{ST)^Mpyk;0W#dl&$sYHqzHdMO)eULrfh`rgCuzo$50S@$x z-+9f9DX8PlSTI!abLfya7-K(e9Nx9wSPc6f+t}HmFMNqKrdXI6u$CMC+&?ka%!nVSQH{n=>l$^^|o>K1N#3BmuC5*wvle&5B{F zBV$v}K=%LE!WyMUC@zS~Z6HlI9)88Y-Ryh?GzFWK_doD21q#aA<%AP7DFgK_4!^Vo zAz6tdqSKLz6*#L@6*$)93ufw|kRDi9^n#jc4UF{Im=AI$wQv7rL`0Q~aHuUhR?rA^Z`F*-;0 z9mvZ@MM(Ntnluh+z!-I=5bYx)17}owg{rTzMX2uLd%i~7ge{M$;q#&-I1p_2p!VYz zW5mjs3owI&C%2@jL+mEC&^ja7HSwV!rAubhR7cC5cc-ZeVm-sr{8CL5IDY8p3xX0k zK5p;{-h_f+0=&pq|1RE`d0h!gDB}I*_c?tiTUu&t>z(KnH^?G|Bx@&KTHDS}{ZmJa zT~cytF@EWU{(Ht-9u{vfZ)Hew;>!Avi63HvDibl^Mu7*Ub(qL`4Gj%HRu~ zt52(&JABgAj37;)J_BQ|PKaz9f#WUQaFit7?n4Hp6d+HVW>mLRBj1 zqBbd)makYi6_5T#BE#8C=d!gu)NeLUGtU-95Gsg>D8Q4++1T!TSJ6g&I2VF}lHlyr z#kVKM`6wxv1$5*nwEka?zszRo4(q6D=~z&VISzt;cD1jCJ_plHl&yYZpz&Dy1dA8^ zUF17+irmgA!$E24Gw{&zlOW}_9*qGR@fAJh*(@4%2E{`A3gxFtL-)>Rl%{sI-@pSF zeCn#>=Oi;kTjnu+ugufHw2w)dtIfyH)dkq#AQCp%?3M}|jj9oZ_-CJ))r!U*CJ13l zU;f471|cf`m5GpY`b*NBDLic1BgHpD%7$d!QYd{?1d^9!2H@MjD6340u#oUwdU;A! zGaco!vlPXQkt%1~F{w=U<;`0D4=UJk~|2<`>y6Mc)0Ww1{+KHRNM~Jw>@^c4?H*TV@2!n%j*eTXD*0? z1%&6{1A4L3eObq27NB%+{r7|OSwzd==C8U;dWXABZ%g2B9vi;!TyK0E-1}OsHfDZN zduNAtMPf`XPo{L9EZK`DQr3qd#PvP0yZV>&6_%Jv;+wV?b^@8W`nEU{wZX+e`*Mt7 z7q|~@8GK3VV<-5-f?9V;M&;}&?^GNleWudE)Gt^*o z$v8R47*j+(Ngvf_Mt~-k3S-P*dkkHoP#Xj-JnWD1&#RQ0 zk(nmg2;uKxP)e#r?x1Ot=NA?1B@ts`y5P_41t<3hlbdufm7Vv+=%t&B9OMP#Zf&8v zi4(s#Y)z(rI=x%Jj2$amW2;NiWEKUtD)d#BP*HBL$N~^w!JIJ_M=X2n zab<&<^=TzYs`Xp`GLWE}P^N?bKu`@3ij2)bQGANp?|u`Errh z(Y=EGNi2VxOmkuT%lV?K;paG4o<3~J6d1{GaLW6#p-%Fvqoea@(w~I^q5b&_0a*WC zi>9g`N{@=Y2qD(`qA9*WsylSg@on3$e22piUU}E~su6c*koN2HmU_<9YEFmsQF;G# zGJ=&4f(J~%W^w=w5j~;_9w&2Rz|0evwYd`%Yb|tBF+{r1PONg^9>(K`vAFjZ3F{y| z9Mq&J0hydG&ab9mj=2GR4l$QFkcHu3ju#1P5T;e7@`P-Xsjv$NaIDixv1N7FmW+_s z4r$F}fBzy*&V}Sy8zopK^ASn?9PWDQ$AxziGCUJv1j;vKM0DH~nd^&WL=C&-#{5I0 zQi3mL+QCTU(r@fi3KUaGgVgBPi{b9+Ev8Bw8@hjX^wG$~0`4hZk6drye|P8A)QMm! zM1HLx#(~S!#Uw=5L3mb9zm93Y(9SB=)j`PM=1p%RLmZ6cDs0`S8Afa;Wcp@+^x-X$ zpd%-*UfH$*dYVGH;hU{Q(nbtIWar;_USc_(#Rb_ZmK^^qVmJCF zr33apW0p^imn_b?D~pE3rMIi44#NOrSAL%P{2bMfOA9xp3X?O{(#&IzYQ!}cXN3tN zH{o+qi?vG@%UADEf3NxmVT>%hz@BJs733+S4O7j7+`1j08V-qUgc zeNL`v!HiXjg_($5bUN59V<6xfzt5(Xi%Cdqm_UW5qvA_!`#h)Kpxo|Nl26raspZ!X zXmu@gNnJ-Zu+MM9lMMtXhvxh7CPrs*lUgXXK=@3AAd5&+mx-X`WjfOPj4dIXNtf1K zF-$X+z^PxaXUPK7`<=2lQ5gNMjH^U_@9KK}pyY1c81OGJBKM}Keb7(B9gGj?Pjfw^ zb)1~N2(7n7i(T=RPO8zN=W|NWeBXfppTCsBgV~g`tB*L48q&^upPw7zG{KvIxFm+^ zCC^T$-^a$mWxKQ_UihHsOc>EIm5>j&;#|n{9qPpJMC3~hcF&P^xK>^O@kfUwg`d)G zUy&r(F{OU2%00(sAtKOdXvl=q%gIX2J@>y)u~@18B*GsPp2jVguqKQG2LOK(jth=1 zK$0tP>c(NH;U~mpzI?bY7Z~u8PR3zg172wJzfso4bc8^)e`aD4^*Ks>l9t=|Do?&? z3;j3js{+fNQWzvf3YCRr9;xCipZkAqV>Nv_pZwOUFqWGmIHWjDW>J15^JJ#L(9o*@ zQtT!(BbND4dT~)GQ{l`D=yRmUX;4VLGfwfxWuL|sq)$qA#8s={Ji`{d!(~5-&xCUIRB(y9dw;l=}wMt8;>9(h-P2 z+MMS{yLyUcqO%h!z7UOw@G7G4sMpoV3Oo}8n{YO|;1?USqd6DgKl)B`_s0lCAy7A|AEJcb^{6e$GEfuAT%Yi)0<6{ir`id&2GR z55(#2M~F4iN!&3*P40FY2*LD*|8}Go;nG>emvCR&gMPKX_gE>}p1N4${l0nUeGhr% zg3vuD4IAR|^CYkr%Z2fI$l|464W)C6W*#ZM@BHYMKMJ&KR@xY7+*{%rc@{{UC~e@m zh|KE#=0B1HGVm;2%-3u2Hh$gU+uYLp-X6JIPl)Am6}GG2EGDZk!}EYY+(}qv%Is?u zHkUC2XTW#9_3c-z@Z$O0l67tamNIGBg5P&^yHSJp!1S^=K4tHR8~taGM7vD2z;)Wq zM+~ebrkcg#VnV?uGB6>?GPWJc-hb$Z_6b)tRW+AlCu_m+D2gDWHYzCqet|B5tWBrbC-o-6XZ%s({2)M%La8|X zLAN+OkSXnIMT=n9u5GB1RSF7p(&3)=XK3h@(sZ%I?w`pdoYz@ z&^hprS5i3-e1smBlXpw6otz{ly0ByF<4PulV1<-cZv=WdkH-)m;@@8K08Fp6_OINs z@huOS=}b=;@IiVNR#H{63R{ZzX?O+8Sw*<@VKuVUTEraIUI++b?J*{tD!5|J3C!c! zW3<)VsCtuip8N@mN|{=y=m#qj*1QmD?8sBt+gc^MLP>$*`LwMrbR956mVgNajvQC( ztqlFgQ??RioXG=-nn!G$IDbVE94)2`pPwcWH|Qk#rM89p$Qw$WmqEy7N{KQgOtPgS zhjzD)<8yXlhYZXg6vN3ACa ztBK~*!jxYQzl-HWNrvbFY<)dadH=IY#03nKo(V|77Dpjz^~!VoGfV3=`FFW%=49!@ zq5s*Te;LofZ_Bf-S@N+Kh>OQ+yZ#8pWPL9lo!<#{5*D+hCppZN(-F+S0_pl0*2rvx zFT}2AW~2yz6Rsig$T6rM+Z9H+W9WWhR{At@np!zGQx*lsq^|#nB4K_IgDr0cU=AlS ziY^KxA14n*aId`S`8+XKYdn)Qa3b{c>CP_y$XPt0CgD1%x>wAv%(tBu6N(q>AbH-D zL3sW*r(P7u#KcWVf)uAKC~W^9E5!Oqtx<=s2 zB;rQnFINi#mHFOtXM$CLxZL-Rt(-o3$sNH$92Z7Z=(qTl39iz7$g9=BkFl;Tw2t;V zjM$)(Vm~?M-OP~f%ztQv1xwR{LyA5xPqLO*PI^3+$IPG3k^|nMGH80t2HhcfU=vE) zXtJ@G+kPoWM@eaB;?Lt+un;m*n_a@vRNt2rGKkQN=0c~mla4LIU7;9gNp3~&yC?&+ zRzuSOpQO`UNVX)7Hh0UQFeI`hlw|$O`3wDIBQ57%#5ACf6e~OK7s+bUi~n&!Xl4bp zMu;)-gt8|EFJMHz+4 z=@h^9R*f25xOq5)6BsK1hmX2)d5rZ{N^wl&qU4T|+$O9h6abr3 z?%UaqIn1{$jW8g^{`XJ~KZk+uJx3vfO1jwOBhx;KHnqCqeY}Y=8qDJOmXQdKe{)KO zuiscs^c@r5VxZ%jZoS?1?MEDbR{W@A_7ef-`R$}Re^Ddbytf;)cy-sKDhGvp;3|&aIQbXM}2t-tWoEUnX_?}LS zORC!ebLA~G&W4+ci!hFWi;+NA5e+Ri#o| zqx=;_L?FB+xjZvA>O{_!WkyvT4085_7-_c{mhVXeye?w##fvzi5DEAZq`s9ASt=@7wRF&&s zF`BTZt<|L?!u=X;j4jA=`}{)ho-=A2sk3VY^?O)^YI?BP#}Qx~Le~|J@{(vp?(}^j zmDEC_$Gk)gr=xRxkpW%*i_}syM<=SK!-pip|4qftPl-3L`XV8ouHW4lVs7GOmPJMq zecMk1INBA`FcagBaS@RH7o1y_?R1yCJfmFv$q#XnBNAgzTB!Bb2Dr2e*_BA9H;;;ZA8yk6L*dhJ)DM zFIOZ=7~ilU3(i*?pK$q+=^7nyC{_Lv45*D9RPQwK|EsOmj6(U0k9@>!e?IwFtlY4D zP*@v5H)M9~Hup3)SvATcLo$tsVls$Ms~_|Zl)MF4Qjs)YRt4pb|S z%^UagiICGU!6lVIPb~{7{r3Nb%tHZ!Pnk~V~`byEC1EJV;BBvyeY`Cnk1Ol=>2dxE>k>F?b78mQyj zd_%KO=|~Vr4I_E#VOG;S#t#A1tHh9_R(cNk8;9xL+O%jU5RG{>JP+pV1Hv12^aTFx zK3+Br_25XPFBugZX(d0ek%2y^Xaat@}X$k8LcUg~yY=iw40{ifnuYLi)1Qj+IMW!>x{1I1%#i$Sk zxEsqC7_}miZ2%oNV>ouw21dljy*@|)QA*)xvg$?$1z--|WitxoBiDmmU-@6L zutQn#%xEU0l$VyH+oI9=)BI-CT3#VAR`o@7D9W7Yk*zQYR3jhHe3`mwcZti99oT zz|DY2y9Qn_mYuhMeXl+{B=UKio940I=U3-Z9*g%!+TA7+*=$WK)E+=`uiV#vNR5`1 z;gZz~z_cfi31dQPyh!*9SeeXuUEx@<@CM^(rEJ3F?Rp!-|Idf~3q`N^s9T$hc^(&W zJWnFO+~HOkf2hS$H&dy{x6O=GHA&Nvof;QBYOPii$-~{|D!TGMFUEP3;f8E)%D+|2 zvs<< zAu8-Vfa~wG%-ZliO!G-ee184h1NSSoSjxat3zao8<)P8?vM>^Dd9oQpp&F=ev1`0L zqZfQiro7wRSlYw(z!0-!6!kwQj==gkqABC^GtJyvh;|-Polx$KRf}+}qGCX7<5`<< zOsuey4RR4?`r*7PpC0S4;W$-~ZQcm|S&onJ*PM38N zfUHKMy<`pc=kxzVKbDuU`qr_H2$>4HpP_0+eXWI`thR_aZ-C~3!dS>v%hKxQUqyFr z5b1{*M9%j-quCD7*#z1s@22cABEBUo$6u9jQ=i)@L%b0a=D7qDMg&iyNF5nGs}|9R zJc-)YwBz~T8@S~%#!9EooZsD_f@RV`BY1lT89b%V z6bB~HfD7mO7a2}S9V)yaKr=FVX!*ZKdBhC*dw@C{d3HlVy_Y)(uJjrn0*qsjYur&g zZ7D5rUH5h+DT;Ado%jIP_^eeW0VDAGZX5#k;Uwus-LlVF4Y2LWK_e0`>}4QY@efD#sTMO&uQJeGUMBBlGd!87ES3F zzoL`S5puFsm&%jw(^H9Ean_%;*nIBg(kZHE=}8#D+_N9Y<;620x`0%@uFCULFny~j z56swt(;IM+n+f|KE2fa+DukIJ^R&5hH84hcFqleYt{ILG9{G* zK%-zGO`|MH(S<-_adFPSSlm{X$qU-nCu`f@`UR0y%PV-(OUhiMipSF*_5mWss%R_Z zcQXlERIqVnvkfPhA~HwxqzYskC*2|#kB{`=sAc+F)>uUbOSA)r)(}28-S;e-ma-m2 z0$<%joUvK%CV|y#cSg)cW#Li*=J)bS0i4X_9KbL7^YVLz;-(gez4-+JIf8CflB4sOW5Yhgt z#*+X6-wKLX3c4K~)+U9l&JZqPw%Me!P}@!ONG;yVw%H7bHH&iq{RjfW_xwyl`IFV% z-N`mYA&36#Tj;GjDQ^$b+Co>i#h8;&1|ktCeN8TTaydf}D59M}N0AmN=E&XpJ=q>D z=WNm~ZAG57cu!(G{$Gc=@gcjJ+JXFaZ$cN8!{myRrug)F|LE-oQtgffVE`WOw z|C`U)VP;B2`r-ERB#8BiV<52yVt!MAX$dRDe0}mAQOI)DpFy5OrSkogvV_qp%X9wyOq#dkK&6R~91F=UJ+doS=G7dxS%Tw?be0T&`;HU|7iJ zs7ZBV_SaOsJPc8~w;*$_PCm`XD+vW-X^8;Ud;4z-LP4Cj2=oBysOc&_nMLlhysaLm~Z^OWVme^rzONOzv9E)#C3`m z6@NVdrapI*gOtCD$;+k;iplEg`mt?)JoIBxxA^o8q>J|ar_qzeZE#Sjz#MYV6LSfx zbf+yqm7@GEs!&LB9?2*;&ydfFFlRHJy4O*z6KpN5kAt3Shtd{t=!hdUy+ILqJ}haG z6vDO!X*s}27QZqG?%k2jBjR++-F&sCFr-<*{26Oyg%B+#W3W$^{l5rf3E}_PB4)U5 zaeOD3YHS@S$Gdqf^9}scaM!@cJLL-uNy(HCJT}YJ;Vo=a4l}UIu}`3DK(^lk;=Q=Z zkXy*1N;E#C&aQDMYPoJE;Fx`t)vvd=;b9FT7e|l9Yr<;-9W&pC)yfr)y##(h<+~6U z&yW(4r!Ut8%g|i%a^#wUe21jg1)~JuE@SAwjRy5}=DLlCb)cpjIXmAQrvXD;2iKPh zifT`W??1vA+5jXSkjh`1xk|IabVOPzl(z%n zWBtzV=cYl*>JhEf9m?q(bnU7bg0e3U^)Xfx`4I9O!;z@EnQLM#-c2F$*-9KnVHWd; z$p@VY)l13w&AVzR_y|P%AYkrStHT6I8POrUz08_;4y50ve4&^`$H7$hDfGD(vg$tR zLFYraQ7_6{3G`!L`#@S3c~_LfzdEy}c+RB!idbpo?Wy2rq76PQ2Iwd&*k2w|>BhNA z+;h1$6B-}`UD5~=t+*O|T(iXIAE+vIl0+&pEO`HL&`~MG60F7mQ?RLrL9l}V(wWDQ zMzhsh$z@MYQIu?q?~&6{lf%F1PZ#XUoBi(FTUrM)4t=z~6C{{2Pj-Xz)lKAV)5zxk zrh~_xa4)LTy(XcS9mu+D%ENmlif>G~81o7RAN<6Jq~l#w1hPx`2ME1e;H`G-CsQrY zk%D@l-86mOetVgXhW9lu4mDiCh2|G0Hv0Xj!CE>;vPi{&4ey^f+h7>MThVKsUFHfr z`{_BUt|Fc7Q3!XNzlc%DuQ;h!ebvYGKiSV_B+cP`PXO=MuPrP=T3Ce>kG#kJ}t zcFdi2Q-5oTuzT7TJjh8E-+fsY-wj*=#B11A)kqut!R8I%gMI>tog2Gt#3EcsJNA3Q zZt4n(B`kAx1#u1n%IyzixXoIWNJW~nDYXebXj=WEk@Mf*gcS&4VCEw{$Ns=+gnm?r&XR+5jyNb*_nG4HDaF+B7^ zxUpOuf)yfOVn~f{^EZbn{42YzH=aj-^II^+0_49=)0w%HDvYr?2YhQ~way8p+ProI zqj}Khg*TCW+m?Wu?KNkd6D}I|uvjk}qy;DoDv1*N!BtHJ(Pg&taG7`2{L|>UnSM)L zfnTO#WstiDzvl5PYDNumoEu;wQlgj{ILs*I)G@(C08h{jMHFM}&l`D*2PQH2%tD1r z7!45s8D}_BQkYh=`nT#IGY_G5Wvih*J$f5_Shwceuu(v-o?)3Us)e&;?Rf{%-?F95 zmiLcCZtO9fi4_ZnwG#cWgsH=SJwQ$IsmDI9PO16d3)e675eb#BM+f;5BBsOz5YbtW zK2k$Lv!DP=m3oN6kTH2krCrdrif;qI;S-nXQ-gy`GtR~$C(TsxgBT0{0lc5Yh^Zju9) z549L2J-5)`8T(DoSs)SL4((316o{@*>76w$dwjhFmmvB;_ly~_3xDJ7%ly}~m3B)i zMl?#yS;?qHtUmgE?;6(Ji#C&y;gUWAyg;x|j-R=T=yRZ3vzb|k|ydD%kyTd!XgJHlZu|MwZcRVgfdU8D#vFeUIHuv-( z)|y@OQ`OXK>i%8UoT6Had7lFMsYyd#2>N23Ll#@HB!?HRK`Ef z+OOiO;SD0IyGV1PVi%M4*Qo!V9RscUu|&sK#Srm>lMq!}J2k${m?R}t+L&6qhc@#G z0*%q- z8&`#j6Iq50hg_722gbXpw2U~_97WwtI9N7>TzTqK#bM!ZV_k?y{Cq#KSqT17mUF{W zQ@$vYq_tJGOb?-~_;avHR)LIs^u@@3on=w2O>y2&C6R@%oW8jpS~yiy_E7SBmFP`A z3Xx;jrm!BR&X;^DGQfVM4QLgXUL=FB)LEw_j6iF_PIh3Y5}+0V!;Q&kQ?KUvb*(@x7q73 zL`^b=Qk89P+e(7kmt$EO6~vp|KO$BQV6x@eVvV|!?tw&5qK zr5#S{biNe*bYEe6>aUVk$>p=5U&>XI*NMl0`H3s(XjhJAmBLxv!;Lzhtu}<~&%OWD zG9CBAodbYl^){ME++cq_{9nXLoOFqTKE-SB9X2xk47)75jcdBqitnNkX9{_PUB2D7 zr`Gl{d_Y32J_<$g+Juq0V@d}mL>T-3)DlSFHGR*^@?4dS^C>;B>|*fcIp7by?QfAY zqiB`uE5}FTGKqt`*Nkg`*&lS5iUTN>$Ki`MRDR$p6e@wWEA*L4dgBt$vfn8noU1iCL)!f5fO-TV> z!#jn(OqG6CsxdLh_c%gNsQbkLp^+}jLWqwr!B?XGmR)y~>f{pQD*V&@M+Z3Kv&zpZ zxh07=T_{#izYhJ0rJ=0A zGc0eV&`;=Hxfew{`prgTjc2!H5msJ`iNF0Tvy@Fp@MC9{@eh@1C7|`PG0^Re*dI~< z%k|BXWb3%s^w`80lE6J=u8@IL9A1~`CayGw$tE}-5vo|RUt5{iI zq}R9JGw!ro?~Iy}bJ8}0oix1rs44_a0z;v7l-fWV`%7Va;=)+jG2z2+2+d%jQP>uj zOpt=p)at<>H|p(I`T%KpGN?OY1cGdV;B#~2c8mSi!7v{M;ej)qx^d0z6Vfe(dm#uI+O(mZE%(z64WYX@Z2*xTL)F& z@e3SbASD#F7Ccl_a*=#7p@Hl8+DPMDMEa}X1I&sl!83P#Qvg{^qNz%~^eSwRT^M`d zGiETBRE&$ijoijrgJmkr@I^*ugDfYPO8#7IMz%RhZi;&>_0JaeYJU8f6k_-V7QsjT z>9XfVAvCXKC%(=UF>osNrrBT~bVn}p+F_+pvr~bNJh~x&dVHv)A_lq%&QY0hNm;oJ zoh-NYg(m#yTVT#vyHpxa#&aU=WfANdH^ctGbyIfzb<=5j-Zt}7C5Pnx3iVWtjg<|j zFY`VH*UR00_n&J~D6xqYoiGtuy^@K`%tw_8@=w9ciL`H{*cCmOR(5=Cfn9nri{F^K zOJ723`Q|2;Ka8(FS#YPQn}xYiZNAR~G4DyW7xH0|g8&s!_OnGqA^U8Z8miG2!eK6W zq8It%s~?78DqY~nC+|&eBhBaVVbq^!ylSmwZ8Y$7)%&du`kzrK;Z3U%@Wkb?2tLRT z{3H@4#hM&|q6RTe>(Z$4(7Mi3fRaohfn#q13<<6yb>N3q ztYJNp6Wf{uRGQ^jhPz~QOh@RCA_!URP}N0cJOByGUYtn7a`Kh&7>9s;`=-Qxpr#RMpY!Leq?cmx=c1O3@>vkr(n zY}YR35J_m5P}c+1&vT_iUlBjjMpNy7pZxDVGnAESe7PAudRX}-y>Z(8XZ80FU4>5% z_~)0w>pyCpYuC)CIJGx$WE9^QqSl#NzSZY z{y-N+#)D|slYGxkl7MGt=Q1f&tE+fQI}p8n6b_9no7I66O*`PUW^pfUr^%2wgK>cS z;Isd`&$T;)9hn~OCz+v|ZsyT`(~nIUB=9R@y43j(&3K@&*~RH z#Y+1)m)nAgHR6jWyNpTE)4V_hLF>aCtyo+$m7G8IgnL$R0%!d}Xj=MD96R!BgSdd( zFoNA=+n+z^e=m;y*=}AABe@;~yG(MxIxL z_qy4(+VrH%P`Q|JWuG5CCj7b@3r$@=9px-|{Xuzy&j$EAqU9aSSn<70wSC|ySMdG< z`Ua6JK}>lG;+_#?AeTqQ_!@p^-e>{fvxT}9%{xl2&oxa}Tn0q`3A(he6SnT)Zyvgz zu-7TD-?v_5hI2$A`a;%%18<`3(fQAx`!+VIAIO9Bjs2y@Z+>*)AexNFRUpD9Y~Zk4 zUijTT`W?<_9De`YV%YNf?$8CXvRaRsR+z4|Y;-8i%PdU7qw5mPUk4DT><6Na|AdaP zGGBIT$%o~GT*rOKV(T`lFL9Qqp2e}2>D-8b%TVtfeCh*59$ehxf~v7?u59}VkgKcGtg|o6{)W{(^K?eQ^rg<@EaLrr z>%%SHmIdzimMfetpk~PX&IzroR_GurK8h6QCMg6Bxh$p!aX#FFQu&rUi4Fl*PFm<7 ziO+J8&`ygX@qYM_W{2-0?s9)}E5$~c0BJKJG`aBIn}*U5pZcv8c9t`J@!7O3JLzs8HMsmnmXnTt4KG=^<+ATpGMBCCQU4W^ zwNO`#)iT&r&&OEzq&7t{ev2J-YNLMHkhmDGUf%emiseyaH(vt z$?Y{W1Wm%Xirs+I;~=#K8@EJz9W3ExIGu2T*l4j#XeYw}eV;4-=ohWRb)Wlq;l8XE zQv(b6@W3y0KB8u`v^Sl10)A1*qt;s;v z1n(a($uwphyyK?IN|lLq3aIQx7&z|J6o2ML;%C9T{#+$i7xxsCl1_-i(moPo^v{B+ zTP4}?$lmaG`{);#ooYVEEu^;VpQ+k~$lJZ)IE3s_*(90eOglVcU8d(AlmSTt15OQ> z<~$XIfKAv7dUJnIgg%y{4h4R*l{lvaQo|(51n=M*%>(z%zHogu`nS5dHY=Dz?D%fC z27Dr0OI$2xO5$$Ah8;yzghs=|rHyIemPB?nt&}Aw%gepzr0p;M3>#E>LnE9?4itQ?r z6Q}^eM7H|~6-3WSVVm8+5sLRg?QA}iaSla+bFCN1W&6F?0x`lCU0=JMaE;beyzO17 zEisW~b9vCBw&02raOzCdm;{INz(F(e&2X4f8JR>a`0xhWM<4%%2{A>(OBKmUP47am5Zh-pb0K2k(@TR8M)lJb=<8uWEvJ#> z^AJNm+ZT(YpLyb%Q7XLcA#o3|)gpOq0jfd+Dx9isx&$^g2?)MyCDObf}M}UO@N`*9hY;TTttR?gt1OD z9;<4)A+1!KX*IlsdR zj=d_xWlC!=DQbaZvNBSKqz<>?;?RrLH!NB|0(OOstBibT=bWJLyc)KVQEO>t+vDMg z2~x#E&|nadkhqRN)C6qjP)!h%7(JtC>hyE1RXB)Ao>4pi8v4Ol(L0qW-^ZWLeSw42 zCzt993dIx~KKd5*C%yO->n35X2s&~o7DIMwKBXcoB@!*g4rh<(2JtLgemH!bJ^W+w zmaL={iE$?>L6>Q`tN_^!ql~~+rxM4u9<5dWsNm}DMG=;>Cww?0bihh4==%!m!N)Up z-s$bB1}jFOM1}-@IBYI^RgFRX5B+9k6A(j4`r;BAeU@ zOS-TUv~IX~e@3e9<>BDzHSKmx&inhAhu>aN+HvhB!|QDpTFkv%Ec=Hf?C8dO77y)oE2;U>UBNebq2V5@FX5llV&n1b3`3t*KNZ z*y%Y*yL>QhQp#F%29sVYKI2`tsBu0}MoKOOPFwzMJl`6*YJ4daKe-*H;DsT-LgGN9 zr6$A&@k3GsL<$Go#e#e0=*vbER^GNdtRL%4k}*Ok3W}IYG6ny)D`wyz29Glx>x=cH z$D7>GGmmGXr%&ApOZ+`7n6UU&EzebLdz#F?JS@~-M}`md&BCr2MdSj2Smr&=0JC4X zm!cvu$1UPNSt(#IAeJVBKQ5#Yee|(GTbjy_9~XVhQy)9Zc5u$$j8Rqm#<%93zu%;y z3kM#@mQ3(QiH(YklHe3?6cAwxp=@a`fNP*G-G=$Vz)-E>`Y#tES{vvrlK;{B;y?X0 z@9vw@-8z3EKQq7|i@8#%wy>PEsW6x@Y1i(~jz+Lrl$1wkC8xp=c`(!`;ZgDjtS>i( z1}hNSF6;-|Vob;OggCpR0o|5{SQ!%gdaYy#0PP4H!`l#IB1^r$b%J|^ zxvq`r8`pwf1ON9uO|N~1e7eT|Mvw1$uFs|4Dc6+1GM%^OyroRFv&NveI^S*25=M1` z8VjC(fhPLqWU7*9{T|{e0QOU~_i9#Ay_>@a?|`;%M@pmD^^O;=5TUPcFhWl71j6|b z!XNWHfKo3`U08{Xwx?W;F$pth=ec(cMDP;pp!H}wTDfU4xgWUY)nk}x@G(69kpxWV!iPA=j_5YWj z8Mw}VL;ZSvoM4@7{cPJE_eXt4)!f&Y7tjH!>Uyu1$&skJDs&s$YC}5GfxM)jA8;&Q z*&D<|1o+0p$h1%w`+E>i%?e>#qmxNdL*(AN#Mgxx6RGtORZ3C^)nnGjEEuoFO&vIo zP9a>k8K<$r2S<@<3$UDd4ucXNtb=g~i^BwCXC1nZ^6#0VXS!{es{>6sTWO#~0g~NB zCS%_?agrZn#}j>cqhuNV_vf8^qz5 zHzUEG!tqBxJ!U+zj((1w2Lug_T)wRd+>(N zu6v?;X6}sCDNm%2O+s=Li`9Ykf4ONlRC&U6tF~t6d`vFvPOkRzfH<=@Ke&{Men`vO zEbJSlkQ^U~c>oY^|KNuCgPjyaG`@$*X4&_J9D%;GC;$ij7VDy-7q5ulCWs@6GlAuD ztq7~3`*g(mr3!@k%Zq?71PW^#ORQop^Ycz26ypHEQ>M1Z5;l)rmgJskjiK3xCySi% zy{)HSL-3F0;grN^u4K`GOZ8eTS*_Dic>Hu8W^;B3+5nI!-jp5E9LR4h=eL0|`Ai{h z1xaHz6SR~0TMIcO>6g||c_hoXfwe6p>=|B)mWgF#JGLFGZsR|&xJktVuo`oQPlYUt zg#F;0Psik^vDO#)m8SGMnoJ6JI9{;=uL1~)AN~7I>Q+b5tI0Cobt7*|m2C(+aM`GJ z8Mdoaa{`xLrqJhdge`^8ApWSBsgE}CegMyp?;kZs* zVaV8xMD&ptLUw}m1u78bPvB?ViW1S;)MODcE2I~<&E)d@;ko2K6a|9$TX`j(CGJ!U z=Ogg#j~-k_;usuNa^OeP-oOvFx^@n@1Ns27jP1BfGVw}ilZq(~X z#HWAkO;akjn;c{(jM=DlGg{t>-PLR2Egk8HpoIQna}B-dIVb%Oqx*Y;W3s{gs8fQ+ z8kkwu*~TG)xLMlcmH$^&yA8u~I$Qq|;8tbzvCZUNW^u0hb5v_wKP6l?rRBI{y{o@G zh|HJ}dr0PB{!sq@fY9 z+5~A3-VgV}bnHuo`g*V33AyS0*0_FZ-Js_;JVhIT(lrew+vtUHG7Qpx))~*c=au&j z6<)4%o_eV{(bF6-_mXS0uTc)^;va{n&3&n|cIr%PMecdBrf0WbQ*?&NuhC%=D%W2k-Z#i?(u;ic$JHm#3{VVM zmv;bN&wy>OBgvz}8#4&v4dTFW_z&?G!t{a{ui3Q1>YYd`UU@w+>%dl>e z!Wmt9zK(U0@RQUKm+sT_o64ld=M$v}rw=-BGGp5;ukp+qt!$N|kkAO7nk7+FI}dz} z?&qb%h*ThVx#I2hyL-C$-)}viHmP3qws-wJ@;>~LJu%RFI?dUmMyQBGE3L|12XHWu z$D?C>r3rSzY6v+-;u{tWfQ2H#0c4!CO5Q@kiBsBK#(&`ik+RKth2s?{%+-B@8%pA3 z`(Rka#6H~y4h?R(39tm3QnAIC z@h%fQuITFrjPYz2YG-72b!IxK7HX~ZQf4sc*0}_A8U=FgB+ZA*MH>VcVWwwVfH@@0 zK)tCK?|TVBp9LJY6s4x2l0srFr;<8U6$(Ek@sbbOOS=10OM?hu-2uA3SRG=X7ZeM4 z%nzi}GMLPvfrGI{OgeDS33&beW)abWWQm}!10!msw{~Jazz<3eyT}W-n_7NpPD^Ej zY@30FlKjsc&Fg9UY=%+=&|;`#3+=bl(Q4wV6l5w++g&<=LZ6%vH9^9z2SVniLTKsdzs4hPem95b0oHX_|X6S ze6ryUgO++K(0vA>&&ioPo*SUSr|w-7dcv2Z-Cz_q3c#)`$4AbuOKs2ORha%{ zo6uMMRY`tY_12kJrp*ZY<=1h4x6oi)pcgCX3EWylp`iH@a=Nr=zrNcu?wM=HVnn)g z&~nh~8H_hgkVu$#ctxXTOTwyGKcgh9(5V0TBXz!<@gPC=T+Wj>O1ZE4|8M;E=!%+~ z(=!6R-EoRby~gWCeur;`HuSo+jQw@HeLs1>(qHa1y?>NBoG+aiP=);F_QJ*iT3(FJ{;?&zldSAQQ>wDY_`c#*&X2A9=p9c7_<-KX&d62X4n#|o7zvVjAej6Ce zMw@dldg)5xciD3&fbzNp_I8Z!0Ww$%s@KPKaUIe;w0dBtD%8h!2^KpELawAGqF+(D zQTF1PbPfM)ozRDQXy3`K$3T|?jNF~?M~DB9rmKvKYw4D_1b6q~5`w!04estP0|fVj zJHg!@f&~xm9^As<5IlHrdvou*->+F~e)VbDUAuPGZW_eE5vUwwmC{rgrMIMl=nMe% zYb@or(5G-FQ%w!zI}*+Q5OF*#3L)yL8Q$=J#~<<%GFm`1V@`9E(J7_uC)8%~6yZZ>Mt>`b(n#sKd?<0f+s#2fuhs_*$4;>CU&^O@ z@~4iT>Fx?6?LIbDv-eJKovdmt{mox_@mW2Q~T~jY8Cv(IPP}?D~ z6koy6m*S~ZZ0`9fB8pT_LV(){qnnd`HB=5^ASbj@6ZX#|W9&{$&Yy``gE2On7cki4 zVXnI`zj{Yvenoop{+!Fq$y`LwmX0d@XiCi2nHW?|JO5Fh{%Bf`Gnx69x9>PMu!5M- zQFDkTue`KgcHONXP9}B6W9aZhv4hp|V?kuazG*0ZpdC4E39*Gr{2!oRY(n3%u;jst z)5Da*$?{jtPlYj$9H{l}rZo;Je40ANY+Yva-w#rmJjC=+nHZ&vyK}8wc;045ChfN_ zZ7#lXj{C+&bsP6fn{VAN?z@wxaf|eBmWjEhbDzUDPet$Uj+L+1#S(}w$x`o>$FN{3 zKW>_m9^R~qe}BI_Ry+)d>!?|d%dgXyDyxr@kVwZq)(-^1UDzfV!HX?^?kWf50zd!k zl99%&F!P7~mcgVlgn*(^ZwSkTg)ADbqyvR4iXh88S`dYv^0%KgHLA?|H+T9$$1I(~ zU5YKfmjh!5{zUt*H;Nw|t~}3|Cw8XO=*=%jzPREl6;rxx?GnkrWZMYjaKFktohj|@ zgh1}5#hbb#gqOA5<1YP0eWwm+3hpdXrt|Z!R2Qn|cg0tNzQ{k?1 z^J7ogmqHOEsA1IMd?-UbZrDR(Y&WbJ1WKAnwwEL{F>d)8S`i($sB_}>$R_wfA1d>T zOdr%aQQ$Y~E8)kFStPOGl3YeXaFhSsU%5S&ey$QJgAQ>Rn`f+oiQ0Rz^lllf_CrN% zr1ZNB{Jcxw+vj{pVTVu7`}!I0e4U`#^BMv>Tx?X;am4lntL+#5!l^vuc0t3jHmrdw zXfgFPyl>R*2s{SIY71le#EK-1L$WXGvs;T$Gn!KDVM9;!}atH`0^ zYhZUEFj41q>0p|oa;wLS=SAk6lhfP|n-(Xyb4brttJj{6J^tY`!FQ3S8W@|t5)tAO zGv1xMqlJlg&Qku{Npa>=x6DLvz|^YU**KAUt}QUXpU4@DgB*)KC!T|ORAu~YPG#^=9^!w<`H%U(nlH< z2z{#iq4Q@8i6i(Be%y1p7^=QyVWlMtnp_oBQo$()d0a5s_plqyv8_Om48)#2W@&If7{oPsmjauo$)p67#)& zpx~u}D*JZV(DKE$Jcinh>*aEG_wDNATPUA;b<8f>7QBo#GORG0(pdl;(e@##&3bS* z$y^e@fKh%P9k$sEa&a(mW{wqqKzwlqt7UPBM{h>WPwDiw{}6{!RO7a1y0chdc8$pO=3CieTsGJI}B zwuKu;j+R|J9NKxl#AanY8_|Nx7(oVgw5jDEf5W28{D0=hU1QAliLWy@Nqr-ja ziW?(K`PeqJLMfAo6?@`u?h=v(c{oYv*AMcI<{xZDJ|{aUmgW_?J}i_%8vKy^f0p-S z(Tn`ps9p)8DIS9)yGQy7aV8FB!5z>({>An#hT5Cj@WFIQ-}kp+mzuYKmy7?pV9$o) zq1JoLkSnrVOI*2?0--Q%#VMaPeC0!XZvcVuMgs70wA|Nc8N2ti( zXO4yF!EfU8TBf^fUb#9#`6vHcz~Ip;RI||uQ$>0k9p)V`doaTyFYu1HLN{0~q|_PKGenFLD@;Egg&+hnK(1dB4@ct3@Mg{|16tZ`!=hNqfTx#$Ro-ifV8Y>s|8BE;IU z6fQ3UMsn7ts-6Xmy>E-!Z)^QGy=L`0l(NK(J1^NbjwL2Bo8Zmu05`#mZ==uBn}2#Y zNMNkKbi5k$R!jRK4q5iH&~{JcLEt)tRJI((L&V>wwP)YWxpUX+bw4Gje$T!(s6VSV z7ve-{B1kqDy-30Sn>H5(R>;3F(hxA#&|v;ZaovB|4M@fxvYZ&TR~lv^q@6&3OE&x>$mSm}A%tZD}prixMUEacWuy=KK$WwfQo({ROmPhBL$C>gpD5y`vd_QfcLH z<+(fdP-t9BGnpKPr3ENN`I$_o~0%Y@x9MW>XydxfOPoQ$#$2!Wfu+YmQ zLTukgO3P~`rkq`liY!YjUz>Cj3eu?C$Wt)TB6d0ZOeFbm_=Bt29SUn-Kd^gYihP!P z-a`E3r$Ki)vG)I9CMeBj#y93Mq9t%dWX1RWA7^#9s1~PyKCug+{zMfmRY|T%XWpsDsC4GfAQ+h0gdlVv zfC{9#LR)wU$XrW<5RSf-G$Wv9Wfa;|tUlO&vbS{!43J)#{h6GC6$5aExrS>V3bdR~ z9!&OcgamZStxn~CH(<559-p`BOSMtEtF1A!N$#3toxv}*eu+os_u<4`2=F&HOKK_@PSiJ=;~Ad6RWXD^J7G&saaR z&cM6<;b!%$&;iIcUN{i6F`$RCpO-`$FD5)$zhLqFle%b#PRoN83VBS&%*FXy&$LrEX%<4^H(Cl z-G9V9NRgtn_-|Q0MVY*Js%E=-B56*INhML%&~_4h_$zX*Xf7yjlKoRsmW~q(h6AS; zNy5bD0u$+mK6GLZk}yQ{4n}$;^s5CzZ)?G=S>rZnEjcTmR)7wg{XP0j)vi$=^#Wv2 z0Fk8+vol@9pf@=h<8aO**b0Ar=#Pn3+w&1aK2>v4t5fB+I%ShqjURcUC7H~Sh=Zi1 zU~=(bLIir>!-#K~2PsT3;3~8SCUBOcc|f0CY=q>KBTGti-!jAgkuciTwxi`VPjC3D zX5+9h%4={~^IddNbSF@qixZ0R>E#oY2M~_?UA_FqWTW6^($+3!Glk13np6M}o?Yxg zX{NFw5PBT2p+TUstytDpK_tlZIGtiT7MI18Jans_m z+sS+PvcDDvp10n}c4Fw*49U)R<>fu-xs`Fa!-)e$)J*%X?kF|J@dzg;C``YH1O6%m zV&#AiL-Ij~-x|?gU?IP-xuD1em0;xFEJV;e&S45JA(yJaZ%+?FF1SM(8;H_KM}o3( z);yp=qlg2JFtyMEAFZlM9nQ8h8@VxIOp&A%2UoU+vFSi4fdYP?q8 z^RoCli1{Man)0#5iZc3~}ko<-Q8$cgwXbB?x->#*^w;QM(zkX^an8-e; zyGmASH<=hEBstIX!KnR%mh3})>?5@#bsmzK_*x^?F@3o!80wHEO^QdN;1RAHgd zhihUUaEJyH|A6-x7_b#7R9c=^n`3KJ4>U+rQDld^r!O$vHvQ3`{%t&AJ#v!~dKsNm z;PnU#1ZKdm90LE~uTs>r5}6o5d@?1@-$=L&&8#n93GOdK{Mlk+9i^IXdt0Z)QI1w6 zcBQjaGW5+xAMka4NF|*`2GywK@yM6hO?AR_uHSxuAq1Doeg#H9l2(8C)%f=*rp3IT zS>DRDg2CepVO#GOBO66DVlk(@C;g5Wy>iI|#7$L7DD6D6g`Qe~7d%DgE942>=6AhHuktC*b{?@{MFCW@Vrh z^jK;s>bkeinHpxlv6u?peSoBeox$Lt`NjFVS1qEG{PuvK&8 zBt`zN$Xb{*VkL^H%jW3Wpf)oLq4GkNJjz*UpLeS==VSa4+%ZSoRI!D$q_Q31hZlEs z4X=X8c#LSWoTLS9f7>?8gvpobv*97h#7Y2t`<@0(N!$Ah?+T>FIZ^y%!o%j%xDxs! zJu0+8h{*|&;P>wbjVjIO{A0?J-PWW0c#0Ta;l@3);v{jcX?CtUM2TlQiV|&Xp?x6OB;$QLI3r{GQRM||vW*pEo;uMQVSjz!c8v$e(nsKW z8Vm$Hs8k7g%g!edOr%mA@bnngie%C64X%oUrq-*w1bg^fjeMW>hK~(SpJT??cE?{z zC-d3n(Jr4pFt-95UjFkP?yXlRovjzuE^mH&>3fss0aZdTsrRbaybo)k(^}tJjWqBn zmNwR}rbPa#k`4NWm;!#7t&q7sua}3f8FiEtRQQfXnNm->%wl1sPZ?@`{w3B{J-g_d zJO;nf5}fceZDrI3mVSAO?#g37)C!Myu}yCygZBT~4Aje)@qnx+ zJFU{!&4?5~d)ycII=uhJLoz#m;}1oX@AxIfQ6Onr7Pqt!5)nC*Dt2Ne4PIf%X>7hx ztNJ{mz9=A#jdN&l{FB^k5bS0rRYy}M+k65tbgqFbsqkI zftE@J4fxb|@nnL;9*9Ymk7q&=1yvhW=Dt;4?Vx=!?KkLON##s6{^0-HI{h07gzBzi z`QYvOVR0kiG}pMw;J$s9$!U9yqK=@GxyR=1;ER_R!{->!)BEm$AoG33_Uuud)QdeK zr;=M`#5aPtqPG`iw~iU7s{xXYp#!ie*^{wD_*2i@QT#GN00a;j`ri59T&DJ1?E7j5 zY}Y)YbiD5ch);DKIy=<$GY=GNH+=NkfV_RG>I%;rne$Vk4V?xvPJ(|0D5c|J@~P!L z!I`kQ$5+N^Z^GwY#IM%^iUi<=Rtb)q8cdZ<7-{ErOp<3esxkq?NdLq6uwT7pDAe1v zqwFSbAs&F74y>41m4&kXvtIqu+~*pbw8N};`&_QO@9s$hOC>iaPPZiIdA}#g>#s_kl9F)IJ2KJByo+EV}OY<`0+ntjpTZt(D>*5#Z^bRUWkV{wk5Ga=`8&Sme_9sta4 zBmjs`^kyl2QBc&eQ~#>A`!5EAz`K$#Fx;F3M-bO#k~1!mcqc`t7_l>B-_w1>*SpK0 z?Jij!R_eT`xI&Mx2-tq)esPN#$+=MoJIR&~^VMD~z=`PNMvcDT@=F(*GXo!3h608Xgd>bHCOJLW`coQ$*yj3T@}SeP zDHZ(=#j*$q6QWCIsK1zwKt_3V8HnkPK@AHgZ05}L%lsg4^tq0o`=l-$CZ7wKDKC%1CQ%FD{g9nQ#mZMgW?2_l zq_vBE`Q*81px;Pu&>kT@^!c_D%CkZ@teU@3W+GF1iN&AG)Y`h)*43wZoq+{yBqzGK(&6OIBgwq0_#jStvaX@)WIh)>~MgX4=ziX-fv zpaLm&G)@G73I?I+aVcYa9*FMCU3n*EAyGiWGrgSD{Xg@%`m_lh1(ZyS))*Ll6KPX@ z**D$mz;oJO9nbV#^u?&+oqiVa%*vne($?l#qR=AGtu=UD@pbbtPqnZ;`cxtOTJLg~ z)#^3v>lfw3a)A~K*^0Lt0rIWnTD6YVufGi+h6SJCg(0<+7Pl4s`Iuq~zv4=!FJs^3{3ddfZ24Rhzm|JS|Jmq}FLQp?;?*tlr!ms#Gc*lXChX7?e_CXUZJS$F{Zie3qByHv$jCY3?_}6x9lm5gxWzy09Af;@k zT?gYIO(JbUE2W7&*f{fX;e168S;77u+(@euz#9&q5g3SnW&$&Jp3z|6h(4ofvjiP2 zZ$+lh0=6P+D6eGeb{b#0+A8xr0fB72ch}V{^Ed!sP5g8@n(hGAFV_*S8(qH5G6&yU z9lcNvIu}12P7RI^NJF3hnTX>xzJ^Uw@iV_EKYrwQ>TAWBh83HzA^(1=<(`GwBu975 zF(yeSO9w;4M?~0ZLG8?#i3%q=Iwt09k{nbJECKPN@W#UT7vElhR=awTp$^8Pp$ zZW~6?$glJD{NRq@m&=0-j)U6&q&iC0Tm#_zX&sY1=E#&K8FkAh>P{Z8>n zX)4Oa{)+(l(=gVbm~1?B>IOJ=n}DcxLRcLrbI;XQG}!r}q@k;Dc=sArLz#v1UmIAb zjY}==u+LQ7Tg&q|``^rZirQ%yD53^$GM%}g8`5ByV_zVE>1G4#@15SzWzWaIRki?c z@D9gex*hL$URnEnU`Ockq*K1*>xE(Z48`BVo<$@vQ4PB_)l~s;Als^#ZQfxa$clzN zct+G~QhG}b=H(F)`Q@>)0v_fI+- zQ@{UaU;lRF_4W74ZPLOt@&TZ7VGaD@^-;bp;jvAZpQJ5N&oki`oWv`dKtfZrF`<-9 z2^eYSn2JnmB=u4K%RzR9Yf>aZ-xO=e_M=g6j3KAM&M)CJ9?Q5XuvU8gN4T1rGl5q* z$>XCveXC=jUf%j5FABKYHs$$?+5OY1bfMV5)To7xMw_(JL{@mVYiM_`LK$Fw9&aw|SoaTn+-A8!;(&>J z#7%u0aXuSzmd4?#{7&md(QdwDf>l;Pn)Hle8ck{b0&R*;q56W6Q8h$G_un-Dj6I@4EK2>(Q+A z)~&Kx&*?JP^W4btvI;!6H(tEjGd+2I)$D0(*JnEEd(+?HG3@Kzx>$Dg>}ToN)gDlH z>vnY4=6(K$W2Mt)d!65y-*wT~x%kS$-yCFX4yw7D zDUwy#Ku!<_ac49g@!c$r*N7P?u%8CbxdzC*;Kfg|J+eH%@7hb-3^a?%Oo?RDuB-H%-NSrULCe z*AvNzY*`O$u^v~3?I=j!gEm{Prgj6Pv3lL&!Nx{n`zE}ZPR=vq39 zh<^hXEIpNXrpwkKPM^(>sltlxShmERSZYA0SO;f@)9yJ1_p8eTi=mz+ zuLeo8CPBb$?M8)2TQk3B(7pzSr}t!sr;|Bo()VFT|9)Jb?+!x9LW;0m%i0fC zm>^hT(Mj%po7>ypK9;sY>2+(yv}8DCe2|6S6ZTR_tu};D{>Vb!e$95sC2XIJw`kc= zB#?jy4@f)v@ObH}ZmDqy$M_}8H;(p}SyQvlI_vyU<_`x$j(r{$ke~e+f z`9d=DQ4L{eSovpl{>D(tbw1FNe$Atl(Wp1u@{Vm*;9X1AW#G4!(rR6Bv0{oLt98Cv zP_yvtM|%AX{@oJG^kaZ2Q?K4a!0PDzr3j%kFx%kw?hMNPa$v)W_eJPlzAr^i+=Yv@ z#Fus6YH9l8r`@gJJtQDbx(oq0zJ5-+0$yTb@;qepnDRTwBrA{wcTZvX4i>b)V~%kg|BrwiVck)8AUXzRWydS|o(JqXL?g6?@-ZCLX(;zxxpz#jRj<7Q-mbQj z`W;}6^V;&tTL@UxZ&z4|uiM1N*_-6uIJG!#```2-_7c_z8;ks%>aG@kD)RAP@pW#z-M2lRbU9sTo;rQ@KWj4Hy0mrL zBggvWBt(NZuJ6?Kp*AUHeiil`_JhN5#HyOB-y_F!>ShTI;n2>TD^4e&hajf(c7;7P7JI?y zSjr*{qdn{&DU?2cLL~qG>cuo<01@HcE)m*k^ATb?vQbDPC^E#v+Z31Jn#|Uk(QM6? z)n-m#`OoASPR;j+m4eE4&33j)aII~hQeGxDU8{jQPpE2-*);Q}ke|h@`mCz+#8nJ$ z43GCgB@AeeI9ld*--+b}lO2WU2|%XhV<2#kHkFK)6q>w>2remko?N7*E(E}%R_|PO z0h37U?*$_A3zx**)bUMI{;QmQh5BOl%sXskIP`EUe12qzZVain`wd><9h|$CuK+zZ z$m5|I-U0_&H`eD@Xl&E6ywJ6VwSG^l!&SZ;;2SW3lbz^u&Fz_>)8)qSah9k15G0B1 zcf-@cu5#VjBUwWhYcP8Ks`O*2Y`qp9#-gC%c^L?+g;`?nv zManst1ZPx)(9Z-AB?yi@wj_)RG7u=&v&p*Ry`BL(ukbJkcE&^;X`ku<&vohi%lwbe5{escnd(LNmi75AoK>g3r%VocYeN zP{$0tctv+8)YFx7gkMD315?g4JzRByitZCs>Z%IZkP5;CLGc%G zw&*8e=u2JQKGpv!H+;3UvIQBIPPHr|xA+08I);Ch%_$5DVYSkt1Mv&em%HfW`4*aM z51;2}S6!QBu1RCt%s50VXxpd|BH|1!^uFZ1jdlF^sZQ2;Fs0vd*7XpSR%ma$m^R~a z^%YR2cXm{dT(EqXC;+HL!bxN1Al6|bLrOeR7?gMBuoOAooRbP)xgDJN?N?x&MPzM% z|5zi;6;TWPRZQk-eO=~}LF;j;IT9;3pSvi1#1=&rwC^ZI^~pA(nH8kcF$b~bygOu> z3yUpWb=ye``=p{N%Vt4n_F34HM)|Jq9ejJuPwOJL{r!x2={~@+T}Ww`jqlZvbg3!; z_m=Z~0cDB#vJT1?;Fqd|X0-kKvE$%_L27=s6QfNV|Eb2*`0u9V7O4)e_c?r^WzJi^ z;cBZk-`Dj zHD#PZHnKia#R%YWJ>P_`pa6vky_CL&aa==AUAhPF1`gAUuAl-P5-L7jLcFZFig1vG z3c8@_E`FPdu+}Dc)>Urw72)@KheLT8Ow@3hMJ=snP>L`_(Zj`y{Bf}uJf0ka1JBX{j}hzh z!CEdPm)h6t)DE8m)pKmbPxG9=NJO!8^XVJDRkmCO>wAleT;~bxL^1pT?d=T3HAC3P z#>JZtUFE!8@EmK;+Y{q)*CZ!BFCx5~0{z_QRS*h!TKlYLO~|+O5#hAr?{{8-pDXxUq8E1dg>=3A0w0^N<_jV4!eyK zhXf7y08X&F^|?FQfQ4c%33rIoW4lDx<^DL8+6FzI}_#9La$lr@ap zxDXB^>PWEU9VSb&sN5<}-CK~Vn`F^EHI@f>k)J<%cr%TNfB&=Zz&2>WK|hHtD7SU5 z+jMk#&HM-6!(&xp`}g-MT^6x~Q}W5pgDvf;BMcnr4F-!tKow1j#T=o3*0}-UI6RqKMWiAk+}h1;4mx(r0xjijg#DCHq3+0PQJRZX@I`I!}G` z^az@&271lUofoPnE`D(Sj`me`#B=)8VHWufb+dkMaD??UK%{w69 zc&T7~I-kB$fk(aAm!kEc(>`YL2B>QGJUHQOp5V11`C6(@?=rN)yuqA>=`+1=FPev^ zyymk=luHK!1?QS(nw0Z7jWKWpC%5#Wc)+A5$?0Ff<}|@0^4sDtdK{}JwBd5Gl1hLL zdkgsMO!Rb6v5-0xMr=#eO%nJr^MzA6Cp*)|R#VyqoK}aZKy_QcO)<#$fQ%GX)ALL7 z5^k}s&FZ8c=&4J4NBjj0&R=_cgHOV| zE8f~9x}mT@C^vI5bf~CL21vll3yPX}s~jfHl9&7YLG%+DB0c8#PR_rKCPc3EQQe7D zeo<@6p);VR{@9vf{rS@IbaEb&C20_7+YFTNImCJ;Y1erzcbD+WjZW@kt!@0HI1cW- zFT2{!@w{#GJmnA-*zJ^Kdtd*sF6^|eFOq!$Idsng%Bungb%570J`>7v#niPnUr^HP z-JLd*XLkk5+69}z=JP9b2GO5;ZI3LRrb_Q{OS`-s`}f78NK?oNb^HH>%doJFW=KWc z6~@10@3TUKg@vRXfW8{WNL@9r7V;ZTF@mS{&IXb3odrw!OS8e7+sdB;ON<lM7wcn1S zcdd9YQxH^?lgnXPrl&#IS2!Fzk^fUQsXj3ApbFGp14y@G%j@$*y~!BIAH!$YO^t3HOaedqY7N3`f@%?u!vdTP?WP+VLHyQp-yn| z4l%E(TEId3Tx4bUmckX>U=SDFK*Ki4260k%GINI$6D)04#u#`im4`sLi}ZQ~FsA7l z@I=VdsYc7n#L(Ri_9s4uUx-5Lz3SOKfUIglu+37BnGQ=P(F7H~6zpFZp%0eA4|tVt zi8GX|G~7*{s#zSFNtBdsZu9pw#*8G)Bd0D=CPBpe^ruD8rh+VkP^eln4X7A6b;Y%} zDYja1i+N%e#g3 zUR&D#Hn}`JLtfiBzIN}f_*=VHS8Y(&1$lMNX%m|7&jJrL=@rPKhcQM7QPcfJ!E#%V z-oRnr1Tuw$LzEm4Fv*5)1cdwplg#czhKC%R34Oz2B&3G<^x9ygq{+@4CQ7^AzzwC8 zHd8;W`s$S$V1;93?IuejrU~2#moCDPa)$+mI;A2~YHQlc9Yo`xkuUYs;b}yvGp>>} zGnc4pYZCFvKpKzH?X0fGukDA^r_AZ$++ixN!@zV2l8~)s*776pAl$Rm;REH~aY#fg zDLv-mwG^&=;$61FVYlf!!E`kl65+w+Wbrd6cN(T<Xi5o>6uZS@dZ8BBTWXPp`A}euoCpm}V@vi2ic7-6*Pxs*0PyTKW~Lcd$bTO3 zpJI|5mH7>3duSW6;@h8N%Hye>@XOZL@*W+_D1=V{P{6`Xk3#*4a+W?*N82z(rYxv& zLRM%Pd${|P;Y)P)5V#;m^U;`7)q@>nTY67ovC@rKejQx(?RB4StZxmNe6$OC&_Vj_O645=Hos zh-C12pJEon=M{_W;`!72jEa~iN7QDv>f%nU-h!yL#DDxkc@TAk@~cAj`upD&fZ4Bl z<~27lX?acox@LLZDw7n5m+0xOtzO4w&2DYppyT1ha~R`yU0=zEHeH}fS-gaj{bQA7 zjhsj0+Ty)1oe0kur-0V^&Oz7PPV;W;?r%LJ_AwPK;e~nDi5aeDjRx<63Kyht#DS0g z3>-M}a1lH8Ti+8*+~J6Jb73+0UzuakxQ-H{34t+*YB-+V1=$Wat}au=aFnZ+M>U%?RW*+$aW2B{BB)OD^r#bmkq5-`F$u-_Qjc(s zX~5-DjhGkZJP{Z`1D-&9CQ%H}q)i1m_h;#)tStP1Q-FfT7}9N`Ch8xF`RoOf{rqea zkqv1d$hJT;q5lc`cPSc%dvxfwTfGH`jIBT;a~ArrP-00z2BSbYF(&!|v!z+c8}s?A z>ykiaeGKp0v@r*r{|BJ3M`>Z3`ls8wikN%fdGwU1C?!jgER(m)W+fQPBzeT(ENCcq zMvOyGz&DqA#8+5&%r1?I6dI8*(u|$L%#Cn;UTXpF&$a$1Oe#$nI8ZQVVKp4XztkS6 zcz^*eVrBCEThYY|b1F$4Q(c{wqhHmFyFJEh{1oh)>jJ8!ajqGV9d4{swoQOQ9MFq2 z{CW>%4j6FXjv!(`?fOT)P_J|$?L@*Zb^+|2HMpArxX2Hx?{zueDM4CDpRnk&ca&an ztuFBYJWC_xd8clvrur0W(Vkm7BhD7x3fhDc-LbpJ+PU9uxu-KDx@H~0k}4Ha$nsV*d0OKrd;BQ;RYYdNLS=2N5GlmJC8+F&0M4Mg>u-6=6iaPs)lZ zfEA;rxq2)yX&!IrcBfw0anT{=v=O@;ZX^xuFI~bw4GZE<9Qf z7;Y&4pi$fNtrpOR*gqjYkb%XXbqM{TnZEz45ycRiRO{xr44#ag1(>7zjhBe*9Tt>8 zz%D_E5Pb(f5cnb1bf%!35TWxA(g66l+FZxr2PXL^G^B`GT6~ik&A@l@Hcu}xfzP9= zPlH*ejLE-*F=U?p(Qv3v{7wC>e-?JO*M^YXmQ>lW1QU0v9y6#z=Q~8rf>N$KPET*x zq@YI6+BWA-P(j{L2ry8Tl#rX2$iIu->OX-cdkD0`Y!=LtwTkRu;6T2Z~;$H(=ZvB6F$jT!X zM&vrn|G8$Z5D_C+#5I4Sqr)|Z`1d*_Ua2v_I0gjbhYlP9`-tp^Oyn}sO#|NMV5CFv zhD0A%hU4UH&i1|b7rl%Gy}2waFf`csE>)hES$!(Kym_EZ)yz=QleJ;5b7fHihpVmb z6b6c11NK5zK>2iTUb;(H!5~LyG|@nn133i)ed6m*AeQtMhWBqup>#W!Vb zp;p$f2qG<}{2PXVMsdoxJGWfpFZE1Knmq7#185nLTi&;NP zC`SMqImJeo68NOL&A&-#C(akl76ujKvlYz+oM9^L+kS7Js^&Hamx=pOw#odGc|HS| z!{d+3zRWVEoHi8u>$%m~o&cZctlZ`+hApMiI`~H-i1g3kzIM1F3>i5!y!z}>i#HIo z!Y@&~H9}gM8eC<1j9L@;i1*?k@Dvc>#)Vm7o1NCvj3U93VD=ns`pVeQrv8PV47Di* zSdM7^lc2-FVR>xs-a`*>W%nEl`wiT<@n{irtxha9d!|lYFW%6T)b(7mZT|9dw}nhj z$??D)E0BA~#^!4cRK7N#V^$=aKe9uTIzL`F^PB#C&mPUZS1{)4(QTS(iJ}WVb@CdP zyiF`M4HCyAl(@7uJduB^m^5&XI2zgH61Z_mXeYrJT$>M3w2Rekva_Q&+G;U_wSwIG z{chYnsFvJZJ(hYfPA6)YK1f_Y`>j>f6AboqO!FBKAS}k3)mxBc=J5T380RwGDZS$> zmD!Tf(nm4@L$GTLM%env!Df{7Q&i8C zwoxa@BJ>dW`b=({JQ_9n-eu5N36fmx=y(P4;cE@mUpQ{-{Gi~Yn`=R)($ty>b%rz4 zgGNvcoYB{o7b+yJkpdq~Yl+Lh0(IAWfm2Q-2`RhuOQ;7S6fYXtEmpGhzkh8{$3y2F z7ZJH*Ksu=|HNoDLkxdhNH{u-0-=<0S7MHKPbXNndogBySkdGPOjRanismQ3ZBBY!j zWlL31r&>Xyg;W%=Oj7^PqR=#-8tb#{Q2k-acOUXOK0RJ2(>fzIHTD#0qUIBLNAD`^ zN8*N*>l~YU7Hjm&4UjrrZxVRmW%Jp(*)kUD*IcPx)SE~|$dTIjz0@V< z^uU-R9b|@0Xanl#t>D4W2>dXZkFtZTBA-|z4vVWhss49%U{qVs_H%uY4D~PB*z8}G z)%_j#GwMtSlf9wh9HF$=DXK+i_#T4iFQRHa`{VNn%EgC8T`g>SF0q_K&TBxIhi-qmgFK7eagNllc8s}qdRjSnUpw+N;m8o!#mx;tFC}m zB;ArMoB|N0ZVEU3|Ae^_uHXjnoHnEs=yb{q?$|j>>MWOUE|*8sNtBSI->gB}5$@bJ z^qU%DvE3fjWau%bJBseG7J&Yo9VsbEM@_X46-N!wAThw=CPH<;55!)D7mK=qRguiX zE~Lavg;GS4+5Vbguu1hZhUNFp<2WVYj7{fc40J!673rfR7kr`=l5Jt528t2w2o8? z&lGTI#yM}e|4sL1lOc#ms_h-*qkl3SYZi>wKbb>OC2JhRJd;K$jH?m+kC2%S;^`(P z`{U%b4zyj~CV6fT@8?p;c4bszLOOdx+z~Y|BTD+j>oOhIizC{4$!N3wXUp2qE~f>c z+=6e4*VNSWosm$JC?;@)c7a(0K<{#Qj!us`M5Iy3#8c#NYZK2G*oLWYZ#G5{-t#CbJMByu({T?1y0;TNqrc;?m4qO!xk zrqyS2ggXoRQ6uxL)@K6mW4j|J6fGWnEaB-7MP9#4UfsVZBL-$$u*GVAfS`ey34Gfi z#Xg}}!u!QRC{cKcqz1b1{~%icon+)CSvw*x^MrhAn>WCJeP!zYkmTvncJ(r@HRtR{ zQeryu!%=?*ISm~8ftq6*AxoP@IUflbOO+OR0g{le1=g zchrqMbTi$%!2}a8Sx3~l`l{v`KVDT<1sy#>srIdk$~Ox-sxZn2WpPM@TmLf=8ds~ZQ_A4RV#Pc4iEyXl`tSxgf87GNu0l2#`4P|q^ zfZ0X|qcf&Z>-J1iMV{luUHb^qFNnjX78MkP7}-)FRJl8x@Qvzwkj)cxX8_bdfhE91 zh~OV+LI~id_g>(~@DQd6D}u}qo!^&qO$`6QLL)&>Jm36$ju0fC?D1Rs$zDG3(+-pM!0zyzRRODgrlX{()Q}}6v*ty@KZ9#>u0Ux84K;+GX5>Lz|HPBbVA3r}PkMgzky%?>_KJ~NzK(DTfnVJM(6Ya(z73bY zw8CHRxVgx#kH(`4MwSsaMURN-S-W8#CjQkJ47oQ}Rzo#JG3#Da%NX&ryE zLkn6jeVMeC;cU>QJGS_J4H0m#b(8_X^pI&=$FDAeTmG|I)cG&2bc|ViL>(4qvm6LA z@5VoL%eekq<}sLHIh|$5C#TWHh91cIvijD4ZZ5Eg+|InaIqTrBg#2)|A3Kch2^+va zIhG-biKs(z%)NBGZ@($LrHMlW4Y5-(4bp-HqzhVcDSrZ0)pIhYXk1ANRWSpouaJ7O&O6jB73smSnFGT@ zv$}7@>9_evc%K%+42z43C=nSbkfqEy#f7PI@=3+D2=RvYTHStODsLKbC}&CMaX69G zT73_&ELhI4)c>mW{Rr!20${KO#(pani$S*+tXQ_{c)S(i$$JdDuQTvK{Xs8`rk+upMW(S5p_kr7^&H%MtxpTZx<=xWi$P{4Ra;pX$L^X*WqZ z|6gRar%iOwwav&el9v6{s+24+|CA+Fbl~ojw)DIRzOb$d8sIPj(T94Z!^GP(pnSs|*812B3fqx~ovpysa*Nh5MvxPVDq8Orn>TrSHyM`Q z@iRVCx=v2D;Bit6VvZ>;UixwfvcEJ&L}8%839{OceQMq*Jl>utRfx1b_ag;cm;y{q)P}`os8R$oTr^#=#T! zmpY78l!EVy475x*pQB(3aUl_0g)Rk~ymI0fi8q9Vv09~u+7}ZZ=nw3nNIOt!+KS}S zYq|PE!|^|j9GLQPkVz-VfBTM&D5qg9$Kz)PJn(%f>|XA^nA&i)^_`Xg&Ts{tnh^4_ z*g2NxyS2d-yD0VntQq+Hyb_kfea1k&TxNuV`O;rVC!$Lcla3B7OLHxoe8?LNc3%Up z+~_q4n0ZOXc?dy+dwVbO-`~&Cu&f->H~4N9(uKf8PxO9b3ZeGRI^KV}-@TdineJ}w ztptA7ER%nL+6pY93$9ZNL<0HHs%9F=;U1V~W2Q!p5HV z9C@D$!+TNUL-Er&bg2NnT%(OY;NrE;H~zF3_D)q$u!?q)Xq(=(A{z|NBAIGRl_Fx9 z!eP6tq|T5`8C3KRi9uNwlStZHFPfV2z8E-MI7dP+X(#U`LrooTC;xvTH?R4A{4%eu z3#ru?H&GPQ=wagiO@S3t88)^jj|D{4GV~?kKJV0 z2bbME8k&=5G?3^-w-@)F{rj09#6P${IN#%yn}pMVGj!SNKUsq)K-rDLT^s-z8-+s! zL&19z!vzcpD+PXwW80Q0CCyp3hWTQK)X^cNJ7i&~ zWztH4tR*0*@L7HeBc;0xf}IJ-%v)pcm<*6;NVoT)IF3Qy(7FscJt#}@{TW+3jj#HG z{{D)Oi{oiA_w9=#M(@wbq{b%aLahn3)AVEvvp91M1kQG-AvM$-Ayd=8bNq~k z)G+v(z5ez>y)p{osn@%rI*R;nrs~1z`a8O_)TwjuX5vbl!OaV7%c?Eb8m0~x()&k3 zJy^9&Epd=F&L0V}0SOTYoip*xTNSSX0ZZYxtwV7=kw!z1nDk_5Xad;)XlVPIl=T;L zI-)DPbWO6_@Xo&QbPb9EQ=4l!5gxfYZK|Xtb?)6tmS=fQvUf!}0mSIo&(b1G2)Yf2 zyW$Lb6L29o!oM%0LNV#TS?zWL0dVJ6U40*k5cBj&ALKo{^yR*`(Hoqz-ps=$H*#x< zhz?^uN7Sr39#>yQ8RY~hR{=H&acqz_T2?tJh)eDMD~!^nJ&|}rO_}dhEKDykwwJtE zYm{gdlU{^{yVT>j3g4Vi;p^hK4w{my04sR3?9{VM*r4*Z(wXLRV8>!PVzfe0+21s2Zep{r{N5DUMer^# z<1F{F^kFB$UKj1KI&$3lQ{s{gb^J)w0J|79oD7>eoF9lIG@62wZ#a?QSa%o%z6haW z7qjfk#lp}dmcnH=1 zEd}VGSnP=uO!^MQdPHY9q4&Q5BJ8 zkm%hqaTZ6?*=nVe3wjA;MYjnS6Fb`WI5*eDEoGtSCcSu>K1N)qMFLx<$ri9&m`A^q ze;RK{&EUyzQQ@be+m`=P5=sV5^_he7KSs!*C6~GuvL|cB!^GWs%GrOQC$cHC`5Pp- zX7c%Fc1$x4+M?e~E+B$lovJ&!`NQZsEi&`!IX_%G>5{$-zIVMDCBTQAWc>LME?T;- zAeF+xi574GFw_t15cxCIUnuRrrzbuz_mRqxGPjO|S6Iy{iTaXZ22uPzKE_*Ggo2Ls zz#RlCy>AUB!Uuh{aroz~pd)ZkyYnil4FQ@;N$g4s`17i;hHfj|Hl=wBVnjYUOEdeD zDF>}S3foc;W^BwB>EK|ksR0T)2TOz>7#B~vq%i}(c!)X|3jwKnJ9N0@hVX)|3uj#^ zpU8)CY~e$sLkB{UDc-w~T6`_knBJ}H5t-~6pA{DQ?IKEXl+?4tFAzFgwRB_|XE(nS z=hw2Q(bShoWO{h?tqX2OtR|hgIui45J}fMkNH1j4LE+g?Fz`u>`h0K993g-m181%m z>-vvoxr`}pQQIjg>t(vGmv$7#vsdMgW;#wX-S4idXRox@Eh?2Uo3Uh?3$3oAtMsugb+x7(xLPQ+wLzFD zKZQ+A3Vn_7$*7T$`Bs$i@#<=~!HDigG$tmHgt%-rB#F#I)moOn-;`xHG z1Cs~IwY+E^yHH^s7BX6F8F`M)j3?)wauOYsc*v>gPQ47Sbz=7{8<5eNGo2HF>1U3H z(fnunRTn?{bhhbr8F-D}1>Tq4y4P^D7#Y42hJ8niEmck})giK=2h$ca4MJET;`g2} zQH^3%TIukKuL67cwo+!)GiZMCB{l)mdOa$LI%uvkm4fyN2U+l{6Z)nWbh*5rV*Qsh z-CY_%pJd>)i=4^GCmCw3V+Sx?w9qv;p~zGQF?14IAfp;PFAgI9F9KQ)4hPFJ7DI!E|f%3&w>7oj|~#F&OVWZm#~ zVs&_Np6u{SxKXAm{Mh2HCvycecU5rz0)aCy&NHvBIHZTQ$Mq)!THZ!Ag&hX7bI!M9 z5FD9|gOXDRl7)fgOJdc^#?u^xl7Zu^4{@XoS7pr$h2Y~;_tokEdjbs_$qb@D{ldA3JdrEH{K=rBA#nyPlc73DzpJ~ikAm8$N4mD#6;$$imbS@){vunzmK;@aB zyenh-4-j8NDU(52!xFYj=&m&U-O=J4|8j4n-eZaXu`Y9+tSW?UdB`$)WKnUuN&o=6 zBBsHlb(yNdY_UU+xH3P5QHmullQU#_Xb+$VZeVbkp_4h18ou1_e}iufg?!_q>JKwl?68KRht?tRHzguJ%Us-^6G~Wx=lPrO-4my09p~Cp@dmkk{TA z`%M2SC?8o|Jo8i2U=azZM5u=4Yb-6@RLmE!f4A1-HtJr>C1WE#nz?<=ohUyfabsPR zDMvG4woO5pVWFgTO{&5eXjlQEp;G8uVp5YcTgouU&;u5*xXhC5LPaO}7alt&at^Q8 zVVM(I)Z6?Pwr@v&U|Y%wS^^x9I3ZE-?@8-<70*?Mqi~CI?VjUIgj$FNQhRU^sj^_Y zVyV^-7-g?1)Ch&c>rlI;mk>R0RAH_DPV4AUX!I?}jR-@!7LMuvBF!S@bI^NMKPxid zFJBy4p7pv!yX)Fp>mI@oYlSLm43@m_W&wI&ePiv#Eml-3N9Dp}zn9%WT#H)QHb$%IS(d3JSgp$!9g<}Ng552>vNwhs}q_Z10_Pog7_O4z<~OCY*U$yUcg@WS>)@saGO@!Ydt_KiUXsY)%33tYQ9gn z&Xw)D;EB*P7*fS$;Z==#84uefNW?R?@4oxyZbyJkv8(e=3cLgwu;zaL`>tLSGKc=M3}m4n>)~iaTLKdW#x<|ch`9i8K&y?|GCzo?dSg4 z_b8&}veNgKJpzTGgMoZ@g!sT(J|mKgdg>~$Hjn?&GhV{LCG9d6#S=9OsVYecxT$D{%U?#jlM@6 z%U{xcVAh1dUtM1JN#WBtzX1sYlVL{ zjtqmZe>6A+f5dUQf>#HN73YIjL8p)r!_29ePpNiLQCaE&9C3en#kgGx_%BPhoj|0K zrmj)-ew(R+ZM{n?;Opd&gU9pH=o)8kNLlRO?aLGhW`ZY6-$2GK20Dv~EkwNGEf=pQ zI!ad3SJ(+z^w6@XIb2kv?kRRTGBH;UHk&ReQN?=H37mW$+xAc6uiae@M~=(O56{}u z`x9*x4m)0tv3s%NbVA@-8+zCdt5~<U76)oUZU1o2G;=}q`WFp7@v=RkNvol~8xg*~6p*(>m#0hhB|Czvd zyaBCEcjvxhRfLwP^wuc(ViFBSSPTgaHN8H!S`dB;uhfO-S{&cpheXy3=Ka}5yRn84} zQUg=$S*y3TjxrF=YA^;AQaw0yP@f?X_>5sZI4PjeZlFEDZq_iwfQaoQRGY5lLs0R( zQ79OE##qGh70K84w=e#dZ`W!Ym!q`f!?i~7NuWzXmg_sImtNdOs%ZN-M^tdYmblMD zXOcWRWcMc{)>P8eK0KQU3`K0;W!hL%-ZIw7jAnlf|K6f7*{^>(A>F-?k|suIw8D1C zhlO9XfCK_}X+c~QvV~w}V01-11Y_}W`J!HTawG0c+qF>!1-YQe{QiFKmni4_R4R

)Oh|a1Wiu7#Z(fJutICcwmzmTni_ERS5ysuU1;&C;C1!!kp%ukO1 zwvBsAUP_EwH8?@GcB2uttl!fD(36;_`{r7KsyiHXc%d z1A5qq@k5Go%WID16PR86rYbfsOc7m#j}mtCC>jh$StBCb4;m!C90dJD7%D*5c@j*g zaomX0Rk5g{H`}O=aR7C4@Xb-Y1i2Bf8(&sR&L7!3l9D0TN$qrOXIj2GctDGUnYnEt zJ;!Y?7ukxS^X4$C!F{C7zq3VtE<0iykq*_#0ly-`OXXc^4aYL zzPzB|C^$q$O>GbZ=Uqv8Ib}gn5T@o-SUKG+y5>}&npy#n`p_jrGnDjD3H>qAS~|c) zW7@DD!8Va5w$)q}723=PW^-ADyqYO~%ix$f#zc7&4Ef2r?b_q&RK3?~gi=sgAz%;m zuqJf=OJY+=)XC@A=TJ0YHT=M$)%20#k2=u5CPtx<5wpdz95eX)+3OSH`A zi4~70jM)4{2`hT>ej}EGsyqU*SEZb+JVG)wU+lZSh9v2e<_;KjNol}X3ho-44H2#1 zUv4x${yO7YEc5xf%!KG&F)4dw>=pzHbKiYu7H;jc}Z=g%VkXXd8X@rb;G46 zz_Y~d6=hzY@L`FKiC>SPC@>myqX9FE&K9DViE$J95Uj_~bqEhUshS-%(MvjT(u5-= z0u{$*V33?MHSQC*;wN6#LBRV(euvor(u<(Nc|wLoc3S0cV$|V2WNihCk5-=+eW{9+ zDfUUnsJgpyCiko^`A?k|BV%iY`WPdB#+xU>_y_Rl(o0V&h zn;$JbJ+fQdOAt`_Qb#wV^F6|37;_L~L8GKKk8;yD2op324r?WyMF$NQx(EYz{DCsz zDCzAqoE{$n)f1NHpLb8}YptU6Xm+}NyPn-8`iq5hN-x4jK~QJHW^6udZfUbIB2Qz; z8F5EF<1#6x7qz)kqf!~;s=$vaT^}MK2vth~=E43Tgq(yXiOd;E^=)_(**}ZS5Ndsj zoCHr$fyWK?4BHp@Z+?)urLX?*%Kf79g=Nx1MaX0C^xjRGshaDd!KBAEjCqP5wWcs4F@J{bv5L zuTNYxh!ZFd6^)@4T45za&T!Q4G9Ki>aSZP~=bvq8Oy1M67yQ{j+XB)12 z@vT-W)hrtyIxumax4_6pL%pNZa~N=%C2R$hJwZX3vvXl=F!|8wTKmE<|Fcp8eUx`(vJR-u-PKf99V*~+?T~Go6sbjX zNQ#t-MRG{Pl+i^Q$jB7TB^t5@>90VhR%9g;bhu9zl&KkyPlm`%8QB|VH_YtP}2c?@q`Eo2|}F=bwd)Gies zS>}Ux5J^U&w6XIq`R9ps5vQtkyJlgfs?vMR3wcSxn}=3C;hm3nCR#~zsoUPLSd7RD z_R9Ai;B~Ht`1T+`&|ke)(0^0q83r8qk*%hd~8i zevh70aGPZ#d_kIxNfQ$c-u7EX-a&9J&1_ztW3-xrT0bY{lPn8;AYxmXEBz>I(%)E} zQ+Q=*VRAAbGuagRviwj^Koid5KpaI}h?1;u%ql^5zk|gQB9V#G2J^TD_E|?H6zyzYnkIWAVw1`_2gOpU^BZ?PRZnS`H~;Q*V3FyjPws99_U_VbuQ#C@D+OM( zExp^r$>4lJ;J*DBEM%Y);Dg4yMyUdPR?_`2Hb+Qw4$39WV{mj1Nj;@i zWZ|#@lx^x`p))TgR&6Ut7e>Z5y7xA-_xW*&?@=^B@5<>m^j)zuUTe%xsC@@59Vd;^ zlEV4t@Df}>ZV9_V=Qp!?^Lp&dA6f(JwiCiqLvj*T=t_#fiP#5!!wrETX<&Ymf6cLh z-~PH#Z!?eYO_AHZ{j7$lw4eiV8u0;2pE4ksz_X4wG}=hZnN`#< zz-RjT)a$OJ*!`X8tc@svRCT0M92_F^EnX7J?=W1BkP&hM3FKSo6!E~}5pvO3^Pcd= z8GqRr1VkOI7$zu54vpm})lLfm#(2?0{L4P@2Z{z90c8F@0ttWc8}m?3nC|Da&&Z>= zppW6o5TliJAEn#>B;YPXApJ-TZ*ffcA&IryaBkOUdyFAdFlOIqWL+XAXx`kIX%QVX z-sEY+fnO42RwGQcOvrN{kkj?2-9gfv|J z8xP`ucF>xlMU_{Yfcc-Pd9PP9xu52`ks9c91kCZ@w_c4+oF1@tpgZxu={{g0_V}{y zPD(ZiC7+~6^p^ImSZE1Aj@Q9Z6{!gS6XhgrdjkuCa3h%t22Bv?(C6i^6p~J9C7>3* z$*8guG>fEtyd*wgokalMu6{3)Qjsd|fFIQbX-(!7DG_;;0!!V)?4w=u%DZyLK3PT+ zaHV)`P{X)Q8XctMZpeVkYS2rcWptNKxb!M3^wqu}vJ3oDpvI9QPx2|XT^wyJZA5_@ zgd&lQgE_H@gOO3C6896>79$aV|Jt>&6ham!MaK0;3Y;P>dtFidk&m}G`y;Gpoi^TH zSImi!@0LhA%pj@zBHm*;O@}K2K?=ROBHS=OTZm~s1_w|LnX-b>b&q#VqJ$F&6+)HD zJ|q^r>*%3mBSxk<5N#@(_d96mzOzw$&}n#=yB<%YyeD>UVbaS=mlE0#Tb>iw{;mEy z5C@~%GX0|kS8pk(m>!>Qr|aI%uy!ZRunx5Z|4O}1@h9?V#M<*TK6}d&aF*9u3#1b( zLMHdEZnsEjfkm3}649`x5zze5H_dN}n5i#ewLG)vsV}pMrJ~#@2YX2l1hxDW zn8}zgMo54QCyqhEcbv!nPD&2vIcYuD>LhLY8-jqGCI>P)dN5fTiFJ^#uu6E2;(&Aj&g9N}M1B0q^AYpzx}wA7KpR=9Zh)d?xL z%%j6nAQ9nX3sPyN`hZd~6)iIi`nkI_^-{K#>470qlA{kCvIx{%Kh;^}LLE-Lc&cp2 zY#+Ujf2|z{hYg)WQfuOSq*yc$#)7@-qt^j{EeR{fJC$d&3K3fyb-D7b3qe*$E*|xF z*f#xm-9Moq2W~Tbeit-6eZ2ZDdlzYT`eeBGgs#IwW!*$)poF$%%Mo5R0-+5pqsoKb zH>WD<&t?9`ej`Px`Ov0os4w{BmHx#1XPw1i8tutUwrxXY1N2Lv8+e7ec`avoCqBt= zc!3sif*~%#v@%79r>#|wG@v+?LL7#@79VApi7M(b=6P<@^fe2dAU_m#*It7ezcxT_ zTt>$B5ovK_I|m*^3$l#)NZMcX))}SANIOI{Dp5cpNyPgH=wl30PW)ABij^g5fX$X<5`4$Z7P2l=ZSR;z8hkXykGXs{0so!b0z zZ(trqK>Kf?r_k|xOHwfQBOik0-rS+3D6>?Q^g}jLOYBn)N;hX<%p@Kp zthi0zdLEQn`1xKg#RC_l$NWRJO2eQ3MW5J-ylE#)Ljf$Gd8?etjL%-_?Ib(PJ4w%< zqp_p*vcD2p?X14lve8#6RzY?5^%EL93I^9Fb@&gE>VV5VO$E}ot|QScLzm{}FhlU` zmvx>K%p*lQ7U!Q6!y_aVEUIO#Vg$~?JqW+z!&=2=(Z@;dy)pzZ<#y3X@iHb0;BoP> zU_L;{^nmd#68Tn2dRIPwV?CTQ1@o@NxlE)I3fC9$(!=pnEh7&T_-y4QX)PYycOa{= zj=)}sPKj*OOC`RBw#Xd+E|wBk2lp^*s+k-n{H-CkV@bY9tu{0fwLG7MYvJuJI0~CJ zMGHC76b-}ecYn!OGiW02do@RTO@Vv?>^Bfh8}%sC%WEEYljX$5PQ=Z;bWS1gt4=PufLOci4_VxK2s!hQ}HIk#7;|O&)0|?@e{|@TUs8~i=oG^RP;oA6|C4bQazMqE^ zqakF1duZ8P0(-B!29J&&E_v_P{C#{*<{M6ruum^@%L!P*NgRp0vgdel`p`5$o^;U1 z82A)7uoK~PcsHT5cuiyx4^Z#74~Y(~Yq7#!vAwXt6yp9owFoFb3|LvH^2+OMG|rl~ z!LYaEYT5H96UP%3nq4+Oiir9d-@8||LQPN5=W)@mn`mLQP1nnvz={;yhTJyZLQ$1> z6+G)35RkWJ6Ngjd<-jEy0MLxVe}yn-R7k=zF1h{*N25V0mjrcvyUSrYXj`TO zE}9XU(b34{QyGeKW}==rHAZ8??^^h3fF1-Ta>C4tf@R=(#OKVS>Kr~T@K5gY#qD2Y zzXVjAfb@kQJnwU9aW}1Yw?uF^?{e3%QMasKvNnV{rnZ7%lU^j-0U|#EAf=TsSwR3& zTA3BpT+vjw9iuuy@^`fY5>psm;I-{D*+<7r*Wbv1-x~fh z^->?hJPQH_kRike&>A-` z2gS0o0>3Zh1Vi*RQp93A)+2mE#d;I#@}(B>?j!c;4fvkl%ta(e>dAHl6FDt7I}q!` zcB2MGE1Pt0I6VX&6jg>4RtU11=u_6cpvmL<;`e(T@CurB<=l#^V$(Z?hEkGVr|P7>8-VbqARLRyp(CMPNC1~lt3h*6P}3wMGF{f>TZ@8<+z)}gUTV|i(&Rh5qJU3l z)yPXMy!?;c)|x`|Qz zKZi=q0~z4#b+RXzTk03+K;E1+Zq`WU=>?a2*Tfa=Asf!u1{xeGg?IzF(=^(F5x|?K z{aVjW`-L^FPD+(NS#XETMjYh^fg5a+$s%VlCqPZ2NmSzt7T)ho)0zXnxvm70!P|;9adPtjq_$Ntr z9NOXkaHB~!--i0y_Riy+uT7rYVSZO_Ei8BCer|)T+6N+nGi_kFBl^L32jXI4VbnS^ z`e+gbNKX`eabYEJ6p7@snD9impzy?-qEAAOO#SckTe1=0w@HJfjm_)Z<>uChj-J&K zcj+~LES_1ZnW3ldF5zy+nt4NDW#Oewp;V!2&VjOX(I^T+#__H5N3AqCU3n%*>>d+w zv{VBhGkV_^WvcvUKAU$;>OQ6Uf{z8CrC}2+wOtOR%PnwV1>~4d0xD@8tY=>DyU$|= zrK_>w)@~cbL8VDZNzU}qVG58~X!yPMi5Q4L$X_ZrKdl$1NV4xK-sJERv)*(jmw?uR zfirL3dDgl0UsXUpT7ujMyXT9&BKJMy-@KR8Str1JU`1#RzI7Ey$sGA49^h*nwo7Kf z4053U9exKkI3L6pp31@UPo6H0*VNz;eg>)5bq)LR2?&BxN^QR&~6@*6T!-IHX;RFi^q?TxFV1W0mq8L5MVJi|#SU16 zNK%GQJLp$m8`MpxE3B<(m;s~#;;(2`)=&dAhy*x3;wWoK+~B0lv)yQwS(lnqyw)Wr z!gt__dKnSBd8^XU2Xl>@x56v~=lv=;n)u?C!;cBb9%o@L(GbG@Ajst(WGum$ikJmm zQ#cR=Q0~v`<_PDHY7r0C{ro^T<6_4p!zwx=Y?&50GMncHzT_gJy!F zYE;#GpWOV_MAq74uwY+!7;9QYEG;p-K}y`ZA954o4RWS;9$gT#Q!u77?k43>7XqDH zq8ODkl7%L|^}nDEU4BPf)*>>s;%;2~mB#cAa?0<$Q1N1pJA##>2A>(>COEbSA=*lU zkLnOU+UjuagB1hj#ZF=DJrl$?K#X)ijGy&7NME5Azv!j8i#2gOogJc*!9qL12F1$w zL31z!@!Xm>;`_7yFur#GnB|5X5(oI}9Q{N)n3EP~{=|Z^^{dPl{1L<~4f;@O{=X$6 zGNbWf*GJuKeE{N}24{-)azxziHeZ=9-Jix@?Mv@13g__BmAU;%B#9`>IF;CQZF<}R z-_46 zN+YDQ50hXAyOn?Hi=z66_)9ry;buMe_z&=Go0zzKeO|iXs`LB#xx*b&fnJ)&kuaZ_ z8MQFZ2FYPZdze^KMpXe@f(VE+N~vCbx5CU5=_MyAdOy-eBh<($bo3BtptA_DO~x4+ zj`YBFsKx8eT-A9(8FdIeas%CDb+J9BY*<&Z7Stga0C!-YG^!M0GUKj~pCOWp0fGn0 zN=KMeUrj~47IV#u{WC0cWZEkZmMsY4FA<^&dC z9T5Hc;)g>4XBF~Gla_o-@z9!P(xHwYyP74>HP4?w4Fl$SRRvnj+)_XaY*|Ln!HGJo zXbkg1Ov0S@5WXzf;L?UX-I;LnOgkzqtOtS|7h4BL#-yG4WX`fQcS5eK;@)D@tKwbr zvs!}1g&7DZt9#G|_EJol54{BEN4Z^E5szJTg)Vck&B0G!!4 z4spL}yAX9qM02PORk843u;RaDD-RBeFh?BddFyu!ymYv|jxWcpsM*ocF}eLr+>Nq9 z7*o{Hq0JWuyo{Untj>~?$)(PpAhS?oy+Lvpt(9NnN8$v4I|s9OeUg&14V`DQ>DND4af zlGE=b`YtDF7~f78@!>;AX+g#GkdLR8KnHhx`NCwc^$TwrVSeHq?+$JO&pJ-auxSgv z!}^XK83Yb%5Np={R#N)zS*~$Sl$R0{0V}hhE60jP={GkRfK+l2Fy!xVwGnrbT_44r zMN}=qr)B=BtPhSx`Dv6kv=p=2`XFvWWI28?t%#Tx?S1@W1l>0-Y5p?EU4+)NORTsR1qI3RHWbR=`5x*z|} zDzrSU-MMPG%KSQi-MV2(X#%WL*T7$sEz{5| z(cmUi)!YR?N#fBT?QetGPx4*`C{ctb7`0wIaccNUQb1R1V9t$4R!E)QnU(i#pH4lT za^vXCkp#}{tU)yCh-$6`(aB>F@@V+ts7g`EKpuknF=auO-vo8n#bt@oyFUgxsFe?> zG@@c-h$a-^Z!$3xex7K%Mq!NY2L|hm_L=J9YtEie*=&Cxb&n5MUzadMH4Y2MGU6h3 zFNwN*ys*UsKPi`=^no^LV``gIK<{GvKDIOeBzASw@L!1f5$VM2KrI?P)h<a+uuK&q);mZClC-Y1`lyM#sr4>4oYL*42g&`2Fn$-RjD6&fZ> z?{Pn6@cJzjcqIkxh7AjFb5tBbQ$TMRf0U+Trg0WAMfHXi5?D@tLxcR(G8xJ^Ir8(g zfhwi;h7ati+`#8=swm%FXVf>k;wyW_Ag-fl)=A2p5W9smT*Br|`zbZ+eVT-cvEL0w z+X41InVp3vQfoDP6Y=U~O$upaKcbEM7ufbl( zuiPz05gyFMU!3vMhA~PPWgL8q`RCX=f-R172PcY!io6XhKtpm<_{T&lx$bs+pK-<{ z>a5M;@&5gh2v{m`v#hWbH8i0(gu<3yQcVGP+K^P`A&6=U<@su-ml=QhzOcwNHq|u? zT}{Nc8p|WLG%bo>--((>yXT&*;jJN+0E26x;i`THf-xTn%2D(j(-rAX)+?TMG#Cc)76_&(TkLXL} zfRKceQ5DASOH>tA6^`kD2mWqqdN>lAH1kw*>WOV4Hdr4d?L=NCKf_Y&5brJhY5r`? z_=cXmpnupH^j;w-n@ajqQS2sTTj zB6y$*NJOKMPu525l~U8ua_kn0Vl0Ev9_1(wE$n%u)Gg?fqr2|_Rt$%P93&T>GEzBN zDoFJ)EE#G)5Lu35_6IqAfs-B|O1V33?LKO+JNd0Evhxmh&Aeq8c`?b z>aUT=I}uG&JHGoRsx^#jZiRp#yc@Tfhq{9jCbp6 zAK?zHejtYZI^Vob1gnPr3RBhi+enl;?p%rKLb$C&bpMOcpMP7W$=^Br`gN+y<2Ip9 z;ly&nc3XYW9-!3`?Cmx{t0TKx3~f0X$yeo{QM!lv z9Vt=D9r0@St> z^8MvD9ug_A>}&vp2OPE3RMDF0tAwaI5SoT2<>~v{VMuGbL5e&G5Zz!25cO5pgDgM7X5g@e2xMbehCc)gWD|lv zPnKc*&?UA1I6;{kZXyQT2;9G>rNRurRbd{fKEl@XXx3XO>2C+Z-8-eGr%2l$PjNod|mQ6kb_?av=P#ari3pw0OdtEa?3i zk6d#%i+d>N!KYB@Iv)_>s(UsZ zY=P`iUMLlVie0%IeAd(J6hSY8@4I4-smxA&E%P3D;){akksCG7{>^R=US6jNRkY2& zarB+;j$HTpT=uS?=h7Ekd7JpF%?-63!Q-S%7~+1i8XUz z1{pnQY`2Y!+7laWB1SU=s9fq7JZHbdao{Q951x_PYR|QG+&sMcg@Hrr-_8$?PscP= z=ri(02RG%4S+;i3vUDMv#6TX|ga@;ijLsxaORWa43_bdpP3`S}T4ujIjqe?Q0gU$R z%L}{LNY&fG+~*kHuCw*5qiTb}ZG|llA}6u<-x1*bb|&eC3*K51AfNQdE;<4%*uSA4 zXT#ofd(7f^bCom#a*h@nx64JBP7eN|rOe?l#s2Hg+L!0{;(^GaE%(m!Smwk9sFlTU zZn$b|aMy9W9*or_Y<-sxw@FgI@*$o*tg{*>Q~zN&lp))w-aNpsHLlhZChxy2!Q6s} z9e@~H&Ke9dX}J*Dwzfv79a9f7#D(zeL9n%!2qHOzx3zX`CCpM#l=R6k{E860y2)1H zYTQ%AiyKqj8Ru~L&MsS%mv+g`+>q1Z_H`nwq}i^Wxo)OGOeS*v&WNK=OfDvx-O=V( zpUg08gi-VeBoIPv6ug)DpFEU;%8ls z7NZA@hlkrwhcg6sm+XKxC!4A!>o(fovY7CfmVZ@h0lig`)O?>-^iUsk2sEWsu!|mm zTULowxCuT5w*XjE<(J_NZXaGuI2~t{jB6*2HtLh}`LtWS^@lN~CCApZ8t$rwnMS<@ zyY+NT@|u2++H@2iRb5@(Gp@;GQ}J2!#)K(t@mbd1R>}(I&dB_u0c#lmHX)~;0ALd- z1J_p;s|Hu6xU1zsZGfSs1ErgDl;?doxu*YkZx6o1Wcdc?OJ<%xSIJIPMA!hD8-9<$ z6A^2CD;s4i^&IP(e*u)PT_%%<0of0YdbuPvnS_24meA`}@zWbx5G8RLd}eAk_s?JK z@W#o6KC4ZJA`$RmL?OZBhEnJfUljb@CZ~K9mn!*Q#;sAaP zBmun-Zu#ElerkI9ty67Y<~KK}ch@A~;>TW6g45N@!?75V2ZT$wL1D+vl!jMkuo4W^ z);UmNaB0xkeBog{u~$A+QZj34yk^&D>X}E=u`f3Cyn<_tiJjiq=G)mrVQ3|{Dp|yn znb^2loUO*qcJuo(>|uLlwuwNCIweAB1HqWVCH7cjNSMeTl<=BI0R5zs$}Z+cX=Y!K zV1e?d#5Kek#Xl0HCwX)@nUMtV-LFkPbK3Db_FJKB7UVM);oloXRgbHgTc9O=1o~pp zVZ-u3Ukv9m(KS5B|MOj;qd3^_VFJX~#}Dlg zoxVdO(H2I{(wixMfkVskuQWOb5YtWMFDe~=j{@(HgmjJmQUo<%91tJQ?3_SzdC1dg z{L#(hkjq&3;wMT`%~H=|CrYStei|cotO+fOcuBW?V&tDmbYOIkeu5JU!=^e4YX)LT3 zMm|sv!~8vdzK5sIqh>BsAU zZ6qB6wn5%NWAmCPJoAmvFaL0k&2fPnOh4ebmL`H`>?y`uYzVQYh84m8=0wl3jZf+Lrm9rd%B4_Pwk3ZsRNP~lA#k2B)}!jfvEtpj z+TKn}Z0_sB?%_6czVkp^kOU#`x=W4$h6JG-XeI+%7Wm&q8#zS@<_)x)xTS0xd5S;F zrg&T?L***ZeNEH}#@hswmVt zb0gx@2>@{XKHq#=dw9}~T4(QFmLgq>*7a41e-2%YsjXss%P8g>BHqLg#?)L5*a&ESJ!frY^bHj@#pX&Xtr0%9&_KrUSHCXOg`V2HAKdOK=(R#BYF$li257)V;Ot0{QxT-LLO$fpJcU~VC>ZRWIp!S2<6G2jn5iYYw+ z29k*g5NUezk4Q03O~@;LId$KL?^`eOM992)(g$E5CD1YR4X-N(fpRQ2W-q5yAsq)6 zQULs+U4-A_PN(&`thY>DjL*y+F|~obG6#SS5A*eRn&%4qs&n*v(StFyl)3QRFke*( ze^lD8%uwfEtYLe2YRgv6s&)k!H1()&r-ARRX>9UM6v#48%D4)%)4JjUT5RP5M=%{$ z;AgynuHrJH7Mwj{zbvfaAQZN@g)L002oC)F|@ z7?>D-wHmwJ8ptY{u(teI_aSZK^JXNO>;+sp7Y7GrKqZe=DFfp>houujKff(-7|{ZC zv2Om2NUyE~Sb1Ijq{2b{$9@2b=8eEr-wRGDwyUU&vx%_)CN2Jd4{cuf))H)C;_M__ zcWVdPg@%wycPflWiRT7t%YfdYWDB9Erf>z!RAV@CSbFMAzZ8SCEh7>ENt^}B9Ww*w zTam%C(6_4rlWvnG)c(e%h(?7Uwnn{8D#mMNe0gG)n6$VSK3$HocFt*1u9)UH;*U`~ zlu-;Jx26kF6>oaS1LB1wZ@?h9hks;XK2xWN=V*%cRX-0GBRw)Wbtz%mO=I#BHKz9> z1DCt>X{_)%BF151ATB&y^;vW*_1PDPaM1N5j|AbNtExl^gck3nSnM#wFKF6clgvws zbCo3+r5a#LE6rwh`CUNgXCQxVtbDH+_${?Ew8_!M|C}Z@8cdfsm0EQ3U5=nSECdOO zSPT~S#0C~tMpgX6BGv;l$kXwVyCk>g&5ZI?&Xn?hP#AZRuYI;(z@(4=I{#}Fj$sz~ z*om?gU>c<=P2Gv{6xTNUBiM~;=;)IgI{?{=r3A1&B=WVS6TA6rgx0Tmd8s;XacFqb zQt4qQ1*{SROz67-L}7S4 zTFIWp`heTjO8w`6eQLW=>jk?RKK;Q8l6l)?TRCidk8b!FcJI-8?xrEF*Q!8F4-Hd~ zGMQ`3Du;$|Mot}HF^e4Y~Hy-Kua?yT4U)gboe6)O?&F z%|Kfa&t|&oNQCgQwM?(wzRyXUDSRZqxmC}@UH)~f@jKRqN28BlFNgnf+^C$M9-3I^ z1Al*hyet%KS?%(|vJ(;i;n4odpG;PCa0yaMhvy<;3rWt$Py%{e$@zp4JD-xlfcKNo zhQLWa|MIq!g$H0$9&D(ByVSB6oQgYy2UFx88xFW#94DGcel-0r1 zjw$qV9oIGv1CQ3sO^21}G~CzimP;}k$k_Q}sGNPa8EQh2NN%#Sh2(p%%YNz^W6s#m zBIe+*Qnp@iN;T>&7H23`Q1}+he>NT!&l}ans+>Om`Q=o^|JUTk-Cd!Ppi8sctKyy~ zXchM1EXGvRCfpz4bX3C`ulv^t5Ppj?Fjj$rwP%q9i5o=Q@SlsIq+Z{*_+M970uE*S^=FF6o+a6r$xeF9z9dVQ5Rqj{4YIrjsgNw= zDIrTm$Yf7;LnF&5+a$?SLndQiB8*+K%rItrkKX_P`>yw!>zZq>d*KUbD#73 zokymjiSig@iO!Zo?Am`K%}VHKCYk8N_)G?hX;H8kDQP8}WAM=P9#EKk%+X`;QVIE( z_ipBHr?eGj+?G6P*H8v`Em5_8iMM`O{c}kx=vzDWZFwxiG8zL2lN5c}OmKAN>oI)R5hr$5)#G`Ib z;3N}w+&bOQX~So#9ruh#jq!-2m;0d9;3o~!BP?{j-3p`4WUBkN!6x42XO2zQqQPWQ zpPJlpSuMx8Tx@mZ;*qg%9lIAAZc{|Ko`?G5I7?2YNB~icBJB zTYnf=hyM&tW8+QkFTMsuf$lbQMH@rAP>97JcblOdNCk?DaVvF|(iWI`etCz#bSMGP zci+^6M>{0XsInCqiWc5!!!2D+eW=1Iuq8W7Eu8>1@-AR19A4v|1uD3_(E;kuvjnrJ+`p$wihjN#wqxBCF1L0$Xf@@N)DG$euJ~i|65_2XBhuH#b+Bf8Nz?%fmDqpY;9h z?(pkHQS5c-C&v)XvzPPkI33Zy2?#JX8w-GFTLBPl%bdz8%VW5x82L;Tc8rN|^w>Xf z-MCV!aY&VzKV?2DoQ?qPpB8zY5d5k#oz;OSNZih}QpDWT zwkj?3t=Z$NCfjoCyKsp-0c847osQk^0L0g4JsxK7>wC_5JepO%^aFP{CwZ>>=-^@! zfTnxO&_KsehRI8k{IJKdo|7jrd_QrhnrfbB<$>O<*BcJ_MwF>XJ-?kq>MLb5Do|f& zXe<}>1keoLLnm3vQi9H>gGe6F`}JO*{B&j8Q|+waf6;P2zYN<*7QYg~_820Y!RjjA zo?;bxF||CnjfjK=xBYG52pHC>wYuWVFJs?ch$&t%P91roY1#jDPNQr}K}-p9k~6Z_ zqYMm~1zft73La(izNy)79_dL->sEB-O z$`BuQ=QNr1rp-T>hPO-2i^?w>J?rG27_*)QS~Gv(Zc!kE4HJCdJD;}qTo=Ftfm76x zc)nEv8R!uca)D5~2nGiPq?QHc$zMr2auV26IV&v@^X;SWZxch;bus6cimuP}eGgTh``@Vb}T8k`C7BwZS6$*UmB0CvOw zp;ZMK-z!X|;)KFWxLtRcPm_kW8hTa8qf=eIs5b z+&6&}Vq!U6C4I0I$tvcNfc1+~>Na8AHS^b&vEkyp#(n1l6*Yn9^1>j#`%coTx8?#! z#GPnI4@AE7&Vo+keJO2tX>2JaJ98h;uZice+3i zC@KLDtt@tpus*Syc}s})p|+x;VVN$ay(x*+=HV*Nvg}>=PyPKPe%VoOALzw?X_t%D zGDJM-qoO?qdM%t&{#q`lp%)}C2yIhqYqa5ml?n=4J7{Esj!!wK zb7Frrc;~)zxnw%mJg z`_FEc)!ZVH;gzzX1L*P%reZDYxCQbPc3p5KeuqQVvhcV%@+410Sq^I~oc;nKM(1lZ0k}fsb$h~YMd7YdB}rBtlgE!N4>dpYbWy4c zb7ltLSbeqJ$A?&0=40)1HLHL5#>e@!j;{%Mp^br6r(UXKQszk4Jjrq-HLq@$&FFyw z0XUU=X*+FV^J`|sk>;!D+PT_W!UQ08pq8F|1bfY4E2goVtrwmvmPp;NI*HL=9e+By zXB&{(^l^4#RBM~>N~4=|r2?TJx8?ic>^`a_ta)NJK%&9AT-X#9t&)nhFm*^x#O`+HEx*uxnSdP@yIs}adH%Sz1~}w*i;qb@*2ohRoIh_&zL|Kx zWc~DdN% zx&IhP5V?vBtnQH>_-5tsn=vGtvC`mWHDs>?J5+93;OTrLhJoJV18s@nM)rsj#+s++ z>d3NH*fkvT_$capT_JG7)HvW)>ZTWA)H}I(+k=_9RX#V~R*6$rf34Zf;o<=2Ue@&S z+x}sFdMJ*Xr!QBbaWRmXyPMR<&E6gXdOSXVHV?=SulI6LM~OdKk^7Ln+0#dq%R-K0 z)!UDxu1a|FMc-Xa+yb25;-ub6eG~ok1^BF27muzRmosGz|VSTFS zEhwI2OEyCEU-+wk4=BZ%Jlf8%a+7_*OqHmDk~fF(G5d5r2DgDpFYIJHmnmCz2D|<8 zpA9mqDL7z!jdOY{bf=7OwavRHhuM2QH+;7!d?S?HLtMlFXHx=KNTAz=+~v7VA+D2( z!h7On07gp#$RoHN6ukbd=6>(~uPM~mcMS3vdfY4|T%U_WkUaLd*AYzy)A^3<_>_gO z-KR`wu4V+T4w`{`=D2rn4nk1Z603_&AG|f@<=d5OuyCWPPk#$TTYgWYE-X%KQ@Rh6 zso0))z>~btw6XM}CkK|Z287q#9YAddQa1U>yQq+6aY z&aN4}=4bD)X>1byZk|4(@lF;h{HoquByDJ4JEur6hrMIiM->bx=Xv#ogZ~uRVkU2* zyw}YrpReOWmltTuEBPV0%Ly(mt7D-fsLfa5Ko3Ymr?0dCV!DI9O>pf|RM4OeZG048 zMfb8{R5!fi&*XnMdG78d#&7c2^DTwe5ml!~O3R(}$W827n!-duPWWt0&E4E!o^07F z-zmBd5Pt36yO54EZC`Ah5Wo!&!R=X_o-rlz$H8T-EH!Y1B|ML((i@4U-;DQpG)6Y9 zI?%tbxn*WQh{8}Ie>TYJ=*YQ=r-aBN771KUlhpoZoyo>-?l;x!^hHc-)w<>QJ2|XY z5<0s`B4@)Xr^acb!sVw8dqe$dqZ4^B;K9w_@ud zc!{Ka`u)9JJ@Jip0*&B$Soq?{g^rsU_w|k|CmkO7p46e;+VK-$->1p$QP*Eqcrlr> zMd%%W<7M(ECN-JNwi#e) zTkL*LYL8hhV8N(fs-$n^hwr=%+?AvB3vKWI?n2Kr82oC)O{mLp$FkW7lzz02n0E*7 zqk1oGrxZZXJn$$6FUfs8HkT!IwtPl;NRPjR<4sXqsdSo{#f!q;u-^Al@MS{Mpc$qt z)P~@?MV(B>Y;kcs6G`r3VnZK!3%f-1g-8$R{5 zJwP=q-Zo*H!VoW(e@jIC?Z*eQH0c)(Ft0vs3P@n*bxly;z6W1juIi#}QP@;rp6%tT zLk|2Ekc24EK#Sy6WCJ;L3nf&+!o2Bk-?_1A&c|1-KfQSb?_kn@r~6r9vwJaa;#*;S z!=3vz#s2FHiECP+PT^e`e#%=6)^XD)xb=|Nt7ey9;&pa$*}N658foSQn-?6_4tpO` zzy1z*E%Xm+NH?6aOsk@!l z^i*?qLn}%^C5|3c5xC{~Mip%*0hI%+FSVWKVg7+Jnc&myn&DPoiRBz6vcKEbiWeSz zzG4)naa=TBNs`UzsC%odoo3B_m_ACpYYiKaHD365UMOd+(#M?lEN}iX$L(VOPVLgh zgk@ZoJ{M>@Qr}3R)D~K)u|ZN#C~0AR`E%!RuuKc?ZXLp-q)Oj~&i43vb9>-3+v1#< z$FPvtDHx-qPm#nPQ8iuB7j2MG#*K#R$8(L;8J=A1E=UCMbjZI1UtcE)wSoSTC}ipExh*uOG*nIEK)`$A3ox-m5XfH~5C|u@ zPXB!sK9okgN<&2tgO$*LG`L7ebe1$!`cL`8K{*vYCJlxC>4e30`q2P01mX;XKu-L( z2k^7-p(0uw29=XM7%5{uz`j8M3R$sX`G1u(IMEU?r~o<)20iwt|7S3$$YEI+6fFsZ zf{C`lumALLpfB=J2;|_+{;y$_Wzp*}sF2kEstj1;G!q0O{I6OlIVjpi0xF_xYY9EX kfB65@5|zb~C;>L{1e@^vtI3Q!`tC`n%n>z3@Mp;X0GbqhTmS$7 diff --git a/publishing/final_review/README.md b/publishing/final_review/README.md index 335df20f..e461d6ef 100644 --- a/publishing/final_review/README.md +++ b/publishing/final_review/README.md @@ -1,6 +1,6 @@ # Springer 终稿人工复核报告包 -生成时间:2026-06-19T07:41:15.302814+00:00 +生成时间:2026-06-21T14:01:38.410779+00:00 - `chapter_style_checklist.md`:全书逐章统稿签核矩阵。 - `style_report.md`:逐章统稿与风格候选表达。 diff --git a/publishing/final_review/figure_rights_report.md b/publishing/final_review/figure_rights_report.md index 93c167c2..dbcfdce6 100644 --- a/publishing/final_review/figure_rights_report.md +++ b/publishing/final_review/figure_rights_report.md @@ -81,12 +81,12 @@ | `docs/zh/part13/ch44_pretrain_recipes.md` | 56 | `docs/images/part13/ch44_03_pretrain_data_source_map.png` | 1024x1024 | 1204.9 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | | `docs/zh/part13/ch44_pretrain_recipes.md` | 86 | `docs/images/part13/ch44_04_models_pie_chart_en.svg` | unknown | 7.2 | missing-register | unknown | missing-figure-register, needs-high-res-confirmation | | `docs/zh/part13/ch44_pretrain_recipes.md` | 251 | `docs/images/part13/ch44_05_llama3_annealing_schedule_en.svg` | unknown | 6.8 | missing-register | unknown | missing-figure-register, needs-high-res-confirmation | -| `docs/zh/part13/ch45_posttrain_recipes.md` | 62 | `docs/images/part13/ch45_01_posttrain_three_stage_pipeline.svg` | unknown | 142.5 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | -| `docs/zh/part13/ch45_posttrain_recipes.md` | 140 | `docs/images/part13/ch45_02_sft_synthesis_pipelines.svg` | unknown | 94.0 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | -| `docs/zh/part13/ch45_posttrain_recipes.md` | 233 | `docs/images/part13/ch45_03_tulu3_posttrain_flow.svg` | unknown | 64.5 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | -| `docs/zh/part13/ch46_rl_reasoning_data.md` | 71 | `docs/images/part13/ch46_01_r1_reasoning_flywheel.svg` | unknown | 308.0 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | -| `docs/zh/part13/ch46_rl_reasoning_data.md` | 176 | `docs/images/part13/ch46_02_reward_verifier_architecture.svg` | unknown | 50.9 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | -| `docs/zh/part13/ch46_rl_reasoning_data.md` | 219 | `docs/images/part13/ch46_03_long_cot_trace_patterns.svg` | unknown | 36.4 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | +| `docs/zh/part13/ch45_posttrain_recipes.md` | 64 | `docs/images/part13/ch45_01_posttrain_three_stage_pipeline.svg` | unknown | 142.5 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | +| `docs/zh/part13/ch45_posttrain_recipes.md` | 142 | `docs/images/part13/ch45_02_sft_synthesis_pipelines.svg` | unknown | 94.0 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | +| `docs/zh/part13/ch45_posttrain_recipes.md` | 237 | `docs/images/part13/ch45_03_tulu3_posttrain_flow.svg` | unknown | 64.5 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | +| `docs/zh/part13/ch46_rl_reasoning_data.md` | 73 | `docs/images/part13/ch46_01_r1_reasoning_flywheel.svg` | unknown | 308.0 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | +| `docs/zh/part13/ch46_rl_reasoning_data.md` | 180 | `docs/images/part13/ch46_02_reward_verifier_architecture.svg` | unknown | 50.9 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | +| `docs/zh/part13/ch46_rl_reasoning_data.md` | 225 | `docs/images/part13/ch46_03_long_cot_trace_patterns.svg` | unknown | 36.4 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | | `docs/zh/part13/ch47_vlm_data_recipes.md` | 37 | `docs/images/part13/ch47_01_multimodal_data_panorama.png` | 1024x1024 | 1328.2 | 内部自绘/改绘资源;终稿复核高清源与 AI 使用声明 | 是 | needs-rights-ai-review, needs-high-res-confirmation | | `docs/zh/part13/ch47_vlm_data_recipes.md` | 47 | `docs/images/part13/ch47_02_vlm_three_stages_en.svg` | unknown | 4.3 | missing-register | unknown | missing-figure-register, needs-high-res-confirmation | | `docs/zh/part13/ch47_vlm_data_recipes.md` | 114 | `docs/images/part13/ch47_03_resolution_handling_en.svg` | unknown | 4.7 | missing-register | unknown | missing-figure-register, needs-high-res-confirmation | diff --git a/publishing/final_review/final_publication_audit.json b/publishing/final_review/final_publication_audit.json index 9fcf174d..43b88d77 100644 --- a/publishing/final_review/final_publication_audit.json +++ b/publishing/final_review/final_publication_audit.json @@ -1,5 +1,5 @@ { - "generated_at_utc": "2026-06-19T07:41:15.302814+00:00", + "generated_at_utc": "2026-06-21T14:01:38.410779+00:00", "scanned_files": 72, "style_hits": [ { @@ -2853,126 +2853,126 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 56, + "line": 58, "kind": "rhetorical-not-but", "phrase": "不是后训练的终点,而是", "context": "模型发布上线并不是后训练的终点,而是起点。第三层决定了模型能否随着真实业务演进而自我修复。" }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 65, + "line": 67, "kind": "weak-booster", "phrase": "很容易", "context": "从工程落地角度看,三段论还意味着三类完全不同的数据资产管理方式。SFT 数据更像“行为模板库”,它需要稳定、干净、覆盖常见任务,并且字段结构尽量简单。团队通常会围绕 `messages`、`instruction`、`input`、`..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 77, + "line": 79, "kind": "rhetorical-not-but", "phrase": "不是评价谁的榜单得分更高,而是", "context": "在构建自己的后训练管线前,我们需要横向比较当前主流开源模型的公开路线。本节选取 Tülu-3、Llama-3、Qwen2.5 与 Nemotron-4 作为四类代表路线进行剖析。我们的核心不是评价谁的榜单得分更高,而是建立“公开信息如..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 145, + "line": 147, "kind": "weak-booster", "phrase": "真正", "context": "这四道门禁最好以“自动过滤 + 人工抽检”组合实现。自动过滤适合处理格式错误、重复、长度异常、低质量模板、敏感词和明显安全问题;人工抽检适合判断指令是否自然、答案是否真正有帮助、复杂任务是否保持了原始意图。尤其在 Evol-Instr..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 147, + "line": 149, "kind": "rhetorical-not-but", "phrase": "不是完全复制某个比例,而是", "context": "SFT 数据还需要分层配比,而不是简单混合。建议至少拆成六类:通用问答、知识解释、复杂指令遵循、代码与工具、数学与推理、安全与拒答。每一类都要单独统计数量、平均长度、来源、过滤率和抽检通过率。对于开源模型 recipe 的复现来说,最..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 149, + "line": 153, "kind": "weak-booster", "phrase": "很容易", "context": "还要注意,SFT 阶段的好数据不一定适合所有训练轮次。第一轮 SFT 更适合使用结构清晰、回答稳定、覆盖面广的数据,帮助模型建立基础助手行为。后续增量 SFT 则更适合加入 hard cases、领域任务、工具调用和安全边界修复数据。..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 157, + "line": 161, "kind": "rhetorical-not-but", "phrase": "不是“生成样本”,而是", "context": "这一层的任务已经不是“生成样本”,而是“构造偏好/奖励信号”。也就是说,上一节的 SFT 合成方法与本节的 RLHF、DPO、GRPO、RLVR 属于不同层级,不能写成同一种数据工程动作。" }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 203, + "line": 207, "kind": "weak-booster", "phrase": "真正", "context": "第四条原则是把可验证任务单独管理。数学、代码、结构化输出、工具调用这类任务不必完全依赖主观偏好,因为它们可以引入 verifier。对于这些任务,偏好数据应额外记录 `verifier_name`、`verifier_version`..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 269, + "line": 273, "kind": "rhetorical-not-but", "phrase": "不是重复训练,而是", "context": "从数据工程视角看,Llama-3 路线最值得借鉴的是“每轮后训练都重新定义数据分布”。第一轮 SFT 之后,模型的失败样本通常集中在基础指令遵循和安全边界;经过几轮偏好优化后,失败样本会逐渐转移到更难的问题,例如长上下文一致性、多约束..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 271, + "line": 275, "kind": "rhetorical-not-but", "phrase": "不是简单地“让 RM 挑最好答案”,而是", "context": "这也解释了为什么拒绝采样微调在工业系统中很重要。RSFT 并不是简单地“让 RM 挑最好答案”,而是在当前模型的候选空间中寻找高质量轨迹,把模型已经能偶尔做到的行为固化成更稳定的行为。它比人工从零写 SFT 样本更贴近当前模型的能力边..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 275, + "line": 279, "kind": "weak-booster", "phrase": "很容易", "context": "还需要强调隐私和合规边界。线上失败样本并不自动等于可训练样本。用户输入可能包含个人信息、商业秘密、医疗或金融敏感内容,也可能来自不允许用于训练的授权场景。因此,多轮 RLHF 或 RSFT 的真实系统需要先经过脱敏、用途授权检查、数据..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 285, + "line": 289, "kind": "weak-booster", "phrase": "真正", "context": "奖励漏洞利用(Reward Hacking)是偏好对齐中最常见也最容易被低估的风险之一。当模型的优化目标完全受控于一个不完美的 RM 时,它会寻找最便捷的方式获取高分,而非真正解决用户问题。" }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 316, + "line": 322, "kind": "rhetorical-not-but", "phrase": "不是单点错误,而是", "context": "数据污染的难点在于,它往往不是单点错误,而是多个环节叠加后的结果。例如,一个公开数学题库先被用于构造 SFT 数据,随后模型基于这批数据生成候选回答,RM 又根据这些候选的 benchmark 风格进行偏好学习,最后拒绝采样阶段再用相..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 320, + "line": 326, "kind": "rhetorical-not-but", "phrase": "不是完全禁止 judge,而是", "context": "另一个常被忽略的问题是“评审模型污染”。很多团队用强模型做 LLM-as-a-Judge,随后又用 judge 输出构造偏好数据。如果 judge 曾经见过某些 benchmark 或训练偏好,它的评分可能会把这些偏好带入新模型。解决..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 326, + "line": 334, "kind": "rhetorical-not-but", "phrase": "不是完整回答,而是", "context": "过程奖励数据的核心单位不是完整回答,而是 step。一个可用的过程奖励样本,至少要保存题目、完整轨迹、步骤切分、每一步的局部判断、最终答案、最终验证结果和错误类型。如果只保存整段 CoT,后续 PRM 无法学习哪一步开始偏离。对于数学..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 357, + "line": 365, "kind": "weak-booster", "phrase": "真正", "context": "成本优化不能只看 GPU 小时,也要看有效样本率。某条合成流水线如果生成 100 万条样本,但经过格式、质量、安全、污染和人工抽检后只剩 5 万条,那么真正成本应按这 5 万条计算。类似地,拒绝采样如果每题生成 32 个候选但只保留 ..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 359, + "line": 367, "kind": "rhetorical-not-but", "phrase": "不是把所有任务塞进同一种范式,而是", "context": "适用边界也必须写清楚。通用聊天助手可以更依赖偏好数据和人工评审;数学、代码和结构化任务更适合引入 verifier;专业领域助手则必须强化领域专家复核和合规审计。没有标准答案的开放式任务,不应强行套用 RLVR;高度敏感的医疗和金融任..." }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 367, + "line": 375, "kind": "rhetorical-not-but", "phrase": "不是针对单条样本的精雕细琢,而是", "context": "1. 后训练数据工程的核心从来不是针对单条样本的精雕细琢,而是建立 SFT、偏好优化与在线反馈三阶段的严密系统协同。" @@ -2986,154 +2986,154 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 55, + "line": 57, "kind": "rhetorical-not-but", "phrase": "不是强化学习算法本身,而是", "context": "因此,本章讨论的不是强化学习算法本身,而是 RL 范式下的数据工程问题:任务从哪里来,验证器如何写,采样轨迹如何存,哪些轨迹进入二轮 SFT,哪些失败轨迹进入 hard case 池,以及如何防止模型在奖励信号上产生投机行为。" }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 57, + "line": 59, "kind": "blog-transition", "phrase": "换句话说", "context": "在这个范式中,数据工程师面对的对象也发生了变化。过去一条样本的主要边界是 prompt 和 answer;现在一条样本可能对应一个任务族、一组采样参数、若干条候选轨迹、多个验证器输出、一次人工审计结论和后续训练去向。换句话说,推理数据..." }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 57, + "line": 59, "kind": "rhetorical-not-but", "phrase": "不是“更长的回答”,而是", "context": "在这个范式中,数据工程师面对的对象也发生了变化。过去一条样本的主要边界是 prompt 和 answer;现在一条样本可能对应一个任务族、一组采样参数、若干条候选轨迹、多个验证器输出、一次人工审计结论和后续训练去向。换句话说,推理数据..." }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 69, + "line": 71, "kind": "rhetorical-not-but", "phrase": "不是线性的一次性流程,而是", "context": "R1 风格推理数据飞轮可以拆成四个阶段:冷启动 SFT、大规模 RL、拒绝采样、二轮 SFT。这四个阶段不是线性的一次性流程,而是可以反复运行的闭环。" }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 76, + "line": 78, "kind": "rhetorical-not-but", "phrase": "不是把模型训练成高性能推理模型,而是", "context": "冷启动 SFT 的目标不是把模型训练成高性能推理模型,而是让模型具备可读、稳定、可解析的推理输出格式。这个阶段通常需要少量高质量 Long-CoT 样本,覆盖数学、代码、逻辑题、格式遵循和必要的通用问答。" }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 90, + "line": 94, "kind": "rhetorical-not-but", "phrase": "不是为了制造冗长,而是", "context": "冷启动阶段最容易出现的误区,是把样本写得过于“完美”。真实 RL 后的推理轨迹通常包含试探、检查、回看条件和修正,而人工冷启动样本如果只呈现线性推导,模型会学到一种过分整齐的解释风格。这样的风格在简单题上可读性很好,但在复杂题上可能缺..." }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 92, + "line": 96, "kind": "weak-booster", "phrase": "真正", "context": "冷启动数据还要控制“答案泄漏”。在很多合成数据中,生成器先知道标准答案,再倒写推理过程,容易出现步骤与结论强绑定的问题。模型学到这类样本后,可能在没有真正推理的情况下直接靠模式猜答案。更稳妥的做法是保留题目、标准答案和推理过程之间的检..." }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 144, + "line": 148, "kind": "rhetorical-not-but", "phrase": "不是为了让答案花哨,而是", "context": "拒绝采样还要避免样本同质化。若每道题只保留最高分的一条轨迹,训练集可能过分偏向某种表达模板。更好的方式是按任务保留有限数量的多样化成功轨迹,同时对近似重复文本做去重。数学题可以保留不同解法,例如代数法、几何法和枚举法;代码题可以保留不..." }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 166, + "line": 170, "kind": "rhetorical-not-but", "phrase": "不是“不会做难题”,而是", "context": "二轮 SFT 后的评估也应分成两类。第一类是能力评估,关注数学、代码、逻辑、长上下文和结构化输出是否提升。第二类是行为评估,关注回答长度、格式稳定、语言一致、安全边界和普通对话体验是否退化。很多推理模型的工程问题不是“不会做难题”,而..." }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 189, + "line": 195, "kind": "rhetorical-not-but", "phrase": "不是“一写就可靠”,而是", "context": "Rule-based reward 的最大优势是可复现,但它也会诱导模型寻找规则漏洞。只要验证器只看最终答案,模型就可能忽略过程;只要测试用例覆盖不足,模型就可能写出过拟合测试的代码;只要 JSON schema 只检查字段存在,模型..." }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 203, + "line": 209, "kind": "rhetorical-not-but", "phrase": "不是一个单独脚本,而是", "context": "Verifier 池是推理数据飞轮的基础设施。它不是一个单独脚本,而是一组可版本化、可测试、可回滚的验证器。" }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 209, + "line": 215, "kind": "rhetorical-not-but", "phrase": "不是答案错,而是", "context": "格式 verifier 用于检查 JSON、XML、工具调用参数和 `/` 标签。很多推理模型的失败不是答案错,而是答案无法被系统解析。格式 reward 可以减少这类工程故障。" }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 211, + "line": 217, "kind": "weak-booster", "phrase": "真正", "context": "一个可维护的 verifier 池通常需要四类接口。第一类是 `extract`,负责从模型输出中抽取最终答案、代码块、JSON 字段或工具参数。第二类是 `normalize`,负责做单位换算、空白清理、大小写处理、数学表达式标准化..." }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 215, + "line": 221, "kind": "rhetorical-not-but", "phrase": "不是 RL 算法问题,而是", "context": "代码任务的 verifier 更复杂。它需要隔离文件系统、限制网络、控制 CPU 与内存、设置运行超时,并记录依赖版本。对数据工程来说,测试用例本身也是数据资产。公开 benchmark 的测试可能不足以覆盖业务逻辑,自有代码任务还需..." }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 228, + "line": 234, "kind": "rhetorical-not-but", "phrase": "不是为了限制模型的跨语种知识,而是", "context": "语言一致性不是为了限制模型的跨语种知识,而是为了提高工程可控性。推理模型可以利用跨语种知识,但最终训练数据应当保持面向用户的表达稳定。" }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 274, + "line": 280, "kind": "rhetorical-not-but", "phrase": "不是大规模堆叠,而是", "context": "第一,冷启动数据强调可读性。它不是大规模堆叠,而是为模型提供标准推理格式和回答风格。" }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 280, + "line": 286, "kind": "rhetorical-not-but", "phrase": "不是某个不可见的比例,而是", "context": "从复现角度看,DeepSeek-R1 最值得学习的不是某个不可见的比例,而是阶段之间的职责划分。冷启动阶段负责让模型“说得清楚”,RL 阶段负责让模型“试得更多”,拒绝采样负责让数据“留下好的”,二轮 SFT 负责让行为“稳定复现”。..." }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 288, + "line": 294, "kind": "rhetorical-not-but", "phrase": "不是把“Wait”这类词硬编码进样本,而是", "context": "从数据形态上看,QwQ 类模型的重要启发在于:推理轨迹中常出现等待、检查、反思和回溯等模式。这类模式可能来自模型自身采样,也可能来自训练数据和 RL 目标共同作用 [I]。对数据工程师来说,重点不是把“Wait”这类词硬编码进样本,而..." }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 292, + "line": 298, "kind": "rhetorical-not-but", "phrase": "不是“复制 QwQ recipe”,而是", "context": "在项目中使用 QwQ 类模型时,常见路径是让它生成候选 Long-CoT,再用自有 verifier 过滤。这样得到的数据并不是“复制 QwQ recipe”,而是“借助强推理模型生成候选,再用本地任务约束筛选”。这种方式适合冷启动阶..." }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 300, + "line": 306, "kind": "weak-booster", "phrase": "很容易", "context": "Kimi k1.5 这类路线把推理数据工程从“题目-答案”扩展到“上下文-证据-推理-答案”。长上下文任务的 verifier 也更复杂。对于文档问答,验证器不仅要看答案是否正确,还要看引用是否来自给定材料;对于多轮任务,系统要判断模..." }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 512, + "line": 530, "kind": "rhetorical-not-but", "phrase": "不是把 RL 算法接到模型后面就结束,而是", "context": "这些问题共同说明,推理数据工程不是把 RL 算法接到模型后面就结束,而是要持续维护任务、验证器、采样和训练数据之间的契约。任何一个环节的定义变了,后续数据都要重新解释。" }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 563, + "line": 583, "kind": "rhetorical-not-but", "phrase": "不是一个“推理模型”,而是", "context": "如果把本章落到项目实施,最小产物不是一个“推理模型”,而是一套数据资产:任务池、verifier 池、采样轨迹库、拒绝采样训练集、失败样本库和评估报告。模型只是这套资产的一次消费结果。只要这些资产持续更新,团队就可以在不同基座模型、不..." @@ -10887,7 +10887,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 62, + "line": 64, "alt": "图45-1:LLM 后训练三阶段流水线示意图", "src": "../../images/part13/ch45_01_posttrain_three_stage_pipeline.svg", "resolved": "docs/images/part13/ch45_01_posttrain_three_stage_pipeline.svg", @@ -10903,7 +10903,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 140, + "line": 142, "alt": "图45-2:Self-Instruct、Evol-Instruct 与 Magpie 三流派 pipeline 对比", "src": "../../images/part13/ch45_02_sft_synthesis_pipelines.svg", "resolved": "docs/images/part13/ch45_02_sft_synthesis_pipelines.svg", @@ -10919,7 +10919,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 233, + "line": 237, "alt": "图45-3:Tülu-3 三阶段数据流与规模示意", "src": "../../images/part13/ch45_03_tulu3_posttrain_flow.svg", "resolved": "docs/images/part13/ch45_03_tulu3_posttrain_flow.svg", @@ -10935,7 +10935,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 71, + "line": 73, "alt": "图46-1:R1 风格推理数据飞轮四阶段", "src": "../../images/part13/ch46_01_r1_reasoning_flywheel.svg", "resolved": "docs/images/part13/ch46_01_r1_reasoning_flywheel.svg", @@ -10951,7 +10951,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 176, + "line": 180, "alt": "图46-2:推理数据奖励信号与验证器结构", "src": "../../images/part13/ch46_02_reward_verifier_architecture.svg", "resolved": "docs/images/part13/ch46_02_reward_verifier_architecture.svg", @@ -10967,7 +10967,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 219, + "line": 225, "alt": "图46-3:Long-CoT 数据样例剖面", "src": "../../images/part13/ch46_03_long_cot_trace_patterns.svg", "resolved": "docs/images/part13/ch46_03_long_cot_trace_patterns.svg", @@ -18144,13 +18144,12 @@ { "file": "docs/zh/part13/ch45_posttrain_recipes.md", "entry_no": 7, - "entry": "Lambert N, Morrison J, Pyatkin V, Huang S, Ivison H, Brahman F, Miranda L J V, Liu A, Dziri N, Lyu X, Gu Y, Malik S, Graf V, Hwang J D, Yang J, Le Bras R, Tafjord O, Wilhelm C, Soldaini L, Smith N A, Wang Y, Dasigi P, Hajishirzi H (2025) Tülu 3: Pushing Frontiers in Open Language Model Post-Training. Second Conference on Language Modeling.", + "entry": "Lambert N, Morrison J, Pyatkin V, Huang S, Ivison H, Brahman F, Miranda L J V, Liu A, Dziri N, Lyu X, Gu Y, Malik S, Graf V, Hwang J D, Yang J, Le Bras R, Tafjord O, Wilhelm C, Soldaini L, Smith N A, Wang Y, Dasigi P, Hajishirzi H (2025) Tülu 3: Pushing Frontiers in Open Language Model Post-Training. Second Conference on Language Modeling. arXiv preprint arXiv:2411.15124.", "year": "2025", "doi": "", "url": "", "issues": [ - "needs-authenticity-review", - "missing-doi-url-arxiv" + "needs-authenticity-review" ] }, { @@ -19320,13 +19319,12 @@ { "file": "docs/zh/part14/p07_agent_tooluse.md", "entry_no": 4, - "entry": "4. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications.", + "entry": "4. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications. https://genai.owasp.org/llm-top-10/.", "year": "2025", "doi": "", - "url": "", + "url": "https://genai.owasp.org/llm-top-10/.", "issues": [ - "needs-authenticity-review", - "missing-doi-url-arxiv" + "needs-authenticity-review" ] }, { @@ -19399,13 +19397,12 @@ { "file": "docs/zh/part14/p09_privacy_pipeline.md", "entry_no": 1, - "entry": "1. European Union. (2016). Regulation (EU) 2016/679: General Data Protection Regulation.", + "entry": "1. European Union. (2016). Regulation (EU) 2016/679: General Data Protection Regulation. https://eur-lex.europa.eu/eli/reg/2016/679/oj.", "year": "2016", "doi": "", - "url": "", + "url": "https://eur-lex.europa.eu/eli/reg/2016/679/oj.", "issues": [ - "needs-authenticity-review", - "missing-doi-url-arxiv" + "needs-authenticity-review" ] }, { @@ -19447,13 +19444,12 @@ { "file": "docs/zh/part14/p09_privacy_pipeline.md", "entry_no": 5, - "entry": "5. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications.", + "entry": "5. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications. https://genai.owasp.org/llm-top-10/.", "year": "2025", "doi": "", - "url": "", + "url": "https://genai.owasp.org/llm-top-10/.", "issues": [ - "needs-authenticity-review", - "missing-doi-url-arxiv" + "needs-authenticity-review" ] }, { @@ -19685,13 +19681,12 @@ { "file": "docs/zh/part14/p12_r1_reasoning_flywheel.md", "entry_no": 7, - "entry": "Qwen Team (2025) QwQ-32B: Embracing the Power of Reinforcement Learning for Reasoning Models. Qwen Blog.", + "entry": "Qwen Team (2025) QwQ-32B: Embracing the Power of Reinforcement Learning for Reasoning Models. Qwen Blog. https://qwenlm.github.io/blog/qwq-32b/.", "year": "2025", "doi": "", - "url": "", + "url": "https://qwenlm.github.io/blog/qwq-32b/.", "issues": [ - "needs-authenticity-review", - "missing-doi-url-arxiv" + "needs-authenticity-review" ] }, { @@ -25635,7 +25630,7 @@ "figures_needing_review": 288, "broken_figures": 0, "references": 972, - "reference_issue_rows": 632, + "reference_issue_rows": 627, "missing_reference_files": 0 } } diff --git a/publishing/final_review/reference_audit_report.md b/publishing/final_review/reference_audit_report.md index fb5ec704..739ba7c5 100644 --- a/publishing/final_review/reference_audit_report.md +++ b/publishing/final_review/reference_audit_report.md @@ -1,7 +1,7 @@ # 参考文献 DOI / Springer 样式 / 真实性终审报告 - 参考文献条目:972 -- DOI/URL/年份/样式候选问题:632 +- DOI/URL/年份/样式候选问题:627 - 所有条目默认需要人工真实性终审;脚本列出 DOI、URL、年份和样式候选问题,但不替代逐条查证。 | 文件 | 序号 | 年份 | DOI | URL | 问题 | 条目 | diff --git a/publishing/final_review/reference_integrity_audit.json b/publishing/final_review/reference_integrity_audit.json index 04f16284..73b3db76 100644 --- a/publishing/final_review/reference_integrity_audit.json +++ b/publishing/final_review/reference_integrity_audit.json @@ -1,21 +1,20 @@ { - "generated_at_utc": "2026-06-17T02:33:34.120105+00:00", + "generated_at_utc": "2026-06-22T08:46:49.390824+00:00", "scope_roots": [ "docs/zh" ], "summary": { "files": 71, - "references": 972, - "body_author_year_citations": 1327, + "references": 980, + "body_author_year_citations": 1358, "missing_same_chapter_references": 0, - "uncited_references": 227, + "uncited_references": 205, "duplicate_reference_groups": 0, "external_status_counts": { - "not-checked": 972 + "not-checked": 980 }, "format_issue_counts": { - "missing-doi-arxiv-url": 614, - "missing-first-author": 2 + "missing-doi-arxiv-url": 379 } }, "references": [ @@ -23,35 +22,31 @@ "file": "docs/zh/appendix_a_tools_and_frameworks_quick_reference.md", "line": 290, "entry_no": 1, - "entry": "Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12): 86-92.", + "entry": "Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12): 86-92. https://doi.org/10.1145/3458723.", "first_author": "Gebru", "year": "2021", "title": "Datasheets for Datasets", - "doi": "", + "doi": "10.1145/3458723", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3458723", "key": "gebru:2021", "fingerprint": "datasheets for datasets", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/appendix_a_tools_and_frameworks_quick_reference.md", "line": 292, "entry_no": 2, - "entry": "Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229.", + "entry": "Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596.", "first_author": "Mitchell", "year": "2019", "title": "Model Cards for Model Reporting", - "doi": "", + "doi": "10.1145/3287560.3287596", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3287560.3287596", "key": "mitchell:2019", "fingerprint": "model cards for model reporting", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/appendix_a_tools_and_frameworks_quick_reference.md", @@ -194,18 +189,16 @@ "file": "docs/zh/appendix_b_compliance_and_release_checklist.md", "line": 322, "entry_no": 6, - "entry": "Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229.", + "entry": "Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596.", "first_author": "Mitchell", "year": "2019", "title": "Model Cards for Model Reporting", - "doi": "", + "doi": "10.1145/3287560.3287596", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3287560.3287596", "key": "mitchell:2019", "fingerprint": "model cards for model reporting", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/appendix_c_cost_estimation_and_resource_templates.md", @@ -226,35 +219,31 @@ "file": "docs/zh/appendix_c_cost_estimation_and_resource_templates.md", "line": 318, "entry_no": 2, - "entry": "Narayanan D, Shoeybi M, Casper J, LeGresley P, Patwary M, Catanzaro B (2021) Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis.", + "entry": "Narayanan D, Shoeybi M, Casper J, LeGresley P, Patwary M, Catanzaro B (2021) Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. arXiv:2104.04473.", "first_author": "Narayanan", "year": "2021", "title": "Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM", "doi": "", - "arxiv": "", + "arxiv": "2104.04473", "url": "", "key": "narayanan:2021", "fingerprint": "efficient large scale language model training on gpu clusters using megatron lm", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/appendix_c_cost_estimation_and_resource_templates.md", "line": 320, "entry_no": 3, - "entry": "Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu C H, Gonzalez J E, Zhang H, Stoica I (2023) Efficient Memory Management for Large Language Model Serving with PagedAttention. In: Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles, pp 611-626.", + "entry": "Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu C H, Gonzalez J E, Zhang H, Stoica I (2023) Efficient Memory Management for Large Language Model Serving with PagedAttention. In: Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles, pp 611-626. https://doi.org/10.1145/3600006.3613165.", "first_author": "Kwon", "year": "2023", "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention", - "doi": "", + "doi": "10.1145/3600006.3613165", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3600006.3613165", "key": "kwon:2023", "fingerprint": "efficient memory management for large language model serving with pagedattention", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/appendix_c_cost_estimation_and_resource_templates.md", @@ -290,23 +279,36 @@ "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", "line": 397, "entry_no": 1, - "entry": "Kapoor S, Narayanan A (2023) Leakage and the reproducibility crisis in machine-learning-based science. Patterns 4(9):100804.", + "entry": "Kapoor S, Narayanan A (2023) Leakage and the reproducibility crisis in machine-learning-based science. Patterns 4(9):100804. https://doi.org/10.1016/j.patter.2023.100804.", "first_author": "Kapoor", "year": "2023", "title": "Leakage and the reproducibility crisis in machine-learning-based science", - "doi": "", + "doi": "10.1016/j.patter.2023.100804", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1016/j.patter.2023.100804", "key": "kapoor:2023", "fingerprint": "leakage and the reproducibility crisis in machine learning based science", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", "line": 399, "entry_no": 2, + "entry": "Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723.", + "first_author": "Gebru", + "year": "2021", + "title": "Datasheets for Datasets", + "doi": "10.1145/3458723", + "arxiv": "", + "url": "https://doi.org/10.1145/3458723", + "key": "gebru:2021", + "fingerprint": "datasheets for datasets", + "format_issues": [] + }, + { + "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", + "line": 401, + "entry_no": 3, "entry": "Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879.", "first_author": "Kreuzberger", "year": "2023", @@ -322,8 +324,8 @@ }, { "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", - "line": 401, - "entry_no": 3, + "line": 403, + "entry_no": 4, "entry": "Longpre S, Mahari R, Chen A, Obeng-Marnu N, Sileo D, Brannon W, Muennighoff N, Khazam N, Kabbara J, Perisetla K, Wu X, Shippole E, Bollacker K, Wu T, Villa L, Pentland S, Hooker S (2024) A large-scale audit of dataset licensing and attribution in AI. Nature Machine Intelligence 6(8):975-987.", "first_author": "Longpre", "year": "2024", @@ -339,25 +341,70 @@ }, { "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", - "line": 403, - "entry_no": 4, - "entry": "Mazumder M, Banbury C, Yao X, et al. (2023) DataPerf: Benchmarks for Data-Centric AI Development. In: Advances in Neural Information Processing Systems 36, Datasets and Benchmarks Track.", + "line": 405, + "entry_no": 5, + "entry": "Mazumder M, Banbury C, Yao X, et al. (2023) DataPerf: Benchmarks for Data-Centric AI Development. In: Advances in Neural Information Processing Systems 36, Datasets and Benchmarks Track. https://doi.org/10.52202/075280-0235.", "first_author": "Mazumder", "year": "2023", "title": "DataPerf: Benchmarks for Data-Centric AI Development", - "doi": "", + "doi": "10.52202/075280-0235", "arxiv": "", - "url": "", + "url": "https://doi.org/10.52202/075280-0235", "key": "mazumder:2023", "fingerprint": "dataperf benchmarks for data centric ai development", + "format_issues": [] + }, + { + "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", + "line": 407, + "entry_no": 6, + "entry": "Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596.", + "first_author": "Mitchell", + "year": "2019", + "title": "Model Cards for Model Reporting", + "doi": "10.1145/3287560.3287596", + "arxiv": "", + "url": "https://doi.org/10.1145/3287560.3287596", + "key": "mitchell:2019", + "fingerprint": "model cards for model reporting", + "format_issues": [] + }, + { + "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", + "line": 409, + "entry_no": 7, + "entry": "Pushkarna M, Zaldivar A, Kjartansson O (2022) Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 1776-1826. https://doi.org/10.1145/3531146.3533231.", + "first_author": "Pushkarna", + "year": "2022", + "title": "Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI", + "doi": "10.1145/3531146.3533231", + "arxiv": "", + "url": "https://doi.org/10.1145/3531146.3533231", + "key": "pushkarna:2022", + "fingerprint": "data cards purposeful and transparent dataset documentation for responsible ai", + "format_issues": [] + }, + { + "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", + "line": 411, + "entry_no": 8, + "entry": "Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28.", + "first_author": "Sculley", + "year": "2015", + "title": "Hidden Technical Debt in Machine Learning Systems", + "doi": "", + "arxiv": "", + "url": "", + "key": "sculley:2015", + "fingerprint": "hidden technical debt in machine learning systems", "format_issues": [ "missing-doi-arxiv-url" ] }, { "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", - "line": 405, - "entry_no": 5, + "line": 413, + "entry_no": 9, "entry": "Zha D, Bhat Z P, Lai K-H, Yang F, Jiang Z, Zhong S, Hu X (2023) Data-centric Artificial Intelligence: A Survey. arXiv preprint arXiv:2303.10158.", "first_author": "Zha", "year": "2023", @@ -388,15 +435,15 @@ "file": "docs/zh/appendix_e_common_bug_debugging_manual.md", "line": 449, "entry_no": 2, - "entry": "Chen D, Huang Y, Ma Z, Chen H, Pan X, Ge C, Gao D, Xie Y, Liu Z, Gao J, Li Y, Ding B, Zhou J (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Companion of the 2024 International Conference on Management of Data, pp 120-134.", - "first_author": "Chen", - "year": "2024", - "title": "Data-Juicer: A One-Stop Data Processing System for Large Language Models", + "entry": "Breck E, Cai S, Nielsen E, Salib M, Sculley D (2017) The ML Test Score: A Rubric for ML Production Readiness and Technical Debt Reduction. In: IEEE International Conference on Big Data, pp 1123-1132.", + "first_author": "Breck", + "year": "2017", + "title": "The ML Test Score: A Rubric for ML Production Readiness and Technical Debt Reduction", "doi": "", "arxiv": "", "url": "", - "key": "chen:2024", - "fingerprint": "data juicer a one stop data processing system for large language models", + "key": "breck:2017", + "fingerprint": "the ml test score a rubric for ml production readiness and technical debt reduction", "format_issues": [ "missing-doi-arxiv-url" ] @@ -405,6 +452,21 @@ "file": "docs/zh/appendix_e_common_bug_debugging_manual.md", "line": 451, "entry_no": 3, + "entry": "Chen D, Huang Y, Ma Z, Chen H, Pan X, Ge C, Gao D, Xie Y, Liu Z, Gao J, Li Y, Ding B, Zhou J (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Companion of the 2024 International Conference on Management of Data, pp 120-134. https://doi.org/10.1145/3626246.3653385.", + "first_author": "Chen", + "year": "2024", + "title": "Data-Juicer: A One-Stop Data Processing System for Large Language Models", + "doi": "10.1145/3626246.3653385", + "arxiv": "", + "url": "https://doi.org/10.1145/3626246.3653385", + "key": "chen:2024", + "fingerprint": "data juicer a one stop data processing system for large language models", + "format_issues": [] + }, + { + "file": "docs/zh/appendix_e_common_bug_debugging_manual.md", + "line": 453, + "entry_no": 4, "entry": "Chen Y, Shetty M, Somashekar G, Ma M, Simmhan Y, Mace J, Bansal C, Wang R, Rajmohan S (2025) AIOpsLab: A Holistic Framework to Evaluate AI Agents for Enabling Autonomous Clouds. arXiv preprint arXiv:2501.06706.", "first_author": "Chen", "year": "2025", @@ -418,25 +480,23 @@ }, { "file": "docs/zh/appendix_e_common_bug_debugging_manual.md", - "line": 453, - "entry_no": 4, - "entry": "Kapoor S, Narayanan A (2023) Leakage and the reproducibility crisis in machine-learning-based science. Patterns 4(9):100804.", + "line": 455, + "entry_no": 5, + "entry": "Kapoor S, Narayanan A (2023) Leakage and the reproducibility crisis in machine-learning-based science. Patterns 4(9):100804. https://doi.org/10.1016/j.patter.2023.100804.", "first_author": "Kapoor", "year": "2023", "title": "Leakage and the reproducibility crisis in machine-learning-based science", - "doi": "", + "doi": "10.1016/j.patter.2023.100804", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1016/j.patter.2023.100804", "key": "kapoor:2023", "fingerprint": "leakage and the reproducibility crisis in machine learning based science", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/appendix_e_common_bug_debugging_manual.md", - "line": 455, - "entry_no": 5, + "line": 457, + "entry_no": 6, "entry": "Pfitzmann B, Auer C, Dolfi M, Nassar A S, Staar P (2022) DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 3743-3751.", "first_author": "Pfitzmann", "year": "2022", @@ -469,52 +529,91 @@ "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", "line": 403, "entry_no": 2, - "entry": "Liang P, Bommasani R, Lee T, et al. (2023) Holistic Evaluation of Language Models. Transactions on Machine Learning Research.", + "entry": "Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723.", + "first_author": "Gebru", + "year": "2021", + "title": "Datasheets for Datasets", + "doi": "10.1145/3458723", + "arxiv": "", + "url": "https://doi.org/10.1145/3458723", + "key": "gebru:2021", + "fingerprint": "datasheets for datasets", + "format_issues": [] + }, + { + "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", + "line": 405, + "entry_no": 3, + "entry": "Liang P, Bommasani R, Lee T, et al. (2023) Holistic Evaluation of Language Models. Transactions on Machine Learning Research. arXiv:2211.09110.", "first_author": "Liang", "year": "2023", "title": "Holistic Evaluation of Language Models", "doi": "", - "arxiv": "", + "arxiv": "2211.09110", "url": "", "key": "liang:2023", "fingerprint": "holistic evaluation of language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", - "line": 405, - "entry_no": 3, - "entry": "Wang B, Chen W, Pei H, et al. (2023) DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models. In: Advances in Neural Information Processing Systems 36.", + "line": 407, + "entry_no": 4, + "entry": "Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596.", + "first_author": "Mitchell", + "year": "2019", + "title": "Model Cards for Model Reporting", + "doi": "10.1145/3287560.3287596", + "arxiv": "", + "url": "https://doi.org/10.1145/3287560.3287596", + "key": "mitchell:2019", + "fingerprint": "model cards for model reporting", + "format_issues": [] + }, + { + "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", + "line": 409, + "entry_no": 5, + "entry": "Pushkarna M, Zaldivar A, Kjartansson O (2022) Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 1776-1826. https://doi.org/10.1145/3531146.3533231.", + "first_author": "Pushkarna", + "year": "2022", + "title": "Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI", + "doi": "10.1145/3531146.3533231", + "arxiv": "", + "url": "https://doi.org/10.1145/3531146.3533231", + "key": "pushkarna:2022", + "fingerprint": "data cards purposeful and transparent dataset documentation for responsible ai", + "format_issues": [] + }, + { + "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", + "line": 411, + "entry_no": 6, + "entry": "Wang B, Chen W, Pei H, et al. (2023) DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models. In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/075280-1361.", "first_author": "Wang", "year": "2023", "title": "DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models", - "doi": "", + "doi": "10.52202/075280-1361", "arxiv": "", - "url": "", + "url": "https://doi.org/10.52202/075280-1361", "key": "wang:2023", "fingerprint": "decodingtrust a comprehensive assessment of trustworthiness in gpt models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", - "line": 407, - "entry_no": 4, - "entry": "Weidinger L, Uesato J, Rauh M, Griffin C, Huang P-S, Mellor J, Glaese A, Cheng M, Balle B, Kasirzadeh A, Kenton Z, Brown S, Hawkins W, Stepleton T, Birhane A, Haas J, Rimell L, Hendricks L A, Isaac W, Legassick S, Irving G, Gabriel I (2022) Taxonomy of Risks posed by Language Models. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 214-229.", + "line": 413, + "entry_no": 7, + "entry": "Weidinger L, Uesato J, Rauh M, Griffin C, Huang P-S, Mellor J, Glaese A, Cheng M, Balle B, Kasirzadeh A, Kenton Z, Brown S, Hawkins W, Stepleton T, Birhane A, Haas J, Rimell L, Hendricks L A, Isaac W, Legassick S, Irving G, Gabriel I (2022) Taxonomy of Risks posed by Language Models. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 214-229. https://doi.org/10.1145/3531146.3533088.", "first_author": "Weidinger", "year": "2022", "title": "Taxonomy of Risks posed by Language Models", - "doi": "", + "doi": "10.1145/3531146.3533088", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3531146.3533088", "key": "weidinger:2022", "fingerprint": "taxonomy of risks posed by language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/appendix_g_datagallery_note.md", @@ -1007,52 +1106,46 @@ "file": "docs/zh/part1/ch02_quality_framework.md", "line": 507, "entry_no": 1, - "entry": "Cohen J (1960) A Coefficient of Agreement for Nominal Scales. Educational and Psychological Measurement 20(1):37-46.", + "entry": "Cohen J (1960) A Coefficient of Agreement for Nominal Scales. Educational and Psychological Measurement 20(1):37-46. https://doi.org/10.1177/001316446002000104.", "first_author": "Cohen", "year": "1960", "title": "A Coefficient of Agreement for Nominal Scales", - "doi": "", + "doi": "10.1177/001316446002000104", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1177/001316446002000104", "key": "cohen:1960", "fingerprint": "a coefficient of agreement for nominal scales", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part1/ch02_quality_framework.md", "line": 509, "entry_no": 2, - "entry": "Lees A, Tran V Q, Tay Y, Sorensen J, Gupta J, Metzler D, Vasserman L (2022) A New Generation of Perspective API: Efficient Multilingual Character-level Transformers. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 3197-3207.", + "entry": "Lees A, Tran V Q, Tay Y, Sorensen J, Gupta J, Metzler D, Vasserman L (2022) A New Generation of Perspective API: Efficient Multilingual Character-level Transformers. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 3197-3207. https://doi.org/10.1145/3534678.3539147.", "first_author": "Lees", "year": "2022", "title": "A New Generation of Perspective API: Efficient Multilingual Character-level Transformers", - "doi": "", + "doi": "10.1145/3534678.3539147", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3534678.3539147", "key": "lees:2022", "fingerprint": "a new generation of perspective api efficient multilingual character level transformers", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part1/ch02_quality_framework.md", "line": 511, "entry_no": 3, - "entry": "Nadeem M, Bethke A, Reddy S (2021) StereoSet: Measuring Stereotypical Bias in Pretrained Language Models. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 5356-5371.", + "entry": "Nadeem M, Bethke A, Reddy S (2021) StereoSet: Measuring Stereotypical Bias in Pretrained Language Models. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 5356-5371. https://doi.org/10.18653/v1/2021.acl-long.416.", "first_author": "Nadeem", "year": "2021", "title": "StereoSet: Measuring Stereotypical Bias in Pretrained Language Models", - "doi": "", + "doi": "10.18653/v1/2021.acl-long.416", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2021.acl-long.416", "key": "nadeem:2021", "fingerprint": "stereoset measuring stereotypical bias in pretrained language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part1/ch02_quality_framework.md", @@ -1075,35 +1168,31 @@ "file": "docs/zh/part1/ch02_quality_framework.md", "line": 515, "entry_no": 5, - "entry": "Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P F, Leike J, Lowe R (2022) Training Language Models to Follow Instructions with Human Feedback. Advances in Neural Information Processing Systems 35:27730-27744.", + "entry": "Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P F, Leike J, Lowe R (2022) Training Language Models to Follow Instructions with Human Feedback. Advances in Neural Information Processing Systems 35:27730-27744. arXiv:2203.02155.", "first_author": "Ouyang", "year": "2022", "title": "Training Language Models to Follow Instructions with Human Feedback", "doi": "", - "arxiv": "", + "arxiv": "2203.02155", "url": "", "key": "ouyang:2022", "fingerprint": "training language models to follow instructions with human feedback", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part1/ch02_quality_framework.md", "line": 517, "entry_no": 6, - "entry": "Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Advances in Neural Information Processing Systems 36:53728-53741.", + "entry": "Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Advances in Neural Information Processing Systems 36:53728-53741. arXiv:2305.18290.", "first_author": "Rafailov", "year": "2023", "title": "Direct Preference Optimization: Your Language Model Is Secretly a Reward Model", "doi": "", - "arxiv": "", + "arxiv": "2305.18290", "url": "", "key": "rafailov:2023", "fingerprint": "direct preference optimization your language model is secretly a reward model", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part1/ch02_quality_framework.md", @@ -1218,18 +1307,16 @@ "file": "docs/zh/part1/ch02_quality_framework.md", "line": 534, "entry_no": 14, - "entry": "Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92.", + "entry": "Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723.", "first_author": "Gebru", "year": "2021", "title": "Datasheets for Datasets", - "doi": "", + "doi": "10.1145/3458723", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3458723", "key": "gebru:2021", "fingerprint": "datasheets for datasets", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part1/ch02_quality_framework.md", @@ -1299,18 +1386,16 @@ "file": "docs/zh/part1/ch03_data_stack.md", "line": 345, "entry_no": 3, - "entry": "Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29.", + "entry": "Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. https://doi.org/10.1109/sequen.1997.666900.", "first_author": "Broder", "year": "1997", "title": "On the Resemblance and Containment of Documents", - "doi": "", + "doi": "10.1109/sequen.1997.666900", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/sequen.1997.666900", "key": "broder:1997", "fingerprint": "on the resemblance and containment of documents", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part1/ch03_data_stack.md", @@ -1350,18 +1435,16 @@ "file": "docs/zh/part1/ch03_data_stack.md", "line": 351, "entry_no": 6, - "entry": "Malkov Y A, Yashunin D A (2020) Efficient and Robust Approximate Nearest Neighbor Search Using Hierarchical Navigable Small World Graphs (HNSW). IEEE Transactions on Pattern Analysis and Machine Intelligence 42(4):824-836.", + "entry": "Malkov Y A, Yashunin D A (2020) Efficient and Robust Approximate Nearest Neighbor Search Using Hierarchical Navigable Small World Graphs (HNSW). IEEE Transactions on Pattern Analysis and Machine Intelligence 42(4):824-836. https://doi.org/10.1109/tpami.2018.2889473.", "first_author": "Malkov", "year": "2020", "title": "Efficient and Robust Approximate Nearest Neighbor Search Using Hierarchical Navigable Small World Graphs (HNSW)", - "doi": "", + "doi": "10.1109/tpami.2018.2889473", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/tpami.2018.2889473", "key": "malkov:2020", "fingerprint": "efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs hnsw", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part1/ch03_data_stack.md", @@ -1476,35 +1559,31 @@ "file": "docs/zh/part10/ch31_agent_architecture.md", "line": 581, "entry_no": 1, - "entry": "Besta M, Blach N, Kubicek A, Gerstenberger R, Podstawski M, Gianinazzi L, Gajda J, Lehmann T, Niewiadomski H, Nyczyk P, Hoefler T (2024) Graph of Thoughts: Solving Elaborate Problems with Large Language Models. In: Proceedings of the AAAI Conference on Artificial Intelligence 38(16):17682-17690.", + "entry": "Besta M, Blach N, Kubicek A, Gerstenberger R, Podstawski M, Gianinazzi L, Gajda J, Lehmann T, Niewiadomski H, Nyczyk P, Hoefler T (2024) Graph of Thoughts: Solving Elaborate Problems with Large Language Models. In: Proceedings of the AAAI Conference on Artificial Intelligence 38(16):17682-17690. https://doi.org/10.1609/aaai.v38i16.29720.", "first_author": "Besta", "year": "2024", "title": "Graph of Thoughts: Solving Elaborate Problems with Large Language Models", - "doi": "", + "doi": "10.1609/aaai.v38i16.29720", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1609/aaai.v38i16.29720", "key": "besta:2024", "fingerprint": "graph of thoughts solving elaborate problems with large language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch31_agent_architecture.md", "line": 583, "entry_no": 2, - "entry": "Gao L, Madaan A, Zhou S, Alon U, Liu P, Yang Y, Callan J, Neubig G (2023) PAL: Program-aided Language Models. In: Proceedings of the 40th International Conference on Machine Learning, pp 10764-10799.", + "entry": "Gao L, Madaan A, Zhou S, Alon U, Liu P, Yang Y, Callan J, Neubig G (2023) PAL: Program-aided Language Models. In: Proceedings of the 40th International Conference on Machine Learning, pp 10764-10799. arXiv:2211.10435.", "first_author": "Gao", "year": "2023", "title": "PAL: Program-aided Language Models", "doi": "", - "arxiv": "", + "arxiv": "2211.10435", "url": "", "key": "gao:2023", "fingerprint": "pal program aided language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch31_agent_architecture.md", @@ -1542,18 +1621,16 @@ "file": "docs/zh/part10/ch31_agent_architecture.md", "line": 589, "entry_no": 5, - "entry": "Madaan A, Tandon N, Gupta P, Hallinan S, Gao L, Wiegreffe S, Alon U, Dziri N, Prabhumoye S, Yang Y, Gupta S, Majumder B P, Hermann K, Welleck S, Yazdanbakhsh A, Clark P (2023) Self-Refine: Iterative Refinement with Self-Feedback. In: Advances in Neural Information Processing Systems 36.", + "entry": "Madaan A, Tandon N, Gupta P, Hallinan S, Gao L, Wiegreffe S, Alon U, Dziri N, Prabhumoye S, Yang Y, Gupta S, Majumder B P, Hermann K, Welleck S, Yazdanbakhsh A, Clark P (2023) Self-Refine: Iterative Refinement with Self-Feedback. In: Advances in Neural Information Processing Systems 36. arXiv:2303.17651.", "first_author": "Madaan", "year": "2023", "title": "Self-Refine: Iterative Refinement with Self-Feedback", "doi": "", - "arxiv": "", + "arxiv": "2303.17651", "url": "", "key": "madaan:2023", "fingerprint": "self refine iterative refinement with self feedback", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch31_agent_architecture.md", @@ -1591,18 +1668,16 @@ "file": "docs/zh/part10/ch31_agent_architecture.md", "line": 595, "entry_no": 8, - "entry": "Park J S, O'Brien J C, Cai C J, Morris M R, Liang P, Bernstein M S (2023) Generative Agents: Interactive Simulacra of Human Behavior. In: Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology, Article 2.", + "entry": "Park J S, O'Brien J C, Cai C J, Morris M R, Liang P, Bernstein M S (2023) Generative Agents: Interactive Simulacra of Human Behavior. In: Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology, Article 2. https://doi.org/10.1145/3586183.3606763.", "first_author": "Park", "year": "2023", "title": "Generative Agents: Interactive Simulacra of Human Behavior", - "doi": "", + "doi": "10.1145/3586183.3606763", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3586183.3606763", "key": "park:2023", "fingerprint": "generative agents interactive simulacra of human behavior", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch31_agent_architecture.md", @@ -1657,18 +1732,16 @@ "file": "docs/zh/part10/ch31_agent_architecture.md", "line": 603, "entry_no": 12, - "entry": "Shinn N, Cassano F, Gopinath A, Narasimhan K, Yao S (2023) Reflexion: Language Agents with Verbal Reinforcement Learning. In: Advances in Neural Information Processing Systems 36.", + "entry": "Shinn N, Cassano F, Gopinath A, Narasimhan K, Yao S (2023) Reflexion: Language Agents with Verbal Reinforcement Learning. In: Advances in Neural Information Processing Systems 36. arXiv:2303.11366.", "first_author": "Shinn", "year": "2023", "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", "doi": "", - "arxiv": "", + "arxiv": "2303.11366", "url": "", "key": "shinn:2023", "fingerprint": "reflexion language agents with verbal reinforcement learning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch31_agent_architecture.md", @@ -1719,52 +1792,46 @@ "file": "docs/zh/part10/ch31_agent_architecture.md", "line": 611, "entry_no": 16, - "entry": "Yao S, Zhao J, Yu D, Du N, Shafran I, Narasimhan K, Cao Y (2023) ReAct: Synergizing Reasoning and Acting in Language Models. In: International Conference on Learning Representations.", + "entry": "Yao S, Zhao J, Yu D, Du N, Shafran I, Narasimhan K, Cao Y (2023) ReAct: Synergizing Reasoning and Acting in Language Models. In: International Conference on Learning Representations. arXiv:2210.03629.", "first_author": "Yao", "year": "2023", "title": "ReAct: Synergizing Reasoning and Acting in Language Models", "doi": "", - "arxiv": "", + "arxiv": "2210.03629", "url": "", "key": "yao:2023", "fingerprint": "react synergizing reasoning and acting in language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch31_agent_architecture.md", "line": 613, "entry_no": 17, - "entry": "Yao S, Yu D, Zhao J, Shafran I, Griffiths T L, Cao Y, Narasimhan K (2023) Tree of Thoughts: Deliberate Problem Solving with Large Language Models. In: Advances in Neural Information Processing Systems 36.", + "entry": "Yao S, Yu D, Zhao J, Shafran I, Griffiths T L, Cao Y, Narasimhan K (2023) Tree of Thoughts: Deliberate Problem Solving with Large Language Models. In: Advances in Neural Information Processing Systems 36. arXiv:2305.10601.", "first_author": "Yao", "year": "2023", "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models", "doi": "", - "arxiv": "", + "arxiv": "2305.10601", "url": "", "key": "yao:2023", "fingerprint": "tree of thoughts deliberate problem solving with large language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch32_auto_collection_parsing_cleaning.md", "line": 420, "entry_no": 1, - "entry": "Barbaresi A (2021) Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 122-131.", + "entry": "Barbaresi A (2021) Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 122-131. https://doi.org/10.18653/v1/2021.acl-demo.15.", "first_author": "Barbaresi", "year": "2021", "title": "Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction", - "doi": "", + "doi": "10.18653/v1/2021.acl-demo.15", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2021.acl-demo.15", "key": "barbaresi:2021", "fingerprint": "trafilatura a web scraping library and command line tool for text discovery and extraction", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch32_auto_collection_parsing_cleaning.md", @@ -1802,18 +1869,16 @@ "file": "docs/zh/part10/ch32_auto_collection_parsing_cleaning.md", "line": 426, "entry_no": 4, - "entry": "Chen J, Yan X, Lin D, Qu X, Wang Y, Huang X, Zhao Z, Yu T, Zhang Z, Li H, Zheng Y, Xu R, Zhu J, Qiu X (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Proceedings of the ACM SIGMOD International Conference on Management of Data, pp 4436-4449.", + "entry": "Chen J, Yan X, Lin D, Qu X, Wang Y, Huang X, Zhao Z, Yu T, Zhang Z, Li H, Zheng Y, Xu R, Zhu J, Qiu X (2024) Data-Juicer: A One-Stop Data Processing System for Large Language Models. In: Proceedings of the ACM SIGMOD International Conference on Management of Data, pp 4436-4449. https://doi.org/10.1145/3626246.3653385.", "first_author": "Chen", "year": "2024", "title": "Data-Juicer: A One-Stop Data Processing System for Large Language Models", - "doi": "", + "doi": "10.1145/3626246.3653385", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3626246.3653385", "key": "chen:2024", "fingerprint": "data juicer a one stop data processing system for large language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch32_auto_collection_parsing_cleaning.md", @@ -1868,35 +1933,31 @@ "file": "docs/zh/part10/ch32_auto_collection_parsing_cleaning.md", "line": 434, "entry_no": 8, - "entry": "Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083-4091.", + "entry": "Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083-4091. https://doi.org/10.1145/3503161.3548112.", "first_author": "Huang", "year": "2022", "title": "LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking", - "doi": "", + "doi": "10.1145/3503161.3548112", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3503161.3548112", "key": "huang:2022", "fingerprint": "layoutlmv3 pre training for document ai with unified text and image masking", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch32_auto_collection_parsing_cleaning.md", "line": 436, "entry_no": 9, - "entry": "Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: European Conference on Computer Vision, pp 498-517.", + "entry": "Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: European Conference on Computer Vision, pp 498-517. https://doi.org/10.1007/978-3-031-19815-1_29.", "first_author": "Kim", "year": "2022", "title": "OCR-free Document Understanding Transformer", - "doi": "", + "doi": "10.1007/978-3-031-19815-1_29", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1007/978-3-031-19815-1_29", "key": "kim:2022", "fingerprint": "ocr free document understanding transformer", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch32_auto_collection_parsing_cleaning.md", @@ -1953,18 +2014,16 @@ "file": "docs/zh/part10/ch32_auto_collection_parsing_cleaning.md", "line": 444, "entry_no": 13, - "entry": "Nguyen T, et al. (2024) CulturaX: A Cleaned, Enormous, and Multilingual Dataset for Large Language Models in 167 Languages. In: Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation.", + "entry": "Nguyen T, et al. (2024) CulturaX: A Cleaned, Enormous, and Multilingual Dataset for Large Language Models in 167 Languages. In: Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation. https://doi.org/10.63317/5iz6z5g7eit3.", "first_author": "Nguyen", "year": "2024", "title": "CulturaX: A Cleaned, Enormous, and Multilingual Dataset for Large Language Models in 167 Languages", - "doi": "", + "doi": "10.63317/5iz6z5g7eit3", "arxiv": "", - "url": "", + "url": "https://doi.org/10.63317/5iz6z5g7eit3", "key": "nguyen:2024", "fingerprint": "culturax a cleaned enormous and multilingual dataset for large language models in 167 languages", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch32_auto_collection_parsing_cleaning.md", @@ -2087,18 +2146,16 @@ "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", "line": 435, "entry_no": 1, - "entry": "Alemohammad S, Casco-Rodriguez J, Luzi L, et al. (2024) Self-Consuming Generative Models Go MAD. In: International Conference on Learning Representations.", + "entry": "Alemohammad S, Casco-Rodriguez J, Luzi L, et al. (2024) Self-Consuming Generative Models Go MAD. In: International Conference on Learning Representations. arXiv:2307.01850.", "first_author": "Alemohammad", "year": "2024", "title": "Self-Consuming Generative Models Go MAD", "doi": "", - "arxiv": "", + "arxiv": "2307.01850", "url": "", "key": "alemohammad:2024", "fingerprint": "self consuming generative models go mad", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", @@ -2134,18 +2191,16 @@ "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", "line": 441, "entry_no": 4, - "entry": "Dubois Y, Li X, Taori R, Zhang T, Gulrajani I, Ba J, Guestrin C, Liang P, Hashimoto T B (2023) AlpacaFarm: A Simulation Framework for Methods that Learn from Human Feedback. In: Advances in Neural Information Processing Systems 36.", + "entry": "Dubois Y, Li X, Taori R, Zhang T, Gulrajani I, Ba J, Guestrin C, Liang P, Hashimoto T B (2023) AlpacaFarm: A Simulation Framework for Methods that Learn from Human Feedback. In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/075280-1308.", "first_author": "Dubois", "year": "2023", "title": "AlpacaFarm: A Simulation Framework for Methods that Learn from Human Feedback", - "doi": "", + "doi": "10.52202/075280-1308", "arxiv": "", - "url": "", + "url": "https://doi.org/10.52202/075280-1308", "key": "dubois:2023", "fingerprint": "alpacafarm a simulation framework for methods that learn from human feedback", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", @@ -2230,35 +2285,31 @@ "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", "line": 453, "entry_no": 10, - "entry": "Liang P, Bommasani R, Lee T, et al. (2023) Holistic Evaluation of Language Models. Transactions on Machine Learning Research.", + "entry": "Liang P, Bommasani R, Lee T, et al. (2023) Holistic Evaluation of Language Models. Transactions on Machine Learning Research. arXiv:2211.09110.", "first_author": "Liang", "year": "2023", "title": "Holistic Evaluation of Language Models", "doi": "", - "arxiv": "", + "arxiv": "2211.09110", "url": "", "key": "liang:2023", "fingerprint": "holistic evaluation of language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", "line": 455, "entry_no": 11, - "entry": "Liu Y, Iter D, Xu Y, et al. (2023) G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp 2511-2522.", + "entry": "Liu Y, Iter D, Xu Y, et al. (2023) G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp 2511-2522. arXiv:2303.16634.", "first_author": "Liu", "year": "2023", "title": "G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment", "doi": "", - "arxiv": "", + "arxiv": "2303.16634", "url": "", "key": "liu:2023", "fingerprint": "g eval nlg evaluation using gpt 4 with better human alignment", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", @@ -2279,120 +2330,106 @@ "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", "line": 459, "entry_no": 13, - "entry": "Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P, Leike J, Lowe R (2022) Training language models to follow instructions with human feedback. In: Advances in Neural Information Processing Systems 35, pp 27730-27744.", + "entry": "Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P, Leike J, Lowe R (2022) Training language models to follow instructions with human feedback. In: Advances in Neural Information Processing Systems 35, pp 27730-27744. arXiv:2203.02155.", "first_author": "Ouyang", "year": "2022", "title": "Training language models to follow instructions with human feedback", "doi": "", - "arxiv": "", + "arxiv": "2203.02155", "url": "", "key": "ouyang:2022", "fingerprint": "training language models to follow instructions with human feedback", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", "line": 461, "entry_no": 14, - "entry": "Perez E, Huang S, Song F, Cai T, Ring R, Aslanides J, Glaese A, McAleese N, Irving G (2022) Red Teaming Language Models with Language Models. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp 3419-3448.", + "entry": "Perez E, Huang S, Song F, Cai T, Ring R, Aslanides J, Glaese A, McAleese N, Irving G (2022) Red Teaming Language Models with Language Models. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp 3419-3448. arXiv:2202.03286.", "first_author": "Perez", "year": "2022", "title": "Red Teaming Language Models with Language Models", "doi": "", - "arxiv": "", + "arxiv": "2202.03286", "url": "", "key": "perez:2022", "fingerprint": "red teaming language models with language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", "line": 463, "entry_no": 15, - "entry": "Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model is Secretly a Reward Model. In: Advances in Neural Information Processing Systems 36.", + "entry": "Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model is Secretly a Reward Model. In: Advances in Neural Information Processing Systems 36. arXiv:2305.18290.", "first_author": "Rafailov", "year": "2023", "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", "doi": "", - "arxiv": "", + "arxiv": "2305.18290", "url": "", "key": "rafailov:2023", "fingerprint": "direct preference optimization your language model is secretly a reward model", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", "line": 465, "entry_no": 16, - "entry": "Ribeiro M T, Wu T, Guestrin C, Singh S (2020) Beyond Accuracy: Behavioral Testing of NLP Models with CheckList. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp 4902-4912.", + "entry": "Ribeiro M T, Wu T, Guestrin C, Singh S (2020) Beyond Accuracy: Behavioral Testing of NLP Models with CheckList. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp 4902-4912. https://doi.org/10.18653/v1/2020.acl-main.442.", "first_author": "Ribeiro", "year": "2020", "title": "Beyond Accuracy: Behavioral Testing of NLP Models with CheckList", - "doi": "", + "doi": "10.18653/v1/2020.acl-main.442", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2020.acl-main.442", "key": "ribeiro:2020", "fingerprint": "beyond accuracy behavioral testing of nlp models with checklist", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", "line": 467, "entry_no": 17, - "entry": "Shumailov I, Shumaylov Z, Zhao Y, et al. (2024) AI models collapse when trained on recursively generated data. Nature 631:755-759.", + "entry": "Shumailov I, Shumaylov Z, Zhao Y, et al. (2024) AI models collapse when trained on recursively generated data. Nature 631:755-759. https://doi.org/10.1038/s41586-024-07566-y.", "first_author": "Shumailov", "year": "2024", "title": "AI models collapse when trained on recursively generated data", - "doi": "", + "doi": "10.1038/s41586-024-07566-y", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1038/s41586-024-07566-y", "key": "shumailov:2024", "fingerprint": "ai models collapse when trained on recursively generated data", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", "line": 469, "entry_no": 18, - "entry": "Wang Y, Kordi Y, Mishra S, et al. (2023) Self-Instruct: Aligning Language Models with Self-Generated Instructions. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 13484-13508.", + "entry": "Wang Y, Kordi Y, Mishra S, et al. (2023) Self-Instruct: Aligning Language Models with Self-Generated Instructions. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 13484-13508. https://doi.org/10.18653/v1/2023.acl-long.754.", "first_author": "Wang", "year": "2023", "title": "Self-Instruct: Aligning Language Models with Self-Generated Instructions", - "doi": "", + "doi": "10.18653/v1/2023.acl-long.754", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2023.acl-long.754", "key": "wang:2023", "fingerprint": "self instruct aligning language models with self generated instructions", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", "line": 471, "entry_no": 19, - "entry": "Zheng L, Chiang W-L, Sheng Y, et al. (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In: Advances in Neural Information Processing Systems 36.", + "entry": "Zheng L, Chiang W-L, Sheng Y, et al. (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In: Advances in Neural Information Processing Systems 36. arXiv:2306.05685.", "first_author": "Zheng", "year": "2023", "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", "doi": "", - "arxiv": "", + "arxiv": "2306.05685", "url": "", "key": "zheng:2023", "fingerprint": "judging llm as a judge with mt bench and chatbot arena", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", @@ -2413,35 +2450,31 @@ "file": "docs/zh/part10/ch33_labeling_synthesis_evaluation.md", "line": 475, "entry_no": 21, - "entry": "Zhou C, Liu P, Xu P, et al. (2023) LIMA: Less Is More for Alignment. In: Advances in Neural Information Processing Systems 36.", + "entry": "Zhou C, Liu P, Xu P, et al. (2023) LIMA: Less Is More for Alignment. In: Advances in Neural Information Processing Systems 36. arXiv:2305.11206.", "first_author": "Zhou", "year": "2023", "title": "LIMA: Less Is More for Alignment", "doi": "", - "arxiv": "", + "arxiv": "2305.11206", "url": "", "key": "zhou:2023", "fingerprint": "lima less is more for alignment", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch34_dataops_agent.md", "line": 471, "entry_no": 1, - "entry": "Amershi S, Begel A, Bird C, Devanbu P, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 291-300.", + "entry": "Amershi S, Begel A, Bird C, Devanbu P, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 291-300. https://doi.org/10.1109/icse-seip.2019.00042.", "first_author": "Amershi", "year": "2019", "title": "Software Engineering for Machine Learning: A Case Study", - "doi": "", + "doi": "10.1109/icse-seip.2019.00042", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/icse-seip.2019.00042", "key": "amershi:2019", "fingerprint": "software engineering for machine learning a case study", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch34_dataops_agent.md", @@ -2464,18 +2497,16 @@ "file": "docs/zh/part10/ch34_dataops_agent.md", "line": 475, "entry_no": 3, - "entry": "Dang Y, Lin Q, Huang P (2019) AIOps: Real-World Challenges and Research Innovations. In: Proceedings of the 41st International Conference on Software Engineering: Companion Proceedings, pp 4-5.", + "entry": "Dang Y, Lin Q, Huang P (2019) AIOps: Real-World Challenges and Research Innovations. In: Proceedings of the 41st International Conference on Software Engineering: Companion Proceedings, pp 4-5. https://doi.org/10.1109/icse-companion.2019.00023.", "first_author": "Dang", "year": "2019", "title": "AIOps: Real-World Challenges and Research Innovations", - "doi": "", + "doi": "10.1109/icse-companion.2019.00023", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/icse-companion.2019.00023", "key": "dang:2019", "fingerprint": "aiops real world challenges and research innovations", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch34_dataops_agent.md", @@ -2549,18 +2580,16 @@ "file": "docs/zh/part10/ch34_dataops_agent.md", "line": 485, "entry_no": 8, - "entry": "Lwakatare L E, Raj A, Crnkovic I, Bosch J, Olsson H H (2020) Large-scale Machine Learning Systems in Real-world Industrial Settings: A Review of Challenges and Solutions. Information and Software Technology 127:106368.", + "entry": "Lwakatare L E, Raj A, Crnkovic I, Bosch J, Olsson H H (2020) Large-scale Machine Learning Systems in Real-world Industrial Settings: A Review of Challenges and Solutions. Information and Software Technology 127:106368. https://doi.org/10.1016/j.infsof.2020.106368.", "first_author": "Lwakatare", "year": "2020", "title": "Large-scale Machine Learning Systems in Real-world Industrial Settings: A Review of Challenges and Solutions", - "doi": "", + "doi": "10.1016/j.infsof.2020.106368", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1016/j.infsof.2020.106368", "key": "lwakatare:2020", "fingerprint": "large scale machine learning systems in real world industrial settings a review of challenges and solutions", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch34_dataops_agent.md", @@ -2600,69 +2629,61 @@ "file": "docs/zh/part10/ch34_dataops_agent.md", "line": 491, "entry_no": 11, - "entry": "Paleyes A, Urma R-G, Lawrence N D (2022) Challenges in Deploying Machine Learning: A Survey of Case Studies. ACM Computing Surveys 55(6):1-29.", + "entry": "Paleyes A, Urma R-G, Lawrence N D (2022) Challenges in Deploying Machine Learning: A Survey of Case Studies. ACM Computing Surveys 55(6):1-29. https://doi.org/10.1145/3533378.", "first_author": "Paleyes", "year": "2022", "title": "Challenges in Deploying Machine Learning: A Survey of Case Studies", - "doi": "", + "doi": "10.1145/3533378", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3533378", "key": "paleyes:2022", "fingerprint": "challenges in deploying machine learning a survey of case studies", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch34_dataops_agent.md", "line": 493, "entry_no": 12, - "entry": "Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) \"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15.", + "entry": "Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) \"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. https://doi.org/10.1145/3411764.3445518.", "first_author": "Sambasivan", "year": "2021", "title": "\"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI", - "doi": "", + "doi": "10.1145/3411764.3445518", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3411764.3445518", "key": "sambasivan:2021", "fingerprint": "everyone wants to do the model work not the data work data cascades in high stakes ai", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch34_dataops_agent.md", "line": 495, "entry_no": 13, - "entry": "Tamburri D A (2020) Sustainable MLOps: Trends and Challenges. In: Proceedings of the 22nd International Symposium on Symbolic and Numeric Algorithms for Scientific Computing, pp 17-23.", + "entry": "Tamburri D A (2020) Sustainable MLOps: Trends and Challenges. In: Proceedings of the 22nd International Symposium on Symbolic and Numeric Algorithms for Scientific Computing, pp 17-23. https://doi.org/10.1109/synasc51798.2020.00015.", "first_author": "Tamburri", "year": "2020", "title": "Sustainable MLOps: Trends and Challenges", - "doi": "", + "doi": "10.1109/synasc51798.2020.00015", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/synasc51798.2020.00015", "key": "tamburri:2020", "fingerprint": "sustainable mlops trends and challenges", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch34_dataops_agent.md", "line": 497, "entry_no": 14, - "entry": "Testi M, Ballabio M, Frontoni E, Iannello G, Moccia S, Soda P, Vessio G (2022) MLOps: A Taxonomy and a Methodology. IEEE Access 10:63606-63618.", + "entry": "Testi M, Ballabio M, Frontoni E, Iannello G, Moccia S, Soda P, Vessio G (2022) MLOps: A Taxonomy and a Methodology. IEEE Access 10:63606-63618. https://doi.org/10.1109/access.2022.3181730.", "first_author": "Testi", "year": "2022", "title": "MLOps: A Taxonomy and a Methodology", - "doi": "", + "doi": "10.1109/access.2022.3181730", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/access.2022.3181730", "key": "testi:2022", "fingerprint": "mlops a taxonomy and a methodology", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch34_dataops_agent.md", @@ -2685,18 +2706,16 @@ "file": "docs/zh/part10/ch34_dataops_agent.md", "line": 501, "entry_no": 16, - "entry": "Vela D, Sharp A, Zhang R, Nguyen T, Hoang A, Pianykh O S (2022) Temporal quality degradation in AI models. Scientific Reports 12:11654.", + "entry": "Vela D, Sharp A, Zhang R, Nguyen T, Hoang A, Pianykh O S (2022) Temporal quality degradation in AI models. Scientific Reports 12:11654. https://doi.org/10.1038/s41598-022-15245-z.", "first_author": "Vela", "year": "2022", "title": "Temporal quality degradation in AI models", - "doi": "", + "doi": "10.1038/s41598-022-15245-z", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1038/s41598-022-15245-z", "key": "vela:2022", "fingerprint": "temporal quality degradation in ai models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch34_dataops_agent.md", @@ -2781,35 +2800,31 @@ "file": "docs/zh/part10/ch35_security_permission_collaboration.md", "line": 488, "entry_no": 5, - "entry": "Greshake K, Abdelnabi S, Mishra S, et al. (2023) Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. In: Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security, pp 79-90.", + "entry": "Greshake K, Abdelnabi S, Mishra S, et al. (2023) Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. In: Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security, pp 79-90. https://doi.org/10.1145/3605764.3623985.", "first_author": "Greshake", "year": "2023", "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", - "doi": "", + "doi": "10.1145/3605764.3623985", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3605764.3623985", "key": "greshake:2023", "fingerprint": "not what you ve signed up for compromising real world llm integrated applications with indirect prompt injection", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch35_security_permission_collaboration.md", "line": 490, "entry_no": 6, - "entry": "Hendrycks D, Mazeika M, Zou A, Patel S, Zhu C, Navarro J, Mu J, Song D, Li B, Steinhardt J (2021) The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization. In: Proceedings of the IEEE/CVF International Conference on Computer Vision, pp 8340-8349.", + "entry": "Hendrycks D, Mazeika M, Zou A, Patel S, Zhu C, Navarro J, Mu J, Song D, Li B, Steinhardt J (2021) The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization. In: Proceedings of the IEEE/CVF International Conference on Computer Vision, pp 8340-8349. https://doi.org/10.1109/iccv48922.2021.00823.", "first_author": "Hendrycks", "year": "2021", "title": "The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization", - "doi": "", + "doi": "10.1109/iccv48922.2021.00823", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/iccv48922.2021.00823", "key": "hendrycks:2021", "fingerprint": "the many faces of robustness a critical analysis of out of distribution generalization", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch35_security_permission_collaboration.md", @@ -2862,18 +2877,16 @@ "file": "docs/zh/part10/ch35_security_permission_collaboration.md", "line": 498, "entry_no": 10, - "entry": "Perez E, Huang S, Song F, Cai T, Ring R, Aslanides J, Glaese A, McAleese N, Irving G (2022) Red Teaming Language Models with Language Models. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp 3419-3448.", + "entry": "Perez E, Huang S, Song F, Cai T, Ring R, Aslanides J, Glaese A, McAleese N, Irving G (2022) Red Teaming Language Models with Language Models. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp 3419-3448. arXiv:2202.03286.", "first_author": "Perez", "year": "2022", "title": "Red Teaming Language Models with Language Models", "doi": "", - "arxiv": "", + "arxiv": "2202.03286", "url": "", "key": "perez:2022", "fingerprint": "red teaming language models with language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch35_security_permission_collaboration.md", @@ -2973,18 +2986,16 @@ "file": "docs/zh/part10/ch35_security_permission_collaboration.md", "line": 512, "entry_no": 17, - "entry": "Zhan Q, Liang Z, Ying Z, Kang D (2024) InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents. In: Findings of the Association for Computational Linguistics: ACL 2024, pp 10471-10506.", + "entry": "Zhan Q, Liang Z, Ying Z, Kang D (2024) InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents. In: Findings of the Association for Computational Linguistics: ACL 2024, pp 10471-10506. https://doi.org/10.18653/v1/2024.findings-acl.624.", "first_author": "Zhan", "year": "2024", "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents", - "doi": "", + "doi": "10.18653/v1/2024.findings-acl.624", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2024.findings-acl.624", "key": "zhan:2024", "fingerprint": "injecagent benchmarking indirect prompt injections in tool integrated large language model agents", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch35_security_permission_collaboration.md", @@ -3056,18 +3067,16 @@ "file": "docs/zh/part11/ch36_compliance_framework_and_governance.md", "line": 1186, "entry_no": 4, - "entry": "Spiekermann S, Cranor L F (2009) Engineering Privacy. IEEE Transactions on Software Engineering, 35(1), 67-82.", + "entry": "Spiekermann S, Cranor L F (2009) Engineering Privacy. IEEE Transactions on Software Engineering, 35(1), 67-82. https://doi.org/10.1109/tse.2008.88.", "first_author": "Spiekermann", "year": "2009", "title": "Engineering Privacy", - "doi": "", + "doi": "10.1109/tse.2008.88", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/tse.2008.88", "key": "spiekermann:2009", "fingerprint": "engineering privacy", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch36_compliance_framework_and_governance.md", @@ -3107,35 +3116,31 @@ "file": "docs/zh/part11/ch36_compliance_framework_and_governance.md", "line": 1192, "entry_no": 7, - "entry": "Kosenkov O, Zabardast E, Fucci D, Mendez D, Unterkalmsteiner M (2026) Privacy by Design: Aligning GDPR and Software Engineering Specifications with a Requirements Engineering Approach. Information and Software Technology, 190, 107946.", + "entry": "Kosenkov O, Zabardast E, Fucci D, Mendez D, Unterkalmsteiner M (2026) Privacy by Design: Aligning GDPR and Software Engineering Specifications with a Requirements Engineering Approach. Information and Software Technology, 190, 107946. https://doi.org/10.1016/j.infsof.2025.107946.", "first_author": "Kosenkov", "year": "2026", "title": "Privacy by Design: Aligning GDPR and Software Engineering Specifications with a Requirements Engineering Approach", - "doi": "", + "doi": "10.1016/j.infsof.2025.107946", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1016/j.infsof.2025.107946", "key": "kosenkov:2026", "fingerprint": "privacy by design aligning gdpr and software engineering specifications with a requirements engineering approach", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch36_compliance_framework_and_governance.md", "line": 1194, "entry_no": 8, - "entry": "Hoepman J-H (2014) Privacy Design Strategies. In IFIP International Information Security Conference, pp 446-459.", + "entry": "Hoepman J-H (2014) Privacy Design Strategies. In IFIP International Information Security Conference, pp 446-459. https://doi.org/10.1007/978-3-642-55415-5_38.", "first_author": "Hoepman", "year": "2014", "title": "Privacy Design Strategies", - "doi": "", + "doi": "10.1007/978-3-642-55415-5_38", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1007/978-3-642-55415-5_38", "key": "hoepman:2014", "fingerprint": "privacy design strategies", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch36_compliance_framework_and_governance.md", @@ -3158,18 +3163,16 @@ "file": "docs/zh/part11/ch36_compliance_framework_and_governance.md", "line": 1198, "entry_no": 10, - "entry": "Dwork C (2008) Differential Privacy: A Survey of Results. In Theory and Applications of Models of Computation, Springer Berlin Heidelberg, pp 1-19.", + "entry": "Dwork C (2008) Differential Privacy: A Survey of Results. In Theory and Applications of Models of Computation, Springer Berlin Heidelberg, pp 1-19. https://doi.org/10.1007/978-3-540-79228-4_1.", "first_author": "Dwork", "year": "2008", "title": "Differential Privacy: A Survey of Results", - "doi": "", + "doi": "10.1007/978-3-540-79228-4_1", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1007/978-3-540-79228-4_1", "key": "dwork:2008", "fingerprint": "differential privacy a survey of results", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch36_compliance_framework_and_governance.md", @@ -3192,35 +3195,31 @@ "file": "docs/zh/part11/ch36_compliance_framework_and_governance.md", "line": 1202, "entry_no": 12, - "entry": "Anthonysamy P, Rashid A, Chitchyan R (2017) Privacy Requirements: Present & Future. In 2017 IEEE/ACM 39th International Conference on Software Engineering: Software Engineering in Society Track (ICSE-SEIS), pp 13-22.", + "entry": "Anthonysamy P, Rashid A, Chitchyan R (2017) Privacy Requirements: Present & Future. In 2017 IEEE/ACM 39th International Conference on Software Engineering: Software Engineering in Society Track (ICSE-SEIS), pp 13-22. https://doi.org/10.1109/icse-seis.2017.3.", "first_author": "Anthonysamy", "year": "2017", "title": "Privacy Requirements: Present & Future", - "doi": "", + "doi": "10.1109/icse-seis.2017.3", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/icse-seis.2017.3", "key": "anthonysamy:2017", "fingerprint": "privacy requirements present future", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch36_compliance_framework_and_governance.md", "line": 1204, "entry_no": 13, - "entry": "Oetzel M C, Spiekermann S (2014) A Systematic Methodology for Privacy Impact Assessments: A Design Science Approach. European Journal of Information Systems, 23(2), 126-150.", + "entry": "Oetzel M C, Spiekermann S (2014) A Systematic Methodology for Privacy Impact Assessments: A Design Science Approach. European Journal of Information Systems, 23(2), 126-150. https://doi.org/10.1057/ejis.2013.18.", "first_author": "Oetzel", "year": "2014", "title": "A Systematic Methodology for Privacy Impact Assessments: A Design Science Approach", - "doi": "", + "doi": "10.1057/ejis.2013.18", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1057/ejis.2013.18", "key": "oetzel:2014", "fingerprint": "a systematic methodology for privacy impact assessments a design science approach", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch36_compliance_framework_and_governance.md", @@ -3375,18 +3374,16 @@ "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", "line": 444, "entry_no": 2, - "entry": "Zhu L, Liu Z, Han S (2019) Deep Leakage from Gradients. Advances in Neural Information Processing Systems, 32.", + "entry": "Zhu L, Liu Z, Han S (2019) Deep Leakage from Gradients. Advances in Neural Information Processing Systems, 32. arXiv:1906.08935.", "first_author": "Zhu", "year": "2019", "title": "Deep Leakage from Gradients", "doi": "", - "arxiv": "", + "arxiv": "1906.08935", "url": "", "key": "zhu:2019", "fingerprint": "deep leakage from gradients", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", @@ -3460,35 +3457,31 @@ "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", "line": 454, "entry_no": 7, - "entry": "McMahan H B, Ramage D, Talwar K, Zhang L (2018) Learning Differentially Private Recurrent Language Models. International Conference on Learning Representations.", + "entry": "McMahan H B, Ramage D, Talwar K, Zhang L (2018) Learning Differentially Private Recurrent Language Models. International Conference on Learning Representations. arXiv:1710.06963.", "first_author": "McMahan", "year": "2018", "title": "Learning Differentially Private Recurrent Language Models", "doi": "", - "arxiv": "", + "arxiv": "1710.06963", "url": "", "key": "mcmahan:2018", "fingerprint": "learning differentially private recurrent language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", "line": 456, "entry_no": 8, - "entry": "Kairouz P, McMahan H B (2021) Advances and Open Problems in Federated Learning. Foundations and Trends in Machine Learning, 14(1-2), 1-210.", + "entry": "Kairouz P, McMahan H B (2021) Advances and Open Problems in Federated Learning. Foundations and Trends in Machine Learning, 14(1-2), 1-210. https://doi.org/10.1561/2200000083.", "first_author": "Kairouz", "year": "2021", "title": "Advances and Open Problems in Federated Learning", - "doi": "", + "doi": "10.1561/2200000083", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1561/2200000083", "key": "kairouz:2021", "fingerprint": "advances and open problems in federated learning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", @@ -3528,18 +3521,16 @@ "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", "line": 462, "entry_no": 11, - "entry": "Bonawitz K, Ivanov V, Kreuter B, Marcedone A, McMahan H B, Patel S, Ramage D, Segal A, Seth K (2017) Practical Secure Aggregation for Privacy-Preserving Machine Learning. In Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security, pp 1175-1191.", + "entry": "Bonawitz K, Ivanov V, Kreuter B, Marcedone A, McMahan H B, Patel S, Ramage D, Segal A, Seth K (2017) Practical Secure Aggregation for Privacy-Preserving Machine Learning. In Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security, pp 1175-1191. https://doi.org/10.1145/3133956.3133982.", "first_author": "Bonawitz", "year": "2017", "title": "Practical Secure Aggregation for Privacy-Preserving Machine Learning", - "doi": "", + "doi": "10.1145/3133956.3133982", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3133956.3133982", "key": "bonawitz:2017", "fingerprint": "practical secure aggregation for privacy preserving machine learning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", @@ -3594,18 +3585,16 @@ "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", "line": 470, "entry_no": 15, - "entry": "Mohassel P, Zhang Y (2017) SecureML: A System for Scalable Privacy-Preserving Machine Learning. In 2017 IEEE Symposium on Security and Privacy (SP), pp 19-38.", + "entry": "Mohassel P, Zhang Y (2017) SecureML: A System for Scalable Privacy-Preserving Machine Learning. In 2017 IEEE Symposium on Security and Privacy (SP), pp 19-38. https://doi.org/10.1109/sp.2017.12.", "first_author": "Mohassel", "year": "2017", "title": "SecureML: A System for Scalable Privacy-Preserving Machine Learning", - "doi": "", + "doi": "10.1109/sp.2017.12", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/sp.2017.12", "key": "mohassel:2017", "fingerprint": "secureml a system for scalable privacy preserving machine learning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", @@ -3645,35 +3634,31 @@ "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", "line": 476, "entry_no": 18, - "entry": "Hu E J, Shen Y, Wallis P, Allen-Zhu Z, Li Y, Wang S, Wang L, Chen W, others (2022) LoRA: Low-Rank Adaptation of Large Language Models. International Conference on Learning Representations.", + "entry": "Hu E J, Shen Y, Wallis P, Allen-Zhu Z, Li Y, Wang S, Wang L, Chen W, others (2022) LoRA: Low-Rank Adaptation of Large Language Models. International Conference on Learning Representations. arXiv:2106.09685.", "first_author": "Hu", "year": "2022", "title": "LoRA: Low-Rank Adaptation of Large Language Models", "doi": "", - "arxiv": "", + "arxiv": "2106.09685", "url": "", "key": "hu:2022", "fingerprint": "lora low rank adaptation of large language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", "line": 478, "entry_no": 19, - "entry": "Kuang W, Qian B, Li Z, Chen D, Gao D, Pan X, Xie Y, Li Y, Ding B, Zhou J (2024) FederatedScope-LLM: A Comprehensive Package for Fine-Tuning Large Language Models in Federated Learning. In Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 5260-5271.", + "entry": "Kuang W, Qian B, Li Z, Chen D, Gao D, Pan X, Xie Y, Li Y, Ding B, Zhou J (2024) FederatedScope-LLM: A Comprehensive Package for Fine-Tuning Large Language Models in Federated Learning. In Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp 5260-5271. https://doi.org/10.1145/3637528.3671573.", "first_author": "Kuang", "year": "2024", "title": "FederatedScope-LLM: A Comprehensive Package for Fine-Tuning Large Language Models in Federated Learning", - "doi": "", + "doi": "10.1145/3637528.3671573", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3637528.3671573", "key": "kuang:2024", "fingerprint": "federatedscope llm a comprehensive package for fine tuning large language models in federated learning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", @@ -3696,39 +3681,35 @@ "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", "line": 482, "entry_no": 21, - "entry": "Sheller M J, Edwards B, Reina G A, others (2020) Federated Learning in Medicine: Facilitating Multi-Institutional Collaborations without Sharing Patient Data. Scientific Reports, 10(1), 12598.", + "entry": "Sheller M J, Edwards B, Reina G A, others (2020) Federated Learning in Medicine: Facilitating Multi-Institutional Collaborations without Sharing Patient Data. Scientific Reports, 10(1), 12598. https://doi.org/10.1038/s41598-020-69250-1.", "first_author": "Sheller", "year": "2020", "title": "Federated Learning in Medicine: Facilitating Multi-Institutional Collaborations without Sharing Patient Data", - "doi": "", + "doi": "10.1038/s41598-020-69250-1", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1038/s41598-020-69250-1", "key": "sheller:2020", "fingerprint": "federated learning in medicine facilitating multi institutional collaborations without sharing patient data", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", "line": 597, "entry_no": 1, - "entry": "1. - Penedo, G., Kydlíček, H., Allal, L. B., Lozhkov, A., Mitchell, M., Raffel, C., von Werra, L., & Wolf, T. (2024). The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. NeurIPS 2024 Datasets and Benchmarks Track. https://arxiv.org/abs/2406.17557.", - "first_author": "", + "entry": "Penedo, G., Kydlíček, H., Allal, L. B., Lozhkov, A., Mitchell, M., Raffel, C., von Werra, L., & Wolf, T. (2024). The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. NeurIPS 2024 Datasets and Benchmarks Track. https://arxiv.org/abs/2406.17557.", + "first_author": "Penedo", "year": "2024", "title": "The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale", "doi": "", "arxiv": "", "url": "https://arxiv.org/abs/2406.17557", - "key": "", + "key": "penedo:2024", "fingerprint": "the fineweb datasets decanting the web for the finest text data at scale", - "format_issues": [ - "missing-first-author" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 598, + "line": 599, "entry_no": 2, "entry": "Hugging Face. (2026). HuggingFaceFW/fineweb Dataset Card. https://huggingface.co/datasets/HuggingFaceFW/fineweb.", "first_author": "Hugging", @@ -3743,7 +3724,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 599, + "line": 601, "entry_no": 3, "entry": "Hugging Face. (2026). DataTrove FineWeb Processing Script. https://github.com/huggingface/datatrove/blob/main/examples/fineweb.py.", "first_author": "Hugging", @@ -3758,7 +3739,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 600, + "line": 603, "entry_no": 4, "entry": "Penedo, G., Kydlíček, H., Cappelli, A., Sasko, M., & Wolf, T. (2024). DataTrove large scale data processing. https://github.com/huggingface/datatrove.", "first_author": "Penedo", @@ -3773,7 +3754,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 601, + "line": 605, "entry_no": 5, "entry": "Luccioni, S., & Viviano, J. (2021). What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus. https://arxiv.org/abs/2105.02732.", "first_author": "Luccioni", @@ -3788,24 +3769,22 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 603, + "line": 607, "entry_no": 6, - "entry": "2. - Soldaini, L., Kinney, R., Bhagia, A., Schwenk, D., Atkinson, D., Authur, R., et al. (2024). Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research. ACL 2024. https://arxiv.org/abs/2402.00159.", - "first_author": "", + "entry": "Soldaini, L., Kinney, R., Bhagia, A., Schwenk, D., Atkinson, D., Authur, R., et al. (2024). Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research. ACL 2024. https://arxiv.org/abs/2402.00159.", + "first_author": "Soldaini", "year": "2024", "title": "Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research", "doi": "", "arxiv": "", "url": "https://arxiv.org/abs/2402.00159", - "key": "", + "key": "soldaini:2024", "fingerprint": "dolma an open corpus of three trillion tokens for language model pretraining research", - "format_issues": [ - "missing-first-author" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 604, + "line": 609, "entry_no": 7, "entry": "Allen Institute for AI. (2023). Ai2 Dolma: 3 trillion token open corpus for language model pretraining. https://allenai.org/blog/dolma-3-trillion-tokens-open-llm-corpus-9a0ff4b8da64.", "first_author": "Allen", @@ -3820,7 +3799,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 605, + "line": 611, "entry_no": 8, "entry": "AllenAI. (2026). allenai/dolma Dataset Card. https://huggingface.co/datasets/allenai/dolma.", "first_author": "AllenAI.", @@ -3835,7 +3814,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 606, + "line": 613, "entry_no": 9, "entry": "AllenAI. (2026). Dolma Dataset and Toolkit Repository. https://github.com/allenai/dolma.", "first_author": "AllenAI.", @@ -3850,7 +3829,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 607, + "line": 615, "entry_no": 10, "entry": "AllenAI. (2026). Dolma Toolkit Documentation. https://github.com/allenai/dolma/blob/main/docs/README.md.", "first_author": "AllenAI.", @@ -3865,7 +3844,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 608, + "line": 617, "entry_no": 11, "entry": "Groeneveld, D., Beltagy, I., Walsh, P., Bhagia, A., Kinney, R., Tafjord, O., et al. (2024). OLMo: Accelerating the Science of Language Models. https://arxiv.org/abs/2402.00838.", "first_author": "Groeneveld", @@ -3895,7 +3874,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 265, + "line": 266, "entry_no": 2, "entry": "LAION. (2022). LAION-5B: A new era of open large-scale multi-modal datasets. https://laion.ai/blog/laion-5b/.", "first_author": "LAION.", @@ -3910,7 +3889,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 266, + "line": 268, "entry_no": 3, "entry": "LAION-AI. (2022). dataset-spec. https://github.com/LAION-AI/dataset-spec.", "first_author": "LAION-AI.", @@ -3925,7 +3904,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 267, + "line": 270, "entry_no": 4, "entry": "Gadre, S. Y., Ilharco, G., Fang, A., Hayase, J., Smyrnis, G., Nguyen, T., et al. (2023). DataComp: In search of the next generation of multimodal datasets. NeurIPS 2023 Datasets and Benchmarks Track. https://arxiv.org/abs/2304.14108.", "first_author": "Gadre", @@ -3940,7 +3919,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 268, + "line": 272, "entry_no": 5, "entry": "DataComp Team. (2026). DataComp Benchmark Documentation. https://www.datacomp.ai/dcclip/.", "first_author": "DataComp", @@ -3955,7 +3934,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 269, + "line": 274, "entry_no": 6, "entry": "ML Foundations. (2023). DataComp codebase. https://github.com/mlfoundations/datacomp.", "first_author": "ML", @@ -4117,18 +4096,16 @@ "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", "line": 746, "entry_no": 10, - "entry": "Mathew, M., Karatzas, D., and Jawahar, C.V. (2021). DocVQA: A Dataset for VQA on Document Images. *Proc. WACV*.", + "entry": "Mathew, M., Karatzas, D., and Jawahar, C.V. (2021). DocVQA: A Dataset for VQA on Document Images. *Proc. WACV*. https://doi.org/10.1109/wacv48630.2021.00225.", "first_author": "Mathew", "year": "2021", "title": "DocVQA: A Dataset for VQA on Document Images", - "doi": "", + "doi": "10.1109/wacv48630.2021.00225", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/wacv48630.2021.00225", "key": "mathew:2021", "fingerprint": "docvqa a dataset for vqa on document images", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", @@ -4168,18 +4145,16 @@ "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", "line": 752, "entry_no": 13, - "entry": "Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D., and Finn, C. (2024). Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. *Proc. NeurIPS*.", + "entry": "Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D., and Finn, C. (2024). Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. *Proc. NeurIPS*. arXiv:2305.18290.", "first_author": "Rafailov", "year": "2024", - "title": "Direct Preference Optimization: Your Language Model Is Secretly a Reward Model", + "title": "Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Proc. NeurIPS", "doi": "", - "arxiv": "", + "arxiv": "2305.18290", "url": "", "key": "rafailov:2024", - "fingerprint": "direct preference optimization your language model is secretly a reward model", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "fingerprint": "direct preference optimization your language model is secretly a reward model proc neurips", + "format_issues": [] }, { "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", @@ -4379,7 +4354,7 @@ "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", "line": 778, "entry_no": 26, - "entry": "Hunyuan Vision Team, Lyu, P., Wan, X., et al. (2025). HunyuanOCR Technical Report. *arXiv preprint*.", + "entry": "Hunyuan Vision Team (2025). HunyuanOCR Technical Report. *arXiv preprint*.", "first_author": "Hunyuan", "year": "2025", "title": "HunyuanOCR Technical Report", @@ -4475,35 +4450,31 @@ "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", "line": 790, "entry_no": 32, - "entry": "Smock, B., Pesala, R., and Abraham, R. (2022). PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents. *Proc. CVPR*.", + "entry": "Smock, B., Pesala, R., and Abraham, R. (2022). PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents. *Proc. CVPR*. https://doi.org/10.1109/cvpr52688.2022.00459.", "first_author": "Smock", "year": "2022", "title": "PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents", - "doi": "", + "doi": "10.1109/cvpr52688.2022.00459", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/cvpr52688.2022.00459", "key": "smock:2022", "fingerprint": "pubtables 1m towards comprehensive table extraction from unstructured documents", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", "line": 792, "entry_no": 33, - "entry": "Zhu, F., Lei, W., Huang, Y., Wang, C., Zhang, S., Lv, J., Feng, F., and Chua, T.-S. (2021). TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance. *Proc. ACL*.", + "entry": "Zhu, F., Lei, W., Huang, Y., Wang, C., Zhang, S., Lv, J., Feng, F., and Chua, T.-S. (2021). TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance. *Proc. ACL*. https://doi.org/10.18653/v1/2021.acl-long.254.", "first_author": "Zhu", "year": "2021", "title": "TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance", - "doi": "", + "doi": "10.18653/v1/2021.acl-long.254", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2021.acl-long.254", "key": "zhu:2021", "fingerprint": "tat qa a question answering benchmark on a hybrid of tabular and textual content in finance", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", @@ -4539,35 +4510,31 @@ "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", "line": 827, "entry_no": 1, - "entry": "Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022.", + "entry": "Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. https://doi.org/10.18653/v1/2022.findings-acl.177.", "first_author": "Masry", "year": "2022", "title": "ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning", - "doi": "", + "doi": "10.18653/v1/2022.findings-acl.177", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2022.findings-acl.177", "key": "masry:2022", "fingerprint": "chartqa a benchmark for question answering about charts with visual and logical reasoning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", "line": 829, "entry_no": 2, - "entry": "Methani, N., Ganguly, P., Khapra, M. M., & Kumar, P. (2020). PlotQA: Reasoning over Scientific Plots. WACV 2020.", + "entry": "Methani, N., Ganguly, P., Khapra, M. M., & Kumar, P. (2020). PlotQA: Reasoning over Scientific Plots. WACV 2020. https://doi.org/10.1109/wacv45572.2020.9093523.", "first_author": "Methani", "year": "2020", "title": "PlotQA: Reasoning over Scientific Plots", - "doi": "", + "doi": "10.1109/wacv45572.2020.9093523", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/wacv45572.2020.9093523", "key": "methani:2020", "fingerprint": "plotqa reasoning over scientific plots", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", @@ -4588,52 +4555,46 @@ "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", "line": 833, "entry_no": 4, - "entry": "Kafle, K., Price, B., Cohen, S., & Kanan, C. (2018). DVQA: Understanding Data Visualizations via Question Answering. CVPR 2018.", + "entry": "Kafle, K., Price, B., Cohen, S., & Kanan, C. (2018). DVQA: Understanding Data Visualizations via Question Answering. CVPR 2018. https://doi.org/10.1109/cvpr.2018.00592.", "first_author": "Kafle", "year": "2018", "title": "DVQA: Understanding Data Visualizations via Question Answering", - "doi": "", + "doi": "10.1109/cvpr.2018.00592", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/cvpr.2018.00592", "key": "kafle:2018", "fingerprint": "dvqa understanding data visualizations via question answering", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", "line": 835, "entry_no": 5, - "entry": "Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021.", + "entry": "Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. https://doi.org/10.1109/wacv48630.2021.00225.", "first_author": "Mathew", "year": "2021", "title": "DocVQA: A Dataset for VQA on Document Images", - "doi": "", + "doi": "10.1109/wacv48630.2021.00225", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/wacv48630.2021.00225", "key": "mathew:2021", "fingerprint": "docvqa a dataset for vqa on document images", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", "line": 837, "entry_no": 6, - "entry": "Masry, A., Islam, M. S., Ahmed, M., Bajaj, A., Kabir, F., Kartha, A., ... & Joty, S. (2025, July). Chartqapro: A more diverse and challenging benchmark for chart question answering. In Findings of the Association for Computational Linguistics: ACL 2025 (pp. 19123-19151).", + "entry": "Masry, A., Islam, M. S., Ahmed, M., Bajaj, A., Kabir, F., Kartha, A., ... & Joty, S. (2025, July). Chartqapro: A more diverse and challenging benchmark for chart question answering. In Findings of the Association for Computational Linguistics: ACL 2025 (pp. 19123-19151). https://doi.org/10.18653/v1/2025.findings-acl.978.", "first_author": "Masry", "year": "2025", "title": "Chartqapro: A more diverse and challenging benchmark for chart question answering", - "doi": "", + "doi": "10.18653/v1/2025.findings-acl.978", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2025.findings-acl.978", "key": "masry:2025", "fingerprint": "chartqapro a more diverse and challenging benchmark for chart question answering", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", @@ -4750,18 +4711,16 @@ "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", "line": 853, "entry_no": 14, - "entry": "Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. International Conference on Learning Representations.", + "entry": "Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. International Conference on Learning Representations. arXiv:2210.03629.", "first_author": "Yao", "year": "2023", "title": "ReAct: Synergizing Reasoning and Acting in Language Models", "doi": "", - "arxiv": "", + "arxiv": "2210.03629", "url": "", "key": "yao:2023", "fingerprint": "react synergizing reasoning and acting in language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", @@ -4985,18 +4944,16 @@ "file": "docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md", "line": 452, "entry_no": 1, - "entry": "1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022.", + "entry": "1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022. arXiv:2201.11903.", "first_author": "Wei", "year": "2022", - "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", + "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022", "doi": "", - "arxiv": "", + "arxiv": "2201.11903", "url": "", "key": "wei:2022", - "fingerprint": "chain of thought prompting elicits reasoning in large language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "fingerprint": "chain of thought prompting elicits reasoning in large language models neurips 2022", + "format_issues": [] }, { "file": "docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md", @@ -5032,35 +4989,31 @@ "file": "docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md", "line": 455, "entry_no": 4, - "entry": "4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning.", - "first_author": "DeepSeek-AI.", + "entry": "4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948.", + "first_author": "DeepSeek-AI", "year": "2025", "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", "doi": "", - "arxiv": "", + "arxiv": "2501.12948", "url": "", "key": "deepseekai:2025", "fingerprint": "deepseek r1 incentivizing reasoning capability in llms via reinforcement learning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md", "line": 456, "entry_no": 5, - "entry": "5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021.", + "entry": "5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021. arXiv:2103.03874.", "first_author": "Hendrycks", "year": "2021", - "title": "Measuring Mathematical Problem Solving With the MATH Dataset", + "title": "Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021", "doi": "", - "arxiv": "", + "arxiv": "2103.03874", "url": "", "key": "hendrycks:2021", - "fingerprint": "measuring mathematical problem solving with the math dataset", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "fingerprint": "measuring mathematical problem solving with the math dataset neurips 2021", + "format_issues": [] }, { "file": "docs/zh/part13/ch44_pretrain_recipes.md", @@ -5254,39 +5207,35 @@ "file": "docs/zh/part13/ch44_pretrain_recipes.md", "line": 308, "entry_no": 13, - "entry": "Su J, Lu Y, Pan S, Murtadha A, Wen B, Liu Y (2024) RoFormer: Enhanced Transformer with Rotary Position Embedding (RoPE). Neurocomputing 568:127063.", + "entry": "Su J, Lu Y, Pan S, Murtadha A, Wen B, Liu Y (2024) RoFormer: Enhanced Transformer with Rotary Position Embedding (RoPE). Neurocomputing 568:127063. https://doi.org/10.1016/j.neucom.2023.127063.", "first_author": "Su", "year": "2024", "title": "RoFormer: Enhanced Transformer with Rotary Position Embedding (RoPE)", - "doi": "", + "doi": "10.1016/j.neucom.2023.127063", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1016/j.neucom.2023.127063", "key": "su:2024", "fingerprint": "roformer enhanced transformer with rotary position embedding rope", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch44_pretrain_recipes.md", "line": 310, "entry_no": 14, - "entry": "Wang X, Wei J, Schuurmans D, Le Q, Chi E, Narang S, Chowdhery A, Zhou D (2023) Self-Consistency Improves Chain of Thought Reasoning in Language Models. In: International Conference on Learning Representations.", + "entry": "Wang X, Wei J, Schuurmans D, Le Q, Chi E, Narang S, Chowdhery A, Zhou D (2023) Self-Consistency Improves Chain of Thought Reasoning in Language Models. In: International Conference on Learning Representations. arXiv:2203.11171.", "first_author": "Wang", "year": "2023", "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models", "doi": "", - "arxiv": "", + "arxiv": "2203.11171", "url": "", "key": "wang:2023", "fingerprint": "self consistency improves chain of thought reasoning in language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 375, + "line": 383, "entry_no": 1, "entry": "Wang Y, Kordi Y, Mishra S, Liu A, Smith N A, Khashabi D, Hajishirzi H (2023) Self-Instruct: Aligning Language Models with Self-Generated Instructions. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 13484-13508. https://doi.org/10.18653/v1/2023.acl-long.754.", "first_author": "Wang", @@ -5301,7 +5250,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 377, + "line": 385, "entry_no": 2, "entry": "Ouyang L, Wu J, Jiang X, Almeida D, Wainwright C, Mishkin P, Zhang C, Agarwal S, Slama K, Ray A, Schulman J, Hilton J, Kelton F, Miller L, Simens M, Askell A, Welinder P, Christiano P F, Leike J, Lowe R (2022) Training Language Models to Follow Instructions with Human Feedback. Advances in Neural Information Processing Systems, 35, 27730-27744.", "first_author": "Ouyang", @@ -5318,7 +5267,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 379, + "line": 387, "entry_no": 3, "entry": "Rafailov R, Sharma A, Mitchell E, Manning C D, Ermon S, Finn C (2023) Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Advances in Neural Information Processing Systems, 36, 53728-53741.", "first_author": "Rafailov", @@ -5335,7 +5284,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 381, + "line": 389, "entry_no": 4, "entry": "Ethayarajh K, Xu W, Muennighoff N, Jurafsky D, Kiela D (2024) Model Alignment as Prospect Theoretic Optimization. Proceedings of the 41st International Conference on Machine Learning, pp 12634-12651.", "first_author": "Ethayarajh", @@ -5352,16 +5301,16 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 383, + "line": 391, "entry_no": 5, "entry": "Gheshlaghi Azar M, Guo Z D, Piot B, Munos R, Rowland M, Valko M, Calandriello D (2024) A General Theoretical Paradigm to Understand Learning from Human Preferences. Proceedings of the 27th International Conference on Artificial Intelligence and Statistics, pp 4447-4455.", - "first_author": "Gheshlaghi", + "first_author": "Gheshlaghi Azar", "year": "2024", "title": "A General Theoretical Paradigm to Understand Learning from Human Preferences", "doi": "", "arxiv": "", "url": "", - "key": "gheshlaghi:2024", + "key": "gheshlaghiazar:2024", "fingerprint": "a general theoretical paradigm to understand learning from human preferences", "format_issues": [ "missing-doi-arxiv-url" @@ -5369,7 +5318,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 385, + "line": 393, "entry_no": 6, "entry": "Grattafiori A, Dubey A, Jauhri A, Pandey A, Kadian A, Al-Dahle A, Letman A, Mathur A, Schelten A, Vaughan A, et al. (2024) The Llama 3 Herd of Models. arXiv preprint arXiv:2407.21783.", "first_author": "Grattafiori", @@ -5384,24 +5333,22 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 387, + "line": 395, "entry_no": 7, - "entry": "Lambert N, Morrison J, Pyatkin V, Huang S, Ivison H, Brahman F, Miranda L J V, Liu A, Dziri N, Lyu X, Gu Y, Malik S, Graf V, Hwang J D, Yang J, Le Bras R, Tafjord O, Wilhelm C, Soldaini L, Smith N A, Wang Y, Dasigi P, Hajishirzi H (2025) Tülu 3: Pushing Frontiers in Open Language Model Post-Training. Second Conference on Language Modeling.", + "entry": "Lambert N, Morrison J, Pyatkin V, Huang S, Ivison H, Brahman F, Miranda L J V, Liu A, Dziri N, Lyu X, Gu Y, Malik S, Graf V, Hwang J D, Yang J, Le Bras R, Tafjord O, Wilhelm C, Soldaini L, Smith N A, Wang Y, Dasigi P, Hajishirzi H (2025) Tülu 3: Pushing Frontiers in Open Language Model Post-Training. Second Conference on Language Modeling. arXiv preprint arXiv:2411.15124.", "first_author": "Lambert", "year": "2025", "title": "Tülu 3: Pushing Frontiers in Open Language Model Post-Training", "doi": "", - "arxiv": "", + "arxiv": "2411.15124", "url": "", "key": "lambert:2025", "fingerprint": "tulu 3 pushing frontiers in open language model post training", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 389, + "line": 397, "entry_no": 8, "entry": "Yang A, Li A, Yang B, Zhang B, Hui B, Zheng B, Yu B, Gao C, Huang C, Lv C, others (2025) Qwen3 Technical Report. arXiv preprint arXiv:2505.09388.", "first_author": "Yang", @@ -5416,7 +5363,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 391, + "line": 399, "entry_no": 9, "entry": "Wang Z, Dong Y, Delalleau O, Zeng J, Shen G, Egert D, Zhang J J, Sreedhar M N, Kuchaiev O (2024) HelpSteer 2: Open-Source Dataset for Training Top-Performing Reward Models. Advances in Neural Information Processing Systems, 37, 1474-1501.", "first_author": "Wang", @@ -5433,7 +5380,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 393, + "line": 401, "entry_no": 10, "entry": "Xu C, Sun Q, Zheng K, Geng X, Zhao P, Feng J, Tao C, Lin Q, Jiang D (2024) WizardLM: Empowering Large Pre-Trained Language Models to Follow Complex Instructions. In: International Conference on Learning Representations. arXiv:2304.12244.", "first_author": "Xu", @@ -5448,7 +5395,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 395, + "line": 403, "entry_no": 11, "entry": "Xu Z, Jiang F, Niu L, Deng Y, Poovendran R, Choi Y, Lin B Y (2025) Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing. International Conference on Learning Representations.", "first_author": "Xu", @@ -5465,7 +5412,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 397, + "line": 405, "entry_no": 12, "entry": "Liu A, Feng B, Xue B, Wang B, Wu B, Lu C, Zhao C, Deng C, Zhang C, Ruan C, others (2024a) DeepSeek-V3 Technical Report. arXiv preprint arXiv:2412.19437.", "first_author": "Liu", @@ -5480,7 +5427,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 399, + "line": 407, "entry_no": 13, "entry": "Liu C Y, Zeng L, Liu J, Yan R, He J, Wang C, Yan S, Liu Y, Zhou Y (2024b) Skywork-Reward: Bag of Tricks for Reward Modeling in LLMs. arXiv preprint arXiv:2410.18451.", "first_author": "Liu", @@ -5495,7 +5442,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 401, + "line": 409, "entry_no": 14, "entry": "Singhal P, Goyal T, Xu J, Durrett G (2024) A Long Way to Go: Investigating Length Correlations in RLHF. First Conference on Language Modeling.", "first_author": "Singhal", @@ -5512,7 +5459,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 403, + "line": 411, "entry_no": 15, "entry": "Shao Z, Wang P, Zhu Q, Xu R, Song J, Bi X, Zhang H, Zhang M, Li Y, Wu Y, Guo D (2024) DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models. arXiv preprint arXiv:2402.03300.", "first_author": "Shao", @@ -5527,7 +5474,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 405, + "line": 413, "entry_no": 16, "entry": "Zhou K, Zhu Y, Chen Z, Chen W, Zhao W X, Chen X, Lin Y, Wen J-R, Han J (2023) Don't Make Your LLM an Evaluation Benchmark Cheater. arXiv preprint arXiv:2311.01964.", "first_author": "Zhou", @@ -5542,41 +5489,37 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 407, + "line": 415, "entry_no": 17, - "entry": "Zheng L, Chiang W-L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E, Zhang H, Gonzalez J, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. Advances in Neural Information Processing Systems, 36, 46595-46623.", + "entry": "Zheng L, Chiang W-L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E, Zhang H, Gonzalez J, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. Advances in Neural Information Processing Systems, 36, 46595-46623. arXiv:2306.05685.", "first_author": "Zheng", "year": "2023", "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", "doi": "", - "arxiv": "", + "arxiv": "2306.05685", "url": "", "key": "zheng:2023", "fingerprint": "judging llm as a judge with mt bench and chatbot arena", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 409, + "line": 417, "entry_no": 18, - "entry": "Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations.", + "entry": "Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations. arXiv:2305.20050.", "first_author": "Lightman", "year": "2024", "title": "Let's Verify Step by Step", "doi": "", - "arxiv": "", + "arxiv": "2305.20050", "url": "", "key": "lightman:2024", "fingerprint": "let s verify step by step", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 569, + "line": 589, "entry_no": 1, "entry": "Guo D, Yang D, Zhang H, Song J, Wang P, Zhu Q, Xu R, Zhang R, Ma S, Bi X, others (2025) DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv preprint arXiv:2501.12948.", "first_author": "Guo", @@ -5591,7 +5534,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 571, + "line": 591, "entry_no": 2, "entry": "Team Kimi, Du A, Gao B, Xing B, Jiang C, Chen C, Li C, Xiao C, Du C, Liao C, others (2025) Kimi k1.5: Scaling Reinforcement Learning with LLMs. arXiv preprint arXiv:2501.12599.", "first_author": "Kimi Team", @@ -5606,7 +5549,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 573, + "line": 593, "entry_no": 3, "entry": "Qwen Team (2025) QwQ-32B Model Card. Available at: https://huggingface.co/Qwen/QwQ-32B.", "first_author": "Qwen Team", @@ -5621,7 +5564,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 575, + "line": 595, "entry_no": 4, "entry": "Touvron H, Martin L, Stone K, Albert P, Almahairi A, Babaei Y, Bashlykov N, Batra S, Bhargava P, Bhosale S, others (2023) Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv preprint arXiv:2307.09288.", "first_author": "Touvron", @@ -5636,7 +5579,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 577, + "line": 597, "entry_no": 5, "entry": "Cobbe K, Kosaraju V, Bavarian M, Chen M, Jun H, Kaiser L, Plappert M, Tworek J, Hilton J, Nakano R, others (2021) Training Verifiers to Solve Math Word Problems. arXiv preprint arXiv:2110.14168.", "first_author": "Cobbe", @@ -5651,7 +5594,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 579, + "line": 599, "entry_no": 6, "entry": "Chen M, Tworek J, Jun H, Yuan Q, Pinto H P O, Kaplan J, Edwards H, Burda Y, Joseph N, Brockman G, others (2021) Evaluating Large Language Models Trained on Code. arXiv preprint arXiv:2107.03374.", "first_author": "Chen", @@ -5666,7 +5609,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 581, + "line": 601, "entry_no": 7, "entry": "Hendrycks D, Burns C, Kadavath S, Arora A, Basart S, Tang E, Song D, Steinhardt J (2021) Measuring Mathematical Problem Solving With the MATH Dataset. arXiv preprint arXiv:2103.03874.", "first_author": "Hendrycks", @@ -5681,7 +5624,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 583, + "line": 603, "entry_no": 8, "entry": "Meurer A, Smith C P, Paprocki M, Čertík O, Kirpichev S B, Rocklin M, Kumar A, Ivanov S, Moore J K, Singh S, Rathnayake T, Vig S, Granger B E, Muller R P, Bonazzi F, Gupta H, Vats S, Johansson F, Pedregosa F, Curry M J, Terrel A R, Roučka Š, Saboo A, Fernando I, Kulal S, Cimrman R, Scopatz A (2017) SymPy: symbolic computing in Python. PeerJ Computer Science 3:e103. https://doi.org/10.7717/peerj-cs.103.", "first_author": "Meurer", @@ -5696,7 +5639,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 585, + "line": 605, "entry_no": 9, "entry": "Guha E, Marten R, Keh S, Raoof N, Smyrnis G, Bansal H, Nezhurina M, Mercat J, Vu T, Sprague Z, others (2025) OpenThoughts: Data Recipes for Reasoning Models. arXiv preprint arXiv:2506.04178.", "first_author": "Guha", @@ -5711,7 +5654,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 587, + "line": 607, "entry_no": 10, "entry": "Zhou C, Liu P, Xu P, Iyer S, Sun J, Mao Y, Ma X, Efrat A, Yu P, Yu L, Zhang S, Ghosh G, Lewis M, Zettlemoyer L, Levy O (2023) LIMA: Less Is More for Alignment. Advances in Neural Information Processing Systems, 36, 55006-55021.", "first_author": "Zhou", @@ -5728,7 +5671,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 589, + "line": 609, "entry_no": 11, "entry": "Zelikman E, Wu Y, Mu J, Goodman N (2022) STaR: Bootstrapping Reasoning with Reasoning. Advances in Neural Information Processing Systems, 35, 15476-15488.", "first_author": "Zelikman", @@ -5745,7 +5688,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 591, + "line": 611, "entry_no": 12, "entry": "Madaan A, Tandon N, Gupta P, Hallinan S, Gao L, Wiegreffe S, Alon U, Dziri N, Prabhumoye S, Yang Y, Gupta S, Majumder B P, Hermann K, Welleck S, Yazdanbakhsh A, Clark P (2023) Self-Refine: Iterative Refinement with Self-Feedback. Advances in Neural Information Processing Systems, 36, 46534-46594.", "first_author": "Madaan", @@ -5762,24 +5705,22 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 593, + "line": 613, "entry_no": 13, - "entry": "Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations.", + "entry": "Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations. arXiv:2305.20050.", "first_author": "Lightman", "year": "2024", "title": "Let's Verify Step by Step", "doi": "", - "arxiv": "", + "arxiv": "2305.20050", "url": "", "key": "lightman:2024", "fingerprint": "let s verify step by step", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 595, + "line": 615, "entry_no": 14, "entry": "Zheng L, Chiang W-L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E, Zhang H, Gonzalez J, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. Advances in Neural Information Processing Systems, 36, 46595-46623.", "first_author": "Zheng", @@ -5796,7 +5737,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 597, + "line": 617, "entry_no": 15, "entry": "Gao L, Schulman J, Hilton J (2023) Scaling Laws for Reward Model Overoptimization. Proceedings of the 40th International Conference on Machine Learning, pp 10835-10866.", "first_author": "Gao", @@ -5813,7 +5754,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 599, + "line": 619, "entry_no": 16, "entry": "Hosseini A, Yuan X, Malkin N, Courville A, Sordoni A, Agarwal R (2024) V-STaR: Training Verifiers for Self-Taught Reasoners. arXiv preprint arXiv:2402.06457.", "first_author": "Hosseini", @@ -5828,7 +5769,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 601, + "line": 621, "entry_no": 17, "entry": "Shi F, Suzgun M, Freitag M, Wang X, Srivats S, Vosoughi S, Chung H W, Tay Y, Ruder S, Zhou D, others (2022) Language Models Are Multilingual Chain-of-Thought Reasoners. arXiv preprint arXiv:2210.03057.", "first_author": "Shi", @@ -5843,7 +5784,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 603, + "line": 623, "entry_no": 18, "entry": "Jaech A, Kalai A, Lerer A, Richardson A, El-Kishky A, Low A, Helyar A, Madry A, Beutel A, Carney A, others (2024) OpenAI o1 System Card. arXiv preprint arXiv:2412.16720.", "first_author": "Jaech", @@ -5858,24 +5799,22 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 605, + "line": 625, "entry_no": 19, - "entry": "Ott S, Hebenstreit K, Liévin V, others (2023) ThoughtSource: A Central Hub for Large Language Model Reasoning Data. Scientific Data, 10(1), 528.", + "entry": "Ott S, Hebenstreit K, Liévin V, others (2023) ThoughtSource: A Central Hub for Large Language Model Reasoning Data. Scientific Data, 10(1), 528. https://doi.org/10.1038/s41597-023-02433-3.", "first_author": "Ott", "year": "2023", "title": "ThoughtSource: A Central Hub for Large Language Model Reasoning Data", - "doi": "", + "doi": "10.1038/s41597-023-02433-3", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1038/s41597-023-02433-3", "key": "ott:2023", "fingerprint": "thoughtsource a central hub for large language model reasoning data", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 607, + "line": 627, "entry_no": 20, "entry": "Hsieh C-Y, Li C-L, Yeh C-K, Nakhost H, Fujii Y, Ratner A, Krishna R, Lee C-Y, Pfister T (2023) Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes. Findings of the Association for Computational Linguistics: ACL 2023, pp 8003-8017.", "first_author": "Hsieh", @@ -5892,20 +5831,18 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 609, + "line": 629, "entry_no": 21, - "entry": "Patil S G, Zhang T, Wang X, Gonzalez J E (2024) Gorilla: Large Language Model Connected with Massive APIs. Advances in Neural Information Processing Systems, 38.", + "entry": "Patil S G, Zhang T, Wang X, Gonzalez J E (2024) Gorilla: Large Language Model Connected with Massive APIs. Advances in Neural Information Processing Systems, 38. arXiv:2305.15334.", "first_author": "Patil", "year": "2024", "title": "Gorilla: Large Language Model Connected with Massive APIs", "doi": "", - "arxiv": "", + "arxiv": "2305.15334", "url": "", "key": "patil:2024", "fingerprint": "gorilla large language model connected with massive apis", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch47_vlm_data_recipes.md", @@ -5941,35 +5878,31 @@ "file": "docs/zh/part13/ch47_vlm_data_recipes.md", "line": 328, "entry_no": 3, - "entry": "Dao T, Fu D Y, Ermon S, Rudra A, Ré C (2022) FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In: Advances in Neural Information Processing Systems 35:16344-16359.", + "entry": "Dao T, Fu D Y, Ermon S, Rudra A, Ré C (2022) FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness. In: Advances in Neural Information Processing Systems 35:16344-16359. https://doi.org/10.52202/068431-1189.", "first_author": "Dao", "year": "2022", "title": "FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness", - "doi": "", + "doi": "10.52202/068431-1189", "arxiv": "", - "url": "", + "url": "https://doi.org/10.52202/068431-1189", "key": "dao:2022", "fingerprint": "flashattention fast and memory efficient exact attention with io awareness", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch47_vlm_data_recipes.md", "line": 330, "entry_no": 4, - "entry": "Gadre S Y, Ilharco G, Fang A, Hayase J, Ilharco G, Marten T, Wortsman M, Goyal S, Guha E, Jain H, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. In: Advances in Neural Information Processing Systems 36.", + "entry": "Gadre S Y, Ilharco G, Fang A, Hayase J, Ilharco G, Marten T, Wortsman M, Goyal S, Guha E, Jain H, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. In: Advances in Neural Information Processing Systems 36. Available at: https://arxiv.org/abs/2304.14108.", "first_author": "Gadre", "year": "2023", "title": "DataComp: In Search of the Next Generation of Multimodal Datasets", "doi": "", "arxiv": "", - "url": "", + "url": "https://arxiv.org/abs/2304.14108", "key": "gadre:2023", "fingerprint": "datacomp in search of the next generation of multimodal datasets", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch47_vlm_data_recipes.md", @@ -6052,18 +5985,16 @@ "file": "docs/zh/part13/ch47_vlm_data_recipes.md", "line": 342, "entry_no": 10, - "entry": "Mathew M, Karatzas D, Jawahar C V (2021) DocVQA: A Dataset for VQA on Document Images. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision, pp 2200-2209.", + "entry": "Mathew M, Karatzas D, Jawahar C V (2021) DocVQA: A Dataset for VQA on Document Images. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision, pp 2200-2209. https://doi.org/10.1109/wacv48630.2021.00225.", "first_author": "Mathew", "year": "2021", "title": "DocVQA: A Dataset for VQA on Document Images", - "doi": "", + "doi": "10.1109/wacv48630.2021.00225", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/wacv48630.2021.00225", "key": "mathew:2021", "fingerprint": "docvqa a dataset for vqa on document images", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch47_vlm_data_recipes.md", @@ -6086,18 +6017,16 @@ "file": "docs/zh/part13/ch47_vlm_data_recipes.md", "line": 346, "entry_no": 12, - "entry": "Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278-25294.", + "entry": "Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278-25294. Available at: https://arxiv.org/abs/2210.08402.", "first_author": "Schuhmann", "year": "2022", "title": "LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models", "doi": "", "arxiv": "", - "url": "", + "url": "https://arxiv.org/abs/2210.08402", "key": "schuhmann:2022", "fingerprint": "laion 5b an open large scale dataset for training next generation image text models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch47_vlm_data_recipes.md", @@ -6118,18 +6047,16 @@ "file": "docs/zh/part13/ch47_vlm_data_recipes.md", "line": 350, "entry_no": 14, - "entry": "Yue X, Ni Y, Zhang K, Zheng T, Liu R, Zhang S, Stevens J, Jiang C, Zheng N, Sun T, others (2024) MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 9556-9567.", + "entry": "Yue X, Ni Y, Zhang K, Zheng T, Liu R, Zhang S, Stevens J, Jiang C, Zheng N, Sun T, others (2024) MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 9556-9567. https://doi.org/10.1109/cvpr52733.2024.00913.", "first_author": "Yue", "year": "2024", "title": "MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI", - "doi": "", + "doi": "10.1109/cvpr52733.2024.00913", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/cvpr52733.2024.00913", "key": "yue:2024", "fingerprint": "mmmu a massive multi discipline multimodal understanding and reasoning benchmark for expert agi", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch47_vlm_data_recipes.md", @@ -6165,52 +6092,46 @@ "file": "docs/zh/part13/ch48_t2i_t2v.md", "line": 405, "entry_no": 2, - "entry": "Gadre S Y, Ilharco G, Fang A, Hayase J, Ilharco G, Marten T, Wortsman M, Goyal S, Guha E, Jain H, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. In: Advances in Neural Information Processing Systems 36.", + "entry": "Gadre S Y, Ilharco G, Fang A, Hayase J, Ilharco G, Marten T, Wortsman M, Goyal S, Guha E, Jain H, others (2023) DataComp: In Search of the Next Generation of Multimodal Datasets. In: Advances in Neural Information Processing Systems 36. Available at: https://arxiv.org/abs/2304.14108.", "first_author": "Gadre", "year": "2023", "title": "DataComp: In Search of the Next Generation of Multimodal Datasets", "doi": "", "arxiv": "", - "url": "", + "url": "https://arxiv.org/abs/2304.14108", "key": "gadre:2023", "fingerprint": "datacomp in search of the next generation of multimodal datasets", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch48_t2i_t2v.md", "line": 407, "entry_no": 3, - "entry": "Ghosh S, Bhatt U, Bhattacharya R, Parmar P, Patel S, Islam M, Reddy K K, others (2023) GenEval: An Object-Focused Framework for Evaluating Text-to-Image Alignment. In: Advances in Neural Information Processing Systems 36.", + "entry": "Ghosh S, Bhatt U, Bhattacharya R, Parmar P, Patel S, Islam M, Reddy K K, others (2023) GenEval: An Object-Focused Framework for Evaluating Text-to-Image Alignment. In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/075280-2270.", "first_author": "Ghosh", "year": "2023", "title": "GenEval: An Object-Focused Framework for Evaluating Text-to-Image Alignment", - "doi": "", + "doi": "10.52202/075280-2270", "arxiv": "", - "url": "", + "url": "https://doi.org/10.52202/075280-2270", "key": "ghosh:2023", "fingerprint": "geneval an object focused framework for evaluating text to image alignment", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch48_t2i_t2v.md", "line": 409, "entry_no": 4, - "entry": "Kirstain Y, Polyak A, Singer U, Matiana S, Penna J, Levy O (2023) Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation (PickScore). In: Advances in Neural Information Processing Systems 36.", + "entry": "Kirstain Y, Polyak A, Singer U, Matiana S, Penna J, Levy O (2023) Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation (PickScore). In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/075280-1594.", "first_author": "Kirstain", "year": "2023", "title": "Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation (PickScore)", - "doi": "", + "doi": "10.52202/075280-1594", "arxiv": "", - "url": "", + "url": "https://doi.org/10.52202/075280-1594", "key": "kirstain:2023", "fingerprint": "pick a pic an open dataset of user preferences for text to image generation pickscore", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch48_t2i_t2v.md", @@ -6231,35 +6152,31 @@ "file": "docs/zh/part13/ch48_t2i_t2v.md", "line": 413, "entry_no": 6, - "entry": "Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278-25294.", + "entry": "Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278-25294. Available at: https://arxiv.org/abs/2210.08402.", "first_author": "Schuhmann", "year": "2022", "title": "LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models", "doi": "", "arxiv": "", - "url": "", + "url": "https://arxiv.org/abs/2210.08402", "key": "schuhmann:2022", "fingerprint": "laion 5b an open large scale dataset for training next generation image text models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch48_t2i_t2v.md", "line": 415, "entry_no": 7, - "entry": "Wang W, Lv Q, Yu W, Hong W, Qi J, Wang Y, Ji J, Yang Z, Zhao L, Song X, others (2023) CogVLM: Visual Expert for Pretrained Language Models. In: Advances in Neural Information Processing Systems 36.", + "entry": "Wang W, Lv Q, Yu W, Hong W, Qi J, Wang Y, Ji J, Yang Z, Zhao L, Song X, others (2023) CogVLM: Visual Expert for Pretrained Language Models. In: Advances in Neural Information Processing Systems 36. https://doi.org/10.52202/079017-3860.", "first_author": "Wang", "year": "2023", "title": "CogVLM: Visual Expert for Pretrained Language Models", - "doi": "", + "doi": "10.52202/079017-3860", "arxiv": "", - "url": "", + "url": "https://doi.org/10.52202/079017-3860", "key": "wang:2023", "fingerprint": "cogvlm visual expert for pretrained language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch48_t2i_t2v.md", @@ -6639,35 +6556,31 @@ "file": "docs/zh/part14/p03_llava_instruct.md", "line": 1139, "entry_no": 1, - "entry": "1. Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. NeurIPS 2023.", + "entry": "1. Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. NeurIPS 2023. https://doi.org/10.52202/075280-1516.", "first_author": "Liu", "year": "2023", "title": "Visual Instruction Tuning", - "doi": "", + "doi": "10.52202/075280-1516", "arxiv": "", - "url": "", + "url": "https://doi.org/10.52202/075280-1516", "key": "liu:2023", "fingerprint": "visual instruction tuning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p03_llava_instruct.md", "line": 1140, "entry_no": 2, - "entry": "2. Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollár, P., & Zitnick, C. L. (2014). Microsoft COCO: Common Objects in Context. ECCV 2014.", + "entry": "2. Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollár, P., & Zitnick, C. L. (2014). Microsoft COCO: Common Objects in Context. ECCV 2014. https://doi.org/10.1007/978-3-319-10602-1_48.", "first_author": "Lin", "year": "2014", "title": "Microsoft COCO: Common Objects in Context", - "doi": "", + "doi": "10.1007/978-3-319-10602-1_48", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1007/978-3-319-10602-1_48", "key": "lin:2014", "fingerprint": "microsoft coco common objects in context", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p03_llava_instruct.md", @@ -6690,35 +6603,31 @@ "file": "docs/zh/part14/p03_llava_instruct.md", "line": 1142, "entry_no": 4, - "entry": "4. Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021.", + "entry": "4. Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. https://doi.org/10.1109/wacv48630.2021.00225.", "first_author": "Mathew", "year": "2021", "title": "DocVQA: A Dataset for VQA on Document Images", - "doi": "", + "doi": "10.1109/wacv48630.2021.00225", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/wacv48630.2021.00225", "key": "mathew:2021", "fingerprint": "docvqa a dataset for vqa on document images", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p03_llava_instruct.md", "line": 1143, "entry_no": 5, - "entry": "5. Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022.", + "entry": "5. Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. https://doi.org/10.18653/v1/2022.findings-acl.177.", "first_author": "Masry", "year": "2022", "title": "ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning", - "doi": "", + "doi": "10.18653/v1/2022.findings-acl.177", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2022.findings-acl.177", "key": "masry:2022", "fingerprint": "chartqa a benchmark for question answering about charts with visual and logical reasoning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p04_synthetic_textbook.md", @@ -6801,35 +6710,31 @@ "file": "docs/zh/part14/p05_mm_rag.md", "line": 1213, "entry_no": 1, - "entry": "1. Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. NeurIPS 2023.", + "entry": "1. Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual Instruction Tuning. NeurIPS 2023. https://doi.org/10.52202/075280-1516.", "first_author": "Liu", "year": "2023", "title": "Visual Instruction Tuning", - "doi": "", + "doi": "10.52202/075280-1516", "arxiv": "", - "url": "", + "url": "https://doi.org/10.52202/075280-1516", "key": "liu:2023", "fingerprint": "visual instruction tuning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p05_mm_rag.md", "line": 1214, "entry_no": 2, - "entry": "2. Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollár, P., & Zitnick, C. L. (2014). Microsoft COCO: Common Objects in Context. ECCV 2014.", + "entry": "2. Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollár, P., & Zitnick, C. L. (2014). Microsoft COCO: Common Objects in Context. ECCV 2014. https://doi.org/10.1007/978-3-319-10602-1_48.", "first_author": "Lin", "year": "2014", "title": "Microsoft COCO: Common Objects in Context", - "doi": "", + "doi": "10.1007/978-3-319-10602-1_48", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1007/978-3-319-10602-1_48", "key": "lin:2014", "fingerprint": "microsoft coco common objects in context", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p05_mm_rag.md", @@ -6852,52 +6757,46 @@ "file": "docs/zh/part14/p05_mm_rag.md", "line": 1216, "entry_no": 4, - "entry": "4. Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021.", + "entry": "4. Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. https://doi.org/10.1109/wacv48630.2021.00225.", "first_author": "Mathew", "year": "2021", "title": "DocVQA: A Dataset for VQA on Document Images", - "doi": "", + "doi": "10.1109/wacv48630.2021.00225", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/wacv48630.2021.00225", "key": "mathew:2021", "fingerprint": "docvqa a dataset for vqa on document images", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p05_mm_rag.md", "line": 1217, "entry_no": 5, - "entry": "5. Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022.", + "entry": "5. Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. https://doi.org/10.18653/v1/2022.findings-acl.177.", "first_author": "Masry", "year": "2022", "title": "ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning", - "doi": "", + "doi": "10.18653/v1/2022.findings-acl.177", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2022.findings-acl.177", "key": "masry:2022", "fingerprint": "chartqa a benchmark for question answering about charts with visual and logical reasoning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p06_prm.md", "line": 1140, "entry_no": 1, - "entry": "1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022.", + "entry": "1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022. arXiv:2201.11903.", "first_author": "Wei", "year": "2022", - "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", + "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022", "doi": "", - "arxiv": "", + "arxiv": "2201.11903", "url": "", "key": "wei:2022", - "fingerprint": "chain of thought prompting elicits reasoning in large language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "fingerprint": "chain of thought prompting elicits reasoning in large language models neurips 2022", + "format_issues": [] }, { "file": "docs/zh/part14/p06_prm.md", @@ -6933,35 +6832,31 @@ "file": "docs/zh/part14/p06_prm.md", "line": 1143, "entry_no": 4, - "entry": "4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning.", - "first_author": "DeepSeek-AI.", + "entry": "4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948.", + "first_author": "DeepSeek-AI", "year": "2025", "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", "doi": "", - "arxiv": "", + "arxiv": "2501.12948", "url": "", "key": "deepseekai:2025", "fingerprint": "deepseek r1 incentivizing reasoning capability in llms via reinforcement learning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p06_prm.md", "line": 1144, "entry_no": 5, - "entry": "5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021.", + "entry": "5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021. arXiv:2103.03874.", "first_author": "Hendrycks", "year": "2021", - "title": "Measuring Mathematical Problem Solving With the MATH Dataset", + "title": "Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021", "doi": "", - "arxiv": "", + "arxiv": "2103.03874", "url": "", "key": "hendrycks:2021", - "fingerprint": "measuring mathematical problem solving with the math dataset", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "fingerprint": "measuring mathematical problem solving with the math dataset neurips 2021", + "format_issues": [] }, { "file": "docs/zh/part14/p07_agent_tooluse.md", @@ -7014,18 +6909,16 @@ "file": "docs/zh/part14/p07_agent_tooluse.md", "line": 1169, "entry_no": 4, - "entry": "4. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications.", + "entry": "4. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications. https://genai.owasp.org/llm-top-10/.", "first_author": "OWASP", "year": "2025", "title": "OWASP Top 10 for Large Language Model Applications", "doi": "", "arxiv": "", - "url": "", + "url": "https://genai.owasp.org/llm-top-10/", "key": "owasp:2025", "fingerprint": "owasp top 10 for large language model applications", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p07_agent_tooluse.md", @@ -7123,35 +7016,31 @@ "file": "docs/zh/part14/p09_privacy_pipeline.md", "line": 1129, "entry_no": 1, - "entry": "1. European Union. (2016). Regulation (EU) 2016/679: General Data Protection Regulation.", + "entry": "1. European Union. (2016). Regulation (EU) 2016/679: General Data Protection Regulation. https://eur-lex.europa.eu/eli/reg/2016/679/oj.", "first_author": "European", "year": "2016", "title": "Regulation (EU) 2016/679: General Data Protection Regulation", "doi": "", "arxiv": "", - "url": "", + "url": "https://eur-lex.europa.eu/eli/reg/2016/679/oj", "key": "european:2016", "fingerprint": "regulation eu 2016 679 general data protection regulation", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p09_privacy_pipeline.md", "line": 1130, "entry_no": 2, - "entry": "2. NIST. (2020). NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0.", + "entry": "2. NIST. (2020). NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0. https://doi.org/10.6028/nist.cswp.10.", "first_author": "NIST.", "year": "2020", "title": "NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0", - "doi": "", + "doi": "10.6028/nist.cswp.10", "arxiv": "", - "url": "", + "url": "https://doi.org/10.6028/nist.cswp.10", "key": "nist:2020", "fingerprint": "nist privacy framework a tool for improving privacy through enterprise risk management version 1 0", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p09_privacy_pipeline.md", @@ -7191,18 +7080,16 @@ "file": "docs/zh/part14/p09_privacy_pipeline.md", "line": 1133, "entry_no": 5, - "entry": "5. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications.", + "entry": "5. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications. https://genai.owasp.org/llm-top-10/.", "first_author": "OWASP", "year": "2025", "title": "OWASP Top 10 for Large Language Model Applications", "doi": "", "arxiv": "", - "url": "", + "url": "https://genai.owasp.org/llm-top-10/", "key": "owasp:2025", "fingerprint": "owasp top 10 for large language model applications", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p10_flywheel.md", @@ -7317,14 +7204,14 @@ "file": "docs/zh/part14/p11_mini_deepseek.md", "line": 528, "entry_no": 3, - "entry": "Liu A, Feng B, Xue B, Wang B, Wu B, Lu C, Zhao C, Deng C, Zhang C, Ruan C, others (2024) DeepSeek-V3 Technical Report. arXiv preprint arXiv:2412.19437.", - "first_author": "Liu", + "entry": "DeepSeek-AI, Liu A, Feng B, Xue B, Wang B, Wu B, Lu C, Zhao C, Deng C, Zhang C, Ruan C, et al. (2024) DeepSeek-V3 Technical Report. arXiv preprint arXiv:2412.19437.", + "first_author": "DeepSeek-AI", "year": "2024", "title": "DeepSeek-V3 Technical Report", "doi": "", "arxiv": "2412.19437", "url": "", - "key": "liu:2024", + "key": "deepseekai:2024", "fingerprint": "deepseek v3 technical report", "format_issues": [] }, @@ -7379,7 +7266,7 @@ "file": "docs/zh/part14/p11_mini_deepseek.md", "line": 536, "entry_no": 7, - "entry": "Penedo G, Kydlicek H, de Wiele T V, Lozhkov A, Mitchell M, Raffel C, von Werra L, Wolf T (2024) The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. arXiv preprint arXiv:2406.17557.", + "entry": "Penedo G, Kydlíček H, Ben Allal L, Lozhkov A, Mitchell M, Raffel C, von Werra L, Wolf T (2024) The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. arXiv preprint arXiv:2406.17557.", "first_author": "Penedo", "year": "2024", "title": "The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale", @@ -7488,18 +7375,16 @@ "file": "docs/zh/part14/p12_r1_reasoning_flywheel.md", "line": 507, "entry_no": 5, - "entry": "Hendrycks D, Burns C, Kadavath S, Arora A, Basart S, Tang E, Song D, Steinhardt J (2021) Measuring Mathematical Problem Solving with the MATH Dataset. In: Advances in Neural Information Processing Systems 34:24262-24273.", + "entry": "Hendrycks D, Burns C, Kadavath S, Arora A, Basart S, Tang E, Song D, Steinhardt J (2021) Measuring Mathematical Problem Solving with the MATH Dataset. In: Advances in Neural Information Processing Systems 34:24262-24273. arXiv:2103.03874.", "first_author": "Hendrycks", "year": "2021", "title": "Measuring Mathematical Problem Solving with the MATH Dataset", "doi": "", - "arxiv": "", + "arxiv": "2103.03874", "url": "", "key": "hendrycks:2021", "fingerprint": "measuring mathematical problem solving with the math dataset", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p12_r1_reasoning_flywheel.md", @@ -7520,24 +7405,22 @@ "file": "docs/zh/part14/p12_r1_reasoning_flywheel.md", "line": 511, "entry_no": 7, - "entry": "Qwen Team (2025) QwQ-32B: Embracing the Power of Reinforcement Learning for Reasoning Models. Qwen Blog.", + "entry": "Qwen Team (2025) QwQ-32B: Embracing the Power of Reinforcement Learning for Reasoning Models. Qwen Blog. https://qwenlm.github.io/blog/qwq-32b/.", "first_author": "Qwen Team", "year": "2025", "title": "QwQ-32B: Embracing the Power of Reinforcement Learning for Reasoning Models", "doi": "", "arxiv": "", - "url": "", + "url": "https://qwenlm.github.io/blog/qwq-32b/", "key": "qwenteam:2025", "fingerprint": "qwq 32b embracing the power of reinforcement learning for reasoning models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p13_multimodal_instruction_factory.md", "line": 546, "entry_no": 1, - "entry": "Bai S, Chen K, Liu X, Wang J, Ge W, Song S, Dang K, Wang P, Wang S, Tang J, others (2025) Qwen2.5-VL Technical Report. arXiv preprint arXiv:2502.13923.", + "entry": "Bai S, Chen K, Liu X, Wang J, Ge W, Song S, Dang K, Wang P, Wang S, Tang J, et al. (2025) Qwen2.5-VL Technical Report. arXiv preprint arXiv:2502.13923.", "first_author": "Bai", "year": "2025", "title": "Qwen2.5-VL Technical Report", @@ -7552,7 +7435,7 @@ "file": "docs/zh/part14/p13_multimodal_instruction_factory.md", "line": 548, "entry_no": 2, - "entry": "Zhu J, Wang W, Chen Z, Liu Z, Ye S, Gu L, Duan Y, Tian H, Su W, Shao J, others (2025) InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models. arXiv preprint arXiv:2504.10479.", + "entry": "Zhu J, Wang W, Chen Z, Liu Z, Ye S, Gu L, Duan Y, Tian H, Su W, Shao J, et al. (2025) InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models. arXiv preprint arXiv:2504.10479.", "first_author": "Zhu", "year": "2025", "title": "InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models", @@ -7567,69 +7450,61 @@ "file": "docs/zh/part14/p13_multimodal_instruction_factory.md", "line": 550, "entry_no": 3, - "entry": "Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu C H, Gonzalez J E, Zhang H, Stoica I (2023) Efficient Memory Management for Large Language Model Serving with PagedAttention (vLLM). In: Proceedings of the 29th ACM Symposium on Operating Systems Principles, pp 611-626.", + "entry": "Kwon W, Li Z, Zhuang S, Sheng Y, Zheng L, Yu C H, Gonzalez J E, Zhang H, Stoica I (2023) Efficient Memory Management for Large Language Model Serving with PagedAttention. In: Proceedings of the 29th ACM Symposium on Operating Systems Principles, pp 611-626. https://doi.org/10.1145/3600006.3613165.", "first_author": "Kwon", "year": "2023", - "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention (vLLM)", - "doi": "", + "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention", + "doi": "10.1145/3600006.3613165", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3600006.3613165", "key": "kwon:2023", - "fingerprint": "efficient memory management for large language model serving with pagedattention vllm", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "fingerprint": "efficient memory management for large language model serving with pagedattention", + "format_issues": [] }, { "file": "docs/zh/part14/p13_multimodal_instruction_factory.md", "line": 552, "entry_no": 4, - "entry": "Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, others (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35:25278-25294.", + "entry": "Schuhmann C, Beaumont R, Vencu R, Gordon C, Wightman R, Cherti M, Coombes T, Katta A, Mullis C, Wortsman M, et al. (2022) LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models. In: Advances in Neural Information Processing Systems 35, pp 25278-25294. Available at: https://arxiv.org/abs/2210.08402.", "first_author": "Schuhmann", "year": "2022", "title": "LAION-5B: An Open Large-Scale Dataset for Training Next Generation Image-Text Models", "doi": "", "arxiv": "", - "url": "", + "url": "https://arxiv.org/abs/2210.08402", "key": "schuhmann:2022", "fingerprint": "laion 5b an open large scale dataset for training next generation image text models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p13_multimodal_instruction_factory.md", "line": 554, "entry_no": 5, - "entry": "Wang X, Wei J, Schuurmans D, Le Q, Chi E, Narang S, Chowdhery A, Zhou D (2023) Self-Consistency Improves Chain of Thought Reasoning in Language Models. In: International Conference on Learning Representations.", + "entry": "Wang X, Wei J, Schuurmans D, Le Q, Chi E, Narang S, Chowdhery A, Zhou D (2023) Self-Consistency Improves Chain of Thought Reasoning in Language Models. In: International Conference on Learning Representations. arXiv:2203.11171.", "first_author": "Wang", "year": "2023", "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models", "doi": "", - "arxiv": "", + "arxiv": "2203.11171", "url": "", "key": "wang:2023", "fingerprint": "self consistency improves chain of thought reasoning in language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p13_multimodal_instruction_factory.md", "line": 556, "entry_no": 6, - "entry": "Zheng L, Chiang W L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E P, Zhang H, Gonzalez J E, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In: Advances in Neural Information Processing Systems 36.", + "entry": "Zheng L, Chiang W L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E P, Zhang H, Gonzalez J E, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In: Advances in Neural Information Processing Systems 36. arXiv:2306.05685.", "first_author": "Zheng", "year": "2023", "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", "doi": "", - "arxiv": "", + "arxiv": "2306.05685", "url": "", "key": "zheng:2023", "fingerprint": "judging llm as a judge with mt bench and chatbot arena", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p14_video_generation.md", @@ -7763,18 +7638,16 @@ "file": "docs/zh/part14/p15_dataagent_semantic_nl2sql_agent.md", "line": 905, "entry_no": 2, - "entry": "2. Wang, B., Shin, R., Liu, X., Polozov, O., & Richardson, M. (2020). RAT-SQL: Relation-Aware Schema Encoding and Linking for Text-to-SQL Parsers. ACL 2020.", + "entry": "2. Wang, B., Shin, R., Liu, X., Polozov, O., & Richardson, M. (2020). RAT-SQL: Relation-Aware Schema Encoding and Linking for Text-to-SQL Parsers. ACL 2020. https://doi.org/10.18653/v1/2020.acl-main.677.", "first_author": "Wang", "year": "2020", "title": "RAT-SQL: Relation-Aware Schema Encoding and Linking for Text-to-SQL Parsers", - "doi": "", + "doi": "10.18653/v1/2020.acl-main.677", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2020.acl-main.677", "key": "wang:2020", "fingerprint": "rat sql relation aware schema encoding and linking for text to sql parsers", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p15_dataagent_semantic_nl2sql_agent.md", @@ -7979,18 +7852,16 @@ "file": "docs/zh/part2/ch05_cleaning_dedup.md", "line": 627, "entry_no": 1, - "entry": "Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29.", + "entry": "Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. https://doi.org/10.1109/sequen.1997.666900.", "first_author": "Broder", "year": "1997", "title": "On the Resemblance and Containment of Documents", - "doi": "", + "doi": "10.1109/sequen.1997.666900", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/sequen.1997.666900", "key": "broder:1997", "fingerprint": "on the resemblance and containment of documents", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part2/ch05_cleaning_dedup.md", @@ -8013,15 +7884,15 @@ "file": "docs/zh/part2/ch05_cleaning_dedup.md", "line": 631, "entry_no": 3, - "entry": "Honnibal M, Montani I, Van Landeghem S, Boyd A (2020) spaCy: Industrial-strength Natural Language Processing in Python. Available at: https://spacy.io/ (Accessed 2024-11).", + "entry": "Honnibal M, Montani I, Van Landeghem S, Boyd A (2023) explosion/spaCy: v3.7.2: Fixes for APIs and requirements. Zenodo. .", "first_author": "Honnibal", - "year": "2020", - "title": "spaCy: Industrial-strength Natural Language Processing in Python", - "doi": "", + "year": "2023", + "title": "explosion/spaCy: v3.7.2: Fixes for APIs and requirements", + "doi": "10.5281/zenodo.1212303", "arxiv": "", - "url": "https://spacy.io/", - "key": "honnibal:2020", - "fingerprint": "spacy industrial strength natural language processing in python", + "url": "https://doi.org/10.5281/zenodo.1212303", + "key": "honnibal:2023", + "fingerprint": "explosion spacy v3 7 2 fixes for apis and requirements", "format_issues": [] }, { @@ -8186,18 +8057,16 @@ "file": "docs/zh/part2/ch06_tokenization_loading.md", "line": 465, "entry_no": 2, - "entry": "Brown T B, Mann B, Ryder N, Subbiah M, Kaplan J, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D M, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language Models are Few-Shot Learners. In: Advances in Neural Information Processing Systems 33, pp 1877-1901.", + "entry": "Brown T B, Mann B, Ryder N, Subbiah M, Kaplan J, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D M, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language Models are Few-Shot Learners. In: Advances in Neural Information Processing Systems 33, pp 1877-1901. arXiv:2005.14165.", "first_author": "Brown", "year": "2020", "title": "Language Models are Few-Shot Learners", "doi": "", - "arxiv": "", + "arxiv": "2005.14165", "url": "", "key": "brown:2020", "fingerprint": "language models are few shot learners", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part2/ch06_tokenization_loading.md", @@ -8329,18 +8198,16 @@ "file": "docs/zh/part2/ch07_data_operations.md", "line": 391, "entry_no": 3, - "entry": "Covington M A, McFall J D (2010) Cutting the Gordian Knot: The Moving-Average Type–Token Ratio (MATTR). Journal of Quantitative Linguistics 17(2):94-100.", + "entry": "Covington M A, McFall J D (2010) Cutting the Gordian Knot: The Moving-Average Type–Token Ratio (MATTR). Journal of Quantitative Linguistics 17(2):94-100. https://doi.org/10.1080/09296171003643098.", "first_author": "Covington", "year": "2010", "title": "Cutting the Gordian Knot: The Moving-Average Type–Token Ratio (MATTR)", - "doi": "", + "doi": "10.1080/09296171003643098", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1080/09296171003643098", "key": "covington:2010", "fingerprint": "cutting the gordian knot the moving average type token ratio mattr", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part2/ch07_data_operations.md", @@ -8412,35 +8279,31 @@ "file": "docs/zh/part2/ch07_data_operations.md", "line": 401, "entry_no": 8, - "entry": "Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17-28.", + "entry": "Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17-28. https://doi.org/10.1145/3299887.3299891.", "first_author": "Polyzotis", "year": "2018", "title": "Data Lifecycle Challenges in Production Machine Learning: A Survey", - "doi": "", + "doi": "10.1145/3299887.3299891", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3299887.3299891", "key": "polyzotis:2018", "fingerprint": "data lifecycle challenges in production machine learning a survey", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part2/ch07_data_operations.md", "line": 403, "entry_no": 9, - "entry": "Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) \"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI. In: Proceedings of the ACM CHI Conference on Human Factors in Computing Systems, pp 1-15.", + "entry": "Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) \"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI. In: Proceedings of the ACM CHI Conference on Human Factors in Computing Systems, pp 1-15. https://doi.org/10.1145/3411764.3445518.", "first_author": "Sambasivan", "year": "2021", "title": "\"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI", - "doi": "", + "doi": "10.1145/3411764.3445518", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3411764.3445518", "key": "sambasivan:2021", "fingerprint": "everyone wants to do the model work not the data work data cascades in high stakes ai", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part2/ch07_data_operations.md", @@ -8593,18 +8456,16 @@ "file": "docs/zh/part3/ch08_multimodal_image.md", "line": 323, "entry_no": 6, - "entry": "Laurençon H, Saulnier L, Tronchon L, Bekman S, Singh A, Lozhkov A, Wang T, Karamcheti S, Rush A M, Kiela D, Cord M, Wolf T (2023) OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. Advances in Neural Information Processing Systems 36.", + "entry": "Laurençon H, Saulnier L, Tronchon L, Bekman S, Singh A, Lozhkov A, Wang T, Karamcheti S, Rush A M, Kiela D, Cord M, Wolf T (2023) OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. Advances in Neural Information Processing Systems 36. arXiv:2306.16527.", "first_author": "Laurençon", "year": "2023", "title": "OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents", "doi": "", - "arxiv": "", + "arxiv": "2306.16527", "url": "", "key": "laurencon:2023", "fingerprint": "obelics an open web scale filtered dataset of interleaved image text documents", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part3/ch08_multimodal_image.md", @@ -8627,18 +8488,16 @@ "file": "docs/zh/part3/ch08_multimodal_image.md", "line": 327, "entry_no": 8, - "entry": "Liu H, Li C, Li Y, Lee Y J (2024) Improved Baselines with Visual Instruction Tuning (LLaVA-1.5). In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 26296-26306.", + "entry": "Liu H, Li C, Li Y, Lee Y J (2024) Improved Baselines with Visual Instruction Tuning (LLaVA-1.5). In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 26296-26306. https://doi.org/10.1109/cvpr52733.2024.02484.", "first_author": "Liu", "year": "2024", "title": "Improved Baselines with Visual Instruction Tuning (LLaVA-1.5)", - "doi": "", + "doi": "10.1109/cvpr52733.2024.02484", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/cvpr52733.2024.02484", "key": "liu:2024", "fingerprint": "improved baselines with visual instruction tuning llava 1 5", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part3/ch08_multimodal_image.md", @@ -8693,35 +8552,31 @@ "file": "docs/zh/part3/ch08_multimodal_image.md", "line": 335, "entry_no": 12, - "entry": "Zhu W, Hessel J, Awadalla A, Gadre S Y, Dodge J, Fang A, Yu Y, Schmidt L, Wang W Y, Choi Y (2023) Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text. Advances in Neural Information Processing Systems 36.", + "entry": "Zhu W, Hessel J, Awadalla A, Gadre S Y, Dodge J, Fang A, Yu Y, Schmidt L, Wang W Y, Choi Y (2023) Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text. Advances in Neural Information Processing Systems 36. arXiv:2304.06939.", "first_author": "Zhu", "year": "2023", "title": "Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text", "doi": "", - "arxiv": "", + "arxiv": "2304.06939", "url": "", "key": "zhu:2023", "fingerprint": "multimodal c4 an open billion scale corpus of images interleaved with text", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part3/ch08_multimodal_image.md", "line": 337, "entry_no": 13, - "entry": "Zhai X, Mustafa B, Kolesnikov A, Beyer L (2023) Sigmoid Loss for Language Image Pre-Training (SigLIP). In: Proceedings of the IEEE/CVF International Conference on Computer Vision, pp 11975-11986.", + "entry": "Zhai X, Mustafa B, Kolesnikov A, Beyer L (2023) Sigmoid Loss for Language Image Pre-Training (SigLIP). In: Proceedings of the IEEE/CVF International Conference on Computer Vision, pp 11975-11986. https://doi.org/10.1109/iccv51070.2023.01100.", "first_author": "Zhai", "year": "2023", "title": "Sigmoid Loss for Language Image Pre-Training (SigLIP)", - "doi": "", + "doi": "10.1109/iccv51070.2023.01100", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/iccv51070.2023.01100", "key": "zhai:2023", "fingerprint": "sigmoid loss for language image pre training siglip", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part3/ch08_multimodal_image.md", @@ -8817,35 +8672,31 @@ "file": "docs/zh/part3/ch09_recaptioning_ocr.md", "line": 273, "entry_no": 5, - "entry": "Dou Z Y, Xu Y, Gan Z, Wang J, Wang S, Wang L, Zhu C, Zhang P, Yuan L, Peng N, Liu Z (2022) Coarse-to-Fine Vision-Language Pre-training with Fusion in the Backbone (FIBER). Advances in Neural Information Processing Systems 35:32942-32956.", + "entry": "Dou Z Y, Xu Y, Gan Z, Wang J, Wang S, Wang L, Zhu C, Zhang P, Yuan L, Peng N, Liu Z (2022) Coarse-to-Fine Vision-Language Pre-training with Fusion in the Backbone (FIBER). Advances in Neural Information Processing Systems 35:32942-32956. https://doi.org/10.52202/068431-2387.", "first_author": "Dou", "year": "2022", "title": "Coarse-to-Fine Vision-Language Pre-training with Fusion in the Backbone (FIBER)", - "doi": "", + "doi": "10.52202/068431-2387", "arxiv": "", - "url": "", + "url": "https://doi.org/10.52202/068431-2387", "key": "dou:2022", "fingerprint": "coarse to fine vision language pre training with fusion in the backbone fiber", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part3/ch09_recaptioning_ocr.md", "line": 275, "entry_no": 6, - "entry": "Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083-4091.", + "entry": "Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083-4091. https://doi.org/10.1145/3503161.3548112.", "first_author": "Huang", "year": "2022", "title": "LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking", - "doi": "", + "doi": "10.1145/3503161.3548112", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3503161.3548112", "key": "huang:2022", "fingerprint": "layoutlmv3 pre training for document ai with unified text and image masking", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part3/ch09_recaptioning_ocr.md", @@ -8902,18 +8753,16 @@ "file": "docs/zh/part3/ch09_recaptioning_ocr.md", "line": 283, "entry_no": 10, - "entry": "Li L H, Zhang P, Zhang H, Yang J, Li C, Zhong Y, Wang L, Yuan L, Zhang L, Hwang J N, Chang K W, Gao J (2022) Grounded Language-Image Pre-training (GLIP). In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 10965-10975.", + "entry": "Li L H, Zhang P, Zhang H, Yang J, Li C, Zhong Y, Wang L, Yuan L, Zhang L, Hwang J N, Chang K W, Gao J (2022) Grounded Language-Image Pre-training (GLIP). In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 10965-10975. https://doi.org/10.1109/cvpr52688.2022.01069.", "first_author": "Li", "year": "2022", "title": "Grounded Language-Image Pre-training (GLIP)", - "doi": "", + "doi": "10.1109/cvpr52688.2022.01069", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/cvpr52688.2022.01069", "key": "li:2022", "fingerprint": "grounded language image pre training glip", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part3/ch09_recaptioning_ocr.md", @@ -8949,18 +8798,16 @@ "file": "docs/zh/part3/ch09_recaptioning_ocr.md", "line": 289, "entry_no": 13, - "entry": "Liu H, Li C, Li Y, Lee Y J (2024) Improved Baselines with Visual Instruction Tuning (LLaVA-1.5). In: CVPR 2024, pp 26296-26306.", + "entry": "Liu H, Li C, Li Y, Lee Y J (2024) Improved Baselines with Visual Instruction Tuning (LLaVA-1.5). In: CVPR 2024, pp 26296-26306. https://doi.org/10.1109/cvpr52733.2024.02484.", "first_author": "Liu", "year": "2024", "title": "Improved Baselines with Visual Instruction Tuning (LLaVA-1.5)", - "doi": "", + "doi": "10.1109/cvpr52733.2024.02484", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/cvpr52733.2024.02484", "key": "liu:2024", "fingerprint": "improved baselines with visual instruction tuning llava 1 5", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part3/ch09_recaptioning_ocr.md", @@ -8981,18 +8828,16 @@ "file": "docs/zh/part3/ch09_recaptioning_ocr.md", "line": 293, "entry_no": 15, - "entry": "Masry A, Long D, Tan J Q, Joty S, Hoque E (2022) ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp 2263-2279.", + "entry": "Masry A, Long D, Tan J Q, Joty S, Hoque E (2022) ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp 2263-2279. https://doi.org/10.18653/v1/2022.findings-acl.177.", "first_author": "Masry", "year": "2022", "title": "ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning", - "doi": "", + "doi": "10.18653/v1/2022.findings-acl.177", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2022.findings-acl.177", "key": "masry:2022", "fingerprint": "chartqa a benchmark for question answering about charts with visual and logical reasoning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part3/ch09_recaptioning_ocr.md", @@ -9062,18 +8907,16 @@ "file": "docs/zh/part3/ch10_video_audio.md", "line": 348, "entry_no": 2, - "entry": "Bredin H, Yin R, Coria J M, Gelly G, Korshunov P, Lavechin M, Fustes D, Titeux H, Bouaziz W, Gill M P (2020) pyannote.audio: Neural Building Blocks for Speaker Diarization. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp 7124-7128.", + "entry": "Bredin H, Yin R, Coria J M, Gelly G, Korshunov P, Lavechin M, Fustes D, Titeux H, Bouaziz W, Gill M P (2020) pyannote.audio: Neural Building Blocks for Speaker Diarization. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp 7124-7128. https://doi.org/10.1109/icassp40776.2020.9052974.", "first_author": "Bredin", "year": "2020", "title": "pyannote.audio: Neural Building Blocks for Speaker Diarization", - "doi": "", + "doi": "10.1109/icassp40776.2020.9052974", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/icassp40776.2020.9052974", "key": "bredin:2020", "fingerprint": "pyannote audio neural building blocks for speaker diarization", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part3/ch10_video_audio.md", @@ -9273,18 +9116,16 @@ "file": "docs/zh/part3/ch11_cross_modal_alignment.md", "line": 370, "entry_no": 5, - "entry": "Salvador S, Chan P (2007) Toward Accurate Dynamic Time Warping in Linear Time and Space (FastDTW). Intelligent Data Analysis 11(5):561-580.", + "entry": "Salvador S, Chan P (2007) Toward Accurate Dynamic Time Warping in Linear Time and Space (FastDTW). Intelligent Data Analysis 11(5):561-580. https://doi.org/10.3233/ida-2007-11508.", "first_author": "Salvador", "year": "2007", "title": "Toward Accurate Dynamic Time Warping in Linear Time and Space (FastDTW)", - "doi": "", + "doi": "10.3233/ida-2007-11508", "arxiv": "", - "url": "", + "url": "https://doi.org/10.3233/ida-2007-11508", "key": "salvador:2007", "fingerprint": "toward accurate dynamic time warping in linear time and space fastdtw", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part3/ch11_cross_modal_alignment.md", @@ -9307,18 +9148,16 @@ "file": "docs/zh/part3/ch11_cross_modal_alignment.md", "line": 374, "entry_no": 7, - "entry": "Wu Y, Chen K, Zhang T, Hui Y, Berg-Kirkpatrick T, Dubnov S (2023) Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation (CLAP). In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp 1-5.", + "entry": "Wu Y, Chen K, Zhang T, Hui Y, Berg-Kirkpatrick T, Dubnov S (2023) Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation (CLAP). In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp 1-5. https://doi.org/10.1109/icassp49357.2023.10095969.", "first_author": "Wu", "year": "2023", "title": "Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation (CLAP)", - "doi": "", + "doi": "10.1109/icassp49357.2023.10095969", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/icassp49357.2023.10095969", "key": "wu:2023", "fingerprint": "large scale contrastive language audio pretraining with feature fusion and keyword to caption augmentation clap", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part3/ch11_cross_modal_alignment.md", @@ -9566,69 +9405,61 @@ "file": "docs/zh/part4/ch12_sft.md", "line": 710, "entry_no": 16, - "entry": "Singhal, K., Azizi, S., Tu, T., Mahdavi, S. S., Wei, J., Chung, H. W., et al. (2023). Large Language Models Encode Clinical Knowledge. Nature.", + "entry": "Singhal, K., Azizi, S., Tu, T., Mahdavi, S. S., Wei, J., Chung, H. W., et al. (2023). Large Language Models Encode Clinical Knowledge. Nature. https://doi.org/10.1038/s41586-023-06291-2.", "first_author": "Singhal", "year": "2023", "title": "Large Language Models Encode Clinical Knowledge", - "doi": "", + "doi": "10.1038/s41586-023-06291-2", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1038/s41586-023-06291-2", "key": "singhal:2023", "fingerprint": "large language models encode clinical knowledge", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch12_sft.md", "line": 712, "entry_no": 17, - "entry": "Gebru, T., Morgenstern, J., Vecchione, B., Vaughan, J. W., Wallach, H., Daumé III, H., & Crawford, K. (2021). *Datasheets for Datasets*. Communications of the ACM, 64(12), 86–92.", + "entry": "Gebru, T., Morgenstern, J., Vecchione, B., Vaughan, J. W., Wallach, H., Daumé III, H., & Crawford, K. (2021). *Datasheets for Datasets*. Communications of the ACM, 64(12), 86–92. https://doi.org/10.1145/3458723.", "first_author": "Gebru", "year": "2021", "title": "Datasheets for Datasets", - "doi": "", + "doi": "10.1145/3458723", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3458723", "key": "gebru:2021", "fingerprint": "datasheets for datasets", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch12_sft.md", "line": 714, "entry_no": 18, - "entry": "Pushkarna, M., Zaldivar, A., & Kjartansson, O. (2022). *Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI*. Proceedings of the ACM Conference on Fairness, Accountability, and Transparency, 1776–1826.", + "entry": "Pushkarna, M., Zaldivar, A., & Kjartansson, O. (2022). *Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI*. Proceedings of the ACM Conference on Fairness, Accountability, and Transparency, 1776–1826. https://doi.org/10.1145/3531146.3533231.", "first_author": "Pushkarna", "year": "2022", "title": "Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI", - "doi": "", + "doi": "10.1145/3531146.3533231", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3531146.3533231", "key": "pushkarna:2022", "fingerprint": "data cards purposeful and transparent dataset documentation for responsible ai", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch12_sft.md", "line": 716, "entry_no": 19, - "entry": "Mitchell, M., Wu, S., Zaldivar, A., Barnes, P., Vasserman, L., Hutchinson, B., et al. (2019). *Model Cards for Model Reporting*. Proceedings of the Conference on Fairness, Accountability, and Transparency, 220–229.", + "entry": "Mitchell, M., Wu, S., Zaldivar, A., Barnes, P., Vasserman, L., Hutchinson, B., et al. (2019). *Model Cards for Model Reporting*. Proceedings of the Conference on Fairness, Accountability, and Transparency, 220–229. https://doi.org/10.1145/3287560.3287596.", "first_author": "Mitchell", "year": "2019", "title": "Model Cards for Model Reporting", - "doi": "", + "doi": "10.1145/3287560.3287596", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3287560.3287596", "key": "mitchell:2019", "fingerprint": "model cards for model reporting", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch12_sft.md", @@ -9730,18 +9561,16 @@ "file": "docs/zh/part4/ch13_preference.md", "line": 476, "entry_no": 1, - "entry": "Christiano, P. F., Leike, J., Brown, T. B., Martic, M., Legg, S., & Amodei, D. (2017). Deep reinforcement learning from human preferences. *Advances in Neural Information Processing Systems*, 30.", + "entry": "Christiano, P. F., Leike, J., Brown, T. B., Martic, M., Legg, S., & Amodei, D. (2017). Deep reinforcement learning from human preferences. *Advances in Neural Information Processing Systems*, 30. arXiv:1706.03741.", "first_author": "Christiano", "year": "2017", "title": "Deep reinforcement learning from human preferences", "doi": "", - "arxiv": "", + "arxiv": "1706.03741", "url": "", "key": "christiano:2017", "fingerprint": "deep reinforcement learning from human preferences", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch13_preference.md", @@ -9762,18 +9591,16 @@ "file": "docs/zh/part4/ch13_preference.md", "line": 480, "entry_no": 3, - "entry": "Stiennon, N., Ouyang, L., Wu, J., Ziegler, D. M., Lowe, R., Voss, C., Radford, A., Amodei, D., & Christiano, P. (2020). Learning to summarize from human feedback. *Advances in Neural Information Processing Systems*, 33, 3008–3021.", + "entry": "Stiennon, N., Ouyang, L., Wu, J., Ziegler, D. M., Lowe, R., Voss, C., Radford, A., Amodei, D., & Christiano, P. (2020). Learning to summarize from human feedback. *Advances in Neural Information Processing Systems*, 33, 3008–3021. arXiv:2009.01325.", "first_author": "Stiennon", "year": "2020", "title": "Learning to summarize from human feedback", "doi": "", - "arxiv": "", + "arxiv": "2009.01325", "url": "", "key": "stiennon:2020", "fingerprint": "learning to summarize from human feedback", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch13_preference.md", @@ -9794,18 +9621,16 @@ "file": "docs/zh/part4/ch13_preference.md", "line": 484, "entry_no": 5, - "entry": "Ouyang, L., Wu, J., Jiang, X., et al. (2022). Training language models to follow instructions with human feedback. *Advances in Neural Information Processing Systems*, 35, 27730–27744.", + "entry": "Ouyang, L., Wu, J., Jiang, X., et al. (2022). Training language models to follow instructions with human feedback. *Advances in Neural Information Processing Systems*, 35, 27730–27744. arXiv:2203.02155.", "first_author": "Ouyang", "year": "2022", "title": "Training language models to follow instructions with human feedback", "doi": "", - "arxiv": "", + "arxiv": "2203.02155", "url": "", "key": "ouyang:2022", "fingerprint": "training language models to follow instructions with human feedback", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch13_preference.md", @@ -9826,18 +9651,16 @@ "file": "docs/zh/part4/ch13_preference.md", "line": 488, "entry_no": 7, - "entry": "Rafailov, R., Sharma, A., Mitchell, E., et al. (2023). Direct preference optimization: Your language model is secretly a reward model. *Advances in Neural Information Processing Systems*, 36, 53728–53741.", + "entry": "Rafailov, R., Sharma, A., Mitchell, E., et al. (2023). Direct preference optimization: Your language model is secretly a reward model. *Advances in Neural Information Processing Systems*, 36, 53728–53741. arXiv:2305.18290.", "first_author": "Rafailov", "year": "2023", "title": "Direct preference optimization: Your language model is secretly a reward model", "doi": "", - "arxiv": "", + "arxiv": "2305.18290", "url": "", "key": "rafailov:2023", "fingerprint": "direct preference optimization your language model is secretly a reward model", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch13_preference.md", @@ -9903,18 +9726,16 @@ "file": "docs/zh/part4/ch13_preference.md", "line": 498, "entry_no": 12, - "entry": "Bradley, R. A., & Terry, M. E. (1952). Rank analysis of incomplete block designs: I. The method of paired comparisons. *Biometrika*, 39(3/4), 324–345.", + "entry": "Bradley, R. A., & Terry, M. E. (1952). Rank analysis of incomplete block designs: I. The method of paired comparisons. *Biometrika*, 39(3/4), 324–345. https://doi.org/10.2307/2334029.", "first_author": "Bradley", "year": "1952", "title": "Rank analysis of incomplete block designs: I", - "doi": "", + "doi": "10.2307/2334029", "arxiv": "", - "url": "", + "url": "https://doi.org/10.2307/2334029", "key": "bradley:1952", "fingerprint": "rank analysis of incomplete block designs i", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch13_preference.md", @@ -9937,52 +9758,46 @@ "file": "docs/zh/part4/ch13_preference.md", "line": 502, "entry_no": 14, - "entry": "Deb, K., Pratap, A., Agarwal, S., et al. (2002). A fast and elitist multiobjective genetic algorithm: NSGA-II. *IEEE Transactions on Evolutionary Computation*, 6(2), 182–197.", + "entry": "Deb, K., Pratap, A., Agarwal, S., et al. (2002). A fast and elitist multiobjective genetic algorithm: NSGA-II. *IEEE Transactions on Evolutionary Computation*, 6(2), 182–197. https://doi.org/10.1109/4235.996017.", "first_author": "Deb", "year": "2002", "title": "A fast and elitist multiobjective genetic algorithm: NSGA-II", - "doi": "", + "doi": "10.1109/4235.996017", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/4235.996017", "key": "deb:2002", "fingerprint": "a fast and elitist multiobjective genetic algorithm nsga ii", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch13_preference.md", "line": 504, "entry_no": 15, - "entry": "Cohen, J. (1960). A coefficient of agreement for nominal scales. *Educational and Psychological Measurement*, 20(1), 37–46.", + "entry": "Cohen, J. (1960). A coefficient of agreement for nominal scales. *Educational and Psychological Measurement*, 20(1), 37–46. https://doi.org/10.1177/001316446002000104.", "first_author": "Cohen", "year": "1960", "title": "A coefficient of agreement for nominal scales", - "doi": "", + "doi": "10.1177/001316446002000104", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1177/001316446002000104", "key": "cohen:1960", "fingerprint": "a coefficient of agreement for nominal scales", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch13_preference.md", "line": 506, "entry_no": 16, - "entry": "Dawid, A. P., & Skene, A. M. (1979). Maximum likelihood estimation of observer error-rates using the EM algorithm. *Journal of the Royal Statistical Society: Series C (Applied Statistics)*, 28(1), 20–28.", + "entry": "Dawid, A. P., & Skene, A. M. (1979). Maximum likelihood estimation of observer error-rates using the EM algorithm. *Journal of the Royal Statistical Society: Series C (Applied Statistics)*, 28(1), 20–28. https://doi.org/10.2307/2346806.", "first_author": "Dawid", "year": "1979", "title": "Maximum likelihood estimation of observer error-rates using the EM algorithm", - "doi": "", + "doi": "10.2307/2346806", "arxiv": "", - "url": "", + "url": "https://doi.org/10.2307/2346806", "key": "dawid:1979", "fingerprint": "maximum likelihood estimation of observer error rates using the em algorithm", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch13_preference.md", @@ -10005,86 +9820,76 @@ "file": "docs/zh/part4/ch13_preference.md", "line": 510, "entry_no": 18, - "entry": "Aroyo, L., & Welty, C. (2015). Truth is a lie: Crowd truth and the seven myths of human annotation. *AI Magazine*, 36(1), 15–24.", + "entry": "Aroyo, L., & Welty, C. (2015). Truth is a lie: Crowd truth and the seven myths of human annotation. *AI Magazine*, 36(1), 15–24. https://doi.org/10.1609/aimag.v36i1.2564.", "first_author": "Aroyo", "year": "2015", "title": "Truth is a lie: Crowd truth and the seven myths of human annotation", - "doi": "", + "doi": "10.1609/aimag.v36i1.2564", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1609/aimag.v36i1.2564", "key": "aroyo:2015", "fingerprint": "truth is a lie crowd truth and the seven myths of human annotation", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch13_preference.md", "line": 512, "entry_no": 19, - "entry": "Northcutt, C. G., Jiang, L., & Chuang, I. L. (2021). Confident learning: Estimating uncertainty in dataset labels. *Journal of Artificial Intelligence Research*, 70, 1373–1411.", + "entry": "Northcutt, C. G., Jiang, L., & Chuang, I. L. (2021). Confident learning: Estimating uncertainty in dataset labels. *Journal of Artificial Intelligence Research*, 70, 1373–1411. https://doi.org/10.1613/jair.1.12125.", "first_author": "Northcutt", "year": "2021", "title": "Confident learning: Estimating uncertainty in dataset labels", - "doi": "", + "doi": "10.1613/jair.1.12125", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1613/jair.1.12125", "key": "northcutt:2021", "fingerprint": "confident learning estimating uncertainty in dataset labels", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch13_preference.md", "line": 514, "entry_no": 20, - "entry": "Gebru, T., Morgenstern, J., Vecchione, B., et al. (2021). Datasheets for datasets. *Communications of the ACM*, 64(12), 86–92.", + "entry": "Gebru, T., Morgenstern, J., Vecchione, B., et al. (2021). Datasheets for datasets. *Communications of the ACM*, 64(12), 86–92. https://doi.org/10.1145/3458723.", "first_author": "Gebru", "year": "2021", "title": "Datasheets for datasets", - "doi": "", + "doi": "10.1145/3458723", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3458723", "key": "gebru:2021", "fingerprint": "datasheets for datasets", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch13_preference.md", "line": 516, "entry_no": 21, - "entry": "Bender, E. M., & Friedman, B. (2018). Data statements for natural language processing: Toward mitigating system bias and enabling better science. *Transactions of the Association for Computational Linguistics*, 6, 587–604.", + "entry": "Bender, E. M., & Friedman, B. (2018). Data statements for natural language processing: Toward mitigating system bias and enabling better science. *Transactions of the Association for Computational Linguistics*, 6, 587–604. https://doi.org/10.1162/tacl_a_00041.", "first_author": "Bender", "year": "2018", "title": "Data statements for natural language processing: Toward mitigating system bias and enabling better science", - "doi": "", + "doi": "10.1162/tacl_a_00041", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1162/tacl_a_00041", "key": "bender:2018", "fingerprint": "data statements for natural language processing toward mitigating system bias and enabling better science", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch13_preference.md", "line": 518, "entry_no": 22, - "entry": "Mitchell, M., Wu, S., Zaldivar, A., et al. (2019). Model cards for model reporting. *Proceedings of the Conference on Fairness, Accountability, and Transparency*, 220–229.", + "entry": "Mitchell, M., Wu, S., Zaldivar, A., et al. (2019). Model cards for model reporting. *Proceedings of the Conference on Fairness, Accountability, and Transparency*, 220–229. https://doi.org/10.1145/3287560.3287596.", "first_author": "Mitchell", "year": "2019", "title": "Model cards for model reporting", - "doi": "", + "doi": "10.1145/3287560.3287596", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3287560.3287596", "key": "mitchell:2019", "fingerprint": "model cards for model reporting", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch13_preference.md", @@ -10165,18 +9970,16 @@ "file": "docs/zh/part4/ch14_qa.md", "line": 650, "entry_no": 5, - "entry": "Christiano, P. F., Leike, J., Brown, T. B., et al. (2017). *Deep Reinforcement Learning from Human Preferences*. Advances in Neural Information Processing Systems, 30.", + "entry": "Christiano, P. F., Leike, J., Brown, T. B., et al. (2017). *Deep Reinforcement Learning from Human Preferences*. Advances in Neural Information Processing Systems, 30. arXiv:1706.03741.", "first_author": "Christiano", "year": "2017", "title": "Deep Reinforcement Learning from Human Preferences", "doi": "", - "arxiv": "", + "arxiv": "1706.03741", "url": "", "key": "christiano:2017", "fingerprint": "deep reinforcement learning from human preferences", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch14_qa.md", @@ -11963,52 +11766,46 @@ "file": "docs/zh/part6/ch19_tool.md", "line": 532, "entry_no": 4, - "entry": "Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. International Conference on Learning Representations.", + "entry": "Yao, S., Zhao, J., Yu, D., et al. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. International Conference on Learning Representations. arXiv:2210.03629.", "first_author": "Yao", "year": "2023", "title": "ReAct: Synergizing Reasoning and Acting in Language Models", "doi": "", - "arxiv": "", + "arxiv": "2210.03629", "url": "", "key": "yao:2023", "fingerprint": "react synergizing reasoning and acting in language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch19_tool.md", "line": 534, "entry_no": 5, - "entry": "Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. Advances in Neural Information Processing Systems, 36.", + "entry": "Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). Toolformer: Language Models Can Teach Themselves to Use Tools. Advances in Neural Information Processing Systems, 36. arXiv:2302.04761.", "first_author": "Schick", "year": "2023", "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", "doi": "", - "arxiv": "", + "arxiv": "2302.04761", "url": "", "key": "schick:2023", "fingerprint": "toolformer language models can teach themselves to use tools", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch19_tool.md", "line": 536, "entry_no": 6, - "entry": "Li, M., Zhao, Y., Yu, B., et al. (2023). API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs. Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, 3102–3116.", + "entry": "Li, M., Zhao, Y., Yu, B., et al. (2023). API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs. Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, 3102–3116. https://doi.org/10.18653/v1/2023.emnlp-main.187.", "first_author": "Li", "year": "2023", "title": "API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs", - "doi": "", + "doi": "10.18653/v1/2023.emnlp-main.187", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2023.emnlp-main.187", "key": "li:2023", "fingerprint": "api bank a comprehensive benchmark for tool augmented llms", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch19_tool.md", @@ -12031,35 +11828,31 @@ "file": "docs/zh/part6/ch19_tool.md", "line": 540, "entry_no": 8, - "entry": "Patil, S. G., Zhang, T., Wang, X., & Gonzalez, J. E. (2024). Gorilla: Large Language Model Connected with Massive APIs. Advances in Neural Information Processing Systems, 37.", + "entry": "Patil, S. G., Zhang, T., Wang, X., & Gonzalez, J. E. (2024). Gorilla: Large Language Model Connected with Massive APIs. Advances in Neural Information Processing Systems, 37. arXiv:2305.15334.", "first_author": "Patil", "year": "2024", "title": "Gorilla: Large Language Model Connected with Massive APIs", "doi": "", - "arxiv": "", + "arxiv": "2305.15334", "url": "", "key": "patil:2024", "fingerprint": "gorilla large language model connected with massive apis", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch19_tool.md", "line": 542, "entry_no": 9, - "entry": "Zhuang, Y., Yu, Y., Wang, K., et al. (2023). ToolQA: A Dataset for LLM Question Answering with External Tools. Advances in Neural Information Processing Systems, 36.", + "entry": "Zhuang, Y., Yu, Y., Wang, K., et al. (2023). ToolQA: A Dataset for LLM Question Answering with External Tools. Advances in Neural Information Processing Systems, 36. https://doi.org/10.52202/075280-2180.", "first_author": "Zhuang", "year": "2023", "title": "ToolQA: A Dataset for LLM Question Answering with External Tools", - "doi": "", + "doi": "10.52202/075280-2180", "arxiv": "", - "url": "", + "url": "https://doi.org/10.52202/075280-2180", "key": "zhuang:2023", "fingerprint": "toolqa a dataset for llm question answering with external tools", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch19_tool.md", @@ -12114,35 +11907,31 @@ "file": "docs/zh/part6/ch19_tool.md", "line": 550, "entry_no": 13, - "entry": "Shinn, N., Cassano, F., Gopinath, A., et al. (2023). Reflexion: Language Agents with Verbal Reinforcement Learning. Advances in Neural Information Processing Systems, 36.", + "entry": "Shinn, N., Cassano, F., Gopinath, A., et al. (2023). Reflexion: Language Agents with Verbal Reinforcement Learning. Advances in Neural Information Processing Systems, 36. arXiv:2303.11366.", "first_author": "Shinn", "year": "2023", "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", "doi": "", - "arxiv": "", + "arxiv": "2303.11366", "url": "", "key": "shinn:2023", "fingerprint": "reflexion language agents with verbal reinforcement learning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch19_tool.md", "line": 552, "entry_no": 14, - "entry": "Yang, J., Jimenez, C. E., Wettig, A., et al. (2024). SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering. Advances in Neural Information Processing Systems, 37.", + "entry": "Yang, J., Jimenez, C. E., Wettig, A., et al. (2024). SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering. Advances in Neural Information Processing Systems, 37. https://doi.org/10.52202/079017-1601.", "first_author": "Yang", "year": "2024", "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", - "doi": "", + "doi": "10.52202/079017-1601", "arxiv": "", - "url": "", + "url": "https://doi.org/10.52202/079017-1601", "key": "yang:2024", "fingerprint": "swe agent agent computer interfaces enable automated software engineering", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch19_tool.md", @@ -12165,18 +11954,16 @@ "file": "docs/zh/part6/ch19_tool.md", "line": 556, "entry_no": 16, - "entry": "Greshake, K., Abdelnabi, S., Mishra, S., et al. (2023). Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security, 79–90.", + "entry": "Greshake, K., Abdelnabi, S., Mishra, S., et al. (2023). Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection. Proceedings of the 16th ACM Workshop on Artificial Intelligence and Security, 79–90. https://doi.org/10.1145/3605764.3623985.", "first_author": "Greshake", "year": "2023", "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", - "doi": "", + "doi": "10.1145/3605764.3623985", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3605764.3623985", "key": "greshake:2023", "fingerprint": "not what you ve signed up for compromising real world llm integrated applications with indirect prompt injection", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch19_tool.md", @@ -12229,69 +12016,61 @@ "file": "docs/zh/part6/ch20_agent.md", "line": 486, "entry_no": 2, - "entry": "Williams, J. D., Raux, A., Ramachandran, D., & Black, A. (2013). *The Dialog State Tracking Challenge*. Proceedings of the SIGDIAL 2013 Conference, 404–413.", + "entry": "Williams, J. D., Raux, A., Ramachandran, D., & Black, A. (2013). *The Dialog State Tracking Challenge*. Proceedings of the SIGDIAL 2013 Conference, 404–413. https://doi.org/10.1109/slt.2014.7078595.", "first_author": "Williams", "year": "2013", "title": "The Dialog State Tracking Challenge", - "doi": "", + "doi": "10.1109/slt.2014.7078595", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/slt.2014.7078595", "key": "williams:2013", "fingerprint": "the dialog state tracking challenge", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch20_agent.md", "line": 488, "entry_no": 3, - "entry": "Budzianowski, P., Wen, T.-H., Tseng, B.-H., et al. (2018). *MultiWOZ - A Large-Scale Multi-Domain Wizard-of-Oz Dataset for Task-Oriented Dialogue Modelling*. Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 5016–5026.", + "entry": "Budzianowski, P., Wen, T.-H., Tseng, B.-H., et al. (2018). *MultiWOZ - A Large-Scale Multi-Domain Wizard-of-Oz Dataset for Task-Oriented Dialogue Modelling*. Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 5016–5026. https://doi.org/10.18653/v1/d18-1547.", "first_author": "Budzianowski", "year": "2018", "title": "MultiWOZ - A Large-Scale Multi-Domain Wizard-of-Oz Dataset for Task-Oriented Dialogue Modelling", - "doi": "", + "doi": "10.18653/v1/d18-1547", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/d18-1547", "key": "budzianowski:2018", "fingerprint": "multiwoz a large scale multi domain wizard of oz dataset for task oriented dialogue modelling", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch20_agent.md", "line": 490, "entry_no": 4, - "entry": "Yao, S., Zhao, J., Yu, D., et al. (2023). *ReAct: Synergizing Reasoning and Acting in Language Models*. International Conference on Learning Representations.", + "entry": "Yao, S., Zhao, J., Yu, D., et al. (2023). *ReAct: Synergizing Reasoning and Acting in Language Models*. International Conference on Learning Representations. arXiv:2210.03629.", "first_author": "Yao", "year": "2023", "title": "ReAct: Synergizing Reasoning and Acting in Language Models", "doi": "", - "arxiv": "", + "arxiv": "2210.03629", "url": "", "key": "yao:2023", "fingerprint": "react synergizing reasoning and acting in language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch20_agent.md", "line": 492, "entry_no": 5, - "entry": "Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). *Toolformer: Language Models Can Teach Themselves to Use Tools*. Advances in Neural Information Processing Systems, 36.", + "entry": "Schick, T., Dwivedi-Yu, J., Dessì, R., et al. (2023). *Toolformer: Language Models Can Teach Themselves to Use Tools*. Advances in Neural Information Processing Systems, 36. arXiv:2302.04761.", "first_author": "Schick", "year": "2023", "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", "doi": "", - "arxiv": "", + "arxiv": "2302.04761", "url": "", "key": "schick:2023", "fingerprint": "toolformer language models can teach themselves to use tools", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch20_agent.md", @@ -12327,18 +12106,16 @@ "file": "docs/zh/part6/ch20_agent.md", "line": 498, "entry_no": 8, - "entry": "Wang, W., Dong, L., Cheng, H., et al. (2023). *Augmenting Language Models with Long-Term Memory*. Advances in Neural Information Processing Systems, 36.", + "entry": "Wang, W., Dong, L., Cheng, H., et al. (2023). *Augmenting Language Models with Long-Term Memory*. Advances in Neural Information Processing Systems, 36. https://doi.org/10.52202/075280-3259.", "first_author": "Wang", "year": "2023", "title": "Augmenting Language Models with Long-Term Memory", - "doi": "", + "doi": "10.52202/075280-3259", "arxiv": "", - "url": "", + "url": "https://doi.org/10.52202/075280-3259", "key": "wang:2023", "fingerprint": "augmenting language models with long term memory", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch20_agent.md", @@ -12376,18 +12153,16 @@ "file": "docs/zh/part6/ch20_agent.md", "line": 504, "entry_no": 11, - "entry": "Lewis, P., Perez, E., Piktus, A., et al. (2020). *Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks*. Advances in Neural Information Processing Systems, 33, 9459–9474.", + "entry": "Lewis, P., Perez, E., Piktus, A., et al. (2020). *Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks*. Advances in Neural Information Processing Systems, 33, 9459–9474. arXiv:2005.11401.", "first_author": "Lewis", "year": "2020", "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", "doi": "", - "arxiv": "", + "arxiv": "2005.11401", "url": "", "key": "lewis:2020", "fingerprint": "retrieval augmented generation for knowledge intensive nlp tasks", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch20_agent.md", @@ -12410,18 +12185,16 @@ "file": "docs/zh/part6/ch20_agent.md", "line": 508, "entry_no": 13, - "entry": "Shinn, N., Cassano, F., Gopinath, A., et al. (2023). *Reflexion: Language Agents with Verbal Reinforcement Learning*. Advances in Neural Information Processing Systems, 36.", + "entry": "Shinn, N., Cassano, F., Gopinath, A., et al. (2023). *Reflexion: Language Agents with Verbal Reinforcement Learning*. Advances in Neural Information Processing Systems, 36. arXiv:2303.11366.", "first_author": "Shinn", "year": "2023", "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", "doi": "", - "arxiv": "", + "arxiv": "2303.11366", "url": "", "key": "shinn:2023", "fingerprint": "reflexion language agents with verbal reinforcement learning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part6/ch20_agent.md", @@ -12478,18 +12251,16 @@ "file": "docs/zh/part7/ch21_rag_pipeline.md", "line": 934, "entry_no": 1, - "entry": "Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W-t, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459–9474.", + "entry": "Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W-t, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459–9474. arXiv:2005.11401.", "first_author": "Lewis", "year": "2020", "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", "doi": "", - "arxiv": "", + "arxiv": "2005.11401", "url": "", "key": "lewis:2020", "fingerprint": "retrieval augmented generation for knowledge intensive nlp tasks", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch21_rag_pipeline.md", @@ -12527,205 +12298,181 @@ "file": "docs/zh/part7/ch21_rag_pipeline.md", "line": 940, "entry_no": 4, - "entry": "Izacard G, Grave E (2021) Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering. In: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics (EACL), pp 874–880.", + "entry": "Izacard G, Grave E (2021) Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering. In: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics (EACL), pp 874–880. https://doi.org/10.18653/v1/2021.eacl-main.74.", "first_author": "Izacard", "year": "2021", "title": "Leveraging Passage Retrieval with Generative Models for Open Domain Question Answering", - "doi": "", + "doi": "10.18653/v1/2021.eacl-main.74", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2021.eacl-main.74", "key": "izacard:2021", "fingerprint": "leveraging passage retrieval with generative models for open domain question answering", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch21_rag_pipeline.md", "line": 942, "entry_no": 5, - "entry": "Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: European Conference on Computer Vision (ECCV).", + "entry": "Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: European Conference on Computer Vision (ECCV). https://doi.org/10.1007/978-3-031-19815-1_29.", "first_author": "Kim", "year": "2022", "title": "OCR-free Document Understanding Transformer", - "doi": "", + "doi": "10.1007/978-3-031-19815-1_29", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1007/978-3-031-19815-1_29", "key": "kim:2022", "fingerprint": "ocr free document understanding transformer", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch21_rag_pipeline.md", "line": 944, "entry_no": 6, - "entry": "Xu Y, Li M, Cui L, Huang S, Wei F, Zhou M (2020) LayoutLM: Pre-training of Text and Layout for Document Image Understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 1192–1200.", + "entry": "Xu Y, Li M, Cui L, Huang S, Wei F, Zhou M (2020) LayoutLM: Pre-training of Text and Layout for Document Image Understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 1192–1200. https://doi.org/10.1145/3394486.3403172.", "first_author": "Xu", "year": "2020", "title": "LayoutLM: Pre-training of Text and Layout for Document Image Understanding", - "doi": "", + "doi": "10.1145/3394486.3403172", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3394486.3403172", "key": "xu:2020", "fingerprint": "layoutlm pre training of text and layout for document image understanding", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch21_rag_pipeline.md", "line": 946, "entry_no": 7, - "entry": "Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083–4091.", + "entry": "Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083–4091. https://doi.org/10.1145/3503161.3548112.", "first_author": "Huang", "year": "2022", "title": "LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking", - "doi": "", + "doi": "10.1145/3503161.3548112", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3503161.3548112", "key": "huang:2022", "fingerprint": "layoutlmv3 pre training for document ai with unified text and image masking", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch21_rag_pipeline.md", "line": 948, "entry_no": 8, - "entry": "Appalaraju S, Jasani B, Kota B U, Xie Y, Manmatha R (2021) DocFormer: End-to-End Transformer for Document Understanding. In: Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp 993–1003.", + "entry": "Appalaraju S, Jasani B, Kota B U, Xie Y, Manmatha R (2021) DocFormer: End-to-End Transformer for Document Understanding. In: Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp 993–1003. https://doi.org/10.1109/iccv48922.2021.00103.", "first_author": "Appalaraju", "year": "2021", "title": "DocFormer: End-to-End Transformer for Document Understanding", - "doi": "", + "doi": "10.1109/iccv48922.2021.00103", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/iccv48922.2021.00103", "key": "appalaraju:2021", "fingerprint": "docformer end to end transformer for document understanding", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch21_rag_pipeline.md", "line": 950, "entry_no": 9, - "entry": "Smock B, Pesala R, Abraham R (2022) PubTables-1M: Towards Comprehensive Table Extraction from Unstructured Documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 4634–4642.", + "entry": "Smock B, Pesala R, Abraham R (2022) PubTables-1M: Towards Comprehensive Table Extraction from Unstructured Documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 4634–4642. https://doi.org/10.1109/cvpr52688.2022.00459.", "first_author": "Smock", "year": "2022", "title": "PubTables-1M: Towards Comprehensive Table Extraction from Unstructured Documents", - "doi": "", + "doi": "10.1109/cvpr52688.2022.00459", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/cvpr52688.2022.00459", "key": "smock:2022", "fingerprint": "pubtables 1m towards comprehensive table extraction from unstructured documents", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch21_rag_pipeline.md", "line": 952, "entry_no": 10, - "entry": "Liu N F, Lin K, Hewitt J, Paranjape A, Bevilacqua M, Petroni F, Liang P (2024) Lost in the Middle: How Language Models Use Long Contexts. Transactions of the Association for Computational Linguistics 12:157–173.", + "entry": "Liu N F, Lin K, Hewitt J, Paranjape A, Bevilacqua M, Petroni F, Liang P (2024) Lost in the Middle: How Language Models Use Long Contexts. Transactions of the Association for Computational Linguistics 12:157–173. https://doi.org/10.1162/tacl_a_00638.", "first_author": "Liu", "year": "2024", "title": "Lost in the Middle: How Language Models Use Long Contexts", - "doi": "", + "doi": "10.1162/tacl_a_00638", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1162/tacl_a_00638", "key": "liu:2024", "fingerprint": "lost in the middle how language models use long contexts", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch21_rag_pipeline.md", "line": 954, "entry_no": 11, - "entry": "Karpukhin V, Oğuz B, Min S, Lewis P, Wu L, Edunov S, Chen D, Yih W-t (2020) Dense Passage Retrieval for Open-Domain Question Answering. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp 6769–6781.", + "entry": "Karpukhin V, Oğuz B, Min S, Lewis P, Wu L, Edunov S, Chen D, Yih W-t (2020) Dense Passage Retrieval for Open-Domain Question Answering. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp 6769–6781. https://doi.org/10.18653/v1/2020.emnlp-main.550.", "first_author": "Karpukhin", "year": "2020", "title": "Dense Passage Retrieval for Open-Domain Question Answering", - "doi": "", + "doi": "10.18653/v1/2020.emnlp-main.550", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2020.emnlp-main.550", "key": "karpukhin:2020", "fingerprint": "dense passage retrieval for open domain question answering", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch21_rag_pipeline.md", "line": 956, "entry_no": 12, - "entry": "Reimers N, Gurevych I (2019) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, pp 3982–3992.", + "entry": "Reimers N, Gurevych I (2019) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, pp 3982–3992. https://doi.org/10.18653/v1/d19-1410.", "first_author": "Reimers", "year": "2019", "title": "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks", - "doi": "", + "doi": "10.18653/v1/d19-1410", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/d19-1410", "key": "reimers:2019", "fingerprint": "sentence bert sentence embeddings using siamese bert networks", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch21_rag_pipeline.md", "line": 958, "entry_no": 13, - "entry": "Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150-158.", + "entry": "Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150-158. https://doi.org/10.18653/v1/2024.eacl-demo.16.", "first_author": "Es", "year": "2024", "title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation", - "doi": "", + "doi": "10.18653/v1/2024.eacl-demo.16", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2024.eacl-demo.16", "key": "es:2024", "fingerprint": "ragas automated evaluation of retrieval augmented generation", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch21_rag_pipeline.md", "line": 960, "entry_no": 14, - "entry": "Niu C, Wu Y, Zhu J, Xu S, Shum K, Zhong R, Song J, Zhang T (2024) RAGTruth: A Hallucination Corpus for Developing Trustworthy Retrieval-Augmented Language Models. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (ACL), pp 10862-10878.", + "entry": "Niu C, Wu Y, Zhu J, Xu S, Shum K, Zhong R, Song J, Zhang T (2024) RAGTruth: A Hallucination Corpus for Developing Trustworthy Retrieval-Augmented Language Models. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (ACL), pp 10862-10878. https://doi.org/10.18653/v1/2024.acl-long.585.", "first_author": "Niu", "year": "2024", "title": "RAGTruth: A Hallucination Corpus for Developing Trustworthy Retrieval-Augmented Language Models", - "doi": "", + "doi": "10.18653/v1/2024.acl-long.585", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2024.acl-long.585", "key": "niu:2024", "fingerprint": "ragtruth a hallucination corpus for developing trustworthy retrieval augmented language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch21_rag_pipeline.md", "line": 962, "entry_no": 15, - "entry": "Manning C D, Raghavan P, Schütze H (2008) Introduction to Information Retrieval. Cambridge University Press.", + "entry": "Manning C D, Raghavan P, Schütze H (2008) Introduction to Information Retrieval. Cambridge University Press. https://doi.org/10.5860/choice.46-2715.", "first_author": "Manning", "year": "2008", "title": "Introduction to Information Retrieval", - "doi": "", + "doi": "10.5860/choice.46-2715", "arxiv": "", - "url": "", + "url": "https://doi.org/10.5860/choice.46-2715", "key": "manning:2008", "fingerprint": "introduction to information retrieval", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch21_rag_pipeline.md", @@ -12880,103 +12627,91 @@ "file": "docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md", "line": 545, "entry_no": 1, - "entry": "Xu Y, Li M, Cui L, Huang S, Wei F, Zhou M (2020) LayoutLM: Pre-training of Text and Layout for Document Image Understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 1192–1200.", + "entry": "Xu Y, Li M, Cui L, Huang S, Wei F, Zhou M (2020) LayoutLM: Pre-training of Text and Layout for Document Image Understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 1192–1200. https://doi.org/10.1145/3394486.3403172.", "first_author": "Xu", "year": "2020", "title": "LayoutLM: Pre-training of Text and Layout for Document Image Understanding", - "doi": "", + "doi": "10.1145/3394486.3403172", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3394486.3403172", "key": "xu:2020", "fingerprint": "layoutlm pre training of text and layout for document image understanding", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md", "line": 547, "entry_no": 2, - "entry": "Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083–4091.", + "entry": "Huang Y, Lv T, Cui L, Lu Y, Wei F (2022) LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 4083–4091. https://doi.org/10.1145/3503161.3548112.", "first_author": "Huang", "year": "2022", "title": "LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking", - "doi": "", + "doi": "10.1145/3503161.3548112", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3503161.3548112", "key": "huang:2022", "fingerprint": "layoutlmv3 pre training for document ai with unified text and image masking", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md", "line": 549, "entry_no": 3, - "entry": "Appalaraju S, Jasani B, Kota B U, Xie Y, Manmatha R (2021) DocFormer: End-to-End Transformer for Document Understanding. In: Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp 993–1003.", + "entry": "Appalaraju S, Jasani B, Kota B U, Xie Y, Manmatha R (2021) DocFormer: End-to-End Transformer for Document Understanding. In: Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp 993–1003. https://doi.org/10.1109/iccv48922.2021.00103.", "first_author": "Appalaraju", "year": "2021", "title": "DocFormer: End-to-End Transformer for Document Understanding", - "doi": "", + "doi": "10.1109/iccv48922.2021.00103", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/iccv48922.2021.00103", "key": "appalaraju:2021", "fingerprint": "docformer end to end transformer for document understanding", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md", "line": 551, "entry_no": 4, - "entry": "Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 498–517.", + "entry": "Kim G, Hong T, Yim M, Nam J, Park J, Yim J, Hwang W, Yun S, Han D, Park S (2022) OCR-free Document Understanding Transformer. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 498–517. https://doi.org/10.1007/978-3-031-19815-1_29.", "first_author": "Kim", "year": "2022", "title": "OCR-free Document Understanding Transformer", - "doi": "", + "doi": "10.1007/978-3-031-19815-1_29", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1007/978-3-031-19815-1_29", "key": "kim:2022", "fingerprint": "ocr free document understanding transformer", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md", "line": 553, "entry_no": 5, - "entry": "Mathew M, Karatzas D, Jawahar C V (2021) DocVQA: A Dataset for VQA on Document Images. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), pp 2200–2209.", + "entry": "Mathew M, Karatzas D, Jawahar C V (2021) DocVQA: A Dataset for VQA on Document Images. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), pp 2200–2209. https://doi.org/10.1109/wacv48630.2021.00225.", "first_author": "Mathew", "year": "2021", "title": "DocVQA: A Dataset for VQA on Document Images", - "doi": "", + "doi": "10.1109/wacv48630.2021.00225", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/wacv48630.2021.00225", "key": "mathew:2021", "fingerprint": "docvqa a dataset for vqa on document images", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md", "line": 555, "entry_no": 6, - "entry": "Mathew M, Bagal V, Tito R, Karatzas D, Valveny E, Jawahar C V (2022) InfographicVQA. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), pp 1697–1706.", + "entry": "Mathew M, Bagal V, Tito R, Karatzas D, Valveny E, Jawahar C V (2022) InfographicVQA. In: Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), pp 1697–1706. https://doi.org/10.1109/wacv51458.2022.00264.", "first_author": "Mathew", "year": "2022", "title": "InfographicVQA", - "doi": "", + "doi": "10.1109/wacv51458.2022.00264", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/wacv51458.2022.00264", "key": "mathew:2022", "fingerprint": "infographicvqa", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md", @@ -13050,18 +12785,16 @@ "file": "docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md", "line": 565, "entry_no": 11, - "entry": "Masry A, Long D X, Tan J Q, Joty S, Hoque E (2022) ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp 2263–2279.", + "entry": "Masry A, Long D X, Tan J Q, Joty S, Hoque E (2022) ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp 2263–2279. https://doi.org/10.18653/v1/2022.findings-acl.177.", "first_author": "Masry", "year": "2022", "title": "ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning", - "doi": "", + "doi": "10.18653/v1/2022.findings-acl.177", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2022.findings-acl.177", "key": "masry:2022", "fingerprint": "chartqa a benchmark for question answering about charts with visual and logical reasoning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md", @@ -13084,18 +12817,16 @@ "file": "docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md", "line": 569, "entry_no": 13, - "entry": "Liu F, Eisenschlos J M, Piccinno F, Krichene S, Pang C, Lee K, Joshi M, Chen W, Collier N, Altun Y (2023b) DePlot: One-shot Visual Language Reasoning by Plot-to-Table Translation. In: Findings of the Association for Computational Linguistics: ACL 2023, pp 10381–10399.", + "entry": "Liu F, Eisenschlos J M, Piccinno F, Krichene S, Pang C, Lee K, Joshi M, Chen W, Collier N, Altun Y (2023b) DePlot: One-shot Visual Language Reasoning by Plot-to-Table Translation. In: Findings of the Association for Computational Linguistics: ACL 2023, pp 10381–10399. https://doi.org/10.18653/v1/2023.findings-acl.660.", "first_author": "Liu", "year": "2023", "title": "DePlot: One-shot Visual Language Reasoning by Plot-to-Table Translation", - "doi": "", + "doi": "10.18653/v1/2023.findings-acl.660", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2023.findings-acl.660", "key": "liu:2023", "fingerprint": "deplot one shot visual language reasoning by plot to table translation", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md", @@ -13167,18 +12898,16 @@ "file": "docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md", "line": 579, "entry_no": 18, - "entry": "Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150–158.", + "entry": "Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150–158. https://doi.org/10.18653/v1/2024.eacl-demo.16.", "first_author": "Es", "year": "2024", "title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation", - "doi": "", + "doi": "10.18653/v1/2024.eacl-demo.16", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2024.eacl-demo.16", "key": "es:2024", "fingerprint": "ragas automated evaluation of retrieval augmented generation", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md", @@ -13218,18 +12947,16 @@ "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", "line": 674, "entry_no": 1, - "entry": "Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 291–300.", + "entry": "Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice, pp 291–300. https://doi.org/10.1109/icse-seip.2019.00042.", "first_author": "Amershi", "year": "2019", "title": "Software Engineering for Machine Learning: A Case Study", - "doi": "", + "doi": "10.1109/icse-seip.2019.00042", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/icse-seip.2019.00042", "key": "amershi:2019", "fingerprint": "software engineering for machine learning a case study", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", @@ -13252,52 +12979,46 @@ "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", "line": 678, "entry_no": 3, - "entry": "Chapelle O, Zhang Y (2009) A Dynamic Bayesian Network Click Model for Web Search Ranking. In: Proceedings of the 18th International Conference on World Wide Web, pp 1–10.", + "entry": "Chapelle O, Zhang Y (2009) A Dynamic Bayesian Network Click Model for Web Search Ranking. In: Proceedings of the 18th International Conference on World Wide Web, pp 1–10. https://doi.org/10.1145/1526709.1526711.", "first_author": "Chapelle", "year": "2009", "title": "A Dynamic Bayesian Network Click Model for Web Search Ranking", - "doi": "", + "doi": "10.1145/1526709.1526711", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/1526709.1526711", "key": "chapelle:2009", "fingerprint": "a dynamic bayesian network click model for web search ranking", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", "line": 680, "entry_no": 4, - "entry": "Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150–158.", + "entry": "Es S, James J, Espinosa-Anke L, Schockaert S (2024) RAGAS: Automated Evaluation of Retrieval Augmented Generation. In: Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, pp 150–158. https://doi.org/10.18653/v1/2024.eacl-demo.16.", "first_author": "Es", "year": "2024", "title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation", - "doi": "", + "doi": "10.18653/v1/2024.eacl-demo.16", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2024.eacl-demo.16", "key": "es:2024", "fingerprint": "ragas automated evaluation of retrieval augmented generation", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", "line": 682, "entry_no": 5, - "entry": "Gama J, Žliobaitė I, Bifet A, Pechenizkiy M, Bouchachia A (2014) A Survey on Concept Drift Adaptation. ACM Computing Surveys 46(4):1–37.", + "entry": "Gama J, Žliobaitė I, Bifet A, Pechenizkiy M, Bouchachia A (2014) A Survey on Concept Drift Adaptation. ACM Computing Surveys 46(4):1–37. https://doi.org/10.1145/2523813.", "first_author": "Gama", "year": "2014", "title": "A Survey on Concept Drift Adaptation", - "doi": "", + "doi": "10.1145/2523813", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/2523813", "key": "gama:2014", "fingerprint": "a survey on concept drift adaptation", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", @@ -13318,18 +13039,16 @@ "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", "line": 686, "entry_no": 7, - "entry": "Hu Y, Koren Y, Volinsky C (2008) Collaborative Filtering for Implicit Feedback Datasets. In: Proceedings of the 2008 IEEE International Conference on Data Mining, pp 263–272.", + "entry": "Hu Y, Koren Y, Volinsky C (2008) Collaborative Filtering for Implicit Feedback Datasets. In: Proceedings of the 2008 IEEE International Conference on Data Mining, pp 263–272. https://doi.org/10.1109/icdm.2008.22.", "first_author": "Hu", "year": "2008", "title": "Collaborative Filtering for Implicit Feedback Datasets", - "doi": "", + "doi": "10.1109/icdm.2008.22", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/icdm.2008.22", "key": "hu:2008", "fingerprint": "collaborative filtering for implicit feedback datasets", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", @@ -13352,35 +13071,31 @@ "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", "line": 690, "entry_no": 9, - "entry": "Joachims T (2002) Optimizing Search Engines Using Clickthrough Data. In: Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp 133–142.", + "entry": "Joachims T (2002) Optimizing Search Engines Using Clickthrough Data. In: Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp 133–142. https://doi.org/10.1145/775066.775067.", "first_author": "Joachims", "year": "2002", "title": "Optimizing Search Engines Using Clickthrough Data", - "doi": "", + "doi": "10.1145/775066.775067", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/775066.775067", "key": "joachims:2002", "fingerprint": "optimizing search engines using clickthrough data", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", "line": 692, "entry_no": 10, - "entry": "Joachims T, Swaminathan A, Schnabel T (2017) Unbiased Learning-to-Rank with Biased Feedback. In: Proceedings of the Tenth ACM International Conference on Web Search and Data Mining, pp 781–789.", + "entry": "Joachims T, Swaminathan A, Schnabel T (2017) Unbiased Learning-to-Rank with Biased Feedback. In: Proceedings of the Tenth ACM International Conference on Web Search and Data Mining, pp 781–789. https://doi.org/10.1145/3018661.3018699.", "first_author": "Joachims", "year": "2017", "title": "Unbiased Learning-to-Rank with Biased Feedback", - "doi": "", + "doi": "10.1145/3018661.3018699", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3018661.3018699", "key": "joachims:2017", "fingerprint": "unbiased learning to rank with biased feedback", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", @@ -13420,52 +13135,46 @@ "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", "line": 698, "entry_no": 13, - "entry": "Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866–31879.", + "entry": "Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866–31879. arXiv:2205.02302.", "first_author": "Kreuzberger", "year": "2023", "title": "Machine Learning Operations (MLOps): Overview, Definition, and Architecture", "doi": "", - "arxiv": "", + "arxiv": "2205.02302", "url": "", "key": "kreuzberger:2023", "fingerprint": "machine learning operations mlops overview definition and architecture", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", "line": 700, "entry_no": 14, - "entry": "Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W-t, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459–9474.", + "entry": "Lewis P, Perez E, Piktus A, Petroni F, Karpukhin V, Goyal N, Küttler H, Lewis M, Yih W-t, Rocktäschel T, Riedel S, Kiela D (2020) Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In: Advances in Neural Information Processing Systems 33, pp 9459–9474. arXiv:2005.11401.", "first_author": "Lewis", "year": "2020", "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", "doi": "", - "arxiv": "", + "arxiv": "2005.11401", "url": "", "key": "lewis:2020", "fingerprint": "retrieval augmented generation for knowledge intensive nlp tasks", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", "line": 702, "entry_no": 15, - "entry": "Mallen A, Asai A, Zhong V, Das R, Khashabi D, Hajishirzi H (2023) When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 9802–9822.", + "entry": "Mallen A, Asai A, Zhong V, Das R, Khashabi D, Hajishirzi H (2023) When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, pp 9802–9822. https://doi.org/10.18653/v1/2023.acl-long.546.", "first_author": "Mallen", "year": "2023", "title": "When Not to Trust Language Models: Investigating Effectiveness of Parametric and Non-Parametric Memories", - "doi": "", + "doi": "10.18653/v1/2023.acl-long.546", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2023.acl-long.546", "key": "mallen:2023", "fingerprint": "when not to trust language models investigating effectiveness of parametric and non parametric memories", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", @@ -13858,18 +13567,16 @@ "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", "line": 658, "entry_no": 1, - "entry": "Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291-300.", + "entry": "Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291-300. https://doi.org/10.1109/icse-seip.2019.00042.", "first_author": "Amershi", "year": "2019", "title": "Software Engineering for Machine Learning: A Case Study", - "doi": "", + "doi": "10.1109/icse-seip.2019.00042", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/icse-seip.2019.00042", "key": "amershi:2019", "fingerprint": "software engineering for machine learning a case study", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", @@ -13943,18 +13650,16 @@ "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", "line": 668, "entry_no": 6, - "entry": "Buneman P, Khanna S, Tan W-C (2001) Why and Where: A Characterization of Data Provenance. In: Proceedings of the 8th International Conference on Database Theory (ICDT), pp 316-330.", + "entry": "Buneman P, Khanna S, Tan W-C (2001) Why and Where: A Characterization of Data Provenance. In: Proceedings of the 8th International Conference on Database Theory (ICDT), pp 316-330. https://doi.org/10.1007/3-540-44503-x_20.", "first_author": "Buneman", "year": "2001", "title": "Why and Where: A Characterization of Data Provenance", - "doi": "", + "doi": "10.1007/3-540-44503-x_20", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1007/3-540-44503-x_20", "key": "buneman:2001", "fingerprint": "why and where a characterization of data provenance", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", @@ -13992,52 +13697,46 @@ "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", "line": 674, "entry_no": 9, - "entry": "Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92.", + "entry": "Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723.", "first_author": "Gebru", "year": "2021", "title": "Datasheets for Datasets", - "doi": "", + "doi": "10.1145/3458723", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3458723", "key": "gebru:2021", "fingerprint": "datasheets for datasets", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", "line": 676, "entry_no": 10, - "entry": "Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879.", + "entry": "Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. arXiv:2205.02302.", "first_author": "Kreuzberger", "year": "2023", "title": "Machine Learning Operations (MLOps): Overview, Definition, and Architecture", "doi": "", - "arxiv": "", + "arxiv": "2205.02302", "url": "", "key": "kreuzberger:2023", "fingerprint": "machine learning operations mlops overview definition and architecture", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", "line": 678, "entry_no": 11, - "entry": "Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229.", + "entry": "Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596.", "first_author": "Mitchell", "year": "2019", "title": "Model Cards for Model Reporting", - "doi": "", + "doi": "10.1145/3287560.3287596", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3287560.3287596", "key": "mitchell:2019", "fingerprint": "model cards for model reporting", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", @@ -14060,18 +13759,16 @@ "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", "line": 682, "entry_no": 13, - "entry": "Peng R D (2011) Reproducible Research in Computational Science. Science 334(6060):1226-1227.", + "entry": "Peng R D (2011) Reproducible Research in Computational Science. Science 334(6060):1226-1227. https://doi.org/10.1126/science.1213847.", "first_author": "Peng", "year": "2011", "title": "Reproducible Research in Computational Science", - "doi": "", + "doi": "10.1126/science.1213847", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1126/science.1213847", "key": "peng:2011", "fingerprint": "reproducible research in computational science", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", @@ -14094,18 +13791,16 @@ "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", "line": 686, "entry_no": 15, - "entry": "Sandve G K, Nekrutenko A, Taylor J, Hovig E (2013) Ten Simple Rules for Reproducible Computational Research. PLOS Computational Biology 9(10):e1003285.", + "entry": "Sandve G K, Nekrutenko A, Taylor J, Hovig E (2013) Ten Simple Rules for Reproducible Computational Research. PLOS Computational Biology 9(10):e1003285. https://doi.org/10.1371/journal.pcbi.1003285.", "first_author": "Sandve", "year": "2013", "title": "Ten Simple Rules for Reproducible Computational Research", - "doi": "", + "doi": "10.1371/journal.pcbi.1003285", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1371/journal.pcbi.1003285", "key": "sandve:2013", "fingerprint": "ten simple rules for reproducible computational research", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", @@ -14128,35 +13823,31 @@ "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", "line": 690, "entry_no": 17, - "entry": "Simmhan Y L, Plale B, Gannon D (2005) A Survey of Data Provenance in e-Science. ACM SIGMOD Record 34(3):31-36.", + "entry": "Simmhan Y L, Plale B, Gannon D (2005) A Survey of Data Provenance in e-Science. ACM SIGMOD Record 34(3):31-36. https://doi.org/10.1145/1084805.1084812.", "first_author": "Simmhan", "year": "2005", "title": "A Survey of Data Provenance in e-Science", - "doi": "", + "doi": "10.1145/1084805.1084812", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/1084805.1084812", "key": "simmhan:2005", "fingerprint": "a survey of data provenance in e science", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", "line": 692, "entry_no": 18, - "entry": "Stodden V, Leisch F, Peng R D (eds.) (2014) Implementing Reproducible Research. CRC Press.", + "entry": "Stodden V, Leisch F, Peng R D (eds.) (2014) Implementing Reproducible Research. CRC Press. https://doi.org/10.1201/b16868.", "first_author": "Stodden", "year": "2014", "title": "Implementing Reproducible Research", - "doi": "", + "doi": "10.1201/b16868", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1201/b16868", "key": "stodden:2014", "fingerprint": "implementing reproducible research", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", @@ -14196,18 +13887,16 @@ "file": "docs/zh/part8/ch26_data_platform_observability.md", "line": 685, "entry_no": 1, - "entry": "Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291-300.", + "entry": "Amershi S, Begel A, Bird C, DeLine R, Gall H, Kamar E, Nagappan N, Nushi B, Zimmermann T (2019) Software Engineering for Machine Learning: A Case Study. In: Proceedings of the 41st International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP), pp 291-300. https://doi.org/10.1109/icse-seip.2019.00042.", "first_author": "Amershi", "year": "2019", "title": "Software Engineering for Machine Learning: A Case Study", - "doi": "", + "doi": "10.1109/icse-seip.2019.00042", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/icse-seip.2019.00042", "key": "amershi:2019", "fingerprint": "software engineering for machine learning a case study", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch26_data_platform_observability.md", @@ -14298,18 +13987,16 @@ "file": "docs/zh/part8/ch26_data_platform_observability.md", "line": 697, "entry_no": 7, - "entry": "Dean J, Barroso L A (2013) The Tail at Scale. Communications of the ACM 56(2):74-80.", + "entry": "Dean J, Barroso L A (2013) The Tail at Scale. Communications of the ACM 56(2):74-80. https://doi.org/10.1145/2408776.2408794.", "first_author": "Dean", "year": "2013", "title": "The Tail at Scale", - "doi": "", + "doi": "10.1145/2408776.2408794", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/2408776.2408794", "key": "dean:2013", "fingerprint": "the tail at scale", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch26_data_platform_observability.md", @@ -14349,35 +14036,31 @@ "file": "docs/zh/part8/ch26_data_platform_observability.md", "line": 703, "entry_no": 10, - "entry": "Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879.", + "entry": "Kreuzberger D, Kühl N, Hirschl S (2023) Machine Learning Operations (MLOps): Overview, Definition, and Architecture. IEEE Access 11:31866-31879. arXiv:2205.02302.", "first_author": "Kreuzberger", "year": "2023", "title": "Machine Learning Operations (MLOps): Overview, Definition, and Architecture", "doi": "", - "arxiv": "", + "arxiv": "2205.02302", "url": "", "key": "kreuzberger:2023", "fingerprint": "machine learning operations mlops overview definition and architecture", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch26_data_platform_observability.md", "line": 705, "entry_no": 11, - "entry": "National Institute of Standards and Technology (2006) Guide to Computer Security Log Management. NIST Special Publication 800-92.", + "entry": "National Institute of Standards and Technology (2006) Guide to Computer Security Log Management. NIST Special Publication 800-92. https://doi.org/10.6028/nist.sp.800-92.", "first_author": "NIST", "year": "2006", "title": "Guide to Computer Security Log Management", - "doi": "", + "doi": "10.6028/nist.sp.800-92", "arxiv": "", - "url": "", + "url": "https://doi.org/10.6028/nist.sp.800-92", "key": "nist:2006", "fingerprint": "guide to computer security log management", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch26_data_platform_observability.md", @@ -14449,18 +14132,16 @@ "file": "docs/zh/part8/ch26_data_platform_observability.md", "line": 715, "entry_no": 16, - "entry": "Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) \"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15.", + "entry": "Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) \"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1-15. https://doi.org/10.1145/3411764.3445518.", "first_author": "Sambasivan", "year": "2021", "title": "\"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI", - "doi": "", + "doi": "10.1145/3411764.3445518", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3411764.3445518", "key": "sambasivan:2021", "fingerprint": "everyone wants to do the model work not the data work data cascades in high stakes ai", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch26_data_platform_observability.md", @@ -14517,18 +14198,16 @@ "file": "docs/zh/part8/ch26_data_platform_observability.md", "line": 723, "entry_no": 20, - "entry": "Xu W, Huang L, Fox A, Patterson D, Jordan M I (2009) Detecting Large-Scale System Problems by Mining Console Logs. In: Proceedings of the ACM SIGOPS 22nd Symposium on Operating Systems Principles (SOSP), pp 117-132.", + "entry": "Xu W, Huang L, Fox A, Patterson D, Jordan M I (2009) Detecting Large-Scale System Problems by Mining Console Logs. In: Proceedings of the ACM SIGOPS 22nd Symposium on Operating Systems Principles (SOSP), pp 117-132. https://doi.org/10.1145/1629575.1629587.", "first_author": "Xu", "year": "2009", "title": "Detecting Large-Scale System Problems by Mining Console Logs", - "doi": "", + "doi": "10.1145/1629575.1629587", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/1629575.1629587", "key": "xu:2009", "fingerprint": "detecting large scale system problems by mining console logs", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", @@ -14568,35 +14247,31 @@ "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", "line": 594, "entry_no": 3, - "entry": "Buneman P, Khanna S, Tan W-C (2001) Why and Where: A Characterization of Data Provenance. In: Proceedings of the 8th International Conference on Database Theory (ICDT), pp 316–330.", + "entry": "Buneman P, Khanna S, Tan W-C (2001) Why and Where: A Characterization of Data Provenance. In: Proceedings of the 8th International Conference on Database Theory (ICDT), pp 316–330. https://doi.org/10.1007/3-540-44503-x_20.", "first_author": "Buneman", "year": "2001", "title": "Why and Where: A Characterization of Data Provenance", - "doi": "", + "doi": "10.1007/3-540-44503-x_20", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1007/3-540-44503-x_20", "key": "buneman:2001", "fingerprint": "why and where a characterization of data provenance", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", "line": 596, "entry_no": 4, - "entry": "Cai L, Zhu Y (2015) The challenges of data quality and data quality assessment in the big data era. Data science journal, 2015, 14: 2-2.", + "entry": "Cai L, Zhu Y (2015) The challenges of data quality and data quality assessment in the big data era. Data science journal, 2015, 14: 2-2. https://doi.org/10.5334/dsj-2015-002.", "first_author": "Cai", "year": "2015", "title": "The challenges of data quality and data quality assessment in the big data era", - "doi": "", + "doi": "10.5334/dsj-2015-002", "arxiv": "", - "url": "", + "url": "https://doi.org/10.5334/dsj-2015-002", "key": "cai:2015", "fingerprint": "the challenges of data quality and data quality assessment in the big data era", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", @@ -14619,35 +14294,31 @@ "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", "line": 600, "entry_no": 6, - "entry": "Fernandez R C, Abedjan Z, Koko F, Yuan G, Madden S, Stonebraker M (2018) Aurum: A Data Discovery System. In: 2018 IEEE 34th International Conference on Data Engineering (ICDE), pp 1001–1012.", + "entry": "Fernandez R C, Abedjan Z, Koko F, Yuan G, Madden S, Stonebraker M (2018) Aurum: A Data Discovery System. In: 2018 IEEE 34th International Conference on Data Engineering (ICDE), pp 1001–1012. https://doi.org/10.1109/icde.2018.00094.", "first_author": "Fernandez", "year": "2018", "title": "Aurum: A Data Discovery System", - "doi": "", + "doi": "10.1109/icde.2018.00094", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/icde.2018.00094", "key": "fernandez:2018", "fingerprint": "aurum a data discovery system", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", "line": 602, "entry_no": 7, - "entry": "Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86–92.", + "entry": "Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé III H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86–92. https://doi.org/10.1145/3458723.", "first_author": "Gebru", "year": "2021", "title": "Datasheets for Datasets", - "doi": "", + "doi": "10.1145/3458723", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3458723", "key": "gebru:2021", "fingerprint": "datasheets for datasets", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", @@ -14704,18 +14375,16 @@ "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", "line": 610, "entry_no": 11, - "entry": "Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency (FAT*), pp 220–229.", + "entry": "Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency (FAT*), pp 220–229. https://doi.org/10.1145/3287560.3287596.", "first_author": "Mitchell", "year": "2019", "title": "Model Cards for Model Reporting", - "doi": "", + "doi": "10.1145/3287560.3287596", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3287560.3287596", "key": "mitchell:2019", "fingerprint": "model cards for model reporting", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", @@ -14738,18 +14407,16 @@ "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", "line": 614, "entry_no": 13, - "entry": "Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17–28.", + "entry": "Polyzotis N, Roy S, Whang S E, Zinkevich M (2018) Data Lifecycle Challenges in Production Machine Learning: A Survey. ACM SIGMOD Record 47(2):17–28. https://doi.org/10.1145/3299887.3299891.", "first_author": "Polyzotis", "year": "2018", "title": "Data Lifecycle Challenges in Production Machine Learning: A Survey", - "doi": "", + "doi": "10.1145/3299887.3299891", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3299887.3299891", "key": "polyzotis:2018", "fingerprint": "data lifecycle challenges in production machine learning a survey", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", @@ -14772,18 +14439,16 @@ "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", "line": 618, "entry_no": 15, - "entry": "Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) \"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1–15.", + "entry": "Sambasivan N, Kapania S, Highfill H, Akrong D, Paritosh P, Aroyo L M (2021) \"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp 1–15. https://doi.org/10.1145/3411764.3445518.", "first_author": "Sambasivan", "year": "2021", "title": "\"Everyone wants to do the model work, not the data work\": Data Cascades in High-Stakes AI", - "doi": "", + "doi": "10.1145/3411764.3445518", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/3411764.3445518", "key": "sambasivan:2021", "fingerprint": "everyone wants to do the model work not the data work data cascades in high stakes ai", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", @@ -14857,18 +14522,16 @@ "file": "docs/zh/part9/ch27_data_catalog_and_metadata_governance.md", "line": 628, "entry_no": 20, - "entry": "Wang R Y, Strong D M (1996) Beyond Accuracy: What Data Quality Means to Data Consumers. Journal of Management Information Systems 12(4):5–33.", + "entry": "Wang R Y, Strong D M (1996) Beyond Accuracy: What Data Quality Means to Data Consumers. Journal of Management Information Systems 12(4):5–33. https://doi.org/10.1080/07421222.1996.11518099.", "first_author": "Wang", "year": "1996", "title": "Beyond Accuracy: What Data Quality Means to Data Consumers", - "doi": "", + "doi": "10.1080/07421222.1996.11518099", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1080/07421222.1996.11518099", "key": "wang:1996", "fingerprint": "beyond accuracy what data quality means to data consumers", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch28_data_productization_and_data_contracts.md", @@ -14959,18 +14622,16 @@ "file": "docs/zh/part9/ch28_data_productization_and_data_contracts.md", "line": 318, "entry_no": 6, - "entry": "Lwakatare L E, Raj A, Crnkovic I, Bosch J, Olsson H H (2020) Large-scale machine learning systems in real-world industrial settings: A review of challenges and solutions. Information and Software Technology 127:106368.", + "entry": "Lwakatare L E, Raj A, Crnkovic I, Bosch J, Olsson H H (2020) Large-scale machine learning systems in real-world industrial settings: A review of challenges and solutions. Information and Software Technology 127:106368. https://doi.org/10.1016/j.infsof.2020.106368.", "first_author": "Lwakatare", "year": "2020", "title": "Large-scale machine learning systems in real-world industrial settings: A review of challenges and solutions", - "doi": "", + "doi": "10.1016/j.infsof.2020.106368", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1016/j.infsof.2020.106368", "key": "lwakatare:2020", "fingerprint": "large scale machine learning systems in real world industrial settings a review of challenges and solutions", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch28_data_productization_and_data_contracts.md", @@ -15178,18 +14839,16 @@ "file": "docs/zh/part9/ch29_data_valuation_and_reuse.md", "line": 671, "entry_no": 2, - "entry": "Fleckenstein M, Obaidi A, Tryfona N (2023) A Review of Data Valuation Approaches and Building and Scoring a Data Valuation Model. Harvard Data Science Review 5(1).", + "entry": "Fleckenstein M, Obaidi A, Tryfona N (2023) A Review of Data Valuation Approaches and Building and Scoring a Data Valuation Model. Harvard Data Science Review 5(1). https://doi.org/10.1162/99608f92.c18db966.", "first_author": "Fleckenstein", "year": "2023", "title": "A Review of Data Valuation Approaches and Building and Scoring a Data Valuation Model", - "doi": "", + "doi": "10.1162/99608f92.c18db966", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1162/99608f92.c18db966", "key": "fleckenstein:2023", "fingerprint": "a review of data valuation approaches and building and scoring a data valuation model", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch29_data_valuation_and_reuse.md", @@ -15244,18 +14903,16 @@ "file": "docs/zh/part9/ch29_data_valuation_and_reuse.md", "line": 679, "entry_no": 6, - "entry": "Hoffmann J, Borgeaud S, Mensch A, Buchatskaya E, Cai T, Rutherford E, de Las Casas D, Hendricks L A, Welbl J, Clark A, Hennigan T, Noland E, Millican K, van den Driessche G, Damoc B, Guy A, Osindero S, Simonyan K, Elsen E, Rae J W, Vinyals O, Sifre L (2022) Training Compute-Optimal Large Language Models. In: Advances in Neural Information Processing Systems 35.", + "entry": "Hoffmann J, Borgeaud S, Mensch A, Buchatskaya E, Cai T, Rutherford E, de Las Casas D, Hendricks L A, Welbl J, Clark A, Hennigan T, Noland E, Millican K, van den Driessche G, Damoc B, Guy A, Osindero S, Simonyan K, Elsen E, Rae J W, Vinyals O, Sifre L (2022) Training Compute-Optimal Large Language Models. In: Advances in Neural Information Processing Systems 35. arXiv:2203.15556.", "first_author": "Hoffmann", "year": "2022", "title": "Training Compute-Optimal Large Language Models", "doi": "", - "arxiv": "", + "arxiv": "2203.15556", "url": "", "key": "hoffmann:2022", "fingerprint": "training compute optimal large language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch29_data_valuation_and_reuse.md", @@ -15512,18 +15169,16 @@ "file": "docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md", "line": 537, "entry_no": 2, - "entry": "Alhassan I, Sammon D, Daly M (2016) Data governance activities: an analysis of the literature. Journal of Decision Systems 25(sup1):64-75.", + "entry": "Alhassan I, Sammon D, Daly M (2016) Data governance activities: an analysis of the literature. Journal of Decision Systems 25(sup1):64-75. https://doi.org/10.1080/12460125.2016.1187397.", "first_author": "Alhassan", "year": "2016", "title": "Data governance activities: an analysis of the literature", - "doi": "", + "doi": "10.1080/12460125.2016.1187397", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1080/12460125.2016.1187397", "key": "alhassan:2016", "fingerprint": "data governance activities an analysis of the literature", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md", @@ -15597,35 +15252,31 @@ "file": "docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md", "line": 547, "entry_no": 7, - "entry": "Hu V C, Ferraiolo D, Kuhn R, Schnitzer A, Sandlin K, Miller R, Scarfone K (2014) Guide to Attribute Based Access Control (ABAC) Definition and Considerations. NIST Special Publication 800-162.", + "entry": "Hu V C, Ferraiolo D, Kuhn R, Schnitzer A, Sandlin K, Miller R, Scarfone K (2014) Guide to Attribute Based Access Control (ABAC) Definition and Considerations. NIST Special Publication 800-162. https://doi.org/10.6028/nist.sp.800-162.", "first_author": "Hu", "year": "2014", "title": "Guide to Attribute Based Access Control (ABAC) Definition and Considerations", - "doi": "", + "doi": "10.6028/nist.sp.800-162", "arxiv": "", - "url": "", + "url": "https://doi.org/10.6028/nist.sp.800-162", "key": "hu:2014", "fingerprint": "guide to attribute based access control abac definition and considerations", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md", "line": 549, "entry_no": 8, - "entry": "Khatri V, Brown C V (2010) Designing data governance. Communications of the ACM 53(1):148-152.", + "entry": "Khatri V, Brown C V (2010) Designing data governance. Communications of the ACM 53(1):148-152. https://doi.org/10.1145/1629175.1629210.", "first_author": "Khatri", "year": "2010", "title": "Designing data governance", - "doi": "", + "doi": "10.1145/1629175.1629210", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/1629175.1629210", "key": "khatri:2010", "fingerprint": "designing data governance", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md", @@ -15699,35 +15350,31 @@ "file": "docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md", "line": 559, "entry_no": 13, - "entry": "National Institute of Standards and Technology (2020b) NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0.", + "entry": "National Institute of Standards and Technology (2020b) NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0. https://doi.org/10.6028/nist.cswp.10.", "first_author": "NIST", "year": "2020", "title": "NIST Privacy Framework: A Tool for Improving Privacy through Enterprise Risk Management, Version 1.0", - "doi": "", + "doi": "10.6028/nist.cswp.10", "arxiv": "", - "url": "", + "url": "https://doi.org/10.6028/nist.cswp.10", "key": "nist:2020", "fingerprint": "nist privacy framework a tool for improving privacy through enterprise risk management version 1 0", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md", "line": 561, "entry_no": 14, - "entry": "Otto B (2011) Data Governance. Business & Information Systems Engineering 3(4):241-244.", + "entry": "Otto B (2011) Data Governance. Business & Information Systems Engineering 3(4):241-244. https://doi.org/10.1002/9781118269053.ch4.", "first_author": "Otto", "year": "2011", "title": "Data Governance", - "doi": "", + "doi": "10.1002/9781118269053.ch4", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1002/9781118269053.ch4", "key": "otto:2011", "fingerprint": "data governance", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md", @@ -15818,18 +15465,16 @@ "file": "docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md", "line": 573, "entry_no": 20, - "entry": "Wilkinson M D, Dumontier M, Aalbersberg I J, Appleton G, Axton M, Baak A, Blomberg N, Boiten J-W, da Silva Santos L B, Bourne P E, Bouwman J, Brookes A J, Clark T, Crosas M, Dillo I, Dumon O, Edmunds S, Evelo C T, Finkers R, Gonzalez-Beltran A, Gray A J G, Groth P, Goble C, Grethe J S, Heringa J, 't Hoen P A C, Hooft R, Kuhn T, Kok R, Kok J, Lusher S J, Martone M E, Mons A, Packer A L, Persson B, Rocca-Serra P, Roos M, van Schaik R, Sansone S-A, Schultes E, Sengstag T, Slater T, Strawn G, Swertz M A, Thompson M, van der Lei J, van Mulligen E, Velterop J, Waagmeester A, Wittenburg P, Wolstencroft K, Zhao J, Mons B (2016) The FAIR Guiding Principles for scientific data management and stewardship. Scientific Data 3:160018.", + "entry": "Wilkinson M D, Dumontier M, Aalbersberg I J, Appleton G, Axton M, Baak A, Blomberg N, Boiten J-W, da Silva Santos L B, Bourne P E, Bouwman J, Brookes A J, Clark T, Crosas M, Dillo I, Dumon O, Edmunds S, Evelo C T, Finkers R, Gonzalez-Beltran A, Gray A J G, Groth P, Goble C, Grethe J S, Heringa J, 't Hoen P A C, Hooft R, Kuhn T, Kok R, Kok J, Lusher S J, Martone M E, Mons A, Packer A L, Persson B, Rocca-Serra P, Roos M, van Schaik R, Sansone S-A, Schultes E, Sengstag T, Slater T, Strawn G, Swertz M A, Thompson M, van der Lei J, van Mulligen E, Velterop J, Waagmeester A, Wittenburg P, Wolstencroft K, Zhao J, Mons B (2016) The FAIR Guiding Principles for scientific data management and stewardship. Scientific Data 3:160018. https://doi.org/10.1038/sdata.2016.18.", "first_author": "Wilkinson", "year": "2016", "title": "The FAIR Guiding Principles for scientific data management and stewardship", - "doi": "", + "doi": "10.1038/sdata.2016.18", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1038/sdata.2016.18", "key": "wilkinson:2016", "fingerprint": "the fair guiding principles for scientific data management and stewardship", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] } ], "body_citations": [ @@ -15867,7 +15512,7 @@ "year": "2023", "text": "National Institute of Standards and Technology 2023", "key": "nist:2023", - "context": "更稳妥的做法,是把合规拆成四个关口。这个拆分也可以和风险管理框架对齐:NIST AI RMF 强调按治理、映射、度量和管理组织 AI 风险,欧盟《人工智能法案》则进一步体现了按风险等级设置义务和边界的监管思路(National Institute of Standards and Technology 2023; Regulation (EU) 2024/1689)。" + "context": "更稳妥的做法,是把合规拆成四个关口。这个拆分也可以和风险管理框架对齐:NIST AI RMF 强调按治理、映射、度量和管理组织 AI 风险,欧盟《人工智能法案》则进一步体现了按风险等级设置义务和边界的监管思路(National Institute of Standards and Technology 2023; European Parliament and Council of the European Union 2024)。" }, { "file": "docs/zh/appendix_b_compliance_and_release_checklist.md", @@ -15903,7 +15548,7 @@ "year": "2023", "text": "Kwon et al. 2023", "key": "kwon:2023", - "context": "推理成本至少应分为三类。对于长上下文和高并发服务,PagedAttention 等内存管理机制已经成为推理服务成本估算的重要参照,vLLM 的工程文档也提供了部署和调优层面的实践入口(Kwon et al. 2023; vLLM Documentation)。" + "context": "推理成本至少应分为三类。对于长上下文和高并发服务,PagedAttention 等内存管理机制已经成为推理服务成本估算的重要参照,vLLM 的工程文档也提供了部署和调优层面的实践入口(Kwon et al. 2023; vLLM Project 2026)。" }, { "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", @@ -18326,6 +17971,87 @@ "key": "sheller:2020", "context": "回到本章开头的医疗案例。这个案例最能说明为什么单一技术往往不够。首先,数据属于 C3 高敏感,天然不允许自由流动;其次,文本、影像、基因属于跨模态高风险组合,单独脱敏并不足以消除对齐后的重识别风险;再次,参与方跨机构且互不完全信任,任何一方都不愿承担集中建湖的责任。医疗联邦学习研究已经展示了跨机构协作在不共享患者原始数据时的可行性与价值(Sheller et al. 2020)。" }, + { + "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", + "line": 29, + "author": "Masry", + "year": "2022", + "text": "Masry et al. 2022)", + "key": "masry:2022", + "context": "现有主流图表视觉问答数据集,如 ChartQA (Masry et al. 2022)、FigureQA (Kahou et al. 2017)、PlotQA (Methani et al. 2020) 等,在数据设计层面普遍遵循一图一问、单图闭环范式:单张样本图像只包含独立单一图表(单柱状图、单折线图、单饼图等),全部作答所需数据、图例、统计数值均被收拢在同一张图表内,模型仅需要通过定位图表坐标,读取标注数字,而后完成加减运算或..." + }, + { + "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", + "line": 29, + "author": "Kahou", + "year": "2017", + "text": "Kahou et al. 2017)", + "key": "kahou:2017", + "context": "现有主流图表视觉问答数据集,如 ChartQA (Masry et al. 2022)、FigureQA (Kahou et al. 2017)、PlotQA (Methani et al. 2020) 等,在数据设计层面普遍遵循一图一问、单图闭环范式:单张样本图像只包含独立单一图表(单柱状图、单折线图、单饼图等),全部作答所需数据、图例、统计数值均被收拢在同一张图表内,模型仅需要通过定位图表坐标,读取标注数字,而后完成加减运算或..." + }, + { + "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", + "line": 29, + "author": "Methani", + "year": "2020", + "text": "Methani et al. 2020)", + "key": "methani:2020", + "context": "现有主流图表视觉问答数据集,如 ChartQA (Masry et al. 2022)、FigureQA (Kahou et al. 2017)、PlotQA (Methani et al. 2020) 等,在数据设计层面普遍遵循一图一问、单图闭环范式:单张样本图像只包含独立单一图表(单柱状图、单折线图、单饼图等),全部作答所需数据、图例、统计数值均被收拢在同一张图表内,模型仅需要通过定位图表坐标,读取标注数字,而后完成加减运算或..." + }, + { + "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", + "line": 29, + "author": "Kafle", + "year": "2018", + "text": "Kafle et al. 2018", + "key": "kafle:2018", + "context": "现有主流图表视觉问答数据集,如 ChartQA (Masry et al. 2022)、FigureQA (Kahou et al. 2017)、PlotQA (Methani et al. 2020) 等,在数据设计层面普遍遵循一图一问、单图闭环范式:单张样本图像只包含独立单一图表(单柱状图、单折线图、单饼图等),全部作答所需数据、图例、统计数值均被收拢在同一张图表内,模型仅需要通过定位图表坐标,读取标注数字,而后完成加减运算或..." + }, + { + "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", + "line": 29, + "author": "Zhu", + "year": "2025", + "text": "Zhu et al. 2025)", + "key": "zhu:2025", + "context": "现有主流图表视觉问答数据集,如 ChartQA (Masry et al. 2022)、FigureQA (Kahou et al. 2017)、PlotQA (Methani et al. 2020) 等,在数据设计层面普遍遵循一图一问、单图闭环范式:单张样本图像只包含独立单一图表(单柱状图、单折线图、单饼图等),全部作答所需数据、图例、统计数值均被收拢在同一张图表内,模型仅需要通过定位图表坐标,读取标注数字,而后完成加减运算或..." + }, + { + "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", + "line": 31, + "author": "Masry", + "year": "2025", + "text": "Masry et al. 2025", + "key": "masry:2025", + "context": "从任务难度分层来看,单图表 VQA 任务基本止步于单步信息抽取,任务需求集中在极值查找、单类求和、单一占比计算,不存在跨视图数据联动需求 (Masry et al. 2025; Xie et al. 2026)。在实验室标准化数据集环境下,图表样式经过人工规整优化:图例排版规整、坐标轴标注无歧义、数据分区边界清晰、无附加备注文本与补充说明,数据环境经过降噪处理,和互联网、商业出版物原生信息图的制作规范存在本质区别。" + }, + { + "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", + "line": 31, + "author": "Xie", + "year": "2026", + "text": "Xie et al. 2026)", + "key": "xie:2026", + "context": "从任务难度分层来看,单图表 VQA 任务基本止步于单步信息抽取,任务需求集中在极值查找、单类求和、单一占比计算,不存在跨视图数据联动需求 (Masry et al. 2025; Xie et al. 2026)。在实验室标准化数据集环境下,图表样式经过人工规整优化:图例排版规整、坐标轴标注无歧义、数据分区边界清晰、无附加备注文本与补充说明,数据环境经过降噪处理,和互联网、商业出版物原生信息图的制作规范存在本质区别。" + }, + { + "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", + "line": 37, + "author": "Mathew", + "year": "2021", + "text": "Mathew et al. 2021)", + "key": "mathew:2021", + "context": "复合信息图是多子图嵌套式可视化载体,整图为统一图片文件,内部切割为多个物理分区,每个分区承载独立类型子图表,辅以全局图例、分区注释、侧边文字说明、补充警示标注,也是多图表信息图推理数据集的样本底层形态 (Mathew et al. 2021)。对比单图表,真实复合信息图推理天然附带三类核心刚需,也是本数据集锚定的三大核心任务,本小节先从落地场景解释任务由来,第二章做标准化定义:" + }, + { + "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", + "line": 45, + "author": "Foroutan", + "year": "2025", + "text": "Foroutan et al. 2025)", + "key": "foroutan:2025", + "context": "当前全球公开的多模态图表推理基准数据集存在明显供给缺口:人工合成仿真图表数据集较多,基于网页、报刊、科普出版物抓取的原生真实复合信息图样本稀缺,多数数据集为了降低标注难度,人为拆分多子图信息图为多张独立图片,破坏原图的空间关联与上下文逻辑 (Foroutan et al. 2025)。在此背景下,多图表信息图推理数据集立足原生真实信息图抓取,保留原图多子图同屏布局、图例全局共用、分区备注穿插的原生结构,与上述依赖网页抓取的构建方式..." + }, { "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", "line": 269, @@ -18499,7 +18225,25 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 99, + "line": 55, + "author": "Gheshlaghi Azar", + "year": "2024", + "text": "Gheshlaghi Azar et al. 2024", + "key": "gheshlaghiazar:2024", + "context": "在这些方法中,偏好学习并不只有“二选一排序”这一种理解方式。KTO 等方法把人类反馈解释为更接近前景理论的收益/损失信号,理论分析也在尝试统一 RLHF、DPO 与更一般的人类偏好学习范式(Ethayarajh et al. 2024; Gheshlaghi Azar et al. 2024)。这提醒数据工程侧不要只保存最终标签,还要保存反馈来源、评审理由和候选组结构。" + }, + { + "file": "docs/zh/part13/ch45_posttrain_recipes.md", + "line": 55, + "author": "Ethayarajh", + "year": "2024", + "text": "Ethayarajh et al. 2024", + "key": "ethayarajh:2024", + "context": "在这些方法中,偏好学习并不只有“二选一排序”这一种理解方式。KTO 等方法把人类反馈解释为更接近前景理论的收益/损失信号,理论分析也在尝试统一 RLHF、DPO 与更一般的人类偏好学习范式(Ethayarajh et al. 2024; Gheshlaghi Azar et al. 2024)。这提醒数据工程侧不要只保存最终标签,还要保存反馈来源、评审理由和候选组结构。" + }, + { + "file": "docs/zh/part13/ch45_posttrain_recipes.md", + "line": 101, "author": "Grattafiori", "year": "2024", "text": "Grattafiori et al. 2024)", @@ -18508,25 +18252,43 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 100, + "line": 102, "author": "Xu", - "year": "2024", - "text": "Xu et al. 2024)", - "key": "xu:2024", - "context": "* **Qwen2.5** 对中文、多语、多任务和合成数据路线有重要参考价值。这里需要谨慎区分:Qwen2.5 报告中的合成数据路线与 Magpie (Xu et al. 2024) 这类无种子合成方法可以并列讨论,但不应在缺少明确来源时写成“官方采用 Magpie”。" + "year": "2025", + "text": "Xu et al. 2025)", + "key": "xu:2025", + "context": "* **Qwen2.5 / Qwen3** 对中文、多语、多任务和合成数据路线有重要参考价值。这里需要谨慎区分:Qwen 系列报告中的合成数据路线与 Magpie (Xu et al. 2025) 这类无种子合成方法可以并列讨论;Qwen3 技术报告也可作为后续版本路线的公开参照(Yang et al. 2025),但不应在缺少明确来源时写成“官方采用 Magpie”。" }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 101, + "line": 102, + "author": "Yang", + "year": "2025", + "text": "Yang et al. 2025", + "key": "yang:2025", + "context": "* **Qwen2.5 / Qwen3** 对中文、多语、多任务和合成数据路线有重要参考价值。这里需要谨慎区分:Qwen 系列报告中的合成数据路线与 Magpie (Xu et al. 2025) 这类无种子合成方法可以并列讨论;Qwen3 技术报告也可作为后续版本路线的公开参照(Yang et al. 2025),但不应在缺少明确来源时写成“官方采用 Magpie”。" + }, + { + "file": "docs/zh/part13/ch45_posttrain_recipes.md", + "line": 103, "author": "Wang", "year": "2024", "text": "Wang et al. 2024", "key": "wang:2024", - "context": "* **Nemotron-4** 和 HelpSteer2 的价值在于偏好标注颗粒度。HelpSteer2 (Wang et al. 2024b) 不只记录总体偏好,还围绕有用性、正确性、连贯性、复杂度和冗余度等维度建立打分信号,为奖励模型数据设计提供了可参考样例。" + "context": "* **Nemotron-4** 和 HelpSteer2 的价值在于偏好标注颗粒度。HelpSteer2 (Wang et al. 2024b) 不只记录总体偏好,还围绕有用性、正确性、连贯性、复杂度和冗余度等维度建立打分信号,为奖励模型数据设计提供了可参考样例。奖励模型的工程技巧还可参考 Skywork-Reward 对数据混合、训练稳定性和评测口径的总结(Liu et al. 2024b)。" }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 114, + "line": 103, + "author": "Liu", + "year": "2024", + "text": "Liu et al. 2024", + "key": "liu:2024", + "context": "* **Nemotron-4** 和 HelpSteer2 的价值在于偏好标注颗粒度。HelpSteer2 (Wang et al. 2024b) 不只记录总体偏好,还围绕有用性、正确性、连贯性、复杂度和冗余度等维度建立打分信号,为奖励模型数据设计提供了可参考样例。奖励模型的工程技巧还可参考 Skywork-Reward 对数据混合、训练稳定性和评测口径的总结(Liu et al. 2024b)。" + }, + { + "file": "docs/zh/part13/ch45_posttrain_recipes.md", + "line": 116, "author": "Wang", "year": "2023", "text": "Wang et al. 2023)", @@ -18535,7 +18297,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 121, + "line": 123, "author": "Xu", "year": "2024", "text": "Xu et al. 2024)", @@ -18544,7 +18306,16 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 161, + "line": 151, + "author": "Liu", + "year": "2024", + "text": "Liu et al. 2024", + "key": "liu:2024", + "context": "大型开源技术报告也说明,后训练数据配比通常和预训练、合成数据、推理数据以及安全数据共同演化,而不是单独优化某一类样本。DeepSeek-V3 报告中的混合训练与后训练描述可以作为理解这类多阶段配方的补充参照(Liu et al. 2024a)。" + }, + { + "file": "docs/zh/part13/ch45_posttrain_recipes.md", + "line": 165, "author": "Ouyang", "year": "2022", "text": "Ouyang et al. 2022)", @@ -18553,7 +18324,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 168, + "line": 172, "author": "Rafailov", "year": "2023", "text": "Rafailov et al. 2023)", @@ -18562,7 +18333,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 175, + "line": 179, "author": "Shao", "year": "2024", "text": "Shao et al. 2024)", @@ -18571,7 +18342,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 217, + "line": 221, "author": "Lambert", "year": "2025", "text": "Lambert et al. 2025)", @@ -18580,7 +18351,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 225, + "line": 229, "author": "Zheng", "year": "2023", "text": "Zheng et al. 2023)", @@ -18589,16 +18360,97 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 248, + "line": 252, "author": "Grattafiori", "year": "2024", "text": "Grattafiori et al. 2024)", "key": "grattafiori:2024", "context": "Llama-3 (Grattafiori et al. 2024) 的后训练代表了一类高投入工业路线。与 Tülu-3 更强调公开 recipe 的可复现性不同,Llama-3 报告强调多轮 RLHF 迭代。其关键不在于单个数据集,而在于偏好采集、奖励模型更新、拒绝采样和失败样本回流之间的工程流转机制。" }, + { + "file": "docs/zh/part13/ch45_posttrain_recipes.md", + "line": 308, + "author": "Zhou", + "year": "2023", + "text": "Zhou et al. 2023", + "key": "zhou:2023", + "context": "在后训练语境中,污染不只意味着题面重复,还包括模型通过指令合成、偏好筛选或 judge 反馈间接学到 benchmark 的答案模式。已有研究专门提醒,不应让 LLM 在训练和反馈环节中成为“评测作弊者”(Zhou et al. 2023)。" + }, + { + "file": "docs/zh/part13/ch45_posttrain_recipes.md", + "line": 332, + "author": "Lightman", + "year": "2024", + "text": "Lightman et al. 2024", + "key": "lightman:2024", + "context": "过程监督的代表性工作表明,逐步验证可以把“答案是否正确”拆成更细的训练信号,从而改善数学推理中的错误定位和数据筛选(Lightman et al. 2024)。" + }, + { + "file": "docs/zh/part13/ch45_posttrain_recipes.md", + "line": 347, + "author": "Singhal", + "year": "2024", + "text": "Singhal et al. 2024", + "key": "singhal:2024", + "context": "* **RM 视野狭窄:** Reward model 的训练集如果覆盖的 prompt 类型过窄,模型在线上部署后,面对复杂的用户输入,其行为会被未知的奖励漏洞强力牵引,产生荒谬的输出。长度相关偏差尤其值得单独审计,因为 RLHF 数据与 RM 分数都可能把“更长”误当作“更好”(Singhal et al. 2024)。" + }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 205, + "line": 41, + "author": "Zelikman", + "year": "2022", + "text": "Zelikman et al. 2022", + "key": "zelikman:2022", + "context": "这一传统路线背后有多条代表性技术线索:STaR 强调从模型自身推理中自举出新监督信号,Self-Refine 强调利用反馈迭代修正回答,LIMA 则提醒少量高质量对齐样本也可能带来显著行为变化(Zelikman et al. 2022; Madaan et al. 2023; Zhou et al. 2023)。这些工作共同说明,推理数据的价值不只来自链条长度,也来自能否形成可筛选、可回流的改进过程。" + }, + { + "file": "docs/zh/part13/ch46_rl_reasoning_data.md", + "line": 41, + "author": "Madaan", + "year": "2023", + "text": "Madaan et al. 2023", + "key": "madaan:2023", + "context": "这一传统路线背后有多条代表性技术线索:STaR 强调从模型自身推理中自举出新监督信号,Self-Refine 强调利用反馈迭代修正回答,LIMA 则提醒少量高质量对齐样本也可能带来显著行为变化(Zelikman et al. 2022; Madaan et al. 2023; Zhou et al. 2023)。这些工作共同说明,推理数据的价值不只来自链条长度,也来自能否形成可筛选、可回流的改进过程。" + }, + { + "file": "docs/zh/part13/ch46_rl_reasoning_data.md", + "line": 41, + "author": "Zhou", + "year": "2023", + "text": "Zhou et al. 2023", + "key": "zhou:2023", + "context": "这一传统路线背后有多条代表性技术线索:STaR 强调从模型自身推理中自举出新监督信号,Self-Refine 强调利用反馈迭代修正回答,LIMA 则提醒少量高质量对齐样本也可能带来显著行为变化(Zelikman et al. 2022; Madaan et al. 2023; Zhou et al. 2023)。这些工作共同说明,推理数据的价值不只来自链条长度,也来自能否形成可筛选、可回流的改进过程。" + }, + { + "file": "docs/zh/part13/ch46_rl_reasoning_data.md", + "line": 92, + "author": "Touvron", + "year": "2023", + "text": "Touvron et al. 2023", + "key": "touvron:2023", + "context": "如果冷启动样本来自已有开源聊天模型或基础模型,还应注意不同模型族的许可、数据披露程度和对齐阶段差异。Llama 2 这类开放基础模型报告提供了理解模型卡、许可边界与微调数据透明度的早期参照(Touvron et al. 2023)。" + }, + { + "file": "docs/zh/part13/ch46_rl_reasoning_data.md", + "line": 189, + "author": "Zheng", + "year": "2023", + "text": "Zheng et al. 2023", + "key": "zheng:2023", + "context": "LLM-as-Judge 的工程使用需要单独校准。MT-Bench 和 Chatbot Arena 的经验显示,模型评审可以提高开放式评测吞吐,但也会引入位置偏差、长度偏差和模型家族偏好;奖励模型过度优化研究也说明,持续追逐单一 reward 可能让策略偏离真实质量目标(Zheng et al. 2023; Gao et al. 2023)。" + }, + { + "file": "docs/zh/part13/ch46_rl_reasoning_data.md", + "line": 189, + "author": "Gao", + "year": "2023", + "text": "Gao et al. 2023", + "key": "gao:2023", + "context": "LLM-as-Judge 的工程使用需要单独校准。MT-Bench 和 Chatbot Arena 的经验显示,模型评审可以提高开放式评测吞吐,但也会引入位置偏差、长度偏差和模型家族偏好;奖励模型过度优化研究也说明,持续追逐单一 reward 可能让策略偏离真实质量目标(Zheng et al. 2023; Gao et al. 2023)。" + }, + { + "file": "docs/zh/part13/ch46_rl_reasoning_data.md", + "line": 211, "author": "Meurer", "year": "2017", "text": "Meurer et al. 2017)", @@ -18607,7 +18459,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 241, + "line": 247, "author": "Cobbe", "year": "2021", "text": "Cobbe et al. 2021)", @@ -18616,7 +18468,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 241, + "line": 247, "author": "Hendrycks", "year": "2021", "text": "Hendrycks et al. 2021)", @@ -18625,7 +18477,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 241, + "line": 247, "author": "Chen", "year": "2021", "text": "Chen et al. 2021)", @@ -18634,7 +18486,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 270, + "line": 276, "author": "Guo", "year": "2025", "text": "Guo et al. 2025)", @@ -18643,7 +18495,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 286, + "line": 292, "author": "Qwen Team", "year": "2025", "text": "Qwen Team 2025)", @@ -18652,7 +18504,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 296, + "line": 302, "author": "Kimi Team", "year": "2025", "text": "Kimi Team 2025)", @@ -18661,13 +18513,76 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 306, + "line": 312, "author": "Guha", "year": "2025", "text": "Guha et al. 2025)", "key": "guha:2025", "context": "OpenThoughts-114K (Guha et al. 2025) 是开源社区中重要的推理数据集之一,Hugging Face 数据集卡显示其以 Apache-2.0 许可证发布,并提供 parquet 格式数据 [D]。它的价值在于提供了可下载、可检查、可用于训练的 Long-CoT 样本,使研究者能够复现实验并研究推理数据配方。" }, + { + "file": "docs/zh/part13/ch46_rl_reasoning_data.md", + "line": 354, + "author": "Ott", + "year": "2023", + "text": "Ott et al. 2023", + "key": "ott:2023", + "context": "更早的 ThoughtSource 也把推理数据整理成可复用中心库,展示了把不同任务、不同推理格式和不同来源统一登记的必要性;这类资源适合作为推理样本 schema 和来源治理的参考,而不是简单拼接成训练集(Ott et al. 2023)。" + }, + { + "file": "docs/zh/part13/ch46_rl_reasoning_data.md", + "line": 370, + "author": "Hsieh", + "year": "2023", + "text": "Hsieh et al. 2023", + "key": "hsieh:2023", + "context": "蒸馏式推理数据还需要额外记录 teacher、rationale 和最终答案之间的关系。Distilling Step-by-Step 表明,中间解释可以帮助小模型学习,但前提是解释质量、答案正确性和训练目标之间保持一致(Hsieh et al. 2023)。" + }, + { + "file": "docs/zh/part13/ch46_rl_reasoning_data.md", + "line": 426, + "author": "Hosseini", + "year": "2024", + "text": "Hosseini et al. 2024", + "key": "hosseini:2024", + "context": "训练 verifier 本身也可以成为数据工程对象。V-STaR 这类工作展示了用自生成轨迹训练验证器的路线,说明 verifier 池既需要规则回归测试,也需要持续积累可判别的正负样本(Hosseini et al. 2024)。" + }, + { + "file": "docs/zh/part13/ch46_rl_reasoning_data.md", + "line": 428, + "author": "Lightman", + "year": "2024", + "text": "Lightman et al. 2024", + "key": "lightman:2024", + "context": "过程级验证也可以借鉴 “Let's Verify Step by Step” 的思路,把整题正确性拆到中间步骤,让数据管线能定位哪一步开始偏离,而不是只在最终答案上给出二元通过/失败(Lightman et al. 2024)。" + }, + { + "file": "docs/zh/part13/ch46_rl_reasoning_data.md", + "line": 499, + "author": "Shi", + "year": "2022", + "text": "Shi et al. 2022", + "key": "shi:2022", + "context": "多语言推理数据尤其需要单独统计语言与任务类型。已有多语言 CoT 研究说明,跨语言推理能力并不总是随英文 CoT 自动迁移,因此语言混杂既是输出质量问题,也是数据分布问题(Shi et al. 2022)。" + }, + { + "file": "docs/zh/part13/ch46_rl_reasoning_data.md", + "line": 519, + "author": "Jaech", + "year": "2024", + "text": "Jaech et al. 2024", + "key": "jaech:2024", + "context": "闭源或系统卡披露的推理模型可以帮助团队理解能力边界,但不应被当作可直接复制的数据来源。以 o1 系统卡为例,其价值主要在于安全边界、评估维度和系统行为说明,而不是提供可复用训练轨迹(Jaech et al. 2024)。" + }, + { + "file": "docs/zh/part13/ch46_rl_reasoning_data.md", + "line": 557, + "author": "Patil", + "year": "2024", + "text": "Patil et al. 2024", + "key": "patil:2024", + "context": "工具调用任务还可以借鉴 Gorilla 这类 API 连接型模型的经验:当任务目标是选择工具、填写参数并解释返回结果时,数据不仅要记录自然语言答案,还要记录 API 文档版本、调用参数、执行结果和错误恢复路径(Patil et al. 2024)。" + }, { "file": "docs/zh/part13/ch47_vlm_data_recipes.md", "line": 53, @@ -19154,6 +19069,15 @@ "key": "hendrycks:2021", "context": "`sampler.py` 在这一层负责从 GSM8K、MATH 和 MBPP 构造两类种子:数学任务抽取最终答案并拆成参考步骤,代码任务保留 `reference_code`、`test_setup_code` 与 `test_list`,最后统一落到 `seed_pool.jsonl` 与 `task_spec.json`。MATH 数据集的题目结构为多步推理和过程验证提供了典型来源 (Hendrycks et al. 2021..." }, + { + "file": "docs/zh/part14/p06_prm.md", + "line": 995, + "author": "DeepSeek-AI", + "year": "2025", + "text": "DeepSeek-AI 2025)", + "key": "deepseekai:2025", + "context": "只要把“step”的定义从文本推理扩展到动作与状态,PRM 数据工厂的方法就有机会迁移到更复杂场景,相关交互模式可参考 ReAct 的推理-行动协同范式 (Yao et al. 2023)。若继续走向强化学习式推理飞轮,则应明确区分本章的过程监督数据构造与 DeepSeek-R1 所代表的大规模推理强化学习系统 (DeepSeek-AI 2025)。" + }, { "file": "docs/zh/part14/p06_prm.md", "line": 995, @@ -19242,16 +19166,16 @@ "year": "2020", "text": "Kaplan et al. 2020)", "key": "kaplan:2020", - "context": "在预训练数据工程中,“按比例缩放(Scaling Laws)”(Kaplan et al. 2020) 不仅适用于模型参数,同样适用于数据配方的实验与验证。我们在前作 项目 1(Mini-C4)中,已经走通了单源语料的清洗流水线;但真实的工业级大模型(如 DeepSeek-V3 (Liu et al. 2024))从来不是在单一语料上训练出来的,而是由网页、代码、数学、学术论文等多种数据源精确混合而成。" + "context": "在预训练数据工程中,“按比例缩放(Scaling Laws)”(Kaplan et al. 2020) 不仅适用于模型参数,同样适用于数据配方的实验与验证。我们在前作 项目 1(Mini-C4)中,已经走通了单源语料的清洗流水线;但真实的工业级大模型(如 DeepSeek-V3 (DeepSeek-AI et al. 2024))从来不是在单一语料上训练出来的,而是由网页、代码、数学、学术论文等多种数据源精确混合而成。" }, { "file": "docs/zh/part14/p11_mini_deepseek.md", "line": 65, - "author": "Liu", + "author": "DeepSeek-AI", "year": "2024", - "text": "Liu et al. 2024)", - "key": "liu:2024", - "context": "在预训练数据工程中,“按比例缩放(Scaling Laws)”(Kaplan et al. 2020) 不仅适用于模型参数,同样适用于数据配方的实验与验证。我们在前作 项目 1(Mini-C4)中,已经走通了单源语料的清洗流水线;但真实的工业级大模型(如 DeepSeek-V3 (Liu et al. 2024))从来不是在单一语料上训练出来的,而是由网页、代码、数学、学术论文等多种数据源精确混合而成。" + "text": "DeepSeek-AI et al. 2024)", + "key": "deepseekai:2024", + "context": "在预训练数据工程中,“按比例缩放(Scaling Laws)”(Kaplan et al. 2020) 不仅适用于模型参数,同样适用于数据配方的实验与验证。我们在前作 项目 1(Mini-C4)中,已经走通了单源语料的清洗流水线;但真实的工业级大模型(如 DeepSeek-V3 (DeepSeek-AI et al. 2024))从来不是在单一语料上训练出来的,而是由网页、代码、数学、学术论文等多种数据源精确混合而成。" }, { "file": "docs/zh/part14/p11_mini_deepseek.md", @@ -19292,11 +19216,11 @@ { "file": "docs/zh/part14/p11_mini_deepseek.md", "line": 174, - "author": "Liu", + "author": "DeepSeek-AI", "year": "2024", - "text": "Liu et al. 2024)", - "key": "liu:2024", - "context": "DeepSeek-V3 (Liu et al. 2024) 采用了一个规模为 150K 左右的超大词表(相较于 Llama-2 的 32K 提升巨大),这使其在处理中文与代码时效率极高。在此步骤,我们将以混合且去重后的数据训练 BPE Tokenizer。" + "text": "DeepSeek-AI et al. 2024)", + "key": "deepseekai:2024", + "context": "DeepSeek-V3 (DeepSeek-AI et al. 2024) 采用了一个规模为 150K 左右的超大词表(相较于 Llama-2 的 32K 提升巨大),这使其在处理中文与代码时效率极高。在此步骤,我们将以混合且去重后的数据训练 BPE Tokenizer。" }, { "file": "docs/zh/part14/p11_mini_deepseek.md", @@ -19626,10 +19550,10 @@ "file": "docs/zh/part2/ch05_cleaning_dedup.md", "line": 414, "author": "Honnibal", - "year": "2020", - "text": "Honnibal et al. 2020)", - "key": "honnibal:2020", - "context": "**命名实体识别(NER)模型**则覆盖规则难以枚举的 PII 类型,如真实人名、地址和机构名。推荐使用 spaCy (Honnibal et al. 2020) 的中文模型(`zh_core_web_trf`)或 HuggingFace 上开源的中文 NER 模型,对人名(PER)、地点(LOC)、机构(ORG)等命名实体进行识别,再根据上下文判断是否需要脱敏。" + "year": "2023", + "text": "Honnibal et al. 2023)", + "key": "honnibal:2023", + "context": "**命名实体识别(NER)模型**则覆盖规则难以枚举的 PII 类型,如真实人名、地址和机构名。推荐使用 spaCy (Honnibal et al. 2023) 的中文模型(`zh_core_web_trf`)或 HuggingFace 上开源的中文 NER 模型,对人名(PER)、地点(LOC)、机构(ORG)等命名实体进行识别,再根据上下文判断是否需要脱敏。" }, { "file": "docs/zh/part2/ch05_cleaning_dedup.md", @@ -27914,6 +27838,130 @@ "fingerprint": "vllm documentation", "format_issues": [] }, + { + "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", + "line": 399, + "entry_no": 2, + "entry": "Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723.", + "first_author": "Gebru", + "year": "2021", + "title": "Datasheets for Datasets", + "doi": "10.1145/3458723", + "arxiv": "", + "url": "https://doi.org/10.1145/3458723", + "key": "gebru:2021", + "fingerprint": "datasheets for datasets", + "format_issues": [] + }, + { + "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", + "line": 407, + "entry_no": 6, + "entry": "Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596.", + "first_author": "Mitchell", + "year": "2019", + "title": "Model Cards for Model Reporting", + "doi": "10.1145/3287560.3287596", + "arxiv": "", + "url": "https://doi.org/10.1145/3287560.3287596", + "key": "mitchell:2019", + "fingerprint": "model cards for model reporting", + "format_issues": [] + }, + { + "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", + "line": 409, + "entry_no": 7, + "entry": "Pushkarna M, Zaldivar A, Kjartansson O (2022) Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 1776-1826. https://doi.org/10.1145/3531146.3533231.", + "first_author": "Pushkarna", + "year": "2022", + "title": "Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI", + "doi": "10.1145/3531146.3533231", + "arxiv": "", + "url": "https://doi.org/10.1145/3531146.3533231", + "key": "pushkarna:2022", + "fingerprint": "data cards purposeful and transparent dataset documentation for responsible ai", + "format_issues": [] + }, + { + "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", + "line": 411, + "entry_no": 8, + "entry": "Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing Systems 28.", + "first_author": "Sculley", + "year": "2015", + "title": "Hidden Technical Debt in Machine Learning Systems", + "doi": "", + "arxiv": "", + "url": "", + "key": "sculley:2015", + "fingerprint": "hidden technical debt in machine learning systems", + "format_issues": [ + "missing-doi-arxiv-url" + ] + }, + { + "file": "docs/zh/appendix_e_common_bug_debugging_manual.md", + "line": 449, + "entry_no": 2, + "entry": "Breck E, Cai S, Nielsen E, Salib M, Sculley D (2017) The ML Test Score: A Rubric for ML Production Readiness and Technical Debt Reduction. In: IEEE International Conference on Big Data, pp 1123-1132.", + "first_author": "Breck", + "year": "2017", + "title": "The ML Test Score: A Rubric for ML Production Readiness and Technical Debt Reduction", + "doi": "", + "arxiv": "", + "url": "", + "key": "breck:2017", + "fingerprint": "the ml test score a rubric for ml production readiness and technical debt reduction", + "format_issues": [ + "missing-doi-arxiv-url" + ] + }, + { + "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", + "line": 403, + "entry_no": 2, + "entry": "Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723.", + "first_author": "Gebru", + "year": "2021", + "title": "Datasheets for Datasets", + "doi": "10.1145/3458723", + "arxiv": "", + "url": "https://doi.org/10.1145/3458723", + "key": "gebru:2021", + "fingerprint": "datasheets for datasets", + "format_issues": [] + }, + { + "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", + "line": 407, + "entry_no": 4, + "entry": "Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, pp 220-229. https://doi.org/10.1145/3287560.3287596.", + "first_author": "Mitchell", + "year": "2019", + "title": "Model Cards for Model Reporting", + "doi": "10.1145/3287560.3287596", + "arxiv": "", + "url": "https://doi.org/10.1145/3287560.3287596", + "key": "mitchell:2019", + "fingerprint": "model cards for model reporting", + "format_issues": [] + }, + { + "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", + "line": 409, + "entry_no": 5, + "entry": "Pushkarna M, Zaldivar A, Kjartansson O (2022) Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, pp 1776-1826. https://doi.org/10.1145/3531146.3533231.", + "first_author": "Pushkarna", + "year": "2022", + "title": "Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI", + "doi": "10.1145/3531146.3533231", + "arxiv": "", + "url": "https://doi.org/10.1145/3531146.3533231", + "key": "pushkarna:2022", + "fingerprint": "data cards purposeful and transparent dataset documentation for responsible ai", + "format_issues": [] + }, { "file": "docs/zh/appendix_g_datagallery_note.md", "line": 67, @@ -28042,35 +28090,31 @@ "file": "docs/zh/part1/ch02_quality_framework.md", "line": 507, "entry_no": 1, - "entry": "Cohen J (1960) A Coefficient of Agreement for Nominal Scales. Educational and Psychological Measurement 20(1):37-46.", + "entry": "Cohen J (1960) A Coefficient of Agreement for Nominal Scales. Educational and Psychological Measurement 20(1):37-46. https://doi.org/10.1177/001316446002000104.", "first_author": "Cohen", "year": "1960", "title": "A Coefficient of Agreement for Nominal Scales", - "doi": "", + "doi": "10.1177/001316446002000104", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1177/001316446002000104", "key": "cohen:1960", "fingerprint": "a coefficient of agreement for nominal scales", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part1/ch03_data_stack.md", "line": 345, "entry_no": 3, - "entry": "Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29.", + "entry": "Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. https://doi.org/10.1109/sequen.1997.666900.", "first_author": "Broder", "year": "1997", "title": "On the Resemblance and Containment of Documents", - "doi": "", + "doi": "10.1109/sequen.1997.666900", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/sequen.1997.666900", "key": "broder:1997", "fingerprint": "on the resemblance and containment of documents", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part1/ch03_data_stack.md", @@ -28123,18 +28167,16 @@ "file": "docs/zh/part10/ch32_auto_collection_parsing_cleaning.md", "line": 420, "entry_no": 1, - "entry": "Barbaresi A (2021) Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 122-131.", + "entry": "Barbaresi A (2021) Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 122-131. https://doi.org/10.18653/v1/2021.acl-demo.15.", "first_author": "Barbaresi", "year": "2021", "title": "Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction", - "doi": "", + "doi": "10.18653/v1/2021.acl-demo.15", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2021.acl-demo.15", "key": "barbaresi:2021", "fingerprint": "trafilatura a web scraping library and command line tool for text discovery and extraction", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part10/ch32_auto_collection_parsing_cleaning.md", @@ -28174,18 +28216,16 @@ "file": "docs/zh/part10/ch34_dataops_agent.md", "line": 495, "entry_no": 13, - "entry": "Tamburri D A (2020) Sustainable MLOps: Trends and Challenges. In: Proceedings of the 22nd International Symposium on Symbolic and Numeric Algorithms for Scientific Computing, pp 17-23.", + "entry": "Tamburri D A (2020) Sustainable MLOps: Trends and Challenges. In: Proceedings of the 22nd International Symposium on Symbolic and Numeric Algorithms for Scientific Computing, pp 17-23. https://doi.org/10.1109/synasc51798.2020.00015.", "first_author": "Tamburri", "year": "2020", "title": "Sustainable MLOps: Trends and Challenges", - "doi": "", + "doi": "10.1109/synasc51798.2020.00015", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/synasc51798.2020.00015", "key": "tamburri:2020", "fingerprint": "sustainable mlops trends and challenges", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch36_compliance_framework_and_governance.md", @@ -28208,35 +28248,31 @@ "file": "docs/zh/part11/ch36_compliance_framework_and_governance.md", "line": 1194, "entry_no": 8, - "entry": "Hoepman J-H (2014) Privacy Design Strategies. In IFIP International Information Security Conference, pp 446-459.", + "entry": "Hoepman J-H (2014) Privacy Design Strategies. In IFIP International Information Security Conference, pp 446-459. https://doi.org/10.1007/978-3-642-55415-5_38.", "first_author": "Hoepman", "year": "2014", "title": "Privacy Design Strategies", - "doi": "", + "doi": "10.1007/978-3-642-55415-5_38", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1007/978-3-642-55415-5_38", "key": "hoepman:2014", "fingerprint": "privacy design strategies", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch36_compliance_framework_and_governance.md", "line": 1198, "entry_no": 10, - "entry": "Dwork C (2008) Differential Privacy: A Survey of Results. In Theory and Applications of Models of Computation, Springer Berlin Heidelberg, pp 1-19.", + "entry": "Dwork C (2008) Differential Privacy: A Survey of Results. In Theory and Applications of Models of Computation, Springer Berlin Heidelberg, pp 1-19. https://doi.org/10.1007/978-3-540-79228-4_1.", "first_author": "Dwork", "year": "2008", "title": "Differential Privacy: A Survey of Results", - "doi": "", + "doi": "10.1007/978-3-540-79228-4_1", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1007/978-3-540-79228-4_1", "key": "dwork:2008", "fingerprint": "differential privacy a survey of results", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md", @@ -28257,7 +28293,22 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 598, + "line": 597, + "entry_no": 1, + "entry": "Penedo, G., Kydlíček, H., Allal, L. B., Lozhkov, A., Mitchell, M., Raffel, C., von Werra, L., & Wolf, T. (2024). The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. NeurIPS 2024 Datasets and Benchmarks Track. https://arxiv.org/abs/2406.17557.", + "first_author": "Penedo", + "year": "2024", + "title": "The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale", + "doi": "", + "arxiv": "", + "url": "https://arxiv.org/abs/2406.17557", + "key": "penedo:2024", + "fingerprint": "the fineweb datasets decanting the web for the finest text data at scale", + "format_issues": [] + }, + { + "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", + "line": 599, "entry_no": 2, "entry": "Hugging Face. (2026). HuggingFaceFW/fineweb Dataset Card. https://huggingface.co/datasets/HuggingFaceFW/fineweb.", "first_author": "Hugging", @@ -28272,7 +28323,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 599, + "line": 601, "entry_no": 3, "entry": "Hugging Face. (2026). DataTrove FineWeb Processing Script. https://github.com/huggingface/datatrove/blob/main/examples/fineweb.py.", "first_author": "Hugging", @@ -28287,7 +28338,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 600, + "line": 603, "entry_no": 4, "entry": "Penedo, G., Kydlíček, H., Cappelli, A., Sasko, M., & Wolf, T. (2024). DataTrove large scale data processing. https://github.com/huggingface/datatrove.", "first_author": "Penedo", @@ -28302,7 +28353,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 601, + "line": 605, "entry_no": 5, "entry": "Luccioni, S., & Viviano, J. (2021). What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus. https://arxiv.org/abs/2105.02732.", "first_author": "Luccioni", @@ -28317,7 +28368,22 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 604, + "line": 607, + "entry_no": 6, + "entry": "Soldaini, L., Kinney, R., Bhagia, A., Schwenk, D., Atkinson, D., Authur, R., et al. (2024). Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research. ACL 2024. https://arxiv.org/abs/2402.00159.", + "first_author": "Soldaini", + "year": "2024", + "title": "Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research", + "doi": "", + "arxiv": "", + "url": "https://arxiv.org/abs/2402.00159", + "key": "soldaini:2024", + "fingerprint": "dolma an open corpus of three trillion tokens for language model pretraining research", + "format_issues": [] + }, + { + "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", + "line": 609, "entry_no": 7, "entry": "Allen Institute for AI. (2023). Ai2 Dolma: 3 trillion token open corpus for language model pretraining. https://allenai.org/blog/dolma-3-trillion-tokens-open-llm-corpus-9a0ff4b8da64.", "first_author": "Allen", @@ -28332,7 +28398,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 605, + "line": 611, "entry_no": 8, "entry": "AllenAI. (2026). allenai/dolma Dataset Card. https://huggingface.co/datasets/allenai/dolma.", "first_author": "AllenAI.", @@ -28347,7 +28413,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 606, + "line": 613, "entry_no": 9, "entry": "AllenAI. (2026). Dolma Dataset and Toolkit Repository. https://github.com/allenai/dolma.", "first_author": "AllenAI.", @@ -28362,7 +28428,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 607, + "line": 615, "entry_no": 10, "entry": "AllenAI. (2026). Dolma Toolkit Documentation. https://github.com/allenai/dolma/blob/main/docs/README.md.", "first_author": "AllenAI.", @@ -28377,7 +28443,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 608, + "line": 617, "entry_no": 11, "entry": "Groeneveld, D., Beltagy, I., Walsh, P., Bhagia, A., Kinney, R., Tafjord, O., et al. (2024). OLMo: Accelerating the Science of Language Models. https://arxiv.org/abs/2402.00838.", "first_author": "Groeneveld", @@ -28407,7 +28473,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 265, + "line": 266, "entry_no": 2, "entry": "LAION. (2022). LAION-5B: A new era of open large-scale multi-modal datasets. https://laion.ai/blog/laion-5b/.", "first_author": "LAION.", @@ -28422,7 +28488,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 266, + "line": 268, "entry_no": 3, "entry": "LAION-AI. (2022). dataset-spec. https://github.com/LAION-AI/dataset-spec.", "first_author": "LAION-AI.", @@ -28437,7 +28503,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 267, + "line": 270, "entry_no": 4, "entry": "Gadre, S. Y., Ilharco, G., Fang, A., Hayase, J., Smyrnis, G., Nguyen, T., et al. (2023). DataComp: In search of the next generation of multimodal datasets. NeurIPS 2023 Datasets and Benchmarks Track. https://arxiv.org/abs/2304.14108.", "first_author": "Gadre", @@ -28452,7 +28518,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 268, + "line": 272, "entry_no": 5, "entry": "DataComp Team. (2026). DataComp Benchmark Documentation. https://www.datacomp.ai/dcclip/.", "first_author": "DataComp", @@ -28467,7 +28533,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 269, + "line": 274, "entry_no": 6, "entry": "ML Foundations. (2023). DataComp codebase. https://github.com/mlfoundations/datacomp.", "first_author": "ML", @@ -28629,18 +28695,16 @@ "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", "line": 746, "entry_no": 10, - "entry": "Mathew, M., Karatzas, D., and Jawahar, C.V. (2021). DocVQA: A Dataset for VQA on Document Images. *Proc. WACV*.", + "entry": "Mathew, M., Karatzas, D., and Jawahar, C.V. (2021). DocVQA: A Dataset for VQA on Document Images. *Proc. WACV*. https://doi.org/10.1109/wacv48630.2021.00225.", "first_author": "Mathew", "year": "2021", "title": "DocVQA: A Dataset for VQA on Document Images", - "doi": "", + "doi": "10.1109/wacv48630.2021.00225", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/wacv48630.2021.00225", "key": "mathew:2021", "fingerprint": "docvqa a dataset for vqa on document images", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", @@ -28680,18 +28744,16 @@ "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", "line": 752, "entry_no": 13, - "entry": "Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D., and Finn, C. (2024). Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. *Proc. NeurIPS*.", + "entry": "Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D., and Finn, C. (2024). Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. *Proc. NeurIPS*. arXiv:2305.18290.", "first_author": "Rafailov", "year": "2024", - "title": "Direct Preference Optimization: Your Language Model Is Secretly a Reward Model", + "title": "Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Proc. NeurIPS", "doi": "", - "arxiv": "", + "arxiv": "2305.18290", "url": "", "key": "rafailov:2024", - "fingerprint": "direct preference optimization your language model is secretly a reward model", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "fingerprint": "direct preference optimization your language model is secretly a reward model proc neurips", + "format_issues": [] }, { "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", @@ -28891,7 +28953,7 @@ "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", "line": 778, "entry_no": 26, - "entry": "Hunyuan Vision Team, Lyu, P., Wan, X., et al. (2025). HunyuanOCR Technical Report. *arXiv preprint*.", + "entry": "Hunyuan Vision Team (2025). HunyuanOCR Technical Report. *arXiv preprint*.", "first_author": "Hunyuan", "year": "2025", "title": "HunyuanOCR Technical Report", @@ -28987,35 +29049,31 @@ "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", "line": 790, "entry_no": 32, - "entry": "Smock, B., Pesala, R., and Abraham, R. (2022). PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents. *Proc. CVPR*.", + "entry": "Smock, B., Pesala, R., and Abraham, R. (2022). PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents. *Proc. CVPR*. https://doi.org/10.1109/cvpr52688.2022.00459.", "first_author": "Smock", "year": "2022", "title": "PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents", - "doi": "", + "doi": "10.1109/cvpr52688.2022.00459", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/cvpr52688.2022.00459", "key": "smock:2022", "fingerprint": "pubtables 1m towards comprehensive table extraction from unstructured documents", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", "line": 792, "entry_no": 33, - "entry": "Zhu, F., Lei, W., Huang, Y., Wang, C., Zhang, S., Lv, J., Feng, F., and Chua, T.-S. (2021). TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance. *Proc. ACL*.", + "entry": "Zhu, F., Lei, W., Huang, Y., Wang, C., Zhang, S., Lv, J., Feng, F., and Chua, T.-S. (2021). TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance. *Proc. ACL*. https://doi.org/10.18653/v1/2021.acl-long.254.", "first_author": "Zhu", "year": "2021", "title": "TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance", - "doi": "", + "doi": "10.18653/v1/2021.acl-long.254", "arxiv": "", - "url": "", + "url": "https://doi.org/10.18653/v1/2021.acl-long.254", "key": "zhu:2021", "fingerprint": "tat qa a question answering benchmark on a hybrid of tabular and textual content in finance", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch40_visual_document_table_data_engineering.md", @@ -29047,157 +29105,6 @@ "fingerprint": "apache arrow documentation", "format_issues": [] }, - { - "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", - "line": 827, - "entry_no": 1, - "entry": "Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022.", - "first_author": "Masry", - "year": "2022", - "title": "ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning", - "doi": "", - "arxiv": "", - "url": "", - "key": "masry:2022", - "fingerprint": "chartqa a benchmark for question answering about charts with visual and logical reasoning", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", - "line": 829, - "entry_no": 2, - "entry": "Methani, N., Ganguly, P., Khapra, M. M., & Kumar, P. (2020). PlotQA: Reasoning over Scientific Plots. WACV 2020.", - "first_author": "Methani", - "year": "2020", - "title": "PlotQA: Reasoning over Scientific Plots", - "doi": "", - "arxiv": "", - "url": "", - "key": "methani:2020", - "fingerprint": "plotqa reasoning over scientific plots", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", - "line": 831, - "entry_no": 3, - "entry": "Kahou, S. E., Michalski, V., Atkinson, A., Kádár, Á., Trischler, A., & Bengio, Y. (2017). FigureQA: An Annotated Figure Dataset for Visual Reasoning. arXiv:1710.07300.", - "first_author": "Kahou", - "year": "2017", - "title": "FigureQA: An Annotated Figure Dataset for Visual Reasoning", - "doi": "", - "arxiv": "1710.07300", - "url": "", - "key": "kahou:2017", - "fingerprint": "figureqa an annotated figure dataset for visual reasoning", - "format_issues": [] - }, - { - "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", - "line": 833, - "entry_no": 4, - "entry": "Kafle, K., Price, B., Cohen, S., & Kanan, C. (2018). DVQA: Understanding Data Visualizations via Question Answering. CVPR 2018.", - "first_author": "Kafle", - "year": "2018", - "title": "DVQA: Understanding Data Visualizations via Question Answering", - "doi": "", - "arxiv": "", - "url": "", - "key": "kafle:2018", - "fingerprint": "dvqa understanding data visualizations via question answering", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", - "line": 835, - "entry_no": 5, - "entry": "Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021.", - "first_author": "Mathew", - "year": "2021", - "title": "DocVQA: A Dataset for VQA on Document Images", - "doi": "", - "arxiv": "", - "url": "", - "key": "mathew:2021", - "fingerprint": "docvqa a dataset for vqa on document images", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", - "line": 837, - "entry_no": 6, - "entry": "Masry, A., Islam, M. S., Ahmed, M., Bajaj, A., Kabir, F., Kartha, A., ... & Joty, S. (2025, July). Chartqapro: A more diverse and challenging benchmark for chart question answering. In Findings of the Association for Computational Linguistics: ACL 2025 (pp. 19123-19151).", - "first_author": "Masry", - "year": "2025", - "title": "Chartqapro: A more diverse and challenging benchmark for chart question answering", - "doi": "", - "arxiv": "", - "url": "", - "key": "masry:2025", - "fingerprint": "chartqapro a more diverse and challenging benchmark for chart question answering", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", - "line": 839, - "entry_no": 7, - "entry": "Xie, T., Lin, M., Liu, M., Ye, Y., Chen, C., & Liu, S. (2026). Infochartqa: A benchmark for multimodal question answering on infographic charts. Advances in Neural Information Processing Systems, 38.", - "first_author": "Xie", - "year": "2026", - "title": "Infochartqa: A benchmark for multimodal question answering on infographic charts", - "doi": "", - "arxiv": "", - "url": "", - "key": "xie:2026", - "fingerprint": "infochartqa a benchmark for multimodal question answering on infographic charts", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", - "line": 841, - "entry_no": 8, - "entry": "Foroutan, N., Romanou, A., Ansaripour, M., Eisenschlos, J. M., Aberer, K., & Lebret, R. (2025, July). Wikimixqa: a multimodal benchmark for question answering over tables and charts. In Findings of the Association for Computational Linguistics: ACL 2025 (pp. 24941-24958).", - "first_author": "Foroutan", - "year": "2025", - "title": "Wikimixqa: a multimodal benchmark for question answering over tables and charts", - "doi": "", - "arxiv": "", - "url": "", - "key": "foroutan:2025", - "fingerprint": "wikimixqa a multimodal benchmark for question answering over tables and charts", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", - "line": 843, - "entry_no": 9, - "entry": "Zhu, Z., Jia, M., Zhang, Z., Li, L., & Jiang, M. (2025, April). MultiChartQA: Benchmarking vision-language models on multi-chart problems. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers) (pp. 11341-11359).", - "first_author": "Zhu", - "year": "2025", - "title": "MultiChartQA: Benchmarking vision-language models on multi-chart problems", - "doi": "", - "arxiv": "", - "url": "", - "key": "zhu:2025", - "fingerprint": "multichartqa benchmarking vision language models on multi chart problems", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, { "file": "docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md", "line": 845, @@ -29386,18 +29293,16 @@ "file": "docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md", "line": 452, "entry_no": 1, - "entry": "1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022.", + "entry": "1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022. arXiv:2201.11903.", "first_author": "Wei", "year": "2022", - "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", + "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022", "doi": "", - "arxiv": "", + "arxiv": "2201.11903", "url": "", "key": "wei:2022", - "fingerprint": "chain of thought prompting elicits reasoning in large language models", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "fingerprint": "chain of thought prompting elicits reasoning in large language models neurips 2022", + "format_issues": [] }, { "file": "docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md", @@ -29433,35 +29338,31 @@ "file": "docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md", "line": 455, "entry_no": 4, - "entry": "4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning.", - "first_author": "DeepSeek-AI.", + "entry": "4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948.", + "first_author": "DeepSeek-AI", "year": "2025", "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", "doi": "", - "arxiv": "", + "arxiv": "2501.12948", "url": "", "key": "deepseekai:2025", "fingerprint": "deepseek r1 incentivizing reasoning capability in llms via reinforcement learning", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md", "line": 456, "entry_no": 5, - "entry": "5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021.", + "entry": "5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021. arXiv:2103.03874.", "first_author": "Hendrycks", "year": "2021", - "title": "Measuring Mathematical Problem Solving With the MATH Dataset", + "title": "Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021", "doi": "", - "arxiv": "", + "arxiv": "2103.03874", "url": "", "key": "hendrycks:2021", - "fingerprint": "measuring mathematical problem solving with the math dataset", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "fingerprint": "measuring mathematical problem solving with the math dataset neurips 2021", + "format_issues": [] }, { "file": "docs/zh/part13/ch44_pretrain_recipes.md", @@ -29527,364 +29428,6 @@ "missing-doi-arxiv-url" ] }, - { - "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 381, - "entry_no": 4, - "entry": "Ethayarajh K, Xu W, Muennighoff N, Jurafsky D, Kiela D (2024) Model Alignment as Prospect Theoretic Optimization. Proceedings of the 41st International Conference on Machine Learning, pp 12634-12651.", - "first_author": "Ethayarajh", - "year": "2024", - "title": "Model Alignment as Prospect Theoretic Optimization", - "doi": "", - "arxiv": "", - "url": "", - "key": "ethayarajh:2024", - "fingerprint": "model alignment as prospect theoretic optimization", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 383, - "entry_no": 5, - "entry": "Gheshlaghi Azar M, Guo Z D, Piot B, Munos R, Rowland M, Valko M, Calandriello D (2024) A General Theoretical Paradigm to Understand Learning from Human Preferences. Proceedings of the 27th International Conference on Artificial Intelligence and Statistics, pp 4447-4455.", - "first_author": "Gheshlaghi", - "year": "2024", - "title": "A General Theoretical Paradigm to Understand Learning from Human Preferences", - "doi": "", - "arxiv": "", - "url": "", - "key": "gheshlaghi:2024", - "fingerprint": "a general theoretical paradigm to understand learning from human preferences", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 389, - "entry_no": 8, - "entry": "Yang A, Li A, Yang B, Zhang B, Hui B, Zheng B, Yu B, Gao C, Huang C, Lv C, others (2025) Qwen3 Technical Report. arXiv preprint arXiv:2505.09388.", - "first_author": "Yang", - "year": "2025", - "title": "Qwen3 Technical Report", - "doi": "", - "arxiv": "2505.09388", - "url": "", - "key": "yang:2025", - "fingerprint": "qwen3 technical report", - "format_issues": [] - }, - { - "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 395, - "entry_no": 11, - "entry": "Xu Z, Jiang F, Niu L, Deng Y, Poovendran R, Choi Y, Lin B Y (2025) Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing. International Conference on Learning Representations.", - "first_author": "Xu", - "year": "2025", - "title": "Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing", - "doi": "", - "arxiv": "", - "url": "", - "key": "xu:2025", - "fingerprint": "magpie alignment data synthesis from scratch by prompting aligned llms with nothing", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 397, - "entry_no": 12, - "entry": "Liu A, Feng B, Xue B, Wang B, Wu B, Lu C, Zhao C, Deng C, Zhang C, Ruan C, others (2024a) DeepSeek-V3 Technical Report. arXiv preprint arXiv:2412.19437.", - "first_author": "Liu", - "year": "2024", - "title": "DeepSeek-V3 Technical Report", - "doi": "", - "arxiv": "2412.19437", - "url": "", - "key": "liu:2024", - "fingerprint": "deepseek v3 technical report", - "format_issues": [] - }, - { - "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 399, - "entry_no": 13, - "entry": "Liu C Y, Zeng L, Liu J, Yan R, He J, Wang C, Yan S, Liu Y, Zhou Y (2024b) Skywork-Reward: Bag of Tricks for Reward Modeling in LLMs. arXiv preprint arXiv:2410.18451.", - "first_author": "Liu", - "year": "2024", - "title": "Skywork-Reward: Bag of Tricks for Reward Modeling in LLMs", - "doi": "", - "arxiv": "2410.18451", - "url": "", - "key": "liu:2024", - "fingerprint": "skywork reward bag of tricks for reward modeling in llms", - "format_issues": [] - }, - { - "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 401, - "entry_no": 14, - "entry": "Singhal P, Goyal T, Xu J, Durrett G (2024) A Long Way to Go: Investigating Length Correlations in RLHF. First Conference on Language Modeling.", - "first_author": "Singhal", - "year": "2024", - "title": "A Long Way to Go: Investigating Length Correlations in RLHF", - "doi": "", - "arxiv": "", - "url": "", - "key": "singhal:2024", - "fingerprint": "a long way to go investigating length correlations in rlhf", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 405, - "entry_no": 16, - "entry": "Zhou K, Zhu Y, Chen Z, Chen W, Zhao W X, Chen X, Lin Y, Wen J-R, Han J (2023) Don't Make Your LLM an Evaluation Benchmark Cheater. arXiv preprint arXiv:2311.01964.", - "first_author": "Zhou", - "year": "2023", - "title": "Don't Make Your LLM an Evaluation Benchmark Cheater", - "doi": "", - "arxiv": "2311.01964", - "url": "", - "key": "zhou:2023", - "fingerprint": "don t make your llm an evaluation benchmark cheater", - "format_issues": [] - }, - { - "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 409, - "entry_no": 18, - "entry": "Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations.", - "first_author": "Lightman", - "year": "2024", - "title": "Let's Verify Step by Step", - "doi": "", - "arxiv": "", - "url": "", - "key": "lightman:2024", - "fingerprint": "let s verify step by step", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 575, - "entry_no": 4, - "entry": "Touvron H, Martin L, Stone K, Albert P, Almahairi A, Babaei Y, Bashlykov N, Batra S, Bhargava P, Bhosale S, others (2023) Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv preprint arXiv:2307.09288.", - "first_author": "Touvron", - "year": "2023", - "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models", - "doi": "", - "arxiv": "2307.09288", - "url": "", - "key": "touvron:2023", - "fingerprint": "llama 2 open foundation and fine tuned chat models", - "format_issues": [] - }, - { - "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 587, - "entry_no": 10, - "entry": "Zhou C, Liu P, Xu P, Iyer S, Sun J, Mao Y, Ma X, Efrat A, Yu P, Yu L, Zhang S, Ghosh G, Lewis M, Zettlemoyer L, Levy O (2023) LIMA: Less Is More for Alignment. Advances in Neural Information Processing Systems, 36, 55006-55021.", - "first_author": "Zhou", - "year": "2023", - "title": "LIMA: Less Is More for Alignment", - "doi": "", - "arxiv": "", - "url": "", - "key": "zhou:2023", - "fingerprint": "lima less is more for alignment", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 589, - "entry_no": 11, - "entry": "Zelikman E, Wu Y, Mu J, Goodman N (2022) STaR: Bootstrapping Reasoning with Reasoning. Advances in Neural Information Processing Systems, 35, 15476-15488.", - "first_author": "Zelikman", - "year": "2022", - "title": "STaR: Bootstrapping Reasoning with Reasoning", - "doi": "", - "arxiv": "", - "url": "", - "key": "zelikman:2022", - "fingerprint": "star bootstrapping reasoning with reasoning", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 591, - "entry_no": 12, - "entry": "Madaan A, Tandon N, Gupta P, Hallinan S, Gao L, Wiegreffe S, Alon U, Dziri N, Prabhumoye S, Yang Y, Gupta S, Majumder B P, Hermann K, Welleck S, Yazdanbakhsh A, Clark P (2023) Self-Refine: Iterative Refinement with Self-Feedback. Advances in Neural Information Processing Systems, 36, 46534-46594.", - "first_author": "Madaan", - "year": "2023", - "title": "Self-Refine: Iterative Refinement with Self-Feedback", - "doi": "", - "arxiv": "", - "url": "", - "key": "madaan:2023", - "fingerprint": "self refine iterative refinement with self feedback", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 593, - "entry_no": 13, - "entry": "Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations.", - "first_author": "Lightman", - "year": "2024", - "title": "Let's Verify Step by Step", - "doi": "", - "arxiv": "", - "url": "", - "key": "lightman:2024", - "fingerprint": "let s verify step by step", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 595, - "entry_no": 14, - "entry": "Zheng L, Chiang W-L, Sheng Y, Zhuang S, Wu Z, Zhuang Y, Lin Z, Li Z, Li D, Xing E, Zhang H, Gonzalez J, Stoica I (2023) Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. Advances in Neural Information Processing Systems, 36, 46595-46623.", - "first_author": "Zheng", - "year": "2023", - "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", - "doi": "", - "arxiv": "", - "url": "", - "key": "zheng:2023", - "fingerprint": "judging llm as a judge with mt bench and chatbot arena", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 597, - "entry_no": 15, - "entry": "Gao L, Schulman J, Hilton J (2023) Scaling Laws for Reward Model Overoptimization. Proceedings of the 40th International Conference on Machine Learning, pp 10835-10866.", - "first_author": "Gao", - "year": "2023", - "title": "Scaling Laws for Reward Model Overoptimization", - "doi": "", - "arxiv": "", - "url": "", - "key": "gao:2023", - "fingerprint": "scaling laws for reward model overoptimization", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 599, - "entry_no": 16, - "entry": "Hosseini A, Yuan X, Malkin N, Courville A, Sordoni A, Agarwal R (2024) V-STaR: Training Verifiers for Self-Taught Reasoners. arXiv preprint arXiv:2402.06457.", - "first_author": "Hosseini", - "year": "2024", - "title": "V-STaR: Training Verifiers for Self-Taught Reasoners", - "doi": "", - "arxiv": "2402.06457", - "url": "", - "key": "hosseini:2024", - "fingerprint": "v star training verifiers for self taught reasoners", - "format_issues": [] - }, - { - "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 601, - "entry_no": 17, - "entry": "Shi F, Suzgun M, Freitag M, Wang X, Srivats S, Vosoughi S, Chung H W, Tay Y, Ruder S, Zhou D, others (2022) Language Models Are Multilingual Chain-of-Thought Reasoners. arXiv preprint arXiv:2210.03057.", - "first_author": "Shi", - "year": "2022", - "title": "Language Models Are Multilingual Chain-of-Thought Reasoners", - "doi": "", - "arxiv": "2210.03057", - "url": "", - "key": "shi:2022", - "fingerprint": "language models are multilingual chain of thought reasoners", - "format_issues": [] - }, - { - "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 603, - "entry_no": 18, - "entry": "Jaech A, Kalai A, Lerer A, Richardson A, El-Kishky A, Low A, Helyar A, Madry A, Beutel A, Carney A, others (2024) OpenAI o1 System Card. arXiv preprint arXiv:2412.16720.", - "first_author": "Jaech", - "year": "2024", - "title": "OpenAI o1 System Card", - "doi": "", - "arxiv": "2412.16720", - "url": "", - "key": "jaech:2024", - "fingerprint": "openai o1 system card", - "format_issues": [] - }, - { - "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 605, - "entry_no": 19, - "entry": "Ott S, Hebenstreit K, Liévin V, others (2023) ThoughtSource: A Central Hub for Large Language Model Reasoning Data. Scientific Data, 10(1), 528.", - "first_author": "Ott", - "year": "2023", - "title": "ThoughtSource: A Central Hub for Large Language Model Reasoning Data", - "doi": "", - "arxiv": "", - "url": "", - "key": "ott:2023", - "fingerprint": "thoughtsource a central hub for large language model reasoning data", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 607, - "entry_no": 20, - "entry": "Hsieh C-Y, Li C-L, Yeh C-K, Nakhost H, Fujii Y, Ratner A, Krishna R, Lee C-Y, Pfister T (2023) Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes. Findings of the Association for Computational Linguistics: ACL 2023, pp 8003-8017.", - "first_author": "Hsieh", - "year": "2023", - "title": "Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes", - "doi": "", - "arxiv": "", - "url": "", - "key": "hsieh:2023", - "fingerprint": "distilling step by step outperforming larger language models with less training data and smaller model sizes", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, - { - "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 609, - "entry_no": 21, - "entry": "Patil S G, Zhang T, Wang X, Gonzalez J E (2024) Gorilla: Large Language Model Connected with Massive APIs. Advances in Neural Information Processing Systems, 38.", - "first_author": "Patil", - "year": "2024", - "title": "Gorilla: Large Language Model Connected with Massive APIs", - "doi": "", - "arxiv": "", - "url": "", - "key": "patil:2024", - "fingerprint": "gorilla large language model connected with massive apis", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, { "file": "docs/zh/part13/ch47_vlm_data_recipes.md", "line": 336, @@ -29923,18 +29466,16 @@ "file": "docs/zh/part13/ch47_vlm_data_recipes.md", "line": 350, "entry_no": 14, - "entry": "Yue X, Ni Y, Zhang K, Zheng T, Liu R, Zhang S, Stevens J, Jiang C, Zheng N, Sun T, others (2024) MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 9556-9567.", + "entry": "Yue X, Ni Y, Zhang K, Zheng T, Liu R, Zhang S, Stevens J, Jiang C, Zheng N, Sun T, others (2024) MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 9556-9567. https://doi.org/10.1109/cvpr52733.2024.00913.", "first_author": "Yue", "year": "2024", "title": "MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI", - "doi": "", + "doi": "10.1109/cvpr52733.2024.00913", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/cvpr52733.2024.00913", "key": "yue:2024", "fingerprint": "mmmu a massive multi discipline multimodal understanding and reasoning benchmark for expert agi", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part13/ch48_t2i_t2v.md", @@ -30197,39 +29738,20 @@ "fingerprint": "great expectations documentation", "format_issues": [] }, - { - "file": "docs/zh/part14/p06_prm.md", - "line": 1143, - "entry_no": 4, - "entry": "4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning.", - "first_author": "DeepSeek-AI.", - "year": "2025", - "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", - "doi": "", - "arxiv": "", - "url": "", - "key": "deepseekai:2025", - "fingerprint": "deepseek r1 incentivizing reasoning capability in llms via reinforcement learning", - "format_issues": [ - "missing-doi-arxiv-url" - ] - }, { "file": "docs/zh/part14/p07_agent_tooluse.md", "line": 1169, "entry_no": 4, - "entry": "4. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications.", + "entry": "4. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications. https://genai.owasp.org/llm-top-10/.", "first_author": "OWASP", "year": "2025", "title": "OWASP Top 10 for Large Language Model Applications", "doi": "", "arxiv": "", - "url": "", + "url": "https://genai.owasp.org/llm-top-10/", "key": "owasp:2025", "fingerprint": "owasp top 10 for large language model applications", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p07_agent_tooluse.md", @@ -30310,35 +29832,31 @@ "file": "docs/zh/part14/p09_privacy_pipeline.md", "line": 1129, "entry_no": 1, - "entry": "1. European Union. (2016). Regulation (EU) 2016/679: General Data Protection Regulation.", + "entry": "1. European Union. (2016). Regulation (EU) 2016/679: General Data Protection Regulation. https://eur-lex.europa.eu/eli/reg/2016/679/oj.", "first_author": "European", "year": "2016", "title": "Regulation (EU) 2016/679: General Data Protection Regulation", "doi": "", "arxiv": "", - "url": "", + "url": "https://eur-lex.europa.eu/eli/reg/2016/679/oj", "key": "european:2016", "fingerprint": "regulation eu 2016 679 general data protection regulation", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p09_privacy_pipeline.md", "line": 1133, "entry_no": 5, - "entry": "5. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications.", + "entry": "5. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications. https://genai.owasp.org/llm-top-10/.", "first_author": "OWASP", "year": "2025", "title": "OWASP Top 10 for Large Language Model Applications", "doi": "", "arxiv": "", - "url": "", + "url": "https://genai.owasp.org/llm-top-10/", "key": "owasp:2025", "fingerprint": "owasp top 10 for large language model applications", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part14/p10_flywheel.md", @@ -30562,18 +30080,16 @@ "file": "docs/zh/part2/ch05_cleaning_dedup.md", "line": 627, "entry_no": 1, - "entry": "Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29.", + "entry": "Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. https://doi.org/10.1109/sequen.1997.666900.", "first_author": "Broder", "year": "1997", "title": "On the Resemblance and Containment of Documents", - "doi": "", + "doi": "10.1109/sequen.1997.666900", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1109/sequen.1997.666900", "key": "broder:1997", "fingerprint": "on the resemblance and containment of documents", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part2/ch05_cleaning_dedup.md", @@ -30596,18 +30112,16 @@ "file": "docs/zh/part2/ch06_tokenization_loading.md", "line": 465, "entry_no": 2, - "entry": "Brown T B, Mann B, Ryder N, Subbiah M, Kaplan J, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D M, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language Models are Few-Shot Learners. In: Advances in Neural Information Processing Systems 33, pp 1877-1901.", + "entry": "Brown T B, Mann B, Ryder N, Subbiah M, Kaplan J, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D M, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I, Amodei D (2020) Language Models are Few-Shot Learners. In: Advances in Neural Information Processing Systems 33, pp 1877-1901. arXiv:2005.14165.", "first_author": "Brown", "year": "2020", "title": "Language Models are Few-Shot Learners", "doi": "", - "arxiv": "", + "arxiv": "2005.14165", "url": "", "key": "brown:2020", "fingerprint": "language models are few shot learners", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part2/ch06_tokenization_loading.md", @@ -30675,18 +30189,16 @@ "file": "docs/zh/part3/ch08_multimodal_image.md", "line": 335, "entry_no": 12, - "entry": "Zhu W, Hessel J, Awadalla A, Gadre S Y, Dodge J, Fang A, Yu Y, Schmidt L, Wang W Y, Choi Y (2023) Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text. Advances in Neural Information Processing Systems 36.", + "entry": "Zhu W, Hessel J, Awadalla A, Gadre S Y, Dodge J, Fang A, Yu Y, Schmidt L, Wang W Y, Choi Y (2023) Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text. Advances in Neural Information Processing Systems 36. arXiv:2304.06939.", "first_author": "Zhu", "year": "2023", "title": "Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text", "doi": "", - "arxiv": "", + "arxiv": "2304.06939", "url": "", "key": "zhu:2023", "fingerprint": "multimodal c4 an open billion scale corpus of images interleaved with text", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part3/ch08_multimodal_image.md", @@ -30773,18 +30285,16 @@ "file": "docs/zh/part4/ch13_preference.md", "line": 504, "entry_no": 15, - "entry": "Cohen, J. (1960). A coefficient of agreement for nominal scales. *Educational and Psychological Measurement*, 20(1), 37–46.", + "entry": "Cohen, J. (1960). A coefficient of agreement for nominal scales. *Educational and Psychological Measurement*, 20(1), 37–46. https://doi.org/10.1177/001316446002000104.", "first_author": "Cohen", "year": "1960", "title": "A coefficient of agreement for nominal scales", - "doi": "", + "doi": "10.1177/001316446002000104", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1177/001316446002000104", "key": "cohen:1960", "fingerprint": "a coefficient of agreement for nominal scales", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part4/ch14_qa.md", @@ -30952,18 +30462,16 @@ "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", "line": 690, "entry_no": 9, - "entry": "Joachims T (2002) Optimizing Search Engines Using Clickthrough Data. In: Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp 133–142.", + "entry": "Joachims T (2002) Optimizing Search Engines Using Clickthrough Data. In: Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp 133–142. https://doi.org/10.1145/775066.775067.", "first_author": "Joachims", "year": "2002", "title": "Optimizing Search Engines Using Clickthrough Data", - "doi": "", + "doi": "10.1145/775066.775067", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1145/775066.775067", "key": "joachims:2002", "fingerprint": "optimizing search engines using clickthrough data", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part7/ch23_online_feedback_knowledge_update.md", @@ -31084,18 +30592,16 @@ "file": "docs/zh/part8/ch25_data_versioning_experiment_tracking.md", "line": 682, "entry_no": 13, - "entry": "Peng R D (2011) Reproducible Research in Computational Science. Science 334(6060):1226-1227.", + "entry": "Peng R D (2011) Reproducible Research in Computational Science. Science 334(6060):1226-1227. https://doi.org/10.1126/science.1213847.", "first_author": "Peng", "year": "2011", "title": "Reproducible Research in Computational Science", - "doi": "", + "doi": "10.1126/science.1213847", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1126/science.1213847", "key": "peng:2011", "fingerprint": "reproducible research in computational science", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part8/ch26_data_platform_observability.md", @@ -31388,18 +30894,16 @@ "file": "docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md", "line": 561, "entry_no": 14, - "entry": "Otto B (2011) Data Governance. Business & Information Systems Engineering 3(4):241-244.", + "entry": "Otto B (2011) Data Governance. Business & Information Systems Engineering 3(4):241-244. https://doi.org/10.1002/9781118269053.ch4.", "first_author": "Otto", "year": "2011", "title": "Data Governance", - "doi": "", + "doi": "10.1002/9781118269053.ch4", "arxiv": "", - "url": "", + "url": "https://doi.org/10.1002/9781118269053.ch4", "key": "otto:2011", "fingerprint": "data governance", - "format_issues": [ - "missing-doi-arxiv-url" - ] + "format_issues": [] }, { "file": "docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md", @@ -31695,8 +31199,8 @@ "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", "line": 399, "entry_no": 2, - "key": "kreuzberger:2023", - "title": "Machine Learning Operations (MLOps): Overview, Definition, and Architecture", + "key": "gebru:2021", + "title": "Datasheets for Datasets", "status": "not-checked", "source": "", "matched_title": "", @@ -31710,8 +31214,8 @@ "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", "line": 401, "entry_no": 3, - "key": "longpre:2024", - "title": "A large-scale audit of dataset licensing and attribution in AI", + "key": "kreuzberger:2023", + "title": "Machine Learning Operations (MLOps): Overview, Definition, and Architecture", "status": "not-checked", "source": "", "matched_title": "", @@ -31725,8 +31229,8 @@ "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", "line": 403, "entry_no": 4, - "key": "mazumder:2023", - "title": "DataPerf: Benchmarks for Data-Centric AI Development", + "key": "longpre:2024", + "title": "A large-scale audit of dataset licensing and attribution in AI", "status": "not-checked", "source": "", "matched_title": "", @@ -31740,6 +31244,66 @@ "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", "line": 405, "entry_no": 5, + "key": "mazumder:2023", + "title": "DataPerf: Benchmarks for Data-Centric AI Development", + "status": "not-checked", + "source": "", + "matched_title": "", + "matched_year": "", + "score": 0.0, + "identifier": "", + "url_status": "", + "issues": [] + }, + { + "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", + "line": 407, + "entry_no": 6, + "key": "mitchell:2019", + "title": "Model Cards for Model Reporting", + "status": "not-checked", + "source": "", + "matched_title": "", + "matched_year": "", + "score": 0.0, + "identifier": "", + "url_status": "", + "issues": [] + }, + { + "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", + "line": 409, + "entry_no": 7, + "key": "pushkarna:2022", + "title": "Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI", + "status": "not-checked", + "source": "", + "matched_title": "", + "matched_year": "", + "score": 0.0, + "identifier": "", + "url_status": "", + "issues": [] + }, + { + "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", + "line": 411, + "entry_no": 8, + "key": "sculley:2015", + "title": "Hidden Technical Debt in Machine Learning Systems", + "status": "not-checked", + "source": "", + "matched_title": "", + "matched_year": "", + "score": 0.0, + "identifier": "", + "url_status": "", + "issues": [] + }, + { + "file": "docs/zh/appendix_d_paper_to_implementation_guide.md", + "line": 413, + "entry_no": 9, "key": "zha:2023", "title": "Data-centric Artificial Intelligence: A Survey", "status": "not-checked", @@ -31770,8 +31334,8 @@ "file": "docs/zh/appendix_e_common_bug_debugging_manual.md", "line": 449, "entry_no": 2, - "key": "chen:2024", - "title": "Data-Juicer: A One-Stop Data Processing System for Large Language Models", + "key": "breck:2017", + "title": "The ML Test Score: A Rubric for ML Production Readiness and Technical Debt Reduction", "status": "not-checked", "source": "", "matched_title": "", @@ -31785,8 +31349,8 @@ "file": "docs/zh/appendix_e_common_bug_debugging_manual.md", "line": 451, "entry_no": 3, - "key": "chen:2025", - "title": "AIOpsLab: A Holistic Framework to Evaluate AI Agents for Enabling Autonomous Clouds", + "key": "chen:2024", + "title": "Data-Juicer: A One-Stop Data Processing System for Large Language Models", "status": "not-checked", "source": "", "matched_title": "", @@ -31800,8 +31364,8 @@ "file": "docs/zh/appendix_e_common_bug_debugging_manual.md", "line": 453, "entry_no": 4, - "key": "kapoor:2023", - "title": "Leakage and the reproducibility crisis in machine-learning-based science", + "key": "chen:2025", + "title": "AIOpsLab: A Holistic Framework to Evaluate AI Agents for Enabling Autonomous Clouds", "status": "not-checked", "source": "", "matched_title": "", @@ -31815,6 +31379,21 @@ "file": "docs/zh/appendix_e_common_bug_debugging_manual.md", "line": 455, "entry_no": 5, + "key": "kapoor:2023", + "title": "Leakage and the reproducibility crisis in machine-learning-based science", + "status": "not-checked", + "source": "", + "matched_title": "", + "matched_year": "", + "score": 0.0, + "identifier": "", + "url_status": "", + "issues": [] + }, + { + "file": "docs/zh/appendix_e_common_bug_debugging_manual.md", + "line": 457, + "entry_no": 6, "key": "pfitzmann:2022", "title": "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis", "status": "not-checked", @@ -31845,8 +31424,8 @@ "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", "line": 403, "entry_no": 2, - "key": "liang:2023", - "title": "Holistic Evaluation of Language Models", + "key": "gebru:2021", + "title": "Datasheets for Datasets", "status": "not-checked", "source": "", "matched_title": "", @@ -31860,8 +31439,8 @@ "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", "line": 405, "entry_no": 3, - "key": "wang:2023", - "title": "DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models", + "key": "liang:2023", + "title": "Holistic Evaluation of Language Models", "status": "not-checked", "source": "", "matched_title": "", @@ -31875,6 +31454,51 @@ "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", "line": 407, "entry_no": 4, + "key": "mitchell:2019", + "title": "Model Cards for Model Reporting", + "status": "not-checked", + "source": "", + "matched_title": "", + "matched_year": "", + "score": 0.0, + "identifier": "", + "url_status": "", + "issues": [] + }, + { + "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", + "line": 409, + "entry_no": 5, + "key": "pushkarna:2022", + "title": "Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI", + "status": "not-checked", + "source": "", + "matched_title": "", + "matched_year": "", + "score": 0.0, + "identifier": "", + "url_status": "", + "issues": [] + }, + { + "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", + "line": 411, + "entry_no": 6, + "key": "wang:2023", + "title": "DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models", + "status": "not-checked", + "source": "", + "matched_title": "", + "matched_year": "", + "score": 0.0, + "identifier": "", + "url_status": "", + "issues": [] + }, + { + "file": "docs/zh/appendix_f_terminology_and_chinese_english_mapping.md", + "line": 413, + "entry_no": 7, "key": "weidinger:2022", "title": "Taxonomy of Risks posed by Language Models", "status": "not-checked", @@ -34815,7 +34439,7 @@ "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", "line": 597, "entry_no": 1, - "key": "", + "key": "penedo:2024", "title": "The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale", "status": "not-checked", "source": "", @@ -34828,7 +34452,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 598, + "line": 599, "entry_no": 2, "key": "hugging:2026", "title": "HuggingFaceFW/fineweb Dataset Card", @@ -34843,7 +34467,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 599, + "line": 601, "entry_no": 3, "key": "hugging:2026", "title": "DataTrove FineWeb Processing Script", @@ -34858,7 +34482,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 600, + "line": 603, "entry_no": 4, "key": "penedo:2024", "title": "DataTrove large scale data processing", @@ -34873,7 +34497,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 601, + "line": 605, "entry_no": 5, "key": "luccioni:2021", "title": "What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus", @@ -34888,9 +34512,9 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 603, + "line": 607, "entry_no": 6, - "key": "", + "key": "soldaini:2024", "title": "Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research", "status": "not-checked", "source": "", @@ -34903,7 +34527,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 604, + "line": 609, "entry_no": 7, "key": "allen:2023", "title": "Ai2 Dolma: 3 trillion token open corpus for language model pretraining", @@ -34918,7 +34542,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 605, + "line": 611, "entry_no": 8, "key": "allenai:2026", "title": "allenai/dolma Dataset Card", @@ -34933,7 +34557,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 606, + "line": 613, "entry_no": 9, "key": "allenai:2026", "title": "Dolma Dataset and Toolkit Repository", @@ -34948,7 +34572,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 607, + "line": 615, "entry_no": 10, "key": "allenai:2026", "title": "Dolma Toolkit Documentation", @@ -34963,7 +34587,7 @@ }, { "file": "docs/zh/part12/ch38_text_corpora_transparent_ledger.md", - "line": 608, + "line": 617, "entry_no": 11, "key": "groeneveld:2024", "title": "OLMo: Accelerating the Science of Language Models", @@ -34993,7 +34617,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 265, + "line": 266, "entry_no": 2, "key": "laion:2022", "title": "LAION-5B: A new era of open large-scale multi-modal datasets", @@ -35008,7 +34632,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 266, + "line": 268, "entry_no": 3, "key": "laionai:2022", "title": "dataset-spec", @@ -35023,7 +34647,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 267, + "line": 270, "entry_no": 4, "key": "gadre:2023", "title": "DataComp: In search of the next generation of multimodal datasets", @@ -35038,7 +34662,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 268, + "line": 272, "entry_no": 5, "key": "datacomp:2026", "title": "DataComp Benchmark Documentation", @@ -35053,7 +34677,7 @@ }, { "file": "docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md", - "line": 269, + "line": 274, "entry_no": 6, "key": "ml:2023", "title": "DataComp codebase", @@ -35251,7 +34875,7 @@ "line": 752, "entry_no": 13, "key": "rafailov:2024", - "title": "Direct Preference Optimization: Your Language Model Is Secretly a Reward Model", + "title": "Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Proc. NeurIPS", "status": "not-checked", "source": "", "matched_title": "", @@ -36016,7 +35640,7 @@ "line": 452, "entry_no": 1, "key": "wei:2022", - "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", + "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022", "status": "not-checked", "source": "", "matched_title": "", @@ -36076,7 +35700,7 @@ "line": 456, "entry_no": 5, "key": "hendrycks:2021", - "title": "Measuring Mathematical Problem Solving With the MATH Dataset", + "title": "Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021", "status": "not-checked", "source": "", "matched_title": "", @@ -36298,7 +35922,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 375, + "line": 383, "entry_no": 1, "key": "wang:2023", "title": "Self-Instruct: Aligning Language Models with Self-Generated Instructions", @@ -36313,7 +35937,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 377, + "line": 385, "entry_no": 2, "key": "ouyang:2022", "title": "Training Language Models to Follow Instructions with Human Feedback", @@ -36328,7 +35952,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 379, + "line": 387, "entry_no": 3, "key": "rafailov:2023", "title": "Direct Preference Optimization: Your Language Model Is Secretly a Reward Model", @@ -36343,7 +35967,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 381, + "line": 389, "entry_no": 4, "key": "ethayarajh:2024", "title": "Model Alignment as Prospect Theoretic Optimization", @@ -36358,9 +35982,9 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 383, + "line": 391, "entry_no": 5, - "key": "gheshlaghi:2024", + "key": "gheshlaghiazar:2024", "title": "A General Theoretical Paradigm to Understand Learning from Human Preferences", "status": "not-checked", "source": "", @@ -36373,7 +35997,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 385, + "line": 393, "entry_no": 6, "key": "grattafiori:2024", "title": "The Llama 3 Herd of Models", @@ -36388,7 +36012,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 387, + "line": 395, "entry_no": 7, "key": "lambert:2025", "title": "Tülu 3: Pushing Frontiers in Open Language Model Post-Training", @@ -36403,7 +36027,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 389, + "line": 397, "entry_no": 8, "key": "yang:2025", "title": "Qwen3 Technical Report", @@ -36418,7 +36042,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 391, + "line": 399, "entry_no": 9, "key": "wang:2024", "title": "HelpSteer 2: Open-Source Dataset for Training Top-Performing Reward Models", @@ -36433,7 +36057,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 393, + "line": 401, "entry_no": 10, "key": "xu:2024", "title": "WizardLM: Empowering Large Pre-Trained Language Models to Follow Complex Instructions", @@ -36448,7 +36072,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 395, + "line": 403, "entry_no": 11, "key": "xu:2025", "title": "Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing", @@ -36463,7 +36087,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 397, + "line": 405, "entry_no": 12, "key": "liu:2024", "title": "DeepSeek-V3 Technical Report", @@ -36478,7 +36102,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 399, + "line": 407, "entry_no": 13, "key": "liu:2024", "title": "Skywork-Reward: Bag of Tricks for Reward Modeling in LLMs", @@ -36493,7 +36117,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 401, + "line": 409, "entry_no": 14, "key": "singhal:2024", "title": "A Long Way to Go: Investigating Length Correlations in RLHF", @@ -36508,7 +36132,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 403, + "line": 411, "entry_no": 15, "key": "shao:2024", "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models", @@ -36523,7 +36147,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 405, + "line": 413, "entry_no": 16, "key": "zhou:2023", "title": "Don't Make Your LLM an Evaluation Benchmark Cheater", @@ -36538,7 +36162,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 407, + "line": 415, "entry_no": 17, "key": "zheng:2023", "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", @@ -36553,7 +36177,7 @@ }, { "file": "docs/zh/part13/ch45_posttrain_recipes.md", - "line": 409, + "line": 417, "entry_no": 18, "key": "lightman:2024", "title": "Let's Verify Step by Step", @@ -36568,7 +36192,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 569, + "line": 589, "entry_no": 1, "key": "guo:2025", "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", @@ -36583,7 +36207,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 571, + "line": 591, "entry_no": 2, "key": "kimiteam:2025", "title": "Kimi k1.5: Scaling Reinforcement Learning with LLMs", @@ -36598,7 +36222,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 573, + "line": 593, "entry_no": 3, "key": "qwenteam:2025", "title": "QwQ-32B Model Card", @@ -36613,7 +36237,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 575, + "line": 595, "entry_no": 4, "key": "touvron:2023", "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models", @@ -36628,7 +36252,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 577, + "line": 597, "entry_no": 5, "key": "cobbe:2021", "title": "Training Verifiers to Solve Math Word Problems", @@ -36643,7 +36267,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 579, + "line": 599, "entry_no": 6, "key": "chen:2021", "title": "Evaluating Large Language Models Trained on Code", @@ -36658,7 +36282,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 581, + "line": 601, "entry_no": 7, "key": "hendrycks:2021", "title": "Measuring Mathematical Problem Solving With the MATH Dataset", @@ -36673,7 +36297,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 583, + "line": 603, "entry_no": 8, "key": "meurer:2017", "title": "SymPy: symbolic computing in Python", @@ -36688,7 +36312,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 585, + "line": 605, "entry_no": 9, "key": "guha:2025", "title": "OpenThoughts: Data Recipes for Reasoning Models", @@ -36703,7 +36327,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 587, + "line": 607, "entry_no": 10, "key": "zhou:2023", "title": "LIMA: Less Is More for Alignment", @@ -36718,7 +36342,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 589, + "line": 609, "entry_no": 11, "key": "zelikman:2022", "title": "STaR: Bootstrapping Reasoning with Reasoning", @@ -36733,7 +36357,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 591, + "line": 611, "entry_no": 12, "key": "madaan:2023", "title": "Self-Refine: Iterative Refinement with Self-Feedback", @@ -36748,7 +36372,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 593, + "line": 613, "entry_no": 13, "key": "lightman:2024", "title": "Let's Verify Step by Step", @@ -36763,7 +36387,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 595, + "line": 615, "entry_no": 14, "key": "zheng:2023", "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", @@ -36778,7 +36402,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 597, + "line": 617, "entry_no": 15, "key": "gao:2023", "title": "Scaling Laws for Reward Model Overoptimization", @@ -36793,7 +36417,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 599, + "line": 619, "entry_no": 16, "key": "hosseini:2024", "title": "V-STaR: Training Verifiers for Self-Taught Reasoners", @@ -36808,7 +36432,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 601, + "line": 621, "entry_no": 17, "key": "shi:2022", "title": "Language Models Are Multilingual Chain-of-Thought Reasoners", @@ -36823,7 +36447,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 603, + "line": 623, "entry_no": 18, "key": "jaech:2024", "title": "OpenAI o1 System Card", @@ -36838,7 +36462,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 605, + "line": 625, "entry_no": 19, "key": "ott:2023", "title": "ThoughtSource: A Central Hub for Large Language Model Reasoning Data", @@ -36853,7 +36477,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 607, + "line": 627, "entry_no": 20, "key": "hsieh:2023", "title": "Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes", @@ -36868,7 +36492,7 @@ }, { "file": "docs/zh/part13/ch46_rl_reasoning_data.md", - "line": 609, + "line": 629, "entry_no": 21, "key": "patil:2024", "title": "Gorilla: Large Language Model Connected with Massive APIs", @@ -37801,7 +37425,7 @@ "line": 1140, "entry_no": 1, "key": "wei:2022", - "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", + "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022", "status": "not-checked", "source": "", "matched_title": "", @@ -37861,7 +37485,7 @@ "line": 1144, "entry_no": 5, "key": "hendrycks:2021", - "title": "Measuring Mathematical Problem Solving With the MATH Dataset", + "title": "Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021", "status": "not-checked", "source": "", "matched_title": "", @@ -38205,7 +37829,7 @@ "file": "docs/zh/part14/p11_mini_deepseek.md", "line": 528, "entry_no": 3, - "key": "liu:2024", + "key": "deepseekai:2024", "title": "DeepSeek-V3 Technical Report", "status": "not-checked", "source": "", @@ -38446,7 +38070,7 @@ "line": 550, "entry_no": 3, "key": "kwon:2023", - "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention (vLLM)", + "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention", "status": "not-checked", "source": "", "matched_title": "", @@ -38865,8 +38489,8 @@ "file": "docs/zh/part2/ch05_cleaning_dedup.md", "line": 631, "entry_no": 3, - "key": "honnibal:2020", - "title": "spaCy: Industrial-strength Natural Language Processing in Python", + "key": "honnibal:2023", + "title": "explosion/spaCy: v3.7.2: Fixes for APIs and requirements", "status": "not-checked", "source": "", "matched_title": "", diff --git a/publishing/final_review/reference_integrity_audit.md b/publishing/final_review/reference_integrity_audit.md index 4656fdcc..f71e44d1 100644 --- a/publishing/final_review/reference_integrity_audit.md +++ b/publishing/final_review/reference_integrity_audit.md @@ -1,24 +1,24 @@ # 全书引用完整性与真实性审计报告 -- 生成时间:2026-06-17T02:33:34.150066+00:00 +- 生成时间:2026-06-22T08:46:49.424273+00:00 - 范围:当前 Springer 中文交付稿 `docs/zh` 中的正文章、项目章和附录。 - 扫描文件:71 -- 参考文献条目:972 -- 正文 author-year 引用:1327 +- 参考文献条目:980 +- 正文 author-year 引用:1358 - 正文引用未在同章参考文献解析到:0 -- 章末参考文献未被同章正文引用:227 +- 章末参考文献未被同章正文引用:205 - 同章疑似重复参考文献组:0 ## 外部核验概览 | 状态 | 数量 | 含义 | | --- | ---: | --- | -| `not-checked` | 972 | | +| `not-checked` | 980 | | ## 主要结论 - 需要优先人工复核的外部核验问题:0 条。 -- 缺少 DOI / arXiv / URL 的条目:614 条;其中一部分可由 Crossref/OpenAlex 题名检索确认,但 Springer 终稿仍建议补 DOI 或稳定 URL。 +- 缺少 DOI / arXiv / URL 的条目:379 条;其中一部分可由 Crossref/OpenAlex 题名检索确认,但 Springer 终稿仍建议补 DOI 或稳定 URL。 - `url-reachable` 只能证明网页当前可达,不能证明引文格式、版本日期和题名完全符合出版社要求。 - 当前报告只确认“同章 author-year 可解析对应关系”;对于一段话是否应该引用更精确来源,仍需人工学术编辑判断。 @@ -40,6 +40,14 @@ | `docs/zh/appendix_b_compliance_and_release_checklist.md` | 320 | 5 | `european:2024` | Regulation (EU) 2024/1689 laying down harmonised rules on artificial intelligence (Artificial Intelligence Act) | European Parliament and Council of the European Union (2024) Regulation (EU) 2024/1689 laying down harmonised rules on artificial intelligence (Artificial Intelligence Act). Available at: https://eur-lex.europa.eu/eli... | | `docs/zh/appendix_c_cost_estimation_and_resource_templates.md` | 322 | 4 | `kubernetes:2026` | Kubernetes Documentation | Kubernetes Authors (2026) Kubernetes Documentation. Available at: https://kubernetes.io/docs/. | | `docs/zh/appendix_c_cost_estimation_and_resource_templates.md` | 324 | 5 | `vllm:2026` | vLLM Documentation | vLLM Project (2026) vLLM Documentation. Available at: https://docs.vllm.ai/. | +| `docs/zh/appendix_d_paper_to_implementation_guide.md` | 399 | 2 | `gebru:2021` | Datasheets for Datasets | Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723. | +| `docs/zh/appendix_d_paper_to_implementation_guide.md` | 407 | 6 | `mitchell:2019` | Model Cards for Model Reporting | Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, ... | +| `docs/zh/appendix_d_paper_to_implementation_guide.md` | 409 | 7 | `pushkarna:2022` | Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI | Pushkarna M, Zaldivar A, Kjartansson O (2022) Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, ... | +| `docs/zh/appendix_d_paper_to_implementation_guide.md` | 411 | 8 | `sculley:2015` | Hidden Technical Debt in Machine Learning Systems | Sculley D, Holt G, Golovin D, Davydov E, Phillips T, Ebner D, Chaudhary V, Young M, Crespo J-F, Dennison D (2015) Hidden Technical Debt in Machine Learning Systems. In: Advances in Neural Information Processing System... | +| `docs/zh/appendix_e_common_bug_debugging_manual.md` | 449 | 2 | `breck:2017` | The ML Test Score: A Rubric for ML Production Readiness and Technical Debt Reduction | Breck E, Cai S, Nielsen E, Salib M, Sculley D (2017) The ML Test Score: A Rubric for ML Production Readiness and Technical Debt Reduction. In: IEEE International Conference on Big Data, pp 1123-1132. | +| `docs/zh/appendix_f_terminology_and_chinese_english_mapping.md` | 403 | 2 | `gebru:2021` | Datasheets for Datasets | Gebru T, Morgenstern J, Vecchione B, Vaughan J W, Wallach H, Daumé H, Crawford K (2021) Datasheets for Datasets. Communications of the ACM 64(12):86-92. https://doi.org/10.1145/3458723. | +| `docs/zh/appendix_f_terminology_and_chinese_english_mapping.md` | 407 | 4 | `mitchell:2019` | Model Cards for Model Reporting | Mitchell M, Wu S, Zaldivar A, Barnes P, Vasserman L, Hutchinson B, Spitzer E, Raji I D, Gebru T (2019) Model Cards for Model Reporting. In: Proceedings of the Conference on Fairness, Accountability, and Transparency, ... | +| `docs/zh/appendix_f_terminology_and_chinese_english_mapping.md` | 409 | 5 | `pushkarna:2022` | Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI | Pushkarna M, Zaldivar A, Kjartansson O (2022) Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI. In: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency, ... | | `docs/zh/appendix_g_datagallery_note.md` | 67 | 1 | `datagallery:2026` | DataGallery organization page | DataGallery Contributors (2026a) DataGallery organization page. Available at: https://gitcode.com/datagallery. | | `docs/zh/appendix_g_datagallery_note.md` | 69 | 2 | `datagallery:2026` | DataAgent source repository | DataGallery Contributors (2026b) DataAgent source repository. Available at: https://gitcode.com/datagallery/DataAgent. | | `docs/zh/appendix_h_mindspore_note.md` | 253 | 1 | `mindface:2026` | MindFace source repository | MindFace Contributors (2026) MindFace source repository. Available at: https://github.com/mindspore-lab/mindface. | @@ -48,34 +56,36 @@ | `docs/zh/appendix_h_mindspore_note.md` | 259 | 4 | `mindspore:2026` | Automatic Differentiation, MindSpore Tutorials | MindSpore Contributors (2026c) Automatic Differentiation, MindSpore Tutorials. Available at: https://www.mindspore.cn/tutorials/en/r2.9.0/beginner/autograd.html. | | `docs/zh/part1/ch01_data_change.md` | 328 | 13 | `heafield:2011` | KenLM: Faster and Smaller Language Model Queries | Heafield K (2011) KenLM: Faster and Smaller Language Model Queries. In: Proceedings of the Sixth Workshop on Statistical Machine Translation, pp 187-197. | | `docs/zh/part1/ch01_data_change.md` | 330 | 14 | `broder:1997` | On the Resemblance and Containment of Documents | Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. | -| `docs/zh/part1/ch02_quality_framework.md` | 507 | 1 | `cohen:1960` | A Coefficient of Agreement for Nominal Scales | Cohen J (1960) A Coefficient of Agreement for Nominal Scales. Educational and Psychological Measurement 20(1):37-46. | -| `docs/zh/part1/ch03_data_stack.md` | 345 | 3 | `broder:1997` | On the Resemblance and Containment of Documents | Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. | +| `docs/zh/part1/ch02_quality_framework.md` | 507 | 1 | `cohen:1960` | A Coefficient of Agreement for Nominal Scales | Cohen J (1960) A Coefficient of Agreement for Nominal Scales. Educational and Psychological Measurement 20(1):37-46. https://doi.org/10.1177/001316446002000104. | +| `docs/zh/part1/ch03_data_stack.md` | 345 | 3 | `broder:1997` | On the Resemblance and Containment of Documents | Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. https://doi.org/10.1109/sequen.1997.666900. | | `docs/zh/part1/ch03_data_stack.md` | 347 | 4 | `heafield:2011` | KenLM: Faster and Smaller Language Model Queries | Heafield K (2011) KenLM: Faster and Smaller Language Model Queries. In: Proceedings of the Sixth Workshop on Statistical Machine Translation, pp 187-197. | | `docs/zh/part1/ch03_data_stack.md` | 353 | 7 | `apache:2024` | Apache Iceberg: Table Specification and Documentation | Apache Software Foundation (2024) Apache Iceberg: Table Specification and Documentation. (accessed 2024-11). | | `docs/zh/part1/ch03_data_stack.md` | 359 | 10 | `together:2023` | RedPajama: An Open Dataset for Training Large Language Models | Together Computer (2023) RedPajama: An Open Dataset for Training Large Language Models. GitHub repository. . | -| `docs/zh/part10/ch32_auto_collection_parsing_cleaning.md` | 420 | 1 | `barbaresi:2021` | Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction | Barbaresi A (2021) Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 122-131. | +| `docs/zh/part10/ch32_auto_collection_parsing_cleaning.md` | 420 | 1 | `barbaresi:2021` | Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction | Barbaresi A (2021) Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics, pp 122-131. ... | | `docs/zh/part10/ch32_auto_collection_parsing_cleaning.md` | 450 | 16 | `penedo:2024` | The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale | Penedo G, Kydlíček H, Allal L B, Lozhkov A, Mitchell M, Raffel C, von Werra L, Wolf T (2024) The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. In: Advances in Neural Information Processing Sys... | | `docs/zh/part10/ch34_dataops_agent.md` | 479 | 5 | `huyen:2022` | Designing Machine Learning Systems: An Iterative Process for Production-Ready Applications | Huyen C (2022) Designing Machine Learning Systems: An Iterative Process for Production-Ready Applications. O'Reilly Media. | -| `docs/zh/part10/ch34_dataops_agent.md` | 495 | 13 | `tamburri:2020` | Sustainable MLOps: Trends and Challenges | Tamburri D A (2020) Sustainable MLOps: Trends and Challenges. In: Proceedings of the 22nd International Symposium on Symbolic and Numeric Algorithms for Scientific Computing, pp 17-23. | +| `docs/zh/part10/ch34_dataops_agent.md` | 495 | 13 | `tamburri:2020` | Sustainable MLOps: Trends and Challenges | Tamburri D A (2020) Sustainable MLOps: Trends and Challenges. In: Proceedings of the 22nd International Symposium on Symbolic and Numeric Algorithms for Scientific Computing, pp 17-23. https://doi.org/10.1109/synasc51... | | `docs/zh/part11/ch36_compliance_framework_and_governance.md` | 1188 | 5 | `european:2022` | Data Protection Engineering | European Union Agency for Cybersecurity (ENISA) (2022) Data Protection Engineering. ENISA Report. | -| `docs/zh/part11/ch36_compliance_framework_and_governance.md` | 1194 | 8 | `hoepman:2014` | Privacy Design Strategies | Hoepman J-H (2014) Privacy Design Strategies. In IFIP International Information Security Conference, pp 446-459. | -| `docs/zh/part11/ch36_compliance_framework_and_governance.md` | 1198 | 10 | `dwork:2008` | Differential Privacy: A Survey of Results | Dwork C (2008) Differential Privacy: A Survey of Results. In Theory and Applications of Models of Computation, Springer Berlin Heidelberg, pp 1-19. | +| `docs/zh/part11/ch36_compliance_framework_and_governance.md` | 1194 | 8 | `hoepman:2014` | Privacy Design Strategies | Hoepman J-H (2014) Privacy Design Strategies. In IFIP International Information Security Conference, pp 446-459. https://doi.org/10.1007/978-3-642-55415-5_38. | +| `docs/zh/part11/ch36_compliance_framework_and_governance.md` | 1198 | 10 | `dwork:2008` | Differential Privacy: A Survey of Results | Dwork C (2008) Differential Privacy: A Survey of Results. In Theory and Applications of Models of Computation, Springer Berlin Heidelberg, pp 1-19. https://doi.org/10.1007/978-3-540-79228-4_1. | | `docs/zh/part11/ch37_federated_learning_and_privacy_preserving_technologies.md` | 448 | 4 | `dwork:2011` | Differential Privacy | Dwork C (2011) Differential Privacy. In Encyclopedia of Cryptography and Security, Springer US, pp 338-340. | -| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 598 | 2 | `hugging:2026` | HuggingFaceFW/fineweb Dataset Card | Hugging Face. (2026). HuggingFaceFW/fineweb Dataset Card. https://huggingface.co/datasets/HuggingFaceFW/fineweb. | -| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 599 | 3 | `hugging:2026` | DataTrove FineWeb Processing Script | Hugging Face. (2026). DataTrove FineWeb Processing Script. https://github.com/huggingface/datatrove/blob/main/examples/fineweb.py. | -| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 600 | 4 | `penedo:2024` | DataTrove large scale data processing | Penedo, G., Kydlíček, H., Cappelli, A., Sasko, M., & Wolf, T. (2024). DataTrove large scale data processing. https://github.com/huggingface/datatrove. | -| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 601 | 5 | `luccioni:2021` | What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus | Luccioni, S., & Viviano, J. (2021). What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus. https://arxiv.org/abs/2105.02732. | -| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 604 | 7 | `allen:2023` | Ai2 Dolma: 3 trillion token open corpus for language model pretraining | Allen Institute for AI. (2023). Ai2 Dolma: 3 trillion token open corpus for language model pretraining. https://allenai.org/blog/dolma-3-trillion-tokens-open-llm-corpus-9a0ff4b8da64. | -| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 605 | 8 | `allenai:2026` | allenai/dolma Dataset Card | AllenAI. (2026). allenai/dolma Dataset Card. https://huggingface.co/datasets/allenai/dolma. | -| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 606 | 9 | `allenai:2026` | Dolma Dataset and Toolkit Repository | AllenAI. (2026). Dolma Dataset and Toolkit Repository. https://github.com/allenai/dolma. | -| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 607 | 10 | `allenai:2026` | Dolma Toolkit Documentation | AllenAI. (2026). Dolma Toolkit Documentation. https://github.com/allenai/dolma/blob/main/docs/README.md. | -| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 608 | 11 | `groeneveld:2024` | OLMo: Accelerating the Science of Language Models | Groeneveld, D., Beltagy, I., Walsh, P., Bhagia, A., Kinney, R., Tafjord, O., et al. (2024). OLMo: Accelerating the Science of Language Models. https://arxiv.org/abs/2402.00838. | +| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 597 | 1 | `penedo:2024` | The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale | Penedo, G., Kydlíček, H., Allal, L. B., Lozhkov, A., Mitchell, M., Raffel, C., von Werra, L., & Wolf, T. (2024). The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale. NeurIPS 2024 Datasets and Ben... | +| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 599 | 2 | `hugging:2026` | HuggingFaceFW/fineweb Dataset Card | Hugging Face. (2026). HuggingFaceFW/fineweb Dataset Card. https://huggingface.co/datasets/HuggingFaceFW/fineweb. | +| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 601 | 3 | `hugging:2026` | DataTrove FineWeb Processing Script | Hugging Face. (2026). DataTrove FineWeb Processing Script. https://github.com/huggingface/datatrove/blob/main/examples/fineweb.py. | +| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 603 | 4 | `penedo:2024` | DataTrove large scale data processing | Penedo, G., Kydlíček, H., Cappelli, A., Sasko, M., & Wolf, T. (2024). DataTrove large scale data processing. https://github.com/huggingface/datatrove. | +| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 605 | 5 | `luccioni:2021` | What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus | Luccioni, S., & Viviano, J. (2021). What's in the Box? A Preliminary Analysis of Undesirable Content in the Common Crawl Corpus. https://arxiv.org/abs/2105.02732. | +| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 607 | 6 | `soldaini:2024` | Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research | Soldaini, L., Kinney, R., Bhagia, A., Schwenk, D., Atkinson, D., Authur, R., et al. (2024). Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research. ACL 2024. https://arxiv.org/abs/2402.... | +| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 609 | 7 | `allen:2023` | Ai2 Dolma: 3 trillion token open corpus for language model pretraining | Allen Institute for AI. (2023). Ai2 Dolma: 3 trillion token open corpus for language model pretraining. https://allenai.org/blog/dolma-3-trillion-tokens-open-llm-corpus-9a0ff4b8da64. | +| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 611 | 8 | `allenai:2026` | allenai/dolma Dataset Card | AllenAI. (2026). allenai/dolma Dataset Card. https://huggingface.co/datasets/allenai/dolma. | +| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 613 | 9 | `allenai:2026` | Dolma Dataset and Toolkit Repository | AllenAI. (2026). Dolma Dataset and Toolkit Repository. https://github.com/allenai/dolma. | +| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 615 | 10 | `allenai:2026` | Dolma Toolkit Documentation | AllenAI. (2026). Dolma Toolkit Documentation. https://github.com/allenai/dolma/blob/main/docs/README.md. | +| `docs/zh/part12/ch38_text_corpora_transparent_ledger.md` | 617 | 11 | `groeneveld:2024` | OLMo: Accelerating the Science of Language Models | Groeneveld, D., Beltagy, I., Walsh, P., Bhagia, A., Kinney, R., Tafjord, O., et al. (2024). OLMo: Accelerating the Science of Language Models. https://arxiv.org/abs/2402.00838. | | `docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md` | 264 | 1 | `schuhmann:2022` | LAION-5B: An open large-scale dataset for training next generation image-text models | Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., et al. (2022). LAION-5B: An open large-scale dataset for training next generation image-text models. NeurIPS 2022 Datasets and Benchmarks T... | -| `docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md` | 265 | 2 | `laion:2022` | LAION-5B: A new era of open large-scale multi-modal datasets | LAION. (2022). LAION-5B: A new era of open large-scale multi-modal datasets. https://laion.ai/blog/laion-5b/. | -| `docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md` | 266 | 3 | `laionai:2022` | dataset-spec | LAION-AI. (2022). dataset-spec. https://github.com/LAION-AI/dataset-spec. | -| `docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md` | 267 | 4 | `gadre:2023` | DataComp: In search of the next generation of multimodal datasets | Gadre, S. Y., Ilharco, G., Fang, A., Hayase, J., Smyrnis, G., Nguyen, T., et al. (2023). DataComp: In search of the next generation of multimodal datasets. NeurIPS 2023 Datasets and Benchmarks Track. https://arxiv.org... | -| `docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md` | 268 | 5 | `datacomp:2026` | DataComp Benchmark Documentation | DataComp Team. (2026). DataComp Benchmark Documentation. https://www.datacomp.ai/dcclip/. | -| `docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md` | 269 | 6 | `ml:2023` | DataComp codebase | ML Foundations. (2023). DataComp codebase. https://github.com/mlfoundations/datacomp. | +| `docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md` | 266 | 2 | `laion:2022` | LAION-5B: A new era of open large-scale multi-modal datasets | LAION. (2022). LAION-5B: A new era of open large-scale multi-modal datasets. https://laion.ai/blog/laion-5b/. | +| `docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md` | 268 | 3 | `laionai:2022` | dataset-spec | LAION-AI. (2022). dataset-spec. https://github.com/LAION-AI/dataset-spec. | +| `docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md` | 270 | 4 | `gadre:2023` | DataComp: In search of the next generation of multimodal datasets | Gadre, S. Y., Ilharco, G., Fang, A., Hayase, J., Smyrnis, G., Nguyen, T., et al. (2023). DataComp: In search of the next generation of multimodal datasets. NeurIPS 2023 Datasets and Benchmarks Track. https://arxiv.org... | +| `docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md` | 272 | 5 | `datacomp:2026` | DataComp Benchmark Documentation | DataComp Team. (2026). DataComp Benchmark Documentation. https://www.datacomp.ai/dcclip/. | +| `docs/zh/part12/ch39_image_text_candidate_pool_data_engineering.md` | 274 | 6 | `ml:2023` | DataComp codebase | ML Foundations. (2023). DataComp codebase. https://github.com/mlfoundations/datacomp. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 728 | 1 | `bai:2025` | Qwen2.5-VL Technical Report | Bai, S., Chen, K., Liu, X., et al. (2025). Qwen2.5-VL Technical Report. *arXiv preprint arXiv:2502.13923*. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 730 | 2 | `blecher:2023` | Nougat: Neural Optical Understanding for Academic Documents | Blecher, L., Cucurull, G., Scialom, T., and Stojnic, R. (2023). Nougat: Neural Optical Understanding for Academic Documents. *arXiv preprint arXiv:2308.13418*. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 732 | 3 | `huang:2022` | LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. Proc | Huang, Y., Lv, T., Cui, L., Lu, Y., and Wei, F. (2022). LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking. *Proc. ACM Multimedia*. | @@ -85,10 +95,10 @@ | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 740 | 7 | `kuhn:1955` | The Hungarian Method for the Assignment Problem | Kuhn, H.W. (1955). The Hungarian Method for the Assignment Problem. *Naval Research Logistics Quarterly*, 2(1–2), pp. 83–97. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 742 | 8 | `levenshtein:1965` | Binary Codes Capable of Correcting Deletions, Insertions and Reversals | Levenshtein, V.I. (1965). Binary Codes Capable of Correcting Deletions, Insertions and Reversals. *Soviet Physics Doklady*, 10, pp. 707–710. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 744 | 9 | `liu:2024` | A Survey on Hallucination in Large Vision-Language Models | Liu, H., Xue, W., Chen, Y., et al. (2024). A Survey on Hallucination in Large Vision-Language Models. *arXiv preprint arXiv:2402.00253*. | -| `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 746 | 10 | `mathew:2021` | DocVQA: A Dataset for VQA on Document Images | Mathew, M., Karatzas, D., and Jawahar, C.V. (2021). DocVQA: A Dataset for VQA on Document Images. *Proc. WACV*. | +| `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 746 | 10 | `mathew:2021` | DocVQA: A Dataset for VQA on Document Images | Mathew, M., Karatzas, D., and Jawahar, C.V. (2021). DocVQA: A Dataset for VQA on Document Images. *Proc. WACV*. https://doi.org/10.1109/wacv48630.2021.00225. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 748 | 11 | `niu:2025` | MinerU 2.5: A Decoupled Vision-Language Model for Efficient High-Resolution Document Parsing | Niu, J., Liu, Z., Gu, Z., et al. (2025). MinerU 2.5: A Decoupled Vision-Language Model for Efficient High-Resolution Document Parsing. *arXiv preprint*. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 750 | 12 | `park:2019` | CORD: A Consolidated Receipt Dataset for Post-OCR Parsing | Park, S., Shin, S., Lee, B., et al. (2019). CORD: A Consolidated Receipt Dataset for Post-OCR Parsing. *NeurIPS Workshop on Document Intelligence*. | -| `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 752 | 13 | `rafailov:2024` | Direct Preference Optimization: Your Language Model Is Secretly a Reward Model | Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D., and Finn, C. (2024). Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. *Proc. NeurIPS*. | +| `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 752 | 13 | `rafailov:2024` | Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. Proc. NeurIPS | Rafailov, R., Sharma, A., Mitchell, E., Ermon, S., Manning, C.D., and Finn, C. (2024). Direct Preference Optimization: Your Language Model Is Secretly a Reward Model. *Proc. NeurIPS*. arXiv:2305.18290. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 754 | 14 | `schulman:2017` | Proximal Policy Optimization Algorithms | Schulman, J., Wolski, F., Dhariwal, P., Radford, A., and Klimov, O. (2017). Proximal Policy Optimization Algorithms. *arXiv preprint arXiv:1707.06347*. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 756 | 15 | `shao:2024` | DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models | Shao, Z., Wang, P., et al. (2024). DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models. *arXiv preprint arXiv:2402.03300*. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 758 | 16 | `tianchi:2022` | CHIP 2022 Shared Task: Medical Invoice OCR Element Extraction Dataset | Tianchi, A. and CHIP Committee (2022). CHIP 2022 Shared Task: Medical Invoice OCR Element Extraction Dataset. *Aliyun Tianchi Platform*. | @@ -101,25 +111,16 @@ | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 772 | 23 | `chatdoc:2025` | OCRFlux-3B: A Multimodal Large Language Model for Document Parsing | ChatDOC (2025). OCRFlux-3B: A Multimodal Large Language Model for Document Parsing. *Hugging Face Model Card*. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 774 | 24 | `cui:2025` | PaddleOCR-VL: Boosting Multilingual Document Parsing via a 0.9B Ultra-Compact Vision-Language Model | Cui, C., Sun, T., Liang, S., et al. (2025). PaddleOCR-VL: Boosting Multilingual Document Parsing via a 0.9B Ultra-Compact Vision-Language Model. *arXiv preprint*. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 776 | 25 | `guo:2025` | DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning | Guo, D., Yang, D., Zhang, H., et al. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. *arXiv preprint arXiv:2501.12948*. | -| `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 778 | 26 | `hunyuan:2025` | HunyuanOCR Technical Report | Hunyuan Vision Team, Lyu, P., Wan, X., et al. (2025). HunyuanOCR Technical Report. *arXiv preprint*. | +| `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 778 | 26 | `hunyuan:2025` | HunyuanOCR Technical Report | Hunyuan Vision Team (2025). HunyuanOCR Technical Report. *arXiv preprint*. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 780 | 27 | `li:2025` | Dots.OCR: Multilingual Document Layout Parsing in a Single Vision-Language Model | Li, Y., Yang, G., Liu, H., Wang, B., and Zhang, C. (2025a). Dots.OCR: Multilingual Document Layout Parsing in a Single Vision-Language Model. *arXiv preprint*. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 782 | 28 | `poznanski:2025` | olmOCR 2: Unit Test Rewards for Document OCR | Poznanski, J., Soldaini, L., and Lo, K. (2025). olmOCR 2: Unit Test Rewards for Document OCR. *arXiv preprint arXiv:2510.19817*. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 784 | 29 | `smock:2025` | PubTables-v2: A New Large-Scale Dataset for Full-Page and Multi-Page Table Extraction | Smock, B., Faucon-Morin, V., Sokolov, M., et al. (2025). PubTables-v2: A New Large-Scale Dataset for Full-Page and Multi-Page Table Extraction. *arXiv preprint arXiv:2512.10888*. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 786 | 30 | `wang:2025` | InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency | Wang, W., Gao, Z., Gu, L., et al. (2025). InternVL3.5: Advancing Open-Source Multimodal Models in Versatility, Reasoning, and Efficiency. *arXiv preprint arXiv:2508.18265*. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 788 | 31 | `zhang:2025` | MonkeyOCR v1.5 Technical Report: Unlocking Robust Document Parsing for Complex Patterns | Zhang, J., Liu, Y., Wu, Z., et al. (2025). MonkeyOCR v1.5 Technical Report: Unlocking Robust Document Parsing for Complex Patterns. *arXiv preprint*. | -| `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 790 | 32 | `smock:2022` | PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents | Smock, B., Pesala, R., and Abraham, R. (2022). PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents. *Proc. CVPR*. | -| `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 792 | 33 | `zhu:2021` | TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance | Zhu, F., Lei, W., Huang, Y., Wang, C., Zhang, S., Lv, J., Feng, F., and Chua, T.-S. (2021). TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance. *Proc. ACL*. | +| `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 790 | 32 | `smock:2022` | PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents | Smock, B., Pesala, R., and Abraham, R. (2022). PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents. *Proc. CVPR*. https://doi.org/10.1109/cvpr52688.2022.00459. | +| `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 792 | 33 | `zhu:2021` | TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance | Zhu, F., Lei, W., Huang, Y., Wang, C., Zhang, S., Lv, J., Feng, F., and Chua, T.-S. (2021). TAT-QA: A Question Answering Benchmark on a Hybrid of Tabular and Textual Content in Finance. *Proc. ACL*. https://doi.org/10... | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 794 | 34 | `pandas:2026` | pandas Documentation | Pandas Development Team. (2026). pandas Documentation. https://pandas.pydata.org/docs/. | | `docs/zh/part12/ch40_visual_document_table_data_engineering.md` | 796 | 35 | `apache:2026` | Apache Arrow Documentation | Apache Arrow Contributors. (2026). Apache Arrow Documentation. https://arrow.apache.org/docs/. | -| `docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md` | 827 | 1 | `masry:2022` | ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning | Masry, A., Long, D. X., Tan, J. Q., Joty, S., & Hoque, E. (2022). ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning. ACL 2022. | -| `docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md` | 829 | 2 | `methani:2020` | PlotQA: Reasoning over Scientific Plots | Methani, N., Ganguly, P., Khapra, M. M., & Kumar, P. (2020). PlotQA: Reasoning over Scientific Plots. WACV 2020. | -| `docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md` | 831 | 3 | `kahou:2017` | FigureQA: An Annotated Figure Dataset for Visual Reasoning | Kahou, S. E., Michalski, V., Atkinson, A., Kádár, Á., Trischler, A., & Bengio, Y. (2017). FigureQA: An Annotated Figure Dataset for Visual Reasoning. arXiv:1710.07300. | -| `docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md` | 833 | 4 | `kafle:2018` | DVQA: Understanding Data Visualizations via Question Answering | Kafle, K., Price, B., Cohen, S., & Kanan, C. (2018). DVQA: Understanding Data Visualizations via Question Answering. CVPR 2018. | -| `docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md` | 835 | 5 | `mathew:2021` | DocVQA: A Dataset for VQA on Document Images | Mathew, M., Karatzas, D., & Jawahar, C. V. (2021). DocVQA: A Dataset for VQA on Document Images. WACV 2021. | -| `docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md` | 837 | 6 | `masry:2025` | Chartqapro: A more diverse and challenging benchmark for chart question answering | Masry, A., Islam, M. S., Ahmed, M., Bajaj, A., Kabir, F., Kartha, A., ... & Joty, S. (2025, July). Chartqapro: A more diverse and challenging benchmark for chart question answering. In Findings of the Association for ... | -| `docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md` | 839 | 7 | `xie:2026` | Infochartqa: A benchmark for multimodal question answering on infographic charts | Xie, T., Lin, M., Liu, M., Ye, Y., Chen, C., & Liu, S. (2026). Infochartqa: A benchmark for multimodal question answering on infographic charts. Advances in Neural Information Processing Systems, 38. | -| `docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md` | 841 | 8 | `foroutan:2025` | Wikimixqa: a multimodal benchmark for question answering over tables and charts | Foroutan, N., Romanou, A., Ansaripour, M., Eisenschlos, J. M., Aberer, K., & Lebret, R. (2025, July). Wikimixqa: a multimodal benchmark for question answering over tables and charts. In Findings of the Association for... | -| `docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md` | 843 | 9 | `zhu:2025` | MultiChartQA: Benchmarking vision-language models on multi-chart problems | Zhu, Z., Jia, M., Zhang, Z., Li, L., & Jiang, M. (2025, April). MultiChartQA: Benchmarking vision-language models on multi-chart problems. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter o... | | `docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md` | 845 | 10 | `antol:2015` | VQA: Visual Question Answering | Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C. L., & Parikh, D. (2015). VQA: Visual Question Answering. Proceedings of the IEEE International Conference on Computer Vision, 2425–2433. https://doi... | | `docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md` | 847 | 11 | `lau:2018` | A dataset of clinically generated visual questions and answers about radiology images | Lau, J. J., Gayen, S., Ben Abacha, A., & Demner-Fushman, D. (2018). A dataset of clinically generated visual questions and answers about radiology images. Scientific Data, 5, 180251. https://doi.org/10.1038/sdata.2018... | | `docs/zh/part12/ch41_visual_reasoning_tool_data_engineering.md` | 849 | 12 | `he:2020` | PathVQA: 30000+ Questions for Medical Visual Question Answering | He, X., Zhang, Y., Mou, L., Xing, E., & Xie, P. (2020). PathVQA: 30000+ Questions for Medical Visual Question Answering. arXiv:2003.10286. | @@ -132,27 +133,109 @@ | `docs/zh/part12/ch42_speech_audio_interaction_data_engineering.md` | 469 | 5 | `mittag:2021` | NISQA: A Deep CNN-Self-Attention Model for Multidimensional Speech Quality Prediction with Crowdsourced Datasets | Mittag G, Naderi B, Chehadi A, Möller S (2021) NISQA: A Deep CNN-Self-Attention Model for Multidimensional Speech Quality Prediction with Crowdsourced Datasets. In: Interspeech 2021, pp 2127-2131. | | `docs/zh/part12/ch42_speech_audio_interaction_data_engineering.md` | 471 | 6 | `song:2026` | S3Tokenizer: Reverse Engineering of Supervised Semantic Speech Tokenizer proposed in CosyVoice | Song X (2026) S3Tokenizer: Reverse Engineering of Supervised Semantic Speech Tokenizer proposed in CosyVoice. GitHub repository. https://github.com/xingchensong/S3Tokenizer. | | `docs/zh/part12/ch42_speech_audio_interaction_data_engineering.md` | 473 | 7 | `yang:2025` | Qwen3 Technical Report | Yang A, Li A, Yang B, Zhang B, Hui B, Zheng B, Yu B, Gao C, Huang C, Lv C, others (2025) Qwen3 Technical Report. arXiv preprint arXiv:2505.09388. | -| `docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md` | 452 | 1 | `wei:2022` | Chain-of-Thought Prompting Elicits Reasoning in Large Language Models | 1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022. | +| `docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md` | 452 | 1 | `wei:2022` | Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022 | 1. Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F., Chi, E., Le, Q. V., & Zhou, D. (2022). Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. NeurIPS 2022. arXiv:2201.11903. | | `docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md` | 453 | 2 | `lightman:2023` | Let's Verify Step by Step | 2. Lightman, H., Kosaraju, V., Burda, Y., Edwards, H., Baker, B., Lee, T., Leike, J., Schulman, J., Sutskever, I., & Cobbe, K. (2023). Let's Verify Step by Step. arXiv:2305.20050. | | `docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md` | 454 | 3 | `yao:2023` | ReAct: Synergizing Reasoning and Acting in Language Models | 3. Yao, S., Zhao, J., Yu, D., Du, N., Shafran, I., Narasimhan, K., & Cao, Y. (2023). ReAct: Synergizing Reasoning and Acting in Language Models. arXiv:2210.03629. | -| `docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md` | 455 | 4 | `deepseekai:2025` | DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning | 4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. | -| `docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md` | 456 | 5 | `hendrycks:2021` | Measuring Mathematical Problem Solving With the MATH Dataset | 5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021. | +| `docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md` | 455 | 4 | `deepseekai:2025` | DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning | 4. DeepSeek-AI. (2025). DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948. | +| `docs/zh/part12/ch43_reasoning_trace_compression_data_engineering.md` | 456 | 5 | `hendrycks:2021` | Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021 | 5. Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring Mathematical Problem Solving With the MATH Dataset. NeurIPS 2021. arXiv:2103.03874. | | `docs/zh/part13/ch44_pretrain_recipes.md` | 284 | 1 | `bavarian:2022` | Efficient Training of Language Models to Fill in the Middle (FIM) | Bavarian M, Jun H, Tezak N, Schulman J, McLeavey C, Tworek J, Chen M (2022) Efficient Training of Language Models to Fill in the Middle (FIM). arXiv preprint arXiv:2207.14255. | | `docs/zh/part13/ch44_pretrain_recipes.md` | 288 | 3 | `broder:1997` | On the Resemblance and Containment of Documents | Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. | | `docs/zh/part13/ch44_pretrain_recipes.md` | 294 | 6 | `hoffmann:2022` | Training Compute-Optimal Large Language Models (Chinchilla) | Hoffmann J, Borgeaud S, Mensch A, Buchatskaya E, Cai T, Rutherford E, de Las Casas D, Hendricks L A, Welbl J, Clark A, others (2022) Training Compute-Optimal Large Language Models (Chinchilla). arXiv preprint arXiv:22... | | `docs/zh/part13/ch44_pretrain_recipes.md` | 304 | 11 | `sennrich:2016` | Neural Machine Translation of Rare Words with Subword Units (BPE) | Sennrich R, Haddow B, Birch A (2016) Neural Machine Translation of Rare Words with Subword Units (BPE). In: Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics, pp 1715-1725. | -| `docs/zh/part13/ch45_posttrain_recipes.md` | 381 | 4 | `ethayarajh:2024` | Model Alignment as Prospect Theoretic Optimization | Ethayarajh K, Xu W, Muennighoff N, Jurafsky D, Kiela D (2024) Model Alignment as Prospect Theoretic Optimization. Proceedings of the 41st International Conference on Machine Learning, pp 12634-12651. | -| `docs/zh/part13/ch45_posttrain_recipes.md` | 383 | 5 | `gheshlaghi:2024` | A General Theoretical Paradigm to Understand Learning from Human Preferences | Gheshlaghi Azar M, Guo Z D, Piot B, Munos R, Rowland M, Valko M, Calandriello D (2024) A General Theoretical Paradigm to Understand Learning from Human Preferences. Proceedings of the 27th International Conference on ... | -| `docs/zh/part13/ch45_posttrain_recipes.md` | 389 | 8 | `yang:2025` | Qwen3 Technical Report | Yang A, Li A, Yang B, Zhang B, Hui B, Zheng B, Yu B, Gao C, Huang C, Lv C, others (2025) Qwen3 Technical Report. arXiv preprint arXiv:2505.09388. | -| `docs/zh/part13/ch45_posttrain_recipes.md` | 395 | 11 | `xu:2025` | Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing | Xu Z, Jiang F, Niu L, Deng Y, Poovendran R, Choi Y, Lin B Y (2025) Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing. International Conference on Learning Representations. | -| `docs/zh/part13/ch45_posttrain_recipes.md` | 397 | 12 | `liu:2024` | DeepSeek-V3 Technical Report | Liu A, Feng B, Xue B, Wang B, Wu B, Lu C, Zhao C, Deng C, Zhang C, Ruan C, others (2024a) DeepSeek-V3 Technical Report. arXiv preprint arXiv:2412.19437. | -| `docs/zh/part13/ch45_posttrain_recipes.md` | 399 | 13 | `liu:2024` | Skywork-Reward: Bag of Tricks for Reward Modeling in LLMs | Liu C Y, Zeng L, Liu J, Yan R, He J, Wang C, Yan S, Liu Y, Zhou Y (2024b) Skywork-Reward: Bag of Tricks for Reward Modeling in LLMs. arXiv preprint arXiv:2410.18451. | -| `docs/zh/part13/ch45_posttrain_recipes.md` | 401 | 14 | `singhal:2024` | A Long Way to Go: Investigating Length Correlations in RLHF | Singhal P, Goyal T, Xu J, Durrett G (2024) A Long Way to Go: Investigating Length Correlations in RLHF. First Conference on Language Modeling. | -| `docs/zh/part13/ch45_posttrain_recipes.md` | 405 | 16 | `zhou:2023` | Don't Make Your LLM an Evaluation Benchmark Cheater | Zhou K, Zhu Y, Chen Z, Chen W, Zhao W X, Chen X, Lin Y, Wen J-R, Han J (2023) Don't Make Your LLM an Evaluation Benchmark Cheater. arXiv preprint arXiv:2311.01964. | -| `docs/zh/part13/ch45_posttrain_recipes.md` | 409 | 18 | `lightman:2024` | Let's Verify Step by Step | Lightman H, Kosaraju V, Burda Y, Edwards H, Baker B, Lee T, Leike J, Schulman J, Sutskever I, Cobbe K (2024) Let's Verify Step by Step. International Conference on Learning Representations. | -| `docs/zh/part13/ch46_rl_reasoning_data.md` | 575 | 4 | `touvron:2023` | Llama 2: Open Foundation and Fine-Tuned Chat Models | Touvron H, Martin L, Stone K, Albert P, Almahairi A, Babaei Y, Bashlykov N, Batra S, Bhargava P, Bhosale S, others (2023) Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv preprint arXiv:2307.09288. | - -> 其余 107 条见 JSON 明细。 +| `docs/zh/part13/ch47_vlm_data_recipes.md` | 336 | 7 | `liu:2024` | Visual Instruction Tuning (LLaVA-1.5) | Liu H, Li C, Wu Q, Lee Y J (2024b) Visual Instruction Tuning (LLaVA-1.5). In: Advances in Neural Information Processing Systems 36. | +| `docs/zh/part13/ch47_vlm_data_recipes.md` | 344 | 11 | `radford:2021` | Learning Transferable Visual Models from Natural Language Supervision (CLIP) | Radford A, Kim J W, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, others (2021) Learning Transferable Visual Models from Natural Language Supervision (CLIP). In: Proceedings of the 38t... | +| `docs/zh/part13/ch47_vlm_data_recipes.md` | 350 | 14 | `yue:2024` | MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI | Yue X, Ni Y, Zhang K, Zheng T, Liu R, Zhang S, Stevens J, Jiang C, Zheng N, Sun T, others (2024) MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI. In: Proceedings of the... | +| `docs/zh/part13/ch48_t2i_t2v.md` | 403 | 1 | `pyscenedetect:2026` | PySceneDetect Documentation | PySceneDetect Contributors (2026) PySceneDetect Documentation. Available at: https://www.scenedetect.com/docs/latest/. | +| `docs/zh/part13/ch48_t2i_t2v.md` | 425 | 12 | `black:2025` | FLUX.1 Kontext [dev] Model Card | Black Forest Labs: FLUX.1 Kontext [dev] Model Card. Hugging Face model card (2025a). | +| `docs/zh/part13/ch48_t2i_t2v.md` | 427 | 13 | `black:2025` | FLUX.1 Kontext: Flow Matching for In-Context Image Generation and Editing in Latent Space | Black Forest Labs, Batifol, S., Blattmann, A., Boesel, F., Consul, S., Diagne, C., Dockhorn, T., English, J., English, Z., Esser, P., Kulal, S., Lacey, K., Levi, Y., Li, C., Lorenz, D., Müller, J., Podell, D., Rombach... | +| `docs/zh/part13/ch48_t2i_t2v.md` | 433 | 16 | `wanvideo:2025` | Wan2.2: Wan: Open and Advanced Large-Scale Video Generative Models | Wan-Video Team: Wan2.2: Wan: Open and Advanced Large-Scale Video Generative Models. GitHub repository and model documentation (2025). | +| `docs/zh/part13/ch48_t2i_t2v.md` | 441 | 20 | `thudm:2024` | CogVLM2-Caption for CogVideoX Training Data Captioning | THUDM: CogVLM2-Caption for CogVideoX Training Data Captioning. CogVideo official tool documentation (2024). | +| `docs/zh/part14/p01_mini_c4.md` | 1077 | 2 | `hugging:2026` | Datasets Documentation | 2. Hugging Face. (2026). Datasets Documentation. https://huggingface.co/docs/datasets/. | +| `docs/zh/part14/p01_mini_c4.md` | 1078 | 3 | `ray:2026` | Ray Data Documentation | 3. Ray Project. (2026). Ray Data Documentation. https://docs.ray.io/en/latest/data/data.html. | +| `docs/zh/part14/p01_mini_c4.md` | 1079 | 4 | `mlflow:2026` | MLflow Documentation | 4. MLflow Authors. (2026). MLflow Documentation. https://mlflow.org/docs/latest/. | +| `docs/zh/part14/p01_mini_c4.md` | 1080 | 5 | `great:2026` | Great Expectations Documentation | 5. Great Expectations Contributors. (2026). Great Expectations Documentation. https://docs.greatexpectations.io/. | +| `docs/zh/part14/p02_legal_sft.md` | 1217 | 2 | `hugging:2026` | Datasets Documentation | 2. Hugging Face. (2026). Datasets Documentation. https://huggingface.co/docs/datasets/. | +| `docs/zh/part14/p02_legal_sft.md` | 1218 | 3 | `ray:2026` | Ray Data Documentation | 3. Ray Project. (2026). Ray Data Documentation. https://docs.ray.io/en/latest/data/data.html. | +| `docs/zh/part14/p02_legal_sft.md` | 1219 | 4 | `mlflow:2026` | MLflow Documentation | 4. MLflow Authors. (2026). MLflow Documentation. https://mlflow.org/docs/latest/. | +| `docs/zh/part14/p02_legal_sft.md` | 1220 | 5 | `great:2026` | Great Expectations Documentation | 5. Great Expectations Contributors. (2026). Great Expectations Documentation. https://docs.greatexpectations.io/. | +| `docs/zh/part14/p04_synthetic_textbook.md` | 952 | 2 | `hugging:2026` | Datasets Documentation | 2. Hugging Face. (2026). Datasets Documentation. https://huggingface.co/docs/datasets/. | +| `docs/zh/part14/p04_synthetic_textbook.md` | 953 | 3 | `ray:2026` | Ray Data Documentation | 3. Ray Project. (2026). Ray Data Documentation. https://docs.ray.io/en/latest/data/data.html. | +| `docs/zh/part14/p04_synthetic_textbook.md` | 954 | 4 | `mlflow:2026` | MLflow Documentation | 4. MLflow Authors. (2026). MLflow Documentation. https://mlflow.org/docs/latest/. | +| `docs/zh/part14/p04_synthetic_textbook.md` | 955 | 5 | `great:2026` | Great Expectations Documentation | 5. Great Expectations Contributors. (2026). Great Expectations Documentation. https://docs.greatexpectations.io/. | +| `docs/zh/part14/p07_agent_tooluse.md` | 1169 | 4 | `owasp:2025` | OWASP Top 10 for Large Language Model Applications | 4. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications. https://genai.owasp.org/llm-top-10/. | +| `docs/zh/part14/p07_agent_tooluse.md` | 1170 | 5 | `opentelemetry:2026` | OpenTelemetry Documentation | 5. OpenTelemetry Authors. (2026). OpenTelemetry Documentation. https://opentelemetry.io/docs/. | +| `docs/zh/part14/p08_dataops.md` | 1134 | 2 | `hugging:2026` | Datasets Documentation | 2. Hugging Face. (2026). Datasets Documentation. https://huggingface.co/docs/datasets/. | +| `docs/zh/part14/p08_dataops.md` | 1135 | 3 | `ray:2026` | Ray Data Documentation | 3. Ray Project. (2026). Ray Data Documentation. https://docs.ray.io/en/latest/data/data.html. | +| `docs/zh/part14/p08_dataops.md` | 1136 | 4 | `mlflow:2026` | MLflow Documentation | 4. MLflow Authors. (2026). MLflow Documentation. https://mlflow.org/docs/latest/. | +| `docs/zh/part14/p08_dataops.md` | 1137 | 5 | `great:2026` | Great Expectations Documentation | 5. Great Expectations Contributors. (2026). Great Expectations Documentation. https://docs.greatexpectations.io/. | +| `docs/zh/part14/p09_privacy_pipeline.md` | 1129 | 1 | `european:2016` | Regulation (EU) 2016/679: General Data Protection Regulation | 1. European Union. (2016). Regulation (EU) 2016/679: General Data Protection Regulation. https://eur-lex.europa.eu/eli/reg/2016/679/oj. | +| `docs/zh/part14/p09_privacy_pipeline.md` | 1133 | 5 | `owasp:2025` | OWASP Top 10 for Large Language Model Applications | 5. OWASP Foundation. (2025). OWASP Top 10 for Large Language Model Applications. https://genai.owasp.org/llm-top-10/. | +| `docs/zh/part14/p10_flywheel.md` | 1077 | 2 | `hugging:2026` | Datasets Documentation | 2. Hugging Face. (2026). Datasets Documentation. https://huggingface.co/docs/datasets/. | +| `docs/zh/part14/p10_flywheel.md` | 1078 | 3 | `ray:2026` | Ray Data Documentation | 3. Ray Project. (2026). Ray Data Documentation. https://docs.ray.io/en/latest/data/data.html. | +| `docs/zh/part14/p10_flywheel.md` | 1079 | 4 | `mlflow:2026` | MLflow Documentation | 4. MLflow Authors. (2026). MLflow Documentation. https://mlflow.org/docs/latest/. | +| `docs/zh/part14/p10_flywheel.md` | 1080 | 5 | `great:2026` | Great Expectations Documentation | 5. Great Expectations Contributors. (2026). Great Expectations Documentation. https://docs.greatexpectations.io/. | +| `docs/zh/part14/p11_mini_deepseek.md` | 524 | 1 | `broder:1997` | On the Resemblance and Containment of Documents | Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. | +| `docs/zh/part14/p14_video_generation.md` | 538 | 1 | `pyscenedetect:2026` | PySceneDetect Documentation | PySceneDetect Contributors (2026) PySceneDetect Documentation. Available at: https://www.scenedetect.com/docs/latest/. | +| `docs/zh/part14/p14_video_generation.md` | 544 | 4 | `farneback:2003` | Two-Frame Motion Estimation Based on Polynomial Expansion | Farnebäck G (2003) Two-Frame Motion Estimation Based on Polynomial Expansion. In: Proceedings of the 13th Scandinavian Conference on Image Analysis, pp 363-370. | +| `docs/zh/part14/p14_video_generation.md` | 546 | 5 | `pexels:2014` | Pexels: Free Stock Photos, Royalty Free Images & Videos | Pexels (2014) Pexels: Free Stock Photos, Royalty Free Images & Videos. Available at: https://www.pexels.com. | +| `docs/zh/part14/p15_dataagent_semantic_nl2sql_agent.md` | 908 | 5 | `dbt:2026` | dbt Documentation | 5. dbt Labs. (2026). dbt Documentation. https://docs.getdbt.com/. | +| `docs/zh/part14/p15_dataagent_semantic_nl2sql_agent.md` | 909 | 6 | `datagallery:2026` | DataGallery organization page | 6. DataGallery Contributors. (2026). DataGallery organization page. https://gitcode.com/datagallery. | +| `docs/zh/part14/p15_dataagent_semantic_nl2sql_agent.md` | 910 | 7 | `datagallery:2026` | DataAgent source repository | 7. DataGallery Contributors. (2026). DataAgent source repository. https://gitcode.com/datagallery/DataAgent. | +| `docs/zh/part2/ch04_data_sources.md` | 403 | 1 | `barbaresi:2021` | Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction | Barbaresi A (2021) Trafilatura: A Web Scraping Library and Command-Line Tool for Text Discovery and Extraction. In: Proceedings of the ACL-IJCNLP 2021 System Demonstrations, pp 122-131. | +| `docs/zh/part2/ch04_data_sources.md` | 409 | 4 | `joulin:2017` | FastText.zip: Compressing Text Classification Models | Joulin A, Grave E, Bojanowski P, Douze M, Jegou H, Mikolov T (2017) FastText.zip: Compressing Text Classification Models. arXiv preprint arXiv:1612.03651. | +| `docs/zh/part2/ch04_data_sources.md` | 412 | 5 | `lopez:2009` | GROBID: Combining Automatic Bibliographic Data Recognition and Term Extraction for Scholarship Publications | Lopez P (2009) GROBID: Combining Automatic Bibliographic Data Recognition and Term Extraction for Scholarship Publications. In: Proceedings of the 13th European Conference on Digital Libraries, pp 473-474. | +| `docs/zh/part2/ch05_cleaning_dedup.md` | 627 | 1 | `broder:1997` | On the Resemblance and Containment of Documents | Broder A Z (1997) On the Resemblance and Containment of Documents. In: Proceedings of the Compression and Complexity of Sequences, pp 21-29. https://doi.org/10.1109/sequen.1997.666900. | +| `docs/zh/part2/ch05_cleaning_dedup.md` | 629 | 2 | `heafield:2011` | KenLM: Faster and Smaller Language Model Queries | Heafield K (2011) KenLM: Faster and Smaller Language Model Queries. In: Proceedings of the Sixth Workshop on Statistical Machine Translation, pp 187-197. | +| `docs/zh/part2/ch06_tokenization_loading.md` | 465 | 2 | `brown:2020` | Language Models are Few-Shot Learners | Brown T B, Mann B, Ryder N, Subbiah M, Kaplan J, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D M, Wu J, Winter C, Hesse C, Chen ... | +| `docs/zh/part2/ch06_tokenization_loading.md` | 471 | 5 | `mosaic:2022` | MosaicML Streaming | Mosaic AI Research (2022) MosaicML Streaming. GitHub repository. . | +| `docs/zh/part2/ch07_data_operations.md` | 387 | 1 | `chen:2021` | Evaluating Large Language Models Trained on Code (HumanEval) | Chen M, Tworek J, Jun H, Yuan Q, Pinto H P d O, Kaplan J, Edwards H, Burda Y, Joseph N, Brockman G, Ray A, Puri R, Krueger G, Petrov M, Khlaaf H, Sastry G, Mishkin P, Chan B, Gray S, Ryder N, Pavlov M, Power A, Kaiser... | +| `docs/zh/part2/ch07_data_operations.md` | 393 | 4 | `heafield:2011` | KenLM: Faster and Smaller Language Model Queries | Heafield K (2011) KenLM: Faster and Smaller Language Model Queries. In: Proceedings of the Sixth Workshop on Statistical Machine Translation, pp 187-197. | +| `docs/zh/part3/ch08_multimodal_image.md` | 329 | 9 | `nvidia:2023` | NVIDIA Data Loading Library (DALI) | NVIDIA (2023) NVIDIA Data Loading Library (DALI). GitHub repository. . | +| `docs/zh/part3/ch08_multimodal_image.md` | 335 | 12 | `zhu:2023` | Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text | Zhu W, Hessel J, Awadalla A, Gadre S Y, Dodge J, Fang A, Yu Y, Schmidt L, Wang W Y, Choi Y (2023) Multimodal C4: An Open, Billion-scale Corpus of Images Interleaved with Text. Advances in Neural Information Processing... | +| `docs/zh/part3/ch08_multimodal_image.md` | 341 | 15 | `zhu:2023` | MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models | Zhu D, Chen J, Shen X, Li X, Elhoseiny M (2023) MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arXiv preprint arXiv:2304.10592. | +| `docs/zh/part3/ch10_video_audio.md` | 360 | 8 | `zhang:2023` | Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding | Zhang H, Li X, Bing L (2023) Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding. arXiv preprint arXiv:2306.02858. | +| `docs/zh/part3/ch11_cross_modal_alignment.md` | 365 | 3 | `rombach:2022` | High-Resolution Image Synthesis with Latent Diffusion Models | Rombach R, Blattmann A, Lorenz D, Esser P, Ommer B (2022) High-Resolution Image Synthesis with Latent Diffusion Models. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pp 10684-1... | +| `docs/zh/part4/ch12_sft.md` | 726 | 24 | `moffatt:2024` | Civil Resolution Tribunal of British Columbia, 2024 BCCRT 149 | Moffatt v. Air Canada. 2024. *Civil Resolution Tribunal of British Columbia*, 2024 BCCRT 149. | +| `docs/zh/part4/ch12_sft.md` | 728 | 25 | `lifshitz:2024` | BC Tribunal Confirms Companies Remain Liable for Information Provided by AI Chatbot | Lifshitz, L. R., & Hung, R. 2024. BC Tribunal Confirms Companies Remain Liable for Information Provided by AI Chatbot. *ABA Business Law Today*. | +| `docs/zh/part4/ch13_preference.md` | 504 | 15 | `cohen:1960` | A coefficient of agreement for nominal scales | Cohen, J. (1960). A coefficient of agreement for nominal scales. *Educational and Psychological Measurement*, 20(1), 37–46. https://doi.org/10.1177/001316446002000104. | +| `docs/zh/part4/ch14_qa.md` | 670 | 15 | `cohen:1960` | A Coefficient of Agreement for Nominal Scales | Cohen, J. (1960). *A Coefficient of Agreement for Nominal Scales*. Educational and Psychological Measurement, 20(1), 37–46. DOI: 10.1177/001316446002000104. | +| `docs/zh/part4/ch14_qa.md` | 672 | 16 | `fleiss:1971` | Measuring Nominal Scale Agreement among Many Raters | Fleiss, J. L. (1971). *Measuring Nominal Scale Agreement among Many Raters*. Psychological Bulletin, 76(5), 378–382. DOI: 10.1037/h0031619. | +| `docs/zh/part4/ch14_qa.md` | 674 | 17 | `krippendorff:2004` | Reliability in Content Analysis: Some Common Misconceptions and Recommendations | Krippendorff, K. (2004). *Reliability in Content Analysis: Some Common Misconceptions and Recommendations*. Human Communication Research, 30(3), 411–433. DOI: 10.1111/j.1468-2958.2004.tb00738.x. | +| `docs/zh/part4/ch14_qa.md` | 678 | 19 | `settles:2009` | Active Learning Literature Survey | Settles, B. (2009). *Active Learning Literature Survey*. Computer Sciences Technical Report 1648, University of Wisconsin–Madison. | +| `docs/zh/part4/ch14_qa.md` | 682 | 21 | `thurstone:1927` | A Law of Comparative Judgment | Thurstone, L. L. (1927). *A Law of Comparative Judgment*. Psychological Review, 34(4), 273–286. DOI: 10.1037/h0070288. | +| `docs/zh/part6/ch18_cot.md` | 621 | 23 | `monperrus:2018` | Automatic Software Repair: A Bibliography | Monperrus, M. (2018). Automatic Software Repair: A Bibliography. ACM Computing Surveys, 51(1), Article 17. | +| `docs/zh/part6/ch20_agent.md` | 514 | 16 | `securities:2013` | In the Matter of Knight Capital Americas LLC: Order Instituting Administrative and Cease-and-Desist Proceedings, Release No | Securities and Exchange Commission. 2013. *In the Matter of Knight Capital Americas LLC: Order Instituting Administrative and Cease-and-Desist Proceedings, Release No. 70694*. U.S. Securities and Exchange Commission. | +| `docs/zh/part7/ch21_rag_pipeline.md` | 980 | 24 | `huyen:2022` | Designing Machine Learning Systems: An Iterative Process for Production-Ready Applications | Huyen C (2022) Designing Machine Learning Systems: An Iterative Process for Production-Ready Applications. O’Reilly Media. | +| `docs/zh/part7/ch22_multimodal_rag_visual_retrieval.md` | 583 | 20 | `huyen:2022` | Designing Machine Learning Systems: An Iterative Process for Production-Ready Applications | Huyen C (2022) Designing Machine Learning Systems: An Iterative Process for Production-Ready Applications. O’Reilly Media. | +| `docs/zh/part7/ch23_online_feedback_knowledge_update.md` | 688 | 8 | `huyen:2022` | Designing Machine Learning Systems: An Iterative Process for Production-Ready Applications | Huyen C (2022) Designing Machine Learning Systems: An Iterative Process for Production-Ready Applications. O’Reilly Media. | +| `docs/zh/part7/ch23_online_feedback_knowledge_update.md` | 690 | 9 | `joachims:2002` | Optimizing Search Engines Using Clickthrough Data | Joachims T (2002) Optimizing Search Engines Using Clickthrough Data. In: Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp 133–142. https://doi.org/10.1145/775066... | +| `docs/zh/part7/ch23_online_feedback_knowledge_update.md` | 706 | 17 | `settles:2009` | Active Learning Literature Survey | Settles B (2009) Active Learning Literature Survey. University of Wisconsin–Madison Computer Sciences Technical Report 1648. | +| `docs/zh/part8/ch24_dataops_flywheel_team.md` | 834 | 6 | `dama:2017` | DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition | DAMA International (2017) DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition. Technics Publications. | +| `docs/zh/part8/ch24_dataops_flywheel_team.md` | 836 | 7 | `dataops:2026` | | DataOps Manifesto (accessed 2026) The DataOps Manifesto: 18 DataOps Principles. Online manifesto. Available at: https://dataopsmanifesto.org/en/. | +| `docs/zh/part8/ch24_dataops_flywheel_team.md` | 838 | 8 | `dehghani:2022` | Data Mesh: Delivering Data-Driven Value at Scale | Dehghani Z (2022) Data Mesh: Delivering Data-Driven Value at Scale. O'Reilly Media. | +| `docs/zh/part8/ch24_dataops_flywheel_team.md` | 854 | 16 | `project:2021` | A Guide to the Project Management Body of Knowledge (PMBOK Guide), 7th Edition | Project Management Institute (2021) A Guide to the Project Management Body of Knowledge (PMBOK Guide), 7th Edition. Project Management Institute. | +| `docs/zh/part8/ch25_data_versioning_experiment_tracking.md` | 670 | 7 | `dama:2017` | DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition | DAMA International (2017) DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition. Technics Publications. | +| `docs/zh/part8/ch25_data_versioning_experiment_tracking.md` | 672 | 8 | `dvc:2024` | Data Version Control Documentation | DVC Documentation (2024) Data Version Control Documentation. Available at: https://dvc.org/doc. | +| `docs/zh/part8/ch25_data_versioning_experiment_tracking.md` | 682 | 13 | `peng:2011` | Reproducible Research in Computational Science | Peng R D (2011) Reproducible Research in Computational Science. Science 334(6060):1226-1227. https://doi.org/10.1126/science.1213847. | +| `docs/zh/part8/ch26_data_platform_observability.md` | 701 | 9 | `kleppmann:2017` | Designing Data-Intensive Applications | Kleppmann M (2017) Designing Data-Intensive Applications. O'Reilly Media. | +| `docs/zh/part8/ch26_data_platform_observability.md` | 707 | 12 | `nygard:2018` | Release It!: Design and Deploy Production-Ready Software, 2nd Edition | Nygard M T (2018) Release It!: Design and Deploy Production-Ready Software, 2nd Edition. Pragmatic Bookshelf. | +| `docs/zh/part8/ch26_data_platform_observability.md` | 711 | 14 | `opentelemetry:2024` | OpenTelemetry Specification | OpenTelemetry Authors (2024) OpenTelemetry Specification. Available at: https://opentelemetry.io/docs/specs/. | +| `docs/zh/part8/ch26_data_platform_observability.md` | 721 | 19 | `turnbull:2014` | The Art of Monitoring | Turnbull J (2014) The Art of Monitoring. James Turnbull. | +| `docs/zh/part9/ch27_data_catalog_and_metadata_governance.md` | 598 | 5 | `dama:2017` | DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition | DAMA International (2017) DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition. Technics Publications, Basking Ridge. | +| `docs/zh/part9/ch27_data_catalog_and_metadata_governance.md` | 608 | 10 | `herschel:2017` | A survey on provenance: What for? What form? What from? The VLDB Journal 26(6):881–906 | Herschel M, Diestelkämper R, Ben Lahmar H (2017) A survey on provenance: What for? What form? What from? The VLDB Journal 26(6):881–906. | +| `docs/zh/part9/ch28_data_productization_and_data_contracts.md` | 312 | 3 | `dehghani:2022` | Data Mesh: Delivering Data-Driven Value at Scale | Dehghani Z (2022) Data Mesh: Delivering Data-Driven Value at Scale. O'Reilly Media, Sebastopol. | +| `docs/zh/part9/ch28_data_productization_and_data_contracts.md` | 316 | 5 | `kleppmann:2017` | Designing Data-Intensive Applications: The Big Ideas Behind Reliable, Scalable, and Maintainable Systems | Kleppmann M (2017) Designing Data-Intensive Applications: The Big Ideas Behind Reliable, Scalable, and Maintainable Systems. O'Reilly Media, Sebastopol. | +| `docs/zh/part9/ch28_data_productization_and_data_contracts.md` | 328 | 11 | `redman:1998` | The Impact of Poor Data Quality on the Typical Enterprise | Redman T C (1998) The Impact of Poor Data Quality on the Typical Enterprise. Communications of the ACM 41(2):79–82. | +| `docs/zh/part9/ch29_data_valuation_and_reuse.md` | 685 | 9 | `laney:2017` | Infonomics: How to Monetize, Manage, and Measure Information as an Asset for Competitive Advantage | Laney D B (2017) Infonomics: How to Monetize, Manage, and Measure Information as an Asset for Competitive Advantage. Routledge, New York. | +| `docs/zh/part9/ch29_data_valuation_and_reuse.md` | 695 | 14 | `pei:2022` | A Survey on Data Pricing: From Economics to Data Science | Pei J (2022) A Survey on Data Pricing: From Economics to Data Science. IEEE Transactions on Knowledge and Data Engineering 34(10):4586–4608. | +| `docs/zh/part9/ch29_data_valuation_and_reuse.md` | 701 | 17 | `settles:2009` | Active Learning Literature Survey | Settles B (2009) Active Learning Literature Survey. Computer Sciences Technical Report 1648, University of Wisconsin–Madison. | +| `docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md` | 535 | 1 | `abraham:2019` | Data governance: A conceptual framework, structured review, and research agenda | Abraham R, Schneider J, vom Brocke J (2019) Data governance: A conceptual framework, structured review, and research agenda. International Journal of Information Management 49:424-438. | +| `docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md` | 539 | 3 | `dama:2017` | DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition | DAMA International (2017) DAMA-DMBOK: Data Management Body of Knowledge, 2nd Edition. Technics Publications. | +| `docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md` | 541 | 4 | `dehghani:2022` | Data Mesh: Delivering Data-Driven Value at Scale | Dehghani Z (2022) Data Mesh: Delivering Data-Driven Value at Scale. O'Reilly Media. | +| `docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md` | 551 | 9 | `ladley:2019` | Data Governance: How to Design, Deploy, and Sustain an Effective Data Governance Program, 2nd Edition | Ladley J (2019) Data Governance: How to Design, Deploy, and Sustain an Effective Data Governance Program, 2nd Edition. Academic Press. | +| `docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md` | 553 | 10 | `laney:2017` | Infonomics: How to Monetize, Manage, and Measure Information as an Asset for Competitive Advantage | Laney D B (2017) Infonomics: How to Monetize, Manage, and Measure Information as an Asset for Competitive Advantage. Routledge, New York. | +| `docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md` | 561 | 14 | `otto:2011` | Data Governance | Otto B (2011) Data Governance. Business & Information Systems Engineering 3(4):241-244. https://doi.org/10.1002/9781118269053.ch4. | +| `docs/zh/part9/ch30_internal_data_market_and_sharing_governance.md` | 571 | 19 | `weber:2009` | One Size Does Not Fit All: A Contingency Approach to Data Governance | Weber K, Otto B, Österle H (2009) One Size Does Not Fit All: A Contingency Approach to Data Governance. ACM Journal of Data and Information Quality 1(1):4. | ## 外部核验问题条目 @@ -163,5 +246,4 @@ | 问题 | 数量 | | --- | ---: | -| `missing-doi-arxiv-url` | 614 | -| `missing-first-author` | 2 | +| `missing-doi-arxiv-url` | 379 | diff --git a/publishing/final_review/style_report.md b/publishing/final_review/style_report.md index d3787dc3..8b53c0b9 100644 --- a/publishing/final_review/style_report.md +++ b/publishing/final_review/style_report.md @@ -251,24 +251,24 @@ ## docs/zh/part13/ch45_posttrain_recipes.md - `37` `rhetorical-not-but` `不是 SFT 之后的简单补充,而是`:* 偏好对齐(Preference Alignment)不是 SFT 之后的简单补充,而是一层独立的数据生产与评审机制。 -- `56` `rhetorical-not-but` `不是后训练的终点,而是`:模型发布上线并不是后训练的终点,而是起点。第三层决定了模型能否随着真实业务演进而自我修复。 -- `65` `weak-booster` `很容易`:从工程落地角度看,三段论还意味着三类完全不同的数据资产管理方式。SFT 数据更像“行为模板库”,它需要稳定、干净、覆盖常见任务,并且字段结构尽量简单。团队通常会围绕 `messages`、`instruction`、`input`、`... -- `77` `rhetorical-not-but` `不是评价谁的榜单得分更高,而是`:在构建自己的后训练管线前,我们需要横向比较当前主流开源模型的公开路线。本节选取 Tülu-3、Llama-3、Qwen2.5 与 Nemotron-4 作为四类代表路线进行剖析。我们的核心不是评价谁的榜单得分更高,而是建立“公开信息如... -- `145` `weak-booster` `真正`:这四道门禁最好以“自动过滤 + 人工抽检”组合实现。自动过滤适合处理格式错误、重复、长度异常、低质量模板、敏感词和明显安全问题;人工抽检适合判断指令是否自然、答案是否真正有帮助、复杂任务是否保持了原始意图。尤其在 Evol-Instr... -- `147` `rhetorical-not-but` `不是完全复制某个比例,而是`:SFT 数据还需要分层配比,而不是简单混合。建议至少拆成六类:通用问答、知识解释、复杂指令遵循、代码与工具、数学与推理、安全与拒答。每一类都要单独统计数量、平均长度、来源、过滤率和抽检通过率。对于开源模型 recipe 的复现来说,最... -- `149` `weak-booster` `很容易`:还要注意,SFT 阶段的好数据不一定适合所有训练轮次。第一轮 SFT 更适合使用结构清晰、回答稳定、覆盖面广的数据,帮助模型建立基础助手行为。后续增量 SFT 则更适合加入 hard cases、领域任务、工具调用和安全边界修复数据。... -- `157` `rhetorical-not-but` `不是“生成样本”,而是`:这一层的任务已经不是“生成样本”,而是“构造偏好/奖励信号”。也就是说,上一节的 SFT 合成方法与本节的 RLHF、DPO、GRPO、RLVR 属于不同层级,不能写成同一种数据工程动作。 +- `58` `rhetorical-not-but` `不是后训练的终点,而是`:模型发布上线并不是后训练的终点,而是起点。第三层决定了模型能否随着真实业务演进而自我修复。 +- `67` `weak-booster` `很容易`:从工程落地角度看,三段论还意味着三类完全不同的数据资产管理方式。SFT 数据更像“行为模板库”,它需要稳定、干净、覆盖常见任务,并且字段结构尽量简单。团队通常会围绕 `messages`、`instruction`、`input`、`... +- `79` `rhetorical-not-but` `不是评价谁的榜单得分更高,而是`:在构建自己的后训练管线前,我们需要横向比较当前主流开源模型的公开路线。本节选取 Tülu-3、Llama-3、Qwen2.5 与 Nemotron-4 作为四类代表路线进行剖析。我们的核心不是评价谁的榜单得分更高,而是建立“公开信息如... +- `147` `weak-booster` `真正`:这四道门禁最好以“自动过滤 + 人工抽检”组合实现。自动过滤适合处理格式错误、重复、长度异常、低质量模板、敏感词和明显安全问题;人工抽检适合判断指令是否自然、答案是否真正有帮助、复杂任务是否保持了原始意图。尤其在 Evol-Instr... +- `149` `rhetorical-not-but` `不是完全复制某个比例,而是`:SFT 数据还需要分层配比,而不是简单混合。建议至少拆成六类:通用问答、知识解释、复杂指令遵循、代码与工具、数学与推理、安全与拒答。每一类都要单独统计数量、平均长度、来源、过滤率和抽检通过率。对于开源模型 recipe 的复现来说,最... +- `153` `weak-booster` `很容易`:还要注意,SFT 阶段的好数据不一定适合所有训练轮次。第一轮 SFT 更适合使用结构清晰、回答稳定、覆盖面广的数据,帮助模型建立基础助手行为。后续增量 SFT 则更适合加入 hard cases、领域任务、工具调用和安全边界修复数据。... +- `161` `rhetorical-not-but` `不是“生成样本”,而是`:这一层的任务已经不是“生成样本”,而是“构造偏好/奖励信号”。也就是说,上一节的 SFT 合成方法与本节的 RLHF、DPO、GRPO、RLVR 属于不同层级,不能写成同一种数据工程动作。 - 其余 11 条见 JSON 明细。 ## docs/zh/part13/ch46_rl_reasoning_data.md - `39` `weak-booster` `真正`:在早期指令微调阶段,团队经常把推理能力理解为“给模型更多带步骤的答案”。例如,对数学题写出详细解法,对代码题写出逐步分析,对逻辑题写出推导链。这样的数据确实可以让模型学会“像在推理一样回答”,但它有一个天然限制:模型只是在模仿已经写好... -- `55` `rhetorical-not-but` `不是强化学习算法本身,而是`:因此,本章讨论的不是强化学习算法本身,而是 RL 范式下的数据工程问题:任务从哪里来,验证器如何写,采样轨迹如何存,哪些轨迹进入二轮 SFT,哪些失败轨迹进入 hard case 池,以及如何防止模型在奖励信号上产生投机行为。 -- `57` `blog-transition` `换句话说`:在这个范式中,数据工程师面对的对象也发生了变化。过去一条样本的主要边界是 prompt 和 answer;现在一条样本可能对应一个任务族、一组采样参数、若干条候选轨迹、多个验证器输出、一次人工审计结论和后续训练去向。换句话说,推理数据... -- `57` `rhetorical-not-but` `不是“更长的回答”,而是`:在这个范式中,数据工程师面对的对象也发生了变化。过去一条样本的主要边界是 prompt 和 answer;现在一条样本可能对应一个任务族、一组采样参数、若干条候选轨迹、多个验证器输出、一次人工审计结论和后续训练去向。换句话说,推理数据... -- `69` `rhetorical-not-but` `不是线性的一次性流程,而是`:R1 风格推理数据飞轮可以拆成四个阶段:冷启动 SFT、大规模 RL、拒绝采样、二轮 SFT。这四个阶段不是线性的一次性流程,而是可以反复运行的闭环。 -- `76` `rhetorical-not-but` `不是把模型训练成高性能推理模型,而是`:冷启动 SFT 的目标不是把模型训练成高性能推理模型,而是让模型具备可读、稳定、可解析的推理输出格式。这个阶段通常需要少量高质量 Long-CoT 样本,覆盖数学、代码、逻辑题、格式遵循和必要的通用问答。 -- `90` `rhetorical-not-but` `不是为了制造冗长,而是`:冷启动阶段最容易出现的误区,是把样本写得过于“完美”。真实 RL 后的推理轨迹通常包含试探、检查、回看条件和修正,而人工冷启动样本如果只呈现线性推导,模型会学到一种过分整齐的解释风格。这样的风格在简单题上可读性很好,但在复杂题上可能缺... -- `92` `weak-booster` `真正`:冷启动数据还要控制“答案泄漏”。在很多合成数据中,生成器先知道标准答案,再倒写推理过程,容易出现步骤与结论强绑定的问题。模型学到这类样本后,可能在没有真正推理的情况下直接靠模式猜答案。更稳妥的做法是保留题目、标准答案和推理过程之间的检... +- `57` `rhetorical-not-but` `不是强化学习算法本身,而是`:因此,本章讨论的不是强化学习算法本身,而是 RL 范式下的数据工程问题:任务从哪里来,验证器如何写,采样轨迹如何存,哪些轨迹进入二轮 SFT,哪些失败轨迹进入 hard case 池,以及如何防止模型在奖励信号上产生投机行为。 +- `59` `blog-transition` `换句话说`:在这个范式中,数据工程师面对的对象也发生了变化。过去一条样本的主要边界是 prompt 和 answer;现在一条样本可能对应一个任务族、一组采样参数、若干条候选轨迹、多个验证器输出、一次人工审计结论和后续训练去向。换句话说,推理数据... +- `59` `rhetorical-not-but` `不是“更长的回答”,而是`:在这个范式中,数据工程师面对的对象也发生了变化。过去一条样本的主要边界是 prompt 和 answer;现在一条样本可能对应一个任务族、一组采样参数、若干条候选轨迹、多个验证器输出、一次人工审计结论和后续训练去向。换句话说,推理数据... +- `71` `rhetorical-not-but` `不是线性的一次性流程,而是`:R1 风格推理数据飞轮可以拆成四个阶段:冷启动 SFT、大规模 RL、拒绝采样、二轮 SFT。这四个阶段不是线性的一次性流程,而是可以反复运行的闭环。 +- `78` `rhetorical-not-but` `不是把模型训练成高性能推理模型,而是`:冷启动 SFT 的目标不是把模型训练成高性能推理模型,而是让模型具备可读、稳定、可解析的推理输出格式。这个阶段通常需要少量高质量 Long-CoT 样本,覆盖数学、代码、逻辑题、格式遵循和必要的通用问答。 +- `94` `rhetorical-not-but` `不是为了制造冗长,而是`:冷启动阶段最容易出现的误区,是把样本写得过于“完美”。真实 RL 后的推理轨迹通常包含试探、检查、回看条件和修正,而人工冷启动样本如果只呈现线性推导,模型会学到一种过分整齐的解释风格。这样的风格在简单题上可读性很好,但在复杂题上可能缺... +- `96` `weak-booster` `真正`:冷启动数据还要控制“答案泄漏”。在很多合成数据中,生成器先知道标准答案,再倒写推理过程,容易出现步骤与结论强绑定的问题。模型学到这类样本后,可能在没有真正推理的情况下直接靠模式猜答案。更稳妥的做法是保留题目、标准答案和推理过程之间的检... - 其余 15 条见 JSON 明细。 ## docs/zh/part13/ch47_vlm_data_recipes.md diff --git a/scripts/reference_integrity_audit.py b/scripts/reference_integrity_audit.py index 3fae2393..062509e2 100644 --- a/scripts/reference_integrity_audit.py +++ b/scripts/reference_integrity_audit.py @@ -49,8 +49,8 @@ AUTHOR_TOKEN_RE = r"[A-Z][A-Za-zÀ-ÖØ-öø-ÿ'’`.-]+" CITATION_PATTERNS = [ re.compile(r"\b(National Institute of Standards and Technology)\s*\(?((?:19|20)\d{2})\)?"), - re.compile(r"\b(Nait\s+Saada|Jimeno\s+Yepes|Ortiz\s+Suárez)\s+et\s+al\.\s*\(?((?:19|20)\d{2})\)?"), - re.compile(r"\b(Kimi\s+Team|Qwen\s+Team|Gemini\s+Team|Open-Sora\s+Team|Wan\s+Team)\s*\(?((?:19|20)\d{2})\)?"), + re.compile(r"\b(Gheshlaghi\s+Azar|Nait\s+Saada|Jimeno\s+Yepes|Ortiz\s+Suárez)\s+et\s+al\.\s*\(?((?:19|20)\d{2})\)?"), + re.compile(r"\b(DeepSeek-AI|Kimi\s+Team|Qwen\s+Team|Gemini\s+Team|Open-Sora\s+Team|Wan\s+Team)\s*\(?((?:19|20)\d{2})\)?"), re.compile(rf"\b({AUTHOR_TOKEN_RE}),\s+{AUTHOR_TOKEN_RE}\s+et\s+al\.\s*\(?((?:19|20)\d{{2}})\)?"), re.compile(rf"\b({AUTHOR_TOKEN_RE})\s+et\s+al\.\s*\(?((?:19|20)\d{{2}})\)?"), re.compile(rf"\b({AUTHOR_TOKEN_RE}),\s+{AUTHOR_TOKEN_RE}\s+and\s+{AUTHOR_TOKEN_RE}\s*\(?((?:19|20)\d{{2}})\)?"), @@ -262,6 +262,8 @@ def first_author_from_entry(entry: str) -> str: entry = re.sub(r"^\d+\.\s*", "", entry).strip() if entry.startswith("National Institute of Standards and Technology"): return "NIST" + if entry.startswith("Gheshlaghi Azar"): + return "Gheshlaghi Azar" if entry.startswith("Nait Saada"): return "Nait Saada" if entry.startswith("Jimeno Yepes"): @@ -270,6 +272,8 @@ def first_author_from_entry(entry: str) -> str: return "Ortiz Suárez" if entry.startswith("Team Kimi"): return "Kimi Team" + if entry.startswith("DeepSeek-AI"): + return "DeepSeek-AI" for team_name in ("Qwen Team", "Gemini Team", "Open-Sora Team", "Wan Team"): if entry.startswith(team_name): return team_name