diff --git a/demos/Makefile b/demos/Makefile index 73a0f30..2105bd9 100644 --- a/demos/Makefile +++ b/demos/Makefile @@ -25,3 +25,6 @@ batch-download: batch-list: uv run mp batch list $(PROJECT_NAME) + +run-notebooks: + ./run_notebooks.sh diff --git a/demos/__init__.py b/demos/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/demos/notebooks/__init__.py b/demos/notebooks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/demos/notebooks/banking/Banking Credit Scoring/credit_scoring_ai_act.ipynb b/demos/notebooks/banking/Banking Credit Scoring/credit_scoring_ai_act.ipynb index 91e7cde..8766317 100644 --- a/demos/notebooks/banking/Banking Credit Scoring/credit_scoring_ai_act.ipynb +++ b/demos/notebooks/banking/Banking Credit Scoring/credit_scoring_ai_act.ipynb @@ -114,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "a1b2c3d4-0007", "metadata": { "ExecuteTime": { @@ -122,25 +122,8 @@ "start_time": "2026-03-16T19:11:40.034779Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MLflow tracking URI : http://model-platform.com/registry/test-ia-act/\n" - ] - } - ], - "source": [ - "# Author: Octo Technology MLOps Tribe\n", - "# \u2699\ufe0f Adapter l'URI \u00e0 votre projet Model Platform\n", - "PROJECT_NAME = \"test-ia-act\"\n", - "MLFLOW_TRACKING_URI = f\"http://model-platform.com/registry/{PROJECT_NAME}/\"\n", - "\n", - "mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)\n", - "mlflow.set_experiment(\"credit_default_prediction\")\n", - "print(f\"MLflow tracking URI : {MLFLOW_TRACKING_URI}\")" - ] + "outputs": [], + "source": "# Author: Octo Technology MLOps Tribe\n# \u2699\ufe0f Adapter l'URI \u00e0 votre projet Model Platform\nPROJECT_NAME = \"Credit-Risk-Assessment\"\nMLFLOW_TRACKING_URI = f\"http://model-platform.com/registry/{PROJECT_NAME}/\"\n\nmlflow.set_tracking_uri(MLFLOW_TRACKING_URI)\nmlflow.set_experiment(\"credit_default_prediction\")\nprint(f\"MLflow tracking URI : {MLFLOW_TRACKING_URI}\")" }, { "cell_type": "markdown", diff --git a/demos/notebooks/medical/Medical Document NLP/document_type_classifier_ai_act.ipynb b/demos/notebooks/medical/Medical Document NLP/document_type_classifier_ai_act.ipynb index 73ca321..5d11966 100644 --- a/demos/notebooks/medical/Medical Document NLP/document_type_classifier_ai_act.ipynb +++ b/demos/notebooks/medical/Medical Document NLP/document_type_classifier_ai_act.ipynb @@ -2,65 +2,148 @@ "cells": [ { "cell_type": "markdown", + "id": "3f33ab220d776fe6", "metadata": {}, "source": "# Classification du Type de Documents Medicaux \u2014 Modele a Risque Minimal (AI Act)\n\n**Risque minimal** \u2014 Outil de tri documentaire sans impact sur les soins" }, { "cell_type": "markdown", + "id": "295e4f1b41a71a97", "metadata": {}, "source": "## 0. Dependances optionnelles" }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, + "id": "2312440a51bc99cd", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-23T10:38:54.841640Z", + "start_time": "2026-03-23T10:38:45.626626Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/philippe.stepniewski/projets/model_platform/.venv/bin/python: No module named pip\r\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": "# Author: Octo Technology MLOps Tribe\n%pip install shap pandera --quiet" }, { "cell_type": "markdown", + "id": "1ab4457cda4aa7d4", "metadata": {}, "source": "## 1. Imports" }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 2, + "id": "acd1272b719f581e", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-23T10:39:28.627459Z", + "start_time": "2026-03-23T10:38:54.843701Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Repertoire artefacts temporaires : /var/folders/k0/jqyr0y117r5gkrs84q5rb43c0000gn/T/tmpfrlg_lui\n" + ] + } + ], "source": "# Author: Octo Technology MLOps Tribe\nimport json\nimport tempfile\nimport warnings\nfrom pathlib import Path\n\nimport matplotlib.pyplot as plt\nimport mlflow\nimport mlflow.sklearn\nimport numpy as np\nimport pandas as pd\nimport pandera as pa\nfrom mlflow.models.signature import infer_signature\nfrom sklearn.svm import LinearSVC\nfrom sklearn.calibration import CalibratedClassifierCV\nfrom sklearn.metrics import (\n accuracy_score, classification_report, f1_score,\n precision_score, recall_score, roc_auc_score, roc_curve,\n)\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom scipy.sparse import issparse\n\nwarnings.filterwarnings(\"ignore\")\nARTIFACTS_DIR = Path(tempfile.mkdtemp())\nprint(f\"Repertoire artefacts temporaires : {ARTIFACTS_DIR}\")" }, { "cell_type": "markdown", + "id": "2baf034a9ece137d", "metadata": {}, "source": "## 2. Configuration MLflow" }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 3, + "id": "8d80286c84b8d737", + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-23T10:40:32.268349Z", + "start_time": "2026-03-23T10:39:28.715599Z" + } + }, + "outputs": [ + { + "ename": "MlflowException", + "evalue": "API request to http://model-platform.com/registry/Medical-Document-NLP/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='model-platform.com', port=80): Max retries exceeded with url: /registry/Medical-Document-NLP/api/2.0/mlflow/experiments/get-by-name?experiment_name=medical_document_classification (Caused by ResponseError('too many 502 error responses'))", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mResponseError\u001b[39m Traceback (most recent call last)", + "\u001b[31mResponseError\u001b[39m: too many 502 error responses", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[31mMaxRetryError\u001b[39m Traceback (most recent call last)", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/requests/adapters.py:644\u001b[39m, in \u001b[36mHTTPAdapter.send\u001b[39m\u001b[34m(self, request, stream, timeout, verify, cert, proxies)\u001b[39m\n\u001b[32m 643\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m644\u001b[39m resp = \u001b[43mconn\u001b[49m\u001b[43m.\u001b[49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 645\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 646\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m=\u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 647\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 648\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 649\u001b[39m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 650\u001b[39m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 651\u001b[39m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 652\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 653\u001b[39m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 654\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 655\u001b[39m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m=\u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 656\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 658\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/urllib3/connectionpool.py:942\u001b[39m, in \u001b[36mHTTPConnectionPool.urlopen\u001b[39m\u001b[34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[39m\n\u001b[32m 941\u001b[39m log.debug(\u001b[33m\"\u001b[39m\u001b[33mRetry: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m, url)\n\u001b[32m--> \u001b[39m\u001b[32m942\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 943\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 944\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 945\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 946\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 947\u001b[39m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 948\u001b[39m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[43m=\u001b[49m\u001b[43mredirect\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 949\u001b[39m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[43m=\u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 950\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 951\u001b[39m \u001b[43m \u001b[49m\u001b[43mpool_timeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpool_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 952\u001b[39m \u001b[43m \u001b[49m\u001b[43mrelease_conn\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrelease_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 953\u001b[39m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m=\u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 954\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody_pos\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbody_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 955\u001b[39m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 956\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 957\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mresponse_kw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 958\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 960\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m response\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/urllib3/connectionpool.py:942\u001b[39m, in \u001b[36mHTTPConnectionPool.urlopen\u001b[39m\u001b[34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[39m\n\u001b[32m 941\u001b[39m log.debug(\u001b[33m\"\u001b[39m\u001b[33mRetry: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m, url)\n\u001b[32m--> \u001b[39m\u001b[32m942\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 943\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 944\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 945\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 946\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 947\u001b[39m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 948\u001b[39m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[43m=\u001b[49m\u001b[43mredirect\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 949\u001b[39m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[43m=\u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 950\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 951\u001b[39m \u001b[43m \u001b[49m\u001b[43mpool_timeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpool_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 952\u001b[39m \u001b[43m \u001b[49m\u001b[43mrelease_conn\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrelease_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 953\u001b[39m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m=\u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 954\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody_pos\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbody_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 955\u001b[39m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 956\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 957\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mresponse_kw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 958\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 960\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m response\n", + " \u001b[31m[... skipping similar frames: HTTPConnectionPool.urlopen at line 942 (2 times)]\u001b[39m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/urllib3/connectionpool.py:942\u001b[39m, in \u001b[36mHTTPConnectionPool.urlopen\u001b[39m\u001b[34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[39m\n\u001b[32m 941\u001b[39m log.debug(\u001b[33m\"\u001b[39m\u001b[33mRetry: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m, url)\n\u001b[32m--> \u001b[39m\u001b[32m942\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 943\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 944\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 945\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 946\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 947\u001b[39m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 948\u001b[39m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[43m=\u001b[49m\u001b[43mredirect\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 949\u001b[39m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[43m=\u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 950\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 951\u001b[39m \u001b[43m \u001b[49m\u001b[43mpool_timeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpool_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 952\u001b[39m \u001b[43m \u001b[49m\u001b[43mrelease_conn\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrelease_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 953\u001b[39m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m=\u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 954\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody_pos\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbody_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 955\u001b[39m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 956\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 957\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mresponse_kw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 958\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 960\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m response\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/urllib3/connectionpool.py:932\u001b[39m, in \u001b[36mHTTPConnectionPool.urlopen\u001b[39m\u001b[34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[39m\n\u001b[32m 931\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m932\u001b[39m retries = \u001b[43mretries\u001b[49m\u001b[43m.\u001b[49m\u001b[43mincrement\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresponse\u001b[49m\u001b[43m=\u001b[49m\u001b[43mresponse\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_pool\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 933\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m MaxRetryError:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/urllib3/util/retry.py:535\u001b[39m, in \u001b[36mRetry.increment\u001b[39m\u001b[34m(self, method, url, response, error, _pool, _stacktrace)\u001b[39m\n\u001b[32m 534\u001b[39m reason = error \u001b[38;5;129;01mor\u001b[39;00m ResponseError(cause)\n\u001b[32m--> \u001b[39m\u001b[32m535\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m MaxRetryError(_pool, url, reason) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mreason\u001b[39;00m \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n\u001b[32m 537\u001b[39m log.debug(\u001b[33m\"\u001b[39m\u001b[33mIncremented Retry for (url=\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m): \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[33m\"\u001b[39m, url, new_retry)\n", + "\u001b[31mMaxRetryError\u001b[39m: HTTPConnectionPool(host='model-platform.com', port=80): Max retries exceeded with url: /registry/Medical-Document-NLP/api/2.0/mlflow/experiments/get-by-name?experiment_name=medical_document_classification (Caused by ResponseError('too many 502 error responses'))", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[31mRetryError\u001b[39m Traceback (most recent call last)", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/mlflow/utils/rest_utils.py:189\u001b[39m, in \u001b[36mhttp_request\u001b[39m\u001b[34m(host_creds, endpoint, method, max_retries, backoff_factor, backoff_jitter, extra_headers, retry_codes, timeout, raise_on_status, respect_retry_after_header, **kwargs)\u001b[39m\n\u001b[32m 188\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m189\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_get_http_response_with_retries\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 190\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 191\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 192\u001b[39m \u001b[43m \u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 193\u001b[39m \u001b[43m \u001b[49m\u001b[43mbackoff_factor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 194\u001b[39m \u001b[43m \u001b[49m\u001b[43mbackoff_jitter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 195\u001b[39m \u001b[43m \u001b[49m\u001b[43mretry_codes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 196\u001b[39m \u001b[43m \u001b[49m\u001b[43mraise_on_status\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 197\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 198\u001b[39m \u001b[43m \u001b[49m\u001b[43mverify\u001b[49m\u001b[43m=\u001b[49m\u001b[43mhost_creds\u001b[49m\u001b[43m.\u001b[49m\u001b[43mverify\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 199\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 200\u001b[39m \u001b[43m \u001b[49m\u001b[43mrespect_retry_after_header\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrespect_retry_after_header\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 201\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 202\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 203\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m requests.exceptions.Timeout \u001b[38;5;28;01mas\u001b[39;00m to:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/mlflow/utils/request_utils.py:237\u001b[39m, in \u001b[36m_get_http_response_with_retries\u001b[39m\u001b[34m(method, url, max_retries, backoff_factor, backoff_jitter, retry_codes, raise_on_status, allow_redirects, respect_retry_after_header, **kwargs)\u001b[39m\n\u001b[32m 235\u001b[39m allow_redirects = env_value \u001b[38;5;28;01mif\u001b[39;00m allow_redirects \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m allow_redirects\n\u001b[32m--> \u001b[39m\u001b[32m237\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msession\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mallow_redirects\u001b[49m\u001b[43m=\u001b[49m\u001b[43mallow_redirects\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/requests/sessions.py:589\u001b[39m, in \u001b[36mSession.request\u001b[39m\u001b[34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[39m\n\u001b[32m 588\u001b[39m send_kwargs.update(settings)\n\u001b[32m--> \u001b[39m\u001b[32m589\u001b[39m resp = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 591\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/requests/sessions.py:703\u001b[39m, in \u001b[36mSession.send\u001b[39m\u001b[34m(self, request, **kwargs)\u001b[39m\n\u001b[32m 702\u001b[39m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m703\u001b[39m r = \u001b[43madapter\u001b[49m\u001b[43m.\u001b[49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 705\u001b[39m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/requests/adapters.py:668\u001b[39m, in \u001b[36mHTTPAdapter.send\u001b[39m\u001b[34m(self, request, stream, timeout, verify, cert, proxies)\u001b[39m\n\u001b[32m 667\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e.reason, ResponseError):\n\u001b[32m--> \u001b[39m\u001b[32m668\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m RetryError(e, request=request)\n\u001b[32m 670\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e.reason, _ProxyError):\n", + "\u001b[31mRetryError\u001b[39m: HTTPConnectionPool(host='model-platform.com', port=80): Max retries exceeded with url: /registry/Medical-Document-NLP/api/2.0/mlflow/experiments/get-by-name?experiment_name=medical_document_classification (Caused by ResponseError('too many 502 error responses'))", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[31mMlflowException\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 3\u001b[39m MLFLOW_TRACKING_URI = \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mhttp://model-platform.com/registry/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mPROJECT_NAME\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 5\u001b[39m mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[43mmlflow\u001b[49m\u001b[43m.\u001b[49m\u001b[43mset_experiment\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmedical_document_classification\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMLflow tracking URI : \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mMLFLOW_TRACKING_URI\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/mlflow/tracking/fluent.py:157\u001b[39m, in \u001b[36mset_experiment\u001b[39m\u001b[34m(experiment_name, experiment_id)\u001b[39m\n\u001b[32m 155\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m _experiment_lock:\n\u001b[32m 156\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m experiment_id \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m157\u001b[39m experiment = \u001b[43mclient\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_experiment_by_name\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexperiment_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 158\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m experiment:\n\u001b[32m 159\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/mlflow/tracking/client.py:1257\u001b[39m, in \u001b[36mMlflowClient.get_experiment_by_name\u001b[39m\u001b[34m(self, name)\u001b[39m\n\u001b[32m 1225\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget_experiment_by_name\u001b[39m(\u001b[38;5;28mself\u001b[39m, name: \u001b[38;5;28mstr\u001b[39m) -> Optional[Experiment]:\n\u001b[32m 1226\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"Retrieve an experiment by experiment name from the backend store\u001b[39;00m\n\u001b[32m 1227\u001b[39m \n\u001b[32m 1228\u001b[39m \u001b[33;03m Args:\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 1255\u001b[39m \u001b[33;03m Lifecycle_stage: active\u001b[39;00m\n\u001b[32m 1256\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1257\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_tracking_client\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_experiment_by_name\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/mlflow/tracking/_tracking_service/client.py:502\u001b[39m, in \u001b[36mTrackingServiceClient.get_experiment_by_name\u001b[39m\u001b[34m(self, name)\u001b[39m\n\u001b[32m 494\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget_experiment_by_name\u001b[39m(\u001b[38;5;28mself\u001b[39m, name):\n\u001b[32m 495\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 496\u001b[39m \u001b[33;03m Args:\u001b[39;00m\n\u001b[32m 497\u001b[39m \u001b[33;03m name: The experiment name.\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 500\u001b[39m \u001b[33;03m :py:class:`mlflow.entities.Experiment`\u001b[39;00m\n\u001b[32m 501\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m502\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mstore\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_experiment_by_name\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/mlflow/store/tracking/rest_store.py:522\u001b[39m, in \u001b[36mRestStore.get_experiment_by_name\u001b[39m\u001b[34m(self, experiment_name)\u001b[39m\n\u001b[32m 520\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 521\u001b[39m req_body = message_to_json(GetExperimentByName(experiment_name=experiment_name))\n\u001b[32m--> \u001b[39m\u001b[32m522\u001b[39m response_proto = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_endpoint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mGetExperimentByName\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreq_body\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 523\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m Experiment.from_proto(response_proto.experiment)\n\u001b[32m 524\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m MlflowException \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/mlflow/store/tracking/rest_store.py:82\u001b[39m, in \u001b[36mRestStore._call_endpoint\u001b[39m\u001b[34m(self, api, json_body, endpoint)\u001b[39m\n\u001b[32m 80\u001b[39m endpoint, method = _METHOD_TO_INFO[api]\n\u001b[32m 81\u001b[39m response_proto = api.Response()\n\u001b[32m---> \u001b[39m\u001b[32m82\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcall_endpoint\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mget_host_creds\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mendpoint\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjson_body\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresponse_proto\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/mlflow/utils/rest_utils.py:365\u001b[39m, in \u001b[36mcall_endpoint\u001b[39m\u001b[34m(host_creds, endpoint, method, json_body, response_proto, extra_headers)\u001b[39m\n\u001b[32m 363\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m method == \u001b[33m\"\u001b[39m\u001b[33mGET\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 364\u001b[39m call_kwargs[\u001b[33m\"\u001b[39m\u001b[33mparams\u001b[39m\u001b[33m\"\u001b[39m] = json_body\n\u001b[32m--> \u001b[39m\u001b[32m365\u001b[39m response = \u001b[43mhttp_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mcall_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 366\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 367\u001b[39m call_kwargs[\u001b[33m\"\u001b[39m\u001b[33mjson\u001b[39m\u001b[33m\"\u001b[39m] = json_body\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/projets/model_platform/.venv/lib/python3.11/site-packages/mlflow/utils/rest_utils.py:212\u001b[39m, in \u001b[36mhttp_request\u001b[39m\u001b[34m(host_creds, endpoint, method, max_retries, backoff_factor, backoff_jitter, extra_headers, retry_codes, timeout, raise_on_status, respect_retry_after_header, **kwargs)\u001b[39m\n\u001b[32m 210\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m InvalidUrlException(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mInvalid url: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01miu\u001b[39;00m\n\u001b[32m 211\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m--> \u001b[39m\u001b[32m212\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m MlflowException(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mAPI request to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m failed with exception \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", + "\u001b[31mMlflowException\u001b[39m: API request to http://model-platform.com/registry/Medical-Document-NLP/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='model-platform.com', port=80): Max retries exceeded with url: /registry/Medical-Document-NLP/api/2.0/mlflow/experiments/get-by-name?experiment_name=medical_document_classification (Caused by ResponseError('too many 502 error responses'))" + ] + } + ], "source": "# Author: Octo Technology MLOps Tribe\nPROJECT_NAME = \"Medical-Document-NLP\"\nMLFLOW_TRACKING_URI = f\"http://model-platform.com/registry/{PROJECT_NAME}/\"\n\nmlflow.set_tracking_uri(MLFLOW_TRACKING_URI)\nmlflow.set_experiment(\"medical_document_classification\")\nprint(f\"MLflow tracking URI : {MLFLOW_TRACKING_URI}\")" }, { "cell_type": "markdown", + "id": "98aeaebb36d3e806", "metadata": {}, "source": "## 3. Generation des donnees synthetiques" }, { "cell_type": "code", "execution_count": null, + "id": "55ea59983b8cb199", "metadata": {}, "outputs": [], - "source": "# Author: Octo Technology MLOps Tribe\nnp.random.seed(42)\nrandom.seed(42)\nN = 3000\n\nimport random\n\nDOC_TYPES = {\n 0: {\n \"name\": \"compte-rendu de consultation\",\n \"templates\": [\n \"Compte-rendu de consultation du {date}. Patient suivi pour {motif}. Examen clinique : {exam}. Conclusion : {concl}.\",\n \"Consultation specialisee. Motif : {motif}. Antecedents : {atcd}. Plan therapeutique : {plan}.\",\n ],\n },\n 1: {\n \"name\": \"ordonnance\",\n \"templates\": [\n \"Ordonnance medicale. Prescrit : {medicament} {posologie} pendant {duree}. Renouvellement : {renouvellement}.\",\n \"Je soussigne prescris : {medicament} {posologie}. {duree}. Ne pas depasser la dose prescrite.\",\n ],\n },\n 2: {\n \"name\": \"compte-rendu d'hospitalisation\",\n \"templates\": [\n \"Compte-rendu d'hospitalisation du {date_entree} au {date_sortie}. Motif d'admission : {motif}. Evolution : {evolution}. Sortie : {sortie}.\",\n \"Hospitalisation pour {motif}. Bilan realise : {bilan}. Traitement installe : {traitement}. Suivi ambulatoire recommande.\",\n ],\n },\n 3: {\n \"name\": \"resultat d'examen\",\n \"templates\": [\n \"Resultat d'analyse : {examen}. Valeur : {valeur} {unite}. Norme : {norme}. Interpretation : {interpretation}.\",\n \"Compte-rendu d'imagerie. Type : {examen}. Indication : {indication}. Resultat : {resultat}.\",\n ],\n },\n 4: {\n \"name\": \"lettre de sortie\",\n \"templates\": [\n \"Lettre de sortie du {date}. Cher confrere, je vous adresse ce patient pour {raison}. Diagnostic retenu : {diagnostic}. Traitement de sortie : {traitement}.\",\n \"Courrier de liaison. Suite a l'hospitalisation pour {raison}, le patient sort ce jour avec : {traitement}. Prochain RDV : {rdv}.\",\n ],\n },\n}\n\nFILLERS = {\n \"date\": [\"12/01/2024\", \"03/03/2024\", \"15/06/2024\"],\n \"motif\": [\"douleur abdominale\", \"bilan cardiologique\", \"suivi diabete\", \"lombalgie aigue\"],\n \"exam\": [\"normal\", \"sans anomalie\", \"discret souffle systolique\", \"abdomen souple\"],\n \"concl\": [\"RAS\", \"surveillance\", \"traitement adapte\", \"bilan complementaire demande\"],\n \"atcd\": [\"HTA connue\", \"diabete type 2\", \"allergie penicilline\", \"sans antecedent notable\"],\n \"plan\": [\"poursuite traitement\", \"ajustement posologie\", \"kinesitherapie\", \"consultation cardiologie\"],\n \"medicament\": [\"metformine 1g\", \"amlodipine 5mg\", \"omeprazole 20mg\", \"sertraline 50mg\"],\n \"posologie\": [\"1 cp matin\", \"2 cp/jour\", \"1 cp au coucher\", \"3 cp/jour pendant les repas\"],\n \"duree\": [\"30 jours\", \"3 mois\", \"6 mois\", \"traitement continu\"],\n \"renouvellement\": [\"non renouvelable\", \"renouvelable 2 fois\", \"renouvelable sur avis medical\"],\n \"date_entree\": [\"10/01/2024\"], \"date_sortie\": [\"14/01/2024\"],\n \"evolution\": [\"favorable\", \"stable\", \"amelioration progressive\", \"complication geree\"],\n \"sortie\": [\"retour domicile\", \"transfert SSR\", \"sortie avec HAD\"],\n \"bilan\": [\"ECG normal\", \"biologie standard\", \"scanner thoracique\", \"echo cardiaque\"],\n \"traitement\": [\"metformine + amlodipine\", \"antalgiques + AINS\", \"antibiotiques 7j\"],\n \"examen\": [\"glycemie\", \"NFS\", \"echo abdominale\", \"radio thorax\", \"IRM lombaire\"],\n \"valeur\": [\"6.8\", \"12.5\", \"140\", \"98\", \"0.5\"],\n \"unite\": [\"mmol/L\", \"g/dL\", \"mmHg\", \"U/L\", \"mg/L\"],\n \"norme\": [\"[4-6 mmol/L]\", \"[12-16 g/dL]\", \"[<14 mmHg]\"],\n \"interpretation\": [\"normal\", \"dans les normes\", \"legerement eleve \u2014 a surveiller\", \"pathologique\"],\n \"indication\": [\"douleur\", \"suivi\", \"bilan\"],\n \"resultat\": [\"sans anomalie\", \"hernie discale L4-L5\", \"epanchement pleural minime\"],\n \"raison\": [\"bilan cardiologique\", \"prise en charge diabete\", \"episode infectieux\"],\n \"diagnostic\": [\"insuffisance cardiaque stade II\", \"diabete desequilibre\", \"pneumopathie\"],\n \"rdv\": [\"dans 1 mois\", \"dans 3 mois avec le medecin traitant\"],\n}\n\ntexts, labels = [], []\nfor _ in range(N):\n label = random.randint(0, 4)\n doc_info = DOC_TYPES[label]\n tpl = random.choice(doc_info[\"templates\"])\n filled = tpl\n for key, vals in FILLERS.items():\n placeholder = \"{\" + key + \"}\"\n if placeholder in filled:\n filled = filled.replace(placeholder, random.choice(vals), 1)\n texts.append(filled)\n labels.append(label)\n\nTARGET = \"doc_type\"\ndf = pd.DataFrame({\n \"text\": texts,\n \"text_length\": [len(t) for t in texts],\n \"word_count\": [len(t.split()) for t in texts],\n TARGET: labels,\n})\n\nPROTECTED_ATTRIBUTES = []\nprint(f\"Dataset : {len(df):,} lignes | Taux cible : {df[TARGET].mean():.1%}\")" + "source": "# Author: Octo Technology MLOps Tribe\nimport random\n\nnp.random.seed(42)\nrandom.seed(42)\nN = 3000\n\nDOC_TYPES = {\n 0: {\n \"name\": \"compte-rendu de consultation\",\n \"templates\": [\n \"Compte-rendu de consultation du {date}. Patient suivi pour {motif}. Examen clinique : {exam}. Conclusion : {concl}.\",\n \"Consultation specialisee. Motif : {motif}. Antecedents : {atcd}. Plan therapeutique : {plan}.\",\n ],\n },\n 1: {\n \"name\": \"ordonnance\",\n \"templates\": [\n \"Ordonnance medicale. Prescrit : {medicament} {posologie} pendant {duree}. Renouvellement : {renouvellement}.\",\n \"Je soussigne prescris : {medicament} {posologie}. {duree}. Ne pas depasser la dose prescrite.\",\n ],\n },\n 2: {\n \"name\": \"compte-rendu d'hospitalisation\",\n \"templates\": [\n \"Compte-rendu d'hospitalisation du {date_entree} au {date_sortie}. Motif d'admission : {motif}. Evolution : {evolution}. Sortie : {sortie}.\",\n \"Hospitalisation pour {motif}. Bilan realise : {bilan}. Traitement installe : {traitement}. Suivi ambulatoire recommande.\",\n ],\n },\n 3: {\n \"name\": \"resultat d'examen\",\n \"templates\": [\n \"Resultat d'analyse : {examen}. Valeur : {valeur} {unite}. Norme : {norme}. Interpretation : {interpretation}.\",\n \"Compte-rendu d'imagerie. Type : {examen}. Indication : {indication}. Resultat : {resultat}.\",\n ],\n },\n 4: {\n \"name\": \"lettre de sortie\",\n \"templates\": [\n \"Lettre de sortie du {date}. Cher confrere, je vous adresse ce patient pour {raison}. Diagnostic retenu : {diagnostic}. Traitement de sortie : {traitement}.\",\n \"Courrier de liaison. Suite a l'hospitalisation pour {raison}, le patient sort ce jour avec : {traitement}. Prochain RDV : {rdv}.\",\n ],\n },\n}\n\nFILLERS = {\n \"date\": [\"12/01/2024\", \"03/03/2024\", \"15/06/2024\"],\n \"motif\": [\"douleur abdominale\", \"bilan cardiologique\", \"suivi diabete\", \"lombalgie aigue\"],\n \"exam\": [\"normal\", \"sans anomalie\", \"discret souffle systolique\", \"abdomen souple\"],\n \"concl\": [\"RAS\", \"surveillance\", \"traitement adapte\", \"bilan complementaire demande\"],\n \"atcd\": [\"HTA connue\", \"diabete type 2\", \"allergie penicilline\", \"sans antecedent notable\"],\n \"plan\": [\"poursuite traitement\", \"ajustement posologie\", \"kinesitherapie\", \"consultation cardiologie\"],\n \"medicament\": [\"metformine 1g\", \"amlodipine 5mg\", \"omeprazole 20mg\", \"sertraline 50mg\"],\n \"posologie\": [\"1 cp matin\", \"2 cp/jour\", \"1 cp au coucher\", \"3 cp/jour pendant les repas\"],\n \"duree\": [\"30 jours\", \"3 mois\", \"6 mois\", \"traitement continu\"],\n \"renouvellement\": [\"non renouvelable\", \"renouvelable 2 fois\", \"renouvelable sur avis medical\"],\n \"date_entree\": [\"10/01/2024\"], \"date_sortie\": [\"14/01/2024\"],\n \"evolution\": [\"favorable\", \"stable\", \"amelioration progressive\", \"complication geree\"],\n \"sortie\": [\"retour domicile\", \"transfert SSR\", \"sortie avec HAD\"],\n \"bilan\": [\"ECG normal\", \"biologie standard\", \"scanner thoracique\", \"echo cardiaque\"],\n \"traitement\": [\"metformine + amlodipine\", \"antalgiques + AINS\", \"antibiotiques 7j\"],\n \"examen\": [\"glycemie\", \"NFS\", \"echo abdominale\", \"radio thorax\", \"IRM lombaire\"],\n \"valeur\": [\"6.8\", \"12.5\", \"140\", \"98\", \"0.5\"],\n \"unite\": [\"mmol/L\", \"g/dL\", \"mmHg\", \"U/L\", \"mg/L\"],\n \"norme\": [\"[4-6 mmol/L]\", \"[12-16 g/dL]\", \"[<14 mmHg]\"],\n \"interpretation\": [\"normal\", \"dans les normes\", \"legerement eleve \u2014 a surveiller\", \"pathologique\"],\n \"indication\": [\"douleur\", \"suivi\", \"bilan\"],\n \"resultat\": [\"sans anomalie\", \"hernie discale L4-L5\", \"epanchement pleural minime\"],\n \"raison\": [\"bilan cardiologique\", \"prise en charge diabete\", \"episode infectieux\"],\n \"diagnostic\": [\"insuffisance cardiaque stade II\", \"diabete desequilibre\", \"pneumopathie\"],\n \"rdv\": [\"dans 1 mois\", \"dans 3 mois avec le medecin traitant\"],\n}\n\ntexts, labels = [], []\nfor _ in range(N):\n label = random.randint(0, 4)\n doc_info = DOC_TYPES[label]\n tpl = random.choice(doc_info[\"templates\"])\n filled = tpl\n for key, vals in FILLERS.items():\n placeholder = \"{\" + key + \"}\"\n if placeholder in filled:\n filled = filled.replace(placeholder, random.choice(vals), 1)\n texts.append(filled)\n labels.append(label)\n\nTARGET = \"doc_type\"\ndf = pd.DataFrame({\n \"text\": texts,\n \"text_length\": [len(t) for t in texts],\n \"word_count\": [len(t.split()) for t in texts],\n TARGET: labels,\n})\n\nPROTECTED_ATTRIBUTES = []\nprint(f\"Dataset : {len(df):,} lignes | Taux cible : {df[TARGET].mean():.1%}\")" }, { "cell_type": "markdown", + "id": "9f92f677d129bc39", "metadata": {}, "source": "## 4. Validation Pandera & statistiques descriptives" }, { "cell_type": "code", "execution_count": null, + "id": "7878810fadb75567", "metadata": {}, "outputs": [], "source": "# Author: Octo Technology MLOps Tribe\nSCHEMA = pa.DataFrameSchema(\n name=\"medical_document_input_schema\",\n description=\"Contrat de donnees \u2014 Classification du Type de Documents Medicaux \u2014 Modele a Risque Minimal (AI Act)\",\n columns={\n \"text\": pa.Column(str, checks=pa.Check(lambda s: s.str.len() >= 10, error=\"Texte trop court\"), nullable=False, description=\"Texte du document\"),\n \"text_length\": pa.Column(int, checks=pa.Check.in_range(10, 5000), nullable=False, description=\"Longueur du texte (caracteres)\"),\n \"word_count\": pa.Column(int, checks=pa.Check.in_range(2, 1000), nullable=False, description=\"Nombre de mots\"),\n TARGET: pa.Column(int, checks=pa.Check.isin(list(range(5))), nullable=False, description=\"Variable cible\"),\n },\n coerce=False,\n strict=True,\n)\n\ntry:\n SCHEMA.validate(df[[\"text\", \"text_length\", \"word_count\", TARGET]], lazy=True)\n PANDERA_STATUS = \"PASS\"\n PANDERA_ERRORS = 0\n print(\"Validation Pandera : SUCCES\")\nexcept pa.errors.SchemaErrors as exc:\n PANDERA_STATUS = \"FAIL\"\n PANDERA_ERRORS = len(exc.failure_cases)\n print(f\"Validation Pandera : ECHEC ({PANDERA_ERRORS} erreurs)\")" @@ -68,6 +151,7 @@ { "cell_type": "code", "execution_count": null, + "id": "d42606134050ad1", "metadata": {}, "outputs": [], "source": "# Author: Octo Technology MLOps Tribe\nprint(\"=== Statistiques descriptives ===\")\nprint(f\"Nombre de documents : {len(df):,}\")\nprint(f\"Longueur moyenne des textes : {df['text_length'].mean():.0f} caracteres\")\nprint(f\"Nombre de mots moyen : {df['word_count'].mean():.0f}\")\nprint(f\"\\n=== Distribution de la variable cible ===\")\nvc = df[TARGET].value_counts().sort_index()\nfor k, v in vc.items():\n print(f\" Classe {k} : {v:,} ({v/len(df):.1%})\")" @@ -75,42 +159,49 @@ { "cell_type": "code", "execution_count": null, + "id": "1e2d22f28b04d06b", "metadata": {}, "outputs": [], "source": "# Author: Octo Technology MLOps Tribe\nSCHEMA_YAML_EXPORTED = False\ntry:\n schema_yaml_path = ARTIFACTS_DIR / \"pandera_schema.yaml\"\n with open(schema_yaml_path, \"w\") as f:\n f.write(SCHEMA.to_yaml())\n SCHEMA_YAML_EXPORTED = True\nexcept Exception as e:\n print(f\"Export YAML non disponible : {e}\")\n\nvalidation_report = {\n \"schema_name\": SCHEMA.name,\n \"validation_status\": PANDERA_STATUS,\n \"validation_errors\": PANDERA_ERRORS,\n \"n_rows_validated\": int(len(df)),\n \"protected_attributes\": PROTECTED_ATTRIBUTES,\n}\nvalidation_report_path = ARTIFACTS_DIR / \"data_validation_report.json\"\nwith open(validation_report_path, \"w\") as f:\n json.dump(validation_report, f, indent=2, ensure_ascii=False)\nprint(\"Rapport de validation exporte.\")" }, { "cell_type": "markdown", + "id": "3dd3dc03a88cd65a", "metadata": {}, "source": "## 5. Pretraitement" }, { "cell_type": "code", "execution_count": null, + "id": "96dc6c27c9140d5f", "metadata": {}, "outputs": [], "source": "# Author: Octo Technology MLOps Tribe\nX_text, y = df[\"text\"], df[TARGET]\n\nX_train_full, X_test, y_train_full, y_test = train_test_split(X_text, y, test_size=0.20, random_state=42, stratify=y)\nX_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.15, random_state=42, stratify=y_train_full)\n\nvectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 2), sublinear_tf=True)\nX_train_sc = vectorizer.fit_transform(X_train)\nX_val_sc = vectorizer.transform(X_val)\nX_test_sc = vectorizer.transform(X_test)\n\nFEATURES = vectorizer.get_feature_names_out().tolist()\nprint(f\"Train : {len(X_train):,} | Validation : {len(X_val):,} | Test : {len(X_test):,}\")\nprint(f\"Vocabulaire TF-IDF : {len(FEATURES):,} tokens\")" }, { "cell_type": "markdown", + "id": "70d66a2c1fbc6472", "metadata": {}, "source": "## 6. Entrainement du modele" }, { "cell_type": "code", "execution_count": null, + "id": "84a0b3b1aee6da6e", "metadata": {}, "outputs": [], "source": "# Author: Octo Technology MLOps Tribe\nPARAMS = {\"estimator\": LinearSVC(C=1.0, max_iter=2000, random_state=42), \"cv\": 3}\n\nmodel = CalibratedClassifierCV(**PARAMS)\nmodel.fit(X_train_sc, y_train)\n\nval_pred = model.predict(X_val_sc)\nval_proba = model.predict_proba(X_val_sc)\nif 5 == 2:\n val_auc = roc_auc_score(y_val, val_proba[:, 1])\nelse:\n val_auc = roc_auc_score(y_val, val_proba, multi_class=\"ovr\", average=\"macro\")\nprint(f\"AUC-ROC validation : {val_auc:.4f}\")\nprint(\"Entrainement termine.\")" }, { "cell_type": "markdown", + "id": "5083c5aecce74600", "metadata": {}, "source": "## 7. Evaluation sur le jeu de test" }, { "cell_type": "code", "execution_count": null, + "id": "d30b249b4705b5cb", "metadata": {}, "outputs": [], "source": "# Author: Octo Technology MLOps Tribe\ny_pred = model.predict(X_test_sc)\ny_proba = model.predict_proba(X_test_sc)\n\nif 5 == 2:\n auc = round(float(roc_auc_score(y_test, y_proba[:, 1])), 4)\nelse:\n auc = round(float(roc_auc_score(y_test, y_proba, multi_class=\"ovr\", average=\"macro\")), 4)\n\nMETRICS = {\n \"accuracy\": round(float(accuracy_score(y_test, y_pred)), 4),\n \"precision\": round(float(precision_score(y_test, y_pred, average=\"macro\", zero_division=0)), 4),\n \"recall\": round(float(recall_score(y_test, y_pred, average=\"macro\", zero_division=0)), 4),\n \"f1_score\": round(float(f1_score(y_test, y_pred, average=\"macro\", zero_division=0)), 4),\n \"auc_roc\": auc,\n}\n\nreport_dict = classification_report(y_test, y_pred, output_dict=True)\nprint(classification_report(y_test, y_pred))\nprint(\"\\n\".join(f\"{k:<12}: {v}\" for k, v in METRICS.items()))" @@ -118,30 +209,35 @@ { "cell_type": "code", "execution_count": null, + "id": "6d35efb2a1f272c0", "metadata": {}, "outputs": [], "source": "# Author: Octo Technology MLOps Tribe\n# Multi-class: confusion matrix instead of ROC\nfrom sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix\nfig, ax = plt.subplots(figsize=(8, 6))\ncm = confusion_matrix(y_test, y_pred)\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[\"CR Consultation\", \"Ordonnance\", \"CR Hospit.\", \"Resultat Exam.\", \"Lettre Sortie\"])\ndisp.plot(ax=ax, colorbar=False, cmap=\"Blues\")\nax.set_title(\"Matrice de Confusion \u2014 Classification Documentaire\")\nplt.tight_layout()\nroc_path = ARTIFACTS_DIR / \"confusion_matrix.png\"\nplt.savefig(roc_path, dpi=150)\nplt.show()" }, { "cell_type": "markdown", + "id": "60379fb6e58dbfca", "metadata": {}, "source": "## 8. Analyse d'equite (Fairness)" }, { "cell_type": "code", "execution_count": null, + "id": "44a9ba864824c646", "metadata": {}, "outputs": [], "source": "# Author: Octo Technology MLOps Tribe\n# Modele NLP \u2014 pas d'attribut protege dans les features textuelles synthetiques\nFAIRNESS_REPORT = {\n \"protected_attributes\": [],\n \"note\": \"Aucun attribut protege dans les features TF-IDF synthetiques. En production, des biais linguistiques (genre, origine) pourraient etre presents dans les textes reels.\",\n \"global_accuracy\": METRICS[\"accuracy\"],\n \"global_f1\": METRICS[\"f1_score\"],\n}\n\n\nfairness_path = ARTIFACTS_DIR / \"fairness_report.json\"\nwith open(fairness_path, \"w\") as f:\n json.dump(FAIRNESS_REPORT, f, indent=2, ensure_ascii=False)\nprint(\"Rapport d'equite exporte.\")" }, { "cell_type": "markdown", + "id": "5e449af0c6c26652", "metadata": {}, "source": "## 9. Explicabilite" }, { "cell_type": "code", "execution_count": null, + "id": "ecd2c29e32543952", "metadata": {}, "outputs": [], "source": "# Author: Octo Technology MLOps Tribe\n# Top features TF-IDF par importance (coefficients du modele)\nif hasattr(model, \"coef_\"):\n coef = model.coef_\n if coef.ndim > 1:\n importances = np.abs(coef).mean(axis=0)\n else:\n importances = np.abs(coef[0])\n top_n = min(20, len(importances))\n top_idx = np.argsort(importances)[-top_n:][::-1]\n fi_df = pd.DataFrame({\"feature\": [FEATURES[i] for i in top_idx], \"importance\": importances[top_idx]})\nelse:\n fi_df = pd.DataFrame({\"feature\": FEATURES[:20], \"importance\": np.ones(20)})\n\nfig, ax = plt.subplots(figsize=(8, 5))\nax.barh(fi_df[\"feature\"][::-1], fi_df[\"importance\"][::-1], color=\"mediumorchid\")\nax.set_title(\"Top tokens TF-IDF \u2014 Importance des features\")\nax.set_xlabel(\"Importance relative (|coefficient|)\")\nplt.tight_layout()\nfi_path = ARTIFACTS_DIR / \"feature_importance.png\"\nplt.savefig(fi_path, dpi=150)\nplt.show()\nprint(fi_df.head(10).to_string(index=False))" @@ -149,30 +245,35 @@ { "cell_type": "code", "execution_count": null, + "id": "8e770de6930e5bda", "metadata": {}, "outputs": [], "source": "# Author: Octo Technology MLOps Tribe\n# SHAP non applicable directement aux modeles TF-IDF lineaires de cette maniere \u2014 on utilise LIME si disponible\nSHAP_AVAILABLE = False\ntry:\n import lime\n import lime.lime_text\n explainer = lime.lime_text.LimeTextExplainer(class_names=[str(i) for i in range(model.classes_.max() + 1)])\n sample_text = X_test.iloc[0]\n exp = explainer.explain_instance(sample_text, lambda x: model.predict_proba(vectorizer.transform(x)), num_features=10)\n print(\"LIME explanation pour le premier exemple de test :\")\n for feat, weight in exp.as_list():\n print(f\" {feat}: {weight:.4f}\")\n SHAP_AVAILABLE = True\nexcept ImportError:\n print(\"LIME non disponible. Installer avec : pip install lime\")" }, { "cell_type": "markdown", + "id": "ad3fa7a716961dc0", "metadata": {}, "source": "## 10. Preparation des artefacts de documentation" }, { "cell_type": "code", "execution_count": null, + "id": "dda4b4f48869f637", "metadata": {}, "outputs": [], "source": "# Author: Octo Technology MLOps Tribe\ncr_path = ARTIFACTS_DIR / \"classification_report.json\"\nwith open(cr_path, \"w\") as f:\n json.dump(report_dict, f, indent=2, ensure_ascii=False)\n\nfi_csv_path = ARTIFACTS_DIR / \"feature_importance.csv\"\nfi_df.to_csv(fi_csv_path, index=False)\n\npp_context = {\"schema_name\": SCHEMA.name, \"pandera_status\": PANDERA_STATUS, \"pandera_errors\": PANDERA_ERRORS}\npreprocessing_md = Path(\"preprocessing_description_doctype_template.md\").read_text(encoding=\"utf-8\").format_map(pp_context)\npp_path = ARTIFACTS_DIR / \"preprocessing_description.md\"\npp_path.write_text(preprocessing_md, encoding=\"utf-8\")\n\nprint(\"Artefacts prepares :\")\nfor p in sorted(ARTIFACTS_DIR.iterdir()):\n print(f\" {p.name}\")" }, { "cell_type": "markdown", + "id": "c2966aab6c0ef401", "metadata": {}, "source": "## 11. Logging MLflow" }, { "cell_type": "code", "execution_count": null, + "id": "f52b715951b53477", "metadata": {}, "outputs": [], "source": "# Author: Octo Technology MLOps Tribe\nMODEL_NAME = \"document_type_classifier\"\nTEAM = \"mlops-tribe\"\nENVIRONMENT = \"staging\"\n\nmodel_card_text = Path(\"model_card_doctype.md\").read_text(encoding=\"utf-8\")\nmodel_card_path = ARTIFACTS_DIR / \"model_card.md\"\nmodel_card_path.write_text(model_card_text, encoding=\"utf-8\")\n\nwith mlflow.start_run(run_name=f\"{MODEL_NAME}_v1\") as run:\n mlflow.log_params(PARAMS)\n mlflow.log_param(\"vectorizer\", \"TfidfVectorizer\")\n mlflow.log_param(\"feature_count\", len(FEATURES))\n mlflow.log_param(\"train_size\", len(X_train))\n mlflow.log_param(\"val_size\", len(X_val))\n mlflow.log_param(\"test_size\", len(X_test))\n\n mlflow.log_metrics(METRICS)\n mlflow.log_metric(\"auc_roc_validation\", round(val_auc, 4))\n mlflow.log_metric(\"dataset_total_size\", N)\n\n mlflow.set_tag(\"model_type\", \"TF-IDF + LinearSVC (calibrated)\")\n mlflow.set_tag(\"framework\", \"scikit-learn\")\n mlflow.set_tag(\"data_source\", \"Textes medicaux synthetiques\")\n mlflow.set_tag(\"data_language\", \"francais\")\n mlflow.set_tag(\"contains_personal_data\", \"non \u2014 textes synthetiques\")\n mlflow.set_tag(\"protected_attributes\", \"aucun\")\n mlflow.set_tag(\"threshold_accuracy\", \"0.88\")\n mlflow.set_tag(\"threshold_f1\", \"0.85\")\n\n mlflow.set_tag(\"model.author\", \"Octo Technology MLOps Tribe\")\n mlflow.set_tag(\"model.team\", TEAM)\n mlflow.set_tag(\"model.environment\", ENVIRONMENT)\n mlflow.set_tag(\"data.synthetic\", \"true\")\n mlflow.set_tag(\"pandera.status\", PANDERA_STATUS)\n mlflow.set_tag(\"ai_act.risk_level\", \"minimal\")\n mlflow.set_tag(\"ai_act.annex_ref\", \"N/A\")\n mlflow.set_tag(\"ai_act.domain\", \"Gestion documentaire medicale\")\n mlflow.set_tag(\"mlflow.note.content\", model_card_text)\n\n for art in [cr_path, fi_csv_path, fi_path, fairness_path, roc_path, pp_path, validation_report_path, model_card_path]:\n mlflow.log_artifact(str(art))\n if SCHEMA_YAML_EXPORTED:\n mlflow.log_artifact(str(schema_yaml_path))\n\n # Log pipeline (vectorizer + model) comme modele MLflow\n from sklearn.pipeline import Pipeline\n pipeline = Pipeline([(\"tfidf\", vectorizer), (\"clf\", model)])\n signature = infer_signature(X_train.tolist(), model.predict(X_train_sc))\n input_example = X_train.head(3).tolist()\n mlflow.sklearn.log_model(\n sk_model=pipeline,\n artifact_path=\"custom_model\",\n registered_model_name=MODEL_NAME,\n input_example={\"document_type_classifier_input\": input_example},\n )\n run_id = run.info.run_id\n\nprint(f\"\\nRun MLflow : {run_id}\")\nprint(f\" Modele enregistre : {MODEL_NAME}\")\nprint(f\" Accuracy : {METRICS['accuracy']} | F1 : {METRICS['f1_score']} | AUC : {METRICS['auc_roc']}\")" diff --git a/demos/notebooks/Hugging_Face/Hugging_face_model.ipynb b/demos/notebooks/old_demos/Hugging_Face/Hugging_face_model.ipynb similarity index 100% rename from demos/notebooks/Hugging_Face/Hugging_face_model.ipynb rename to demos/notebooks/old_demos/Hugging_Face/Hugging_face_model.ipynb diff --git a/demos/notebooks/Hugging_Face/__init__.py b/demos/notebooks/old_demos/Hugging_Face/__init__.py similarity index 100% rename from demos/notebooks/Hugging_Face/__init__.py rename to demos/notebooks/old_demos/Hugging_Face/__init__.py diff --git a/demos/notebooks/Hugging_Face/conda.yaml b/demos/notebooks/old_demos/Hugging_Face/conda.yaml similarity index 100% rename from demos/notebooks/Hugging_Face/conda.yaml rename to demos/notebooks/old_demos/Hugging_Face/conda.yaml diff --git a/demos/notebooks/Hugging_Face/hf_predict.http b/demos/notebooks/old_demos/Hugging_Face/hf_predict.http similarity index 100% rename from demos/notebooks/Hugging_Face/hf_predict.http rename to demos/notebooks/old_demos/Hugging_Face/hf_predict.http diff --git a/demos/notebooks/Photovoltaic Physical Model /06_PVLIB_simple_example.ipynb b/demos/notebooks/old_demos/Photovoltaic Physical Model /06_PVLIB_simple_example.ipynb similarity index 100% rename from demos/notebooks/Photovoltaic Physical Model /06_PVLIB_simple_example.ipynb rename to demos/notebooks/old_demos/Photovoltaic Physical Model /06_PVLIB_simple_example.ipynb diff --git a/demos/notebooks/Photovoltaic Physical Model /model_card.md b/demos/notebooks/old_demos/Photovoltaic Physical Model /model_card.md similarity index 100% rename from demos/notebooks/Photovoltaic Physical Model /model_card.md rename to demos/notebooks/old_demos/Photovoltaic Physical Model /model_card.md diff --git a/demos/notebooks/Photovoltaic Physical Model /predict_pvlib.http b/demos/notebooks/old_demos/Photovoltaic Physical Model /predict_pvlib.http similarity index 100% rename from demos/notebooks/Photovoltaic Physical Model /predict_pvlib.http rename to demos/notebooks/old_demos/Photovoltaic Physical Model /predict_pvlib.http diff --git a/demos/notebooks/Profit Optimization/03_ortools_simple_example.ipynb b/demos/notebooks/old_demos/Profit Optimization/03_ortools_simple_example.ipynb similarity index 100% rename from demos/notebooks/Profit Optimization/03_ortools_simple_example.ipynb rename to demos/notebooks/old_demos/Profit Optimization/03_ortools_simple_example.ipynb diff --git a/demos/notebooks/Profit Optimization/model_card.md b/demos/notebooks/old_demos/Profit Optimization/model_card.md similarity index 100% rename from demos/notebooks/Profit Optimization/model_card.md rename to demos/notebooks/old_demos/Profit Optimization/model_card.md diff --git a/demos/notebooks/Profit Optimization/optim_model.py b/demos/notebooks/old_demos/Profit Optimization/optim_model.py similarity index 100% rename from demos/notebooks/Profit Optimization/optim_model.py rename to demos/notebooks/old_demos/Profit Optimization/optim_model.py diff --git a/demos/notebooks/Profit Optimization/profit_optim.http b/demos/notebooks/old_demos/Profit Optimization/profit_optim.http similarity index 100% rename from demos/notebooks/Profit Optimization/profit_optim.http rename to demos/notebooks/old_demos/Profit Optimization/profit_optim.http diff --git a/demos/notebooks/Pytorch Image Classifier/04_pytorch_simple_example.ipynb b/demos/notebooks/old_demos/Pytorch Image Classifier/04_pytorch_simple_example.ipynb similarity index 100% rename from demos/notebooks/Pytorch Image Classifier/04_pytorch_simple_example.ipynb rename to demos/notebooks/old_demos/Pytorch Image Classifier/04_pytorch_simple_example.ipynb diff --git a/demos/notebooks/Pytorch Image Classifier/model_card.md b/demos/notebooks/old_demos/Pytorch Image Classifier/model_card.md similarity index 100% rename from demos/notebooks/Pytorch Image Classifier/model_card.md rename to demos/notebooks/old_demos/Pytorch Image Classifier/model_card.md diff --git a/demos/notebooks/Random Forest classifier/01_train_sklearn_random_forest.py.ipynb b/demos/notebooks/old_demos/Random Forest classifier/01_train_sklearn_random_forest.py.ipynb similarity index 100% rename from demos/notebooks/Random Forest classifier/01_train_sklearn_random_forest.py.ipynb rename to demos/notebooks/old_demos/Random Forest classifier/01_train_sklearn_random_forest.py.ipynb diff --git a/demos/notebooks/Random Forest classifier/model_card.md b/demos/notebooks/old_demos/Random Forest classifier/model_card.md similarity index 100% rename from demos/notebooks/Random Forest classifier/model_card.md rename to demos/notebooks/old_demos/Random Forest classifier/model_card.md diff --git a/demos/notebooks/Random Forest classifier/random_forest_predict.http b/demos/notebooks/old_demos/Random Forest classifier/random_forest_predict.http similarity index 100% rename from demos/notebooks/Random Forest classifier/random_forest_predict.http rename to demos/notebooks/old_demos/Random Forest classifier/random_forest_predict.http diff --git a/demos/notebooks/old_demos/__init__.py b/demos/notebooks/old_demos/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/demos/notebooks/tensorflow/01_marker_quality_control.ipynb b/demos/notebooks/old_demos/tensorflow/01_marker_quality_control.ipynb similarity index 100% rename from demos/notebooks/tensorflow/01_marker_quality_control.ipynb rename to demos/notebooks/old_demos/tensorflow/01_marker_quality_control.ipynb diff --git a/demos/notebooks/tensorflow/images/10.jpg b/demos/notebooks/old_demos/tensorflow/images/10.jpg similarity index 100% rename from demos/notebooks/tensorflow/images/10.jpg rename to demos/notebooks/old_demos/tensorflow/images/10.jpg diff --git a/demos/notebooks/tensorflow/images/22.jpg b/demos/notebooks/old_demos/tensorflow/images/22.jpg similarity index 100% rename from demos/notebooks/tensorflow/images/22.jpg rename to demos/notebooks/old_demos/tensorflow/images/22.jpg diff --git a/demos/notebooks/tensorflow/marker_predict.http b/demos/notebooks/old_demos/tensorflow/marker_predict.http similarity index 100% rename from demos/notebooks/tensorflow/marker_predict.http rename to demos/notebooks/old_demos/tensorflow/marker_predict.http diff --git a/demos/notebooks/tensorflow/marker_quality_control_wrapper.py b/demos/notebooks/old_demos/tensorflow/marker_quality_control_wrapper.py similarity index 100% rename from demos/notebooks/tensorflow/marker_quality_control_wrapper.py rename to demos/notebooks/old_demos/tensorflow/marker_quality_control_wrapper.py diff --git a/demos/run_notebooks.sh b/demos/run_notebooks.sh new file mode 100755 index 0000000..363801f --- /dev/null +++ b/demos/run_notebooks.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Author: Octo Technology MLOps Tribe +# Run all AI Act demo notebooks sequentially via papermill + +set -uo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +OUTPUT_DIR="$SCRIPT_DIR/notebooks/output" +mkdir -p "$OUTPUT_DIR" + +NOTEBOOKS=( + "notebooks/banking/Banking Credit Scoring/credit_scoring_ai_act.ipynb" + "notebooks/banking/Customer Segmentation/customer_segment_classifier_ai_act.ipynb" + "notebooks/banking/Transaction Fraud Detection/transaction_fraud_ai_act.ipynb" + "notebooks/banking/Transaction Fraud Detection/transaction_anomaly_ai_act.ipynb" + "notebooks/ecommerce/Ecommerce Recommendation/product_recommender_ai_act.ipynb" + "notebooks/ecommerce/Ecommerce Recommendation/customer_churn_predictor_ai_act.ipynb" + "notebooks/hr/Employee Attrition Prediction/employee_attrition_ai_act.ipynb" + "notebooks/hr/Employee Attrition Prediction/satisfaction_scorer_ai_act.ipynb" + "notebooks/medical/Medical Document NLP/document_type_classifier_ai_act.ipynb" + "notebooks/medical/Medical Document NLP/clinical_entity_extractor_ai_act.ipynb" + "notebooks/supply_chain/Supply Chain Optimization/demand_forecaster_ai_act.ipynb" + "notebooks/supply_chain/Supply Chain Optimization/supplier_risk_scorer_ai_act.ipynb" +) + +TOTAL=${#NOTEBOOKS[@]} +PASSED=0 +FAILED=0 +FAILED_LIST=() + +cd "$SCRIPT_DIR" + +for i in "${!NOTEBOOKS[@]}"; do + NB="${NOTEBOOKS[$i]}" + NAME=$(basename "$NB" .ipynb) + DIR=$(dirname "$NB") + NUM=$((i + 1)) + + echo "" + echo "=== [$NUM/$TOTAL] $NAME ===" + + if uv run papermill "$NB" "$OUTPUT_DIR/${NAME}_out.ipynb" --cwd "$DIR" 2>&1; then + echo "PASS: $NAME" + PASSED=$((PASSED + 1)) + else + echo "FAIL: $NAME" + FAILED=$((FAILED + 1)) + FAILED_LIST+=("$NAME") + fi +done + +echo "" +echo "=========================================" +echo "Results: $PASSED/$TOTAL passed, $FAILED failed" + +if [ $FAILED -gt 0 ]; then + echo "Failed notebooks:" + for f in "${FAILED_LIST[@]}"; do echo " - $f"; done + exit 1 +fi diff --git a/frontend/js/pages/projects.js b/frontend/js/pages/projects.js index 62fac0e..fc96e15 100644 --- a/frontend/js/pages/projects.js +++ b/frontend/js/pages/projects.js @@ -5,56 +5,56 @@ const ProjectsPage = (() => { { name: "Credit-Risk-Assessment", owner: "BancoFrance - Risk Management Division", - description: "Credit scoring and customer segmentation", + description: "Scoring crédit et segmentation client", team: "Risk & Data Science", domain: "Banking", - scope: "Credit scoring model to assess default risk of business clients. Uses transactional, financial and behavioral data to predict 12-month default probability. Production deployment to support advisor decision-making.", - data_perimeter: "French business customer data (2019-2024): financial statements, cash flows, credit history, financial ratios, business sector. ~500k companies. GDPR: Compliant - Pseudonymization via SHA-256 hashing of SIREN/SIRET identifiers.", + scope: "Modèle de scoring crédit pour évaluer le risque de défaut des clients professionnels. Utilise des données transactionnelles, financières et comportementales pour prédire la probabilité de défaut à 12 mois. Déploiement en production pour aide à la décision des conseillers.", + data_perimeter: "Données clients entreprises France (2019-2024): bilans comptables, flux de trésorerie, historique de crédit, ratios financiers, secteur d'activité. ~500k entreprises. RGPD: Conforme - Pseudonymisation par hachage SHA-256 des identifiants SIREN/SIRET.", }, { name: "Medical-Document-NLP", owner: "AP-HP (Assistance Publique Hôpitaux de Paris) - DSI Innovation", - description: "Clinical entity extraction and document classification", + description: "Extraction d'entités cliniques et classification de documents", team: "NLP & Clinical AI", domain: "Healthcare", - scope: "Automatic extraction of clinical information from unstructured medical reports. NLP to identify pathologies, treatments, exam results, allergies. Supports PMSI coding and clinical research.", - data_perimeter: "Medical reports (hospitalization, consultations, imaging) from 2015 to 2024. ~50M documents. GDPR: Compliant - Pseudonymization per MR-004 CNIL. HDS-certified hosting. Ethics committee consulted.", + scope: "Extraction automatique d'informations cliniques depuis les comptes-rendus médicaux non structurés. NLP pour identifier pathologies, traitements, résultats d'examens, allergies. Aide au codage PMSI et à la recherche clinique.", + data_perimeter: "Comptes-rendus médicaux (hospitalisation, consultations, imagerie) de 2015 à 2024. ~50M de documents. RGPD: Conforme - Pseudonymisation selon MR-004 CNIL. Hébergement certifié HDS. Comité éthique consulté.", }, { name: "Employee-Attrition-Prediction", - owner: "TalentCorp HR - People Analytics Division", - description: "Attrition prediction and satisfaction scoring", + owner: "TalentCorp RH - People Analytics Division", + description: "Prédiction de l'attrition et scoring de satisfaction", team: "People Analytics", domain: "Human Resources", - scope: "Prediction of voluntary employee departure risk and satisfaction scoring. Models trained on synthetic HR data: tenure, satisfaction, performance, mobility. Supports retention and workforce planning.", - data_perimeter: "Synthetic HR data: employee profiles, performance history, satisfaction surveys, internal mobility. ~80k simulated employees. GDPR: Compliant - Entirely synthetic data.", + scope: "Prédiction du risque de départ volontaire des collaborateurs et scoring de satisfaction. Modèles entraînés sur des données RH synthétiques: ancienneté, satisfaction, performance, mobilité. Aide à la rétention et à la gestion prévisionnelle des emplois.", + data_perimeter: "Données RH synthétiques: profils collaborateurs, historique de performance, enquêtes de satisfaction, mobilités internes. ~80k collaborateurs simulés. RGPD: Conforme - Données entièrement synthétiques.", }, { name: "Fraud-Detection-Payments", owner: "EuroBank Systems - Cyber & Fraud Prevention", - description: "Fraud detection and transaction anomalies", + description: "Détection de fraude et anomalies transactionnelles", team: "Fraud & Cyber Analytics", domain: "Banking", - scope: "Real-time fraud detection on card payments (e-commerce and retail). ML model assessing each transaction's risk in <100ms. Reduces false positives while maintaining >95% detection rate.", - data_perimeter: "Card transactions from the last 24 months: amount, merchant type, location, time, currency. ~2 billion transactions/year. GDPR: Compliant - PCI-DSS v4.0 tokenization, customer identity pseudonymization.", + scope: "Détection en temps réel des fraudes sur paiements par carte (e-commerce et retail). Modèle de ML évaluant le risque de chaque transaction en <100ms. Réduction des faux positifs tout en maintenant un taux de détection >95%.", + data_perimeter: "Transactions par carte des 24 derniers mois: montant, type de commerce, localisation, heure, devise. ~2 milliards de transactions/an. RGPD: Conforme - Tokenisation PCI-DSS v4.0, pseudonymisation des identités clients.", }, { name: "Ecommerce-Recommendation", owner: "ShopNow Digital - Personalization & Growth", - description: "Product recommendation and customer churn prediction", + description: "Recommandation produit et prédiction du churn client", team: "Personalization ML", domain: "E-commerce", - scope: "Personalized product recommendation system and customer churn prediction. Models trained on purchase behavior, browsing and engagement. Optimizes average cart value and reduces attrition rate.", - data_perimeter: "Synthetic e-commerce behavioral data: purchase history, browsing, ratings, interactions. ~2M simulated profiles. GDPR: Compliant - Entirely synthetic data.", + scope: "Système de recommandation produit personnalisé et prédiction du churn client. Modèles entraînés sur les comportements d'achat, navigation et engagement. Optimisation du panier moyen et réduction du taux d'attrition.", + data_perimeter: "Données comportementales e-commerce synthétiques: historique d'achats, navigation, notes, interactions. ~2M de profils simulés. RGPD: Conforme - Données entièrement synthétiques.", }, { name: "Supply-Chain-Optimization", owner: "MarchéPlus Distribution - Supply Chain & Advanced Analytics", - description: "Demand forecasting and supplier risk scoring", + description: "Prévision de la demande et scoring risque fournisseur", team: "Supply Chain Analytics", domain: "Retail & Logistics", - scope: "Inventory optimization and demand forecasting for fresh products in large retail. Supplier risk scoring to secure supply chains. Reduces food waste and stockouts.", - data_perimeter: "Daily sales history over 3 years for 12,000 SKUs across 5,000 stores. Local weather data, events, logistics, supplier ratings. GDPR: Compliant - No personal data collected.", + scope: "Optimisation des stocks et prévision de la demande pour les produits frais en grande distribution. Scoring du risque fournisseur pour sécuriser les approvisionnements. Réduction du gaspillage alimentaire et des ruptures de stock.", + data_perimeter: "Historique des ventes quotidiennes sur 3 ans pour 12000 références dans 5000 magasins. Données météo locale, événements, logistiques, évaluations fournisseurs. RGPD: Conforme - Aucune donnée personnelle collectée.", }, ];