From c5bce85978f8c6e7c14bbe5ee59a3a5cf01a9c80 Mon Sep 17 00:00:00 2001 From: Indrayudd Roy Chowdhury Date: Tue, 10 Feb 2026 21:17:55 -0500 Subject: [PATCH 1/5] TutorTask696: Add input handling, date time formatting and integrity checking of dataset at ingestion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../jupyterlab_extension_backbend/.gitignore | 17 ++ .../config/config.py | 133 +++++++++++ .../datasets/T1_slice.csv | 101 ++++++++ .../src/format_datetime.py | 220 +++++++++++++++++ .../src/handle_inputs.py | 96 ++++++++ .../src/integrity.py | 224 ++++++++++++++++++ .../tools/input_tools.py | 126 ++++++++++ 7 files changed, 917 insertions(+) create mode 100644 agentic_eda/jupyterlab_extension_backbend/.gitignore create mode 100644 agentic_eda/jupyterlab_extension_backbend/config/config.py create mode 100644 agentic_eda/jupyterlab_extension_backbend/datasets/T1_slice.csv create mode 100644 agentic_eda/jupyterlab_extension_backbend/src/format_datetime.py create mode 100644 agentic_eda/jupyterlab_extension_backbend/src/handle_inputs.py create mode 100644 agentic_eda/jupyterlab_extension_backbend/src/integrity.py create mode 100644 agentic_eda/jupyterlab_extension_backbend/tools/input_tools.py diff --git a/agentic_eda/jupyterlab_extension_backbend/.gitignore b/agentic_eda/jupyterlab_extension_backbend/.gitignore new file mode 100644 index 000000000..082bc2d6e --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backbend/.gitignore @@ -0,0 +1,17 @@ +# OS files +.DS_Store + +# Python cache/build artifacts +__pycache__/ +*.py[cod] +*.pyo +*.pyd + +# Secrets and local environment files +.env +*.env +config/.env +*.secret +*secret* +*.key +*.pem diff --git a/agentic_eda/jupyterlab_extension_backbend/config/config.py b/agentic_eda/jupyterlab_extension_backbend/config/config.py new file mode 100644 index 000000000..8f66572e9 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backbend/config/config.py @@ -0,0 +1,133 @@ +import os +import dataclasses +import functools +import pydantic + +import dotenv +import langchain_openai +import langchain_anthropic # ChatAnthropic +import langchain_google_genai # ChatGoogleGenerativeAI +# import langchain_groq # ChatGroq +# import langchain_mistralai # ChatMistralAI +# import langchain_ollama # ChatOllama + + +dataclass = dataclasses.dataclass +lru_cache = functools.lru_cache +ChatOpenAI = langchain_openai.ChatOpenAI +ChatAnthropic = langchain_anthropic.ChatAnthropic +ChatGoogleGenerativeAI = langchain_google_genai.ChatGoogleGenerativeAI +# ChatGroq = langchain_groq.ChatGroq +# ChatMistralAI = langchain_mistralai.ChatMistralAI +# ChatOllama = langchain_ollama.ChatOllama +SecretStr = pydantic.SecretStr + +# Load Variables +dotenv.load_dotenv() + + +# Immutable data class +@dataclass(frozen=True) +class Settings: + provider: str + model: str + temperature: float + timeout: float + max_retries: int + +def _need(name:str) -> str: + v = os.getenv(name) + if v is None or v == "": + raise RuntimeError(f"Missing required environment variable: {name}") + return v + +@lru_cache(maxsize=1) +def get_settings() -> Settings: + return Settings( + provider=os.getenv("LLM_PROVIDER", "openai"), + model=os.getenv("LLM_MODEL", "gpt-5-nano"), + temperature=float(os.getenv("LLM_TEMP", 0.2)), + timeout=float(os.getenv("LLM_TIMEOUT", 60)), + max_retries=int(os.getenv("LLM_MAX_RETRIES", 2)), + + ) + +@lru_cache(maxsize=1) +def get_chat_model(model=get_settings().model): + s = get_settings() + + # OpenAI-adjacent + + if s.provider == "openai": + + # READ API KEY. + _need("OPENAI_API_KEY") + + # Return the chatmodel + + return ChatOpenAI( + model=s.model, + temperature=s.temperature, + timeout=s.timeout, + max_retries=s.max_retries, + ) + + if s.provider == "openai_compatible": + + # Secrets. + base_url = _need("OPENAI_COMPAT_BASE_URL") + api_key = _need("OPENAI_COMPAT_API_KEY") + return ChatOpenAI( + model=model, + base_url=base_url, + api_key=SecretStr(api_key), + temperature=s.temperature, + timeout=s.timeout, + max_retries=s.max_retries, + + ) + + if s.provider == "azure_openai_v1": + + # Secrets. + azure_base = _need("AZURE_OPENAI_BASE_URL") + azure_key = SecretStr(_need("AZURE_OPENAI_API_KEY")) + + return ChatOpenAI( + model=s.model, + base_url=azure_base, + api_key=azure_key, + temperature=s.temperature, + timeout=s.timeout, + max_retries=s.max_retries, + + ) + + # Anthropic + + if s.provider == "anthropic": + + # Secrets. + _need("ANTHROPIC_API_KEY") + return ChatAnthropic( + model_name=s.model, + temperature=s.temperature, + timeout=s.timeout, + max_retries=s.max_retries, + stop=None + ) + + # Google + if s.provider in ("google", "gemini", "google_genai"): + # Secrets. + _need("GOOGLE_API_KEY") + return ChatGoogleGenerativeAI( + model=s.model, + temperature=s.temperature, + ) + + + + + + raise ValueError("TODO(*): expand support!") diff --git a/agentic_eda/jupyterlab_extension_backbend/datasets/T1_slice.csv b/agentic_eda/jupyterlab_extension_backbend/datasets/T1_slice.csv new file mode 100644 index 000000000..fd8bb93b2 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backbend/datasets/T1_slice.csv @@ -0,0 +1,101 @@ +Date/Time,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°) +01 01 2018 00:00,380.047790527343,5.31133604049682,416.328907824861,259.994903564453 +01 01 2018 00:10,453.76919555664,5.67216682434082,519.917511061494,268.64111328125 +01 01 2018 00:20,306.376586914062,5.21603679656982,390.900015810951,272.564788818359 +01 01 2018 00:30,419.645904541015,5.65967416763305,516.127568975674,271.258087158203 +01 01 2018 00:40,380.650695800781,5.57794094085693,491.702971953588,265.674285888671 +01 01 2018 00:50,402.391998291015,5.60405206680297,499.436385024805,264.57861328125 +01 01 2018 01:00,447.605712890625,5.79300785064697,557.372363290225,266.163604736328 +01 01 2018 01:10,387.2421875,5.30604982376098,414.898178826186,257.949493408203 +01 01 2018 01:20,463.651214599609,5.58462905883789,493.677652137077,253.480697631835 +01 01 2018 01:30,439.725708007812,5.52322816848754,475.706782818068,258.72378540039 +01 01 2018 01:40,498.181701660156,5.72411584854125,535.841397042263,251.850997924804 +01 01 2018 01:50,526.816223144531,5.93419885635375,603.014076510633,265.504699707031 +01 01 2018 02:00,710.587280273437,6.54741382598876,824.662513585882,274.23291015625 +01 01 2018 02:10,655.194274902343,6.19974613189697,693.472641075637,266.733184814453 +01 01 2018 02:20,754.762512207031,6.50538301467895,808.098138482693,266.76040649414 +01 01 2018 02:30,790.173278808593,6.63411617279052,859.459020788565,270.493194580078 +01 01 2018 02:40,742.985290527343,6.37891292572021,759.434536596592,266.593292236328 +01 01 2018 02:50,748.229614257812,6.4466528892517,785.28100987646,265.571807861328 +01 01 2018 03:00,736.647827148437,6.41508293151855,773.172863451736,261.15869140625 +01 01 2018 03:10,787.246215820312,6.43753099441528,781.7712157188,257.56021118164 +01 01 2018 03:20,722.864074707031,6.22002410888671,700.764699868076,255.926498413085 +01 01 2018 03:30,935.033386230468,6.89802598953247,970.736626881787,250.012893676757 +01 01 2018 03:40,1220.60900878906,7.60971117019653,1315.04892785216,255.985702514648 +01 01 2018 03:50,1053.77197265625,7.28835582733154,1151.26574355584,255.444595336914 +01 01 2018 04:00,1493.80798339843,7.94310188293457,1497.58372354361,256.407409667968 +01 01 2018 04:10,1724.48803710937,8.37616157531738,1752.19966204818,252.41259765625 +01 01 2018 04:20,1636.93505859375,8.23695755004882,1668.47070685152,247.979400634765 +01 01 2018 04:30,1385.48803710937,7.87959098815917,1461.81579081391,238.609603881835 +01 01 2018 04:40,1098.93200683593,7.10137605667114,1062.28503444311,245.095596313476 +01 01 2018 04:50,1021.4580078125,6.95530700683593,995.995854606612,245.410202026367 +01 01 2018 05:00,1164.89294433593,7.09829807281494,1060.85971215544,235.227905273437 +01 01 2018 05:10,1073.33203125,6.95363092422485,995.250960801046,242.872695922851 +01 01 2018 05:20,1165.30798339843,7.24957799911499,1132.4168612641,244.835693359375 +01 01 2018 05:30,1177.98999023437,7.29469108581542,1154.36530469206,242.48159790039 +01 01 2018 05:40,1170.53601074218,7.37636995315551,1194.8430985043,247.97720336914 +01 01 2018 05:50,1145.53601074218,7.44855403900146,1231.43070603717,249.682998657226 +01 01 2018 06:00,1114.02697753906,7.2392520904541,1127.43320551345,248.401000976562 +01 01 2018 06:10,1153.18505859375,7.32921123504638,1171.35504358957,244.621704101562 +01 01 2018 06:20,1125.3310546875,7.13970518112182,1080.13908466205,244.631805419921 +01 01 2018 06:30,1228.73205566406,7.47422885894775,1244.63353439737,245.785995483398 +01 01 2018 06:40,1021.79302978515,7.03317403793334,1030.99268581181,248.652206420898 +01 01 2018 06:50,957.378173828125,6.88645505905151,965.683334443832,244.611694335937 +01 01 2018 07:00,909.887817382812,6.88782119750976,966.279104864065,235.84829711914 +01 01 2018 07:10,1000.95397949218,7.21643209457397,1116.4718990154,232.842697143554 +01 01 2018 07:20,1024.47802734375,7.0685977935791,1047.17023059277,229.933197021484 +01 01 2018 07:30,1009.53399658203,6.93829584121704,988.451940715539,230.13670349121 +01 01 2018 07:40,899.492980957031,6.53668785095214,820.416658585943,234.933807373046 +01 01 2018 07:50,725.110107421875,6.18062496185302,686.636942163399,232.837905883789 +01 01 2018 08:00,585.259399414062,5.81682586669921,564.927659543473,240.328796386718 +01 01 2018 08:10,443.913909912109,5.45015096664428,454.773587146918,238.12629699707 +01 01 2018 08:20,565.253784179687,5.81814908981323,565.349093224668,235.80029296875 +01 01 2018 08:30,644.037780761718,6.13027286529541,668.823569309414,224.958694458007 +01 01 2018 08:40,712.058898925781,6.34707784652709,747.460673422601,216.803894042968 +01 01 2018 08:50,737.394775390625,6.34743690490722,747.595109122642,205.785293579101 +01 01 2018 09:00,725.868103027343,6.19436883926391,691.546334303948,199.848495483398 +01 01 2018 09:10,408.997406005859,4.97719812393188,330.417630427964,207.997802734375 +01 01 2018 09:20,628.436828613281,5.95911121368408,611.283836510667,210.954895019531 +01 01 2018 09:30,716.1005859375,6.21137619018554,697.649474372052,215.69400024414 +01 01 2018 09:40,711.49560546875,6.11145305633544,662.235163012206,220.84260559082 +01 01 2018 09:50,838.151916503906,6.45632219314575,789.011422412419,237.065307617187 +01 01 2018 10:00,881.062072753906,6.66665792465209,872.739625855708,235.667495727539 +01 01 2018 10:10,663.703125,6.16287899017333,680.327891653483,229.329696655273 +01 01 2018 10:20,578.261596679687,6.01316785812377,628.442560754699,234.900604248046 +01 01 2018 10:30,465.620086669921,5.56120300292968,486.779567601972,230.422805786132 +01 01 2018 10:40,311.050903320312,4.96073198318481,326.411025380213,229.537506103515 +01 01 2018 10:50,230.05549621582,4.60387516021728,244.31624421611,231.79849243164 +01 01 2018 11:00,233.990600585937,4.55453395843505,233.632780531927,234.105606079101 +01 01 2018 11:10,175.592193603515,4.26362895965576,173.573663122312,228.776702880859 +01 01 2018 11:20,118.133102416992,3.89413905143737,108.571221110423,227.938995361328 +01 01 2018 11:30,142.202499389648,4.03876113891601,130.229989593698,224.46499633789 +01 01 2018 11:40,212.566192626953,4.50565099716186,223.196784083793,224.950500488281 +01 01 2018 11:50,222.610000610351,4.54339790344238,231.242507343633,229.12759399414 +01 01 2018 12:00,194.181198120117,4.32376098632812,185.598479588255,227.039993286132 +01 01 2018 12:10,82.6407470703125,3.63443708419799,68.5028197987886,230.31460571289 +01 01 2018 12:20,75.8952178955078,3.70551204681396,78.3961653540173,233.953292846679 +01 01 2018 12:30,41.9472389221191,3.25396800041198,29.2869556318446,233.06590270996 +01 01 2018 12:40,118.534599304199,3.77513694763183,88.8713653309387,227.753494262695 +01 01 2018 12:50,250.755905151367,4.69350099563598,264.119257409418,229.896606445312 +01 01 2018 13:00,346.86441040039,5.00293922424316,336.721998240131,235.279495239257 +01 01 2018 13:10,416.417907714843,5.36474990844726,430.92108895689,235.585296630859 +01 01 2018 13:20,331.941497802734,5.01618194580078,339.984940156412,229.942901611328 +01 01 2018 13:30,583.479919433593,5.97040796279907,615.05563084927,235.69529724121 +01 01 2018 13:40,776.552673339843,6.6555209159851,868.180844867276,241.457397460937 +01 01 2018 13:50,752.726379394531,6.60090398788452,846.029409522117,242.782104492187 +01 01 2018 14:00,589.073120117187,5.98137807846069,618.731442665699,234.984405517578 +01 01 2018 14:10,1109.12805175781,7.42459392547607,1219.19978672882,235.14729309082 +01 01 2018 14:20,1482.4599609375,8.18645191192626,1638.50890923271,238.479095458984 +01 01 2018 14:30,1523.43005371093,8.27493000030517,1691.1470390233,237.033203125 +01 01 2018 14:40,1572.17004394531,8.44920253753662,1796.76309010091,238.332397460937 +01 01 2018 14:50,1698.93994140625,8.5759744644165,1875.04719734159,235.641403198242 +01 01 2018 15:00,1616.84594726562,8.28225994110107,1695.53877696245,236.461395263671 +01 01 2018 15:10,1796.82397460937,8.73455238342285,1974.47580025242,234.354797363281 +01 01 2018 15:20,1885.86096191406,8.76410388946533,1993.17071186444,231.001602172851 +01 01 2018 15:30,2327.51196289062,9.66943168640136,2568.82712862015,227.60009765625 +01 01 2018 15:40,2499.162109375,10.1410903930664,2876.75361614448,227.73159790039 +01 01 2018 15:50,2820.51293945312,10.7724199295043,3186.02988321436,225.276397705078 +01 01 2018 16:00,2812.27905273437,10.6475200653076,3133.25922420184,224.680603027343 +01 01 2018 16:10,2530.44702148437,9.98266124725341,2781.27404078649,225.519500732421 +01 01 2018 16:20,2399.12109375,9.87438583374023,2711.49245838958,227.273803710937 +01 01 2018 16:30,2335.587890625,9.78547954559326,2651.34100928894,229.255493164062 diff --git a/agentic_eda/jupyterlab_extension_backbend/src/format_datetime.py b/agentic_eda/jupyterlab_extension_backbend/src/format_datetime.py new file mode 100644 index 000000000..c3a1ed7e9 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backbend/src/format_datetime.py @@ -0,0 +1,220 @@ +import src.handle_inputs as handle_inputs +import pandas as pd +import numpy as np +from typing_extensions import TypedDict +from langgraph.graph import StateGraph, START, END +from pathlib import Path +from config.config import get_chat_model +from langchain.agents import create_agent +from langchain.tools import tool +from langchain_core.messages import HumanMessage +from tools.input_tools import load_dataset, extract_head +from pydantic import BaseModel, ConfigDict + +def _score_parse(dt: pd.Series) -> float: + # Force dtype to datetime (safe even if already datetime) + dt = pd.to_datetime(dt, errors="coerce") + + if dt.isna().all(): + return -1.0 + + parsed = dt.notna().mean() + + dmin, dmax = dt.min(), dt.max() + sane_range = 1.0 + if dmin < pd.Timestamp("1990-01-01") or dmax > pd.Timestamp("2035-01-01"): + sane_range = 0.7 + + dt2 = dt.dropna() + mono = 0.0 + if len(dt2) >= 3: + deltas = dt2.diff() # this is Timedelta series now + inversions = (deltas < pd.Timedelta(0)).mean() + mono = 1.0 - float(inversions) + + return float(parsed) * 0.65 + sane_range * 0.15 + mono * 0.20 + +class _Candidate(BaseModel): + model_config = ConfigDict(extra="forbid") + + # Keep all keys REQUIRED but nullable, to satisfy strict tool-schema validators. + format: str | None + dayfirst: bool | None + yearfirst: bool | None + utc: bool + + +class _ParseWithCandidatesArgs(BaseModel): + model_config = ConfigDict(extra="forbid") + + path: str + col_name: str + candidates: list[_Candidate] + + +@tool(args_schema=_ParseWithCandidatesArgs) +def _parse_with_candidates(path: str, col_name: str, candidates: list[_Candidate]): + """ + Try multiple datetime parsing “candidates” for a single column and pick the best one. + + This helper normalizes the input series to strings, then iterates + over a list of candidate parse configurations (format/dayfirst/yearfirst/utc), parses the column + with `pandas.to_datetime`, and selects the candidate with the highest score as computed by `_score_parse`. + + Scoring (via `_score_parse`) favors: + - high parse success rate (fraction of non-NaT values), + - a “sane” min/max timestamp range (1990-01-01 through 2035-01-01), + - monotonicity / low rate of backwards time jumps (for columns that look like time). + + Parameters + ---------- + col: + A pandas Series containing the raw values for a single candidate + time column. + Values are coerced to `str`, stripped, and empty strings as well + as the literal + strings `"nan"` and `"NaT"` are treated as missing. + candidates: + A list of dicts, each describing one parsing attempt. Supported keys: + - "format": `str | None` + Passed to `pd.to_datetime(..., format=...)`. Common values: + - a strptime format (e.g. "%Y-%m-%d %H:%M:%S") + - "ISO8601" (pandas special value) + - "mixed" (pandas special value for per-element inference) + - None (let pandas infer) + - "dayfirst": `bool` (default False) + - "yearfirst": `bool` (default False) + - "utc": `bool` (default False) + + eg, + {"format": "%d %m %Y %H:%M", "dayfirst": None, "yearfirst": None, "utc": False} + {"format": "mixed", "dayfirst": True, "yearfirst": False, "utc": False} + {"format": "ISO8601", "dayfirst": None, "yearfirst": None, "utc": True} + + Returns + ------- + dict: + JSON-serializable summary of the best candidate: + - best_candidate: {format, dayfirst, yearfirst, utc} + - best_score: float + - parsed_fraction: float in [0, 1] + """ + _path = Path(path) + data = load_dataset(_path) + col: pd.Series = data[col_name] + best_score = -1.0 + best_meta = None + best_parsed_fraction = 0.0 + + s = col.astype(str).str.strip().replace({"": np.nan, "nan": np.nan, "NaT": np.nan}) + + for c in candidates: + c_dict = c if isinstance(c, dict) else c.model_dump() + fmt = c_dict.get("format", None) + dayfirst = c_dict.get("dayfirst", None) + yearfirst = c_dict.get("yearfirst", None) + utc = c_dict.get("utc", None) + + kwargs = {k: v for k, v in {"format": fmt, "dayfirst": dayfirst, "yearfirst": yearfirst, "utc": utc}.items() if v is not None} + + try: + dt = pd.to_datetime( + s, + errors="coerce", + **kwargs, + ) + except Exception: + continue + + sc = _score_parse(dt) + if sc > best_score: + best_score = sc + best_meta = c_dict + best_parsed_fraction = float(dt.notna().mean()) + + return { + "best_candidate": best_meta, + "best_score": float(best_score), + "parsed_fraction": float(best_parsed_fraction), + } + + +class DateFormatterState(TypedDict): + path: str + time_col: str + candidates: list[dict] + winner_formatter: dict + +class DateFormatterOutput(BaseModel): + model_config = ConfigDict(extra="forbid") + + candidates: list[_Candidate] + winner_formatter: _Candidate + +def run_formatting_agent( + state: DateFormatterState +): + system_prompt: str = """Use the tools at your disposal to convert the time column provided into a correct datetime format. The docstring for the function + has information on how to pass the arguments. To get an idea of formatting strings, use the extract_head tool as needed. + + Steps: + 1. Use extract_head to get an idea of what the temporal column looks like and create a list of dict candidates looking like: + [{"format": "%d %m %Y %H:%M", "dayfirst": None, "yearfirst": None, "utc": False}, + {"format": "mixed", "dayfirst": True, "yearfirst": False, "utc": False}, + ... + ] + + 2. Pass all the information needed by _parse_with_candidates and find out the winning format. e.g. {"format": "%d %m %Y %H:%M", "dayfirst": None, "yearfirst": None, "utc": False} + +""" + llm = get_chat_model(model="gpt-4.1") + agent = create_agent( + model = llm, + tools = [_parse_with_candidates, extract_head], + system_prompt=system_prompt, + response_format=DateFormatterOutput, + ) + + out = agent.invoke( + {"messages": [HumanMessage(content=f"The dataset path is {state['path']} and the time column name is {state['time_col']}")]} + ) + + sr = out["structured_response"].model_dump() + return {"candidates": sr["candidates"], "winner_formatter": sr["winner_formatter"]} + +def call_input_handler(state: DateFormatterState) -> dict: + # Call compiled subgraph like a function + out = handle_inputs.run_input_handler(state["path"]) + temporal_cols = out.get("temporal_cols") or [] + if not temporal_cols: + raise ValueError("No temporal columns found by input handler.") + return {"time_col": temporal_cols[0]} + +date_formatter = StateGraph(DateFormatterState) +date_formatter.add_node("input_handler", call_input_handler) +date_formatter.add_node("run_formatting_agent", run_formatting_agent) +date_formatter.add_edge(START, "input_handler") +date_formatter.add_edge("input_handler", "run_formatting_agent") +date_formatter.add_edge("run_formatting_agent", END) +graph = date_formatter.compile() + + +def run_date_formatter(path: str): + + inp = { + "path": path, + + } + out: DateFormatterState = graph.invoke(inp) #type: ignore + print(out["winner_formatter"]) + + _path = Path(path) + data = load_dataset(_path) + raw_args: dict = out["winner_formatter"] + format_args = {k: v for k, v in raw_args.items() if v is not None} + print(type(pd.to_datetime(data[out["time_col"]], **format_args))) # type: ignore + + +if __name__ == "__main__": + run_date_formatter("datasets/T1_slice.csv") + diff --git a/agentic_eda/jupyterlab_extension_backbend/src/handle_inputs.py b/agentic_eda/jupyterlab_extension_backbend/src/handle_inputs.py new file mode 100644 index 000000000..0a6665ec4 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backbend/src/handle_inputs.py @@ -0,0 +1,96 @@ +""" +Docstring for src.handle_inputs + +Input dataset checks to run: +- Does it have headers? If not throw error [Header Gate] +- What are the temporal, numeric value, and categorical headers? + +""" + +from __future__ import annotations +import argparse +from typing import TypedDict +from langgraph.graph import START, END, StateGraph +from pathlib import Path +from tools.input_tools import extract_head, extract_metadata, headerAnalysis +from config.config import get_chat_model +from pydantic import BaseModel +from langchain.agents import create_agent +import pandas as pd +from langchain_core.messages import HumanMessage + +class InputState(TypedDict): + path: str | Path + done: list + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + +class LLMOutput(BaseModel): + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + + +def header_classification_agent( + state: InputState +) -> dict: + + llm = get_chat_model(model="gpt-4.1") + agent = create_agent( + model = llm, + tools = [extract_head, extract_metadata], + system_prompt="""You are a header classifier agent. Use any of the tools at your disposal to ultimately convey which columns are temporal, and of the remaining value columns which ones are purely numeric and which ones are categorical. The final output has all lists of columns. + OUTPUT FORMAT: {"temporal_cols":["..."],"numeric_val_cols": ["..."],"categorical_val_cols":[]} + """, + response_format=LLMOutput, + ) + + out = agent.invoke( + {"messages": [HumanMessage(content=f"The dataset is in {state['path']}")]} + ) + + return out["structured_response"].model_dump() + +def error_node( + state: InputState +): + + print(state['error']) + +def hasHeader(state) -> bool: + return state['has_header'] + +def run_input_handler(path: str | Path): + g = StateGraph(InputState) + + g.add_node("headerAnalysis", headerAnalysis) + g.add_node("header_classification_agent", header_classification_agent) + g.add_node("error", error_node) + + g.add_edge(START, "headerAnalysis") + # g.add_edge("hasHeader", "header_classification_agent") + g.add_conditional_edges("headerAnalysis", hasHeader, {True: "header_classification_agent", False: "error"}) + g.add_edge("error", END) + g.add_edge("header_classification_agent", END) + + graph = g.compile() + + init: InputState = { #type: ignore + "path": path, + + } + + out = graph.invoke(init) + + print(out) + + return out + +if __name__ == "__main__": + run_input_handler('datasets/T1_slice.csv') diff --git a/agentic_eda/jupyterlab_extension_backbend/src/integrity.py b/agentic_eda/jupyterlab_extension_backbend/src/integrity.py new file mode 100644 index 000000000..94907775b --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backbend/src/integrity.py @@ -0,0 +1,224 @@ +import src.handle_inputs as handle_inputs +import src.format_datetime as format_datetime +import pandas as pd +import numpy as np +from typing import Literal +from typing_extensions import TypedDict +from langgraph.graph import StateGraph, START, END +from pathlib import Path +from tools.input_tools import load_dataset +from config.config import get_chat_model +from pydantic import BaseModel +from langchain.agents import create_agent +from langchain_core.messages import HumanMessage + + +class IntegrityState(TypedDict): + path: str + time_col: str + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + + +class IntegrityJudgeOutput(BaseModel): + summary: str + flag: Literal["yes", "no"] + + +def call_date_formatter(state: IntegrityState) -> dict: + out: format_datetime.DateFormatterState = format_datetime.graph.invoke( # type: ignore + {"path": state["path"]} #type:ignore + ) + return {"time_col": out["time_col"], "winner_formatter": out["winner_formatter"]} + +def _maybe_infer_columns(state: IntegrityState) -> dict: + if state.get("numeric_cols"): + return {} + out = handle_inputs.run_input_handler(state["path"]) + numeric_cols = out.get("numeric_val_cols") or [] + return {"numeric_cols": numeric_cols} + + +def run_integrity_checks(state: IntegrityState) -> dict: + path = Path(state["path"]) + df = load_dataset(path) + + issues: list[dict] = [] + summary: dict = { + "n_rows": int(df.shape[0]), + "n_cols": int(df.shape[1]), + } + + if df.shape[0] == 0: + issues.append({"type": "empty_dataset", "msg": "Dataset has 0 rows"}) + return {"report": {"summary": summary, "issues": issues}} + + time_col = state.get("time_col") + if not time_col or time_col not in df.columns: + issues.append({"type": "missing_time_col", "msg": f"time_col missing: {time_col!r}"}) + return {"report": {"summary": summary, "issues": issues}} + format_args = state.get("winner_formatter") or {} + format_args = {k: v for k, v in format_args.items() if v is not None} + try: + ts = pd.to_datetime(df[time_col], errors="coerce", **format_args) + except Exception: + ts = pd.to_datetime(df[time_col], errors="coerce") + summary["n_nat_time"] = int(ts.isna().sum()) + summary["min_time"] = None if ts.dropna().empty else str(ts.dropna().min()) + summary["max_time"] = None if ts.dropna().empty else str(ts.dropna().max()) + + dup_ts = int(ts.dropna().duplicated().sum()) + summary["duplicate_timestamps"] = dup_ts + if dup_ts > 0: + issues.append({"type": "duplicate_timestamps", "count": dup_ts}) + + entity_col = state.get("entity_col") or None + if entity_col and entity_col in df.columns: + summary["n_entities"] = int(df[entity_col].nunique(dropna=True)) + tmp = df[[entity_col]].copy() + tmp["_ts"] = ts + dup_pairs = int(tmp.dropna(subset=[entity_col, "_ts"]).duplicated(subset=[entity_col, "_ts"]).sum()) + summary["duplicate_entity_timestamp_pairs"] = dup_pairs + if dup_pairs > 0: + issues.append({"type": "duplicate_entity_timestamp_pairs", "count": dup_pairs}) + else: + summary["duplicate_entity_timestamp_pairs"] = None + + numeric_cols = state.get("numeric_cols") or [] + numeric_cols = [c for c in numeric_cols if c in df.columns] + + nonnegative_cols = state.get("nonnegative_cols") or [] + neg_report: dict = {} + for c in nonnegative_cols: + if c not in df.columns: + continue + s = pd.to_numeric(df[c], errors="coerce") + nneg = int((s < 0).sum(skipna=True)) + if nneg > 0: + neg_report[c] = nneg + summary["negatives_in_nonnegative_cols"] = neg_report + if len(neg_report) > 0: + issues.append({"type": "negative_values", "details": neg_report}) + + jump_mult = float(state.get("jump_mult") or 20.0) + jumps: dict = {} + if numeric_cols: + tmp = df[[time_col] + ([entity_col] if entity_col and entity_col in df.columns else []) + numeric_cols].copy() + tmp["_ts"] = ts + sort_cols = ["_ts"] if not (entity_col and entity_col in tmp.columns) else [entity_col, "_ts"] + tmp = tmp.sort_values(sort_cols) + + for c in numeric_cols: + tmp[c] = pd.to_numeric(tmp[c], errors="coerce") + if entity_col and entity_col in tmp.columns: + diff = tmp.groupby(entity_col)[c].diff() + else: + diff = tmp[c].diff() + diff_abs = diff.abs() + + scale = diff_abs.median() + if pd.isna(scale) or float(scale) <= 0.0: + scale = diff_abs.mean() + if pd.isna(scale) or float(scale) <= 0.0: + continue + + threshold = float(scale) * float(jump_mult) + flag = diff_abs > threshold + n_flag = int(flag.sum(skipna=True)) + if n_flag <= 0: + continue + + examples = [] + for i in tmp.index[flag.fillna(False)][:5]: + d = diff.loc[i] + curr = tmp.loc[i, c] + prev = None if pd.isna(d) or pd.isna(curr) else float(curr - d) + examples.append( + { + "col": c, + "entity": None if not (entity_col and entity_col in tmp.columns) else tmp.loc[i, entity_col], + "time": None if pd.isna(tmp.loc[i, "_ts"]) else str(tmp.loc[i, "_ts"]), + "prev": prev, + "curr": None if pd.isna(curr) else float(curr), #type:ignore + "diff": None if pd.isna(d) else float(d), + "threshold": float(threshold), + } + ) + + jumps[c] = {"count": n_flag, "threshold": threshold, "examples": examples} + issues.append({"type": "impossible_jumps", "col": c, "count": n_flag}) + + summary["jump_mult"] = float(jump_mult) + summary["jumps"] = jumps + + return {"report": {"summary": summary, "issues": issues}} + +def integrity_llm_summary(state: IntegrityState) -> dict: + llm = get_chat_model(model="gpt-4.1") + agent = create_agent( + model=llm, + tools=[], + system_prompt="""You are an integrity judge. +You get an integrity report dict from a dataset. +Decide if everything looks normal enough to proceed. + +Output format: +{ "summary": "...", "flag": "yes" or "no" } + +Rules: +- flag = "yes" only if the report has no meaningful integrity issues. +- flag = "no" if there are clear issues (duplicates, impossible jumps, bad timestamps, etc.). +- Keep summary short and direct. +""", + response_format=IntegrityJudgeOutput, + ) + out = agent.invoke( + { + "messages": [ + HumanMessage( + content=f"Here is the integrity report: {state['report']}" + ) + ] + } + ) + sr = out["structured_response"].model_dump() + return {"summary": sr["summary"], "flag": sr["flag"]} + + +integrity = StateGraph(IntegrityState) +integrity.add_node("date_formatter", call_date_formatter) +integrity.add_node("maybe_infer_columns", _maybe_infer_columns) +integrity.add_node("integrity", run_integrity_checks) +integrity.add_node("integrity_llm_summary", integrity_llm_summary) +integrity.add_edge(START, "date_formatter") +integrity.add_edge("date_formatter", "maybe_infer_columns") +integrity.add_edge("maybe_infer_columns", "integrity") +integrity.add_edge("integrity", "integrity_llm_summary") +integrity.add_edge("integrity_llm_summary", END) +graph = integrity.compile() + + +def run_integrity(path: str, time_col: str | None = None, entity_col: str | None = None): + init: IntegrityState = { # type: ignore + "path": path, + "time_col": time_col, #type:ignore + "winner_formatter": {}, + "entity_col": entity_col, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + } + out = graph.invoke(init) + print(out["report"]) + print({"summary": out["summary"], "flag": out["flag"]}) + return {"report": out["report"], "summary": out["summary"], "flag": out["flag"]} + + +if __name__ == "__main__": + run_integrity("datasets/T1_slice.csv") diff --git a/agentic_eda/jupyterlab_extension_backbend/tools/input_tools.py b/agentic_eda/jupyterlab_extension_backbend/tools/input_tools.py new file mode 100644 index 000000000..e37a0725d --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backbend/tools/input_tools.py @@ -0,0 +1,126 @@ +from __future__ import annotations + +import json +from typing_extensions import Annotated +from langchain.tools import tool +from langgraph.prebuilt import InjectedState +import pandas as pd +from pathlib import Path +import re +# from src.handle_inputs import InputState + +# @tool +# def dataset_brief( +# question: str, +# dataset_meta: Annotated[dict, InjectedState("dataset_meta")], # (not visible to the LLM) +# ) -> str: +# """ +# Answer a question using system-provided dataset metadata +# """ +# # dataset_meta comes from state["dataset_meta"], injected at runtime +# payload = { +# "question": question, +# "n_rows": dataset_meta.get("n_rows"), +# "n_cols": dataset_meta.get("n_cols"), +# "columns": dataset_meta.get("columns"), +# "freq": dataset_meta.get("freq"), +# } +# return json.dumps(payload) + + +def load_dataset(path: Path) -> pd.DataFrame: + # Load dataset. + + ext = path.suffix.lower() + + if ext in {'.csv'}: + data = pd.read_csv(path) + # TODO: Extend to other types of data. + + return data + +def headerAnalysis( + state +) -> dict: + path = Path(state['path']) + data = load_dataset(path) + cols = list(data.columns) + has_header: bool = True + error: str = "" + _valid_start = re.compile(r"^[A-Za-z_]") + if all(isinstance(c, int) for c in cols) and cols == list(range(len(cols))): + has_header = False + error += "No column names;" + + return {'has_header': has_header, 'error': error} + + for i, c in enumerate(cols): + if c is None: + has_header = False + error += "One or more column names missing" + return {'has_header': has_header, 'error': error} + name = str(c).strip() + if name[0].isdigit() or not _valid_start.match(name): + has_header = False + error += "One or more column names missing (headers are numbers)" + return {'has_header': has_header, 'error': error} + + + + return {'has_header': has_header, 'dataset': data} + + +@tool +def extract_metadata( + path: str +) -> dict: + """ + Return minimal dataset metadata. + + Only includes: + - number of rows + - number of columns + - number of unique values per column + + :param dataset: dataset to process + :return: metadata + """ + d_path = Path(path) + dataset = load_dataset(d_path) + n_rows, n_cols = dataset.shape + nunique = dataset.nunique(dropna=True) + nunique_map = {str(col): int(nunique[col]) for col in nunique.index} + + return { + "n_rows": int(n_rows), + "n_cols": int(n_cols), + "n_unique": nunique_map, + } + +@tool +def extract_head( + path: str, + n: int = 5 +) -> dict: + """ + Return dataset head + + :param dataset: dataset to process + :param n: number of head rows + :return: the first n rows + """ + d_path = Path(path) + dataset = load_dataset(d_path) + n_int = int(n) + if n_int <= 0: + n_int = 5 + n_int = min(n_int, 50) + + head = dataset.head(n_int) + # Use to_json so datetimes become ISO strings and NaNs become null-ish. + rows = json.loads(head.to_json(orient="records", date_format="iso")) + return { + "n": n_int, + "columns": [str(c) for c in head.columns.tolist()], + "rows": rows, + } From 699106330090d865237bde02d98c2a0a01f52789 Mon Sep 17 00:00:00 2001 From: Indrayudd Roy Chowdhury Date: Thu, 19 Feb 2026 15:54:01 -0500 Subject: [PATCH 2/5] TutorTask696: Fix typo on folder name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../.gitignore | 0 .../config/config.py | 0 .../datasets/T1_slice.csv | 0 .../src/format_datetime.py | 0 .../src/handle_inputs.py | 0 .../src/integrity.py | 0 .../tools/input_tools.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename agentic_eda/{jupyterlab_extension_backbend => jupyterlab_extension_backend}/.gitignore (100%) rename agentic_eda/{jupyterlab_extension_backbend => jupyterlab_extension_backend}/config/config.py (100%) rename agentic_eda/{jupyterlab_extension_backbend => jupyterlab_extension_backend}/datasets/T1_slice.csv (100%) rename agentic_eda/{jupyterlab_extension_backbend => jupyterlab_extension_backend}/src/format_datetime.py (100%) rename agentic_eda/{jupyterlab_extension_backbend => jupyterlab_extension_backend}/src/handle_inputs.py (100%) rename agentic_eda/{jupyterlab_extension_backbend => jupyterlab_extension_backend}/src/integrity.py (100%) rename agentic_eda/{jupyterlab_extension_backbend => jupyterlab_extension_backend}/tools/input_tools.py (100%) diff --git a/agentic_eda/jupyterlab_extension_backbend/.gitignore b/agentic_eda/jupyterlab_extension_backend/.gitignore similarity index 100% rename from agentic_eda/jupyterlab_extension_backbend/.gitignore rename to agentic_eda/jupyterlab_extension_backend/.gitignore diff --git a/agentic_eda/jupyterlab_extension_backbend/config/config.py b/agentic_eda/jupyterlab_extension_backend/config/config.py similarity index 100% rename from agentic_eda/jupyterlab_extension_backbend/config/config.py rename to agentic_eda/jupyterlab_extension_backend/config/config.py diff --git a/agentic_eda/jupyterlab_extension_backbend/datasets/T1_slice.csv b/agentic_eda/jupyterlab_extension_backend/datasets/T1_slice.csv similarity index 100% rename from agentic_eda/jupyterlab_extension_backbend/datasets/T1_slice.csv rename to agentic_eda/jupyterlab_extension_backend/datasets/T1_slice.csv diff --git a/agentic_eda/jupyterlab_extension_backbend/src/format_datetime.py b/agentic_eda/jupyterlab_extension_backend/src/format_datetime.py similarity index 100% rename from agentic_eda/jupyterlab_extension_backbend/src/format_datetime.py rename to agentic_eda/jupyterlab_extension_backend/src/format_datetime.py diff --git a/agentic_eda/jupyterlab_extension_backbend/src/handle_inputs.py b/agentic_eda/jupyterlab_extension_backend/src/handle_inputs.py similarity index 100% rename from agentic_eda/jupyterlab_extension_backbend/src/handle_inputs.py rename to agentic_eda/jupyterlab_extension_backend/src/handle_inputs.py diff --git a/agentic_eda/jupyterlab_extension_backbend/src/integrity.py b/agentic_eda/jupyterlab_extension_backend/src/integrity.py similarity index 100% rename from agentic_eda/jupyterlab_extension_backbend/src/integrity.py rename to agentic_eda/jupyterlab_extension_backend/src/integrity.py diff --git a/agentic_eda/jupyterlab_extension_backbend/tools/input_tools.py b/agentic_eda/jupyterlab_extension_backend/tools/input_tools.py similarity index 100% rename from agentic_eda/jupyterlab_extension_backbend/tools/input_tools.py rename to agentic_eda/jupyterlab_extension_backend/tools/input_tools.py From d1349b4da8b80f3d0b51e9614eaec51b93645741 Mon Sep 17 00:00:00 2001 From: Indrayudd Roy Chowdhury Date: Thu, 26 Feb 2026 15:48:00 -0500 Subject: [PATCH 3/5] TutorTask696: Align code with coding guidelines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../jupyterlab_extension_backend/README.md | 19 + .../config/config.py | 161 ++++--- .../src/format_datetime.py | 343 ++++++++------- .../src/handle_inputs.py | 182 +++++--- .../src/integrity.py | 396 +++++++++++------- .../jupyterlab_extension_backend/src/main.py | 78 ++++ .../tools/input_tools.py | 206 +++++---- 7 files changed, 842 insertions(+), 543 deletions(-) create mode 100644 agentic_eda/jupyterlab_extension_backend/README.md create mode 100644 agentic_eda/jupyterlab_extension_backend/src/main.py diff --git a/agentic_eda/jupyterlab_extension_backend/README.md b/agentic_eda/jupyterlab_extension_backend/README.md new file mode 100644 index 000000000..d3a9b1185 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/README.md @@ -0,0 +1,19 @@ +# JupyterLab Extension Backend + +Run the backend entrypoint from this directory: + +```bash +cd /Users/indro/src/tutorials1/agentic_eda/jupyterlab_extension_backend +python -m src.main \ + --mode integrity \ + --path /Users/indro/src/tutorials1/agentic_eda/jupyterlab_extension_backend/datasets/T1_slice.csv +``` + +If you run from a different directory, set `PYTHONPATH`: + +```bash +PYTHONPATH=/Users/indro/src/tutorials1/agentic_eda/jupyterlab_extension_backend \ +python -m src.main \ + --mode integrity \ + --path /Users/indro/src/tutorials1/agentic_eda/jupyterlab_extension_backend/datasets/T1_slice.csv +``` diff --git a/agentic_eda/jupyterlab_extension_backend/config/config.py b/agentic_eda/jupyterlab_extension_backend/config/config.py index 8f66572e9..56a61fabe 100644 --- a/agentic_eda/jupyterlab_extension_backend/config/config.py +++ b/agentic_eda/jupyterlab_extension_backend/config/config.py @@ -1,133 +1,128 @@ -import os +""" +Import as: + +import config.config as cconf +""" + import dataclasses import functools -import pydantic +import os import dotenv +import langchain_anthropic +import langchain_google_genai import langchain_openai -import langchain_anthropic # ChatAnthropic -import langchain_google_genai # ChatGoogleGenerativeAI -# import langchain_groq # ChatGroq -# import langchain_mistralai # ChatMistralAI -# import langchain_ollama # ChatOllama - +import pydantic dataclass = dataclasses.dataclass lru_cache = functools.lru_cache ChatOpenAI = langchain_openai.ChatOpenAI ChatAnthropic = langchain_anthropic.ChatAnthropic ChatGoogleGenerativeAI = langchain_google_genai.ChatGoogleGenerativeAI -# ChatGroq = langchain_groq.ChatGroq -# ChatMistralAI = langchain_mistralai.ChatMistralAI -# ChatOllama = langchain_ollama.ChatOllama SecretStr = pydantic.SecretStr -# Load Variables dotenv.load_dotenv() -# Immutable data class @dataclass(frozen=True) class Settings: + """ + Store model provider settings. + """ + provider: str model: str temperature: float timeout: float max_retries: int -def _need(name:str) -> str: - v = os.getenv(name) - if v is None or v == "": + +def _need(name: str) -> str: + """ + Read a required environment variable. + + :param name: environment variable name + :return: environment variable value + """ + value = os.getenv(name) + if value is None or value == "": raise RuntimeError(f"Missing required environment variable: {name}") - return v + return value + @lru_cache(maxsize=1) def get_settings() -> Settings: - return Settings( + """ + Build settings from environment variables. + + :return: configured settings + """ + settings = Settings( provider=os.getenv("LLM_PROVIDER", "openai"), model=os.getenv("LLM_MODEL", "gpt-5-nano"), temperature=float(os.getenv("LLM_TEMP", 0.2)), timeout=float(os.getenv("LLM_TIMEOUT", 60)), max_retries=int(os.getenv("LLM_MAX_RETRIES", 2)), - ) + return settings -@lru_cache(maxsize=1) -def get_chat_model(model=get_settings().model): - s = get_settings() - - # OpenAI-adjacent - if s.provider == "openai": - - # READ API KEY. +@lru_cache(maxsize=1) +def get_chat_model(*, model: str | None = None) -> object: + """ + Build the configured chat model client. + + :param model: optional model override + :return: langchain chat model client + """ + settings = get_settings() + model_name = settings.model if model is None else model + provider = settings.provider + if provider == "openai": _need("OPENAI_API_KEY") - - # Return the chatmodel - - return ChatOpenAI( - model=s.model, - temperature=s.temperature, - timeout=s.timeout, - max_retries=s.max_retries, + chat_model = ChatOpenAI( + model=model_name, + temperature=settings.temperature, + timeout=settings.timeout, + max_retries=settings.max_retries, ) - - if s.provider == "openai_compatible": - - # Secrets. + elif provider == "openai_compatible": base_url = _need("OPENAI_COMPAT_BASE_URL") api_key = _need("OPENAI_COMPAT_API_KEY") - return ChatOpenAI( - model=model, + chat_model = ChatOpenAI( + model=model_name, base_url=base_url, api_key=SecretStr(api_key), - temperature=s.temperature, - timeout=s.timeout, - max_retries=s.max_retries, - + temperature=settings.temperature, + timeout=settings.timeout, + max_retries=settings.max_retries, ) - - if s.provider == "azure_openai_v1": - - # Secrets. + elif provider == "azure_openai_v1": azure_base = _need("AZURE_OPENAI_BASE_URL") azure_key = SecretStr(_need("AZURE_OPENAI_API_KEY")) - - return ChatOpenAI( - model=s.model, + chat_model = ChatOpenAI( + model=model_name, base_url=azure_base, api_key=azure_key, - temperature=s.temperature, - timeout=s.timeout, - max_retries=s.max_retries, - + temperature=settings.temperature, + timeout=settings.timeout, + max_retries=settings.max_retries, ) - - # Anthropic - - if s.provider == "anthropic": - - # Secrets. - _need("ANTHROPIC_API_KEY") - return ChatAnthropic( - model_name=s.model, - temperature=s.temperature, - timeout=s.timeout, - max_retries=s.max_retries, - stop=None - ) - - # Google - if s.provider in ("google", "gemini", "google_genai"): - # Secrets. + elif provider == "anthropic": + _need("ANTHROPIC_API_KEY") + chat_model = ChatAnthropic( + model_name=model_name, + temperature=settings.temperature, + timeout=settings.timeout, + max_retries=settings.max_retries, + stop=None, + ) + elif provider in ("google", "gemini", "google_genai"): _need("GOOGLE_API_KEY") - return ChatGoogleGenerativeAI( - model=s.model, - temperature=s.temperature, + chat_model = ChatGoogleGenerativeAI( + model=model_name, + temperature=settings.temperature, ) - - - - - - raise ValueError("TODO(*): expand support!") + else: + raise ValueError(f"Unsupported provider='{provider}'") + return chat_model diff --git a/agentic_eda/jupyterlab_extension_backend/src/format_datetime.py b/agentic_eda/jupyterlab_extension_backend/src/format_datetime.py index c3a1ed7e9..8b538ff35 100644 --- a/agentic_eda/jupyterlab_extension_backend/src/format_datetime.py +++ b/agentic_eda/jupyterlab_extension_backend/src/format_datetime.py @@ -1,220 +1,251 @@ -import src.handle_inputs as handle_inputs -import pandas as pd +""" +Import as: + +import src.format_datetime as sfordat +""" + +import logging +import pathlib +from typing import TypedDict + +import langchain.agents as lagents +import langchain.tools as ltools +import langchain_core.messages as lmessages +import langgraph.graph as lgraph import numpy as np -from typing_extensions import TypedDict -from langgraph.graph import StateGraph, START, END -from pathlib import Path -from config.config import get_chat_model -from langchain.agents import create_agent -from langchain.tools import tool -from langchain_core.messages import HumanMessage -from tools.input_tools import load_dataset, extract_head -from pydantic import BaseModel, ConfigDict +import pandas as pd +import pydantic -def _score_parse(dt: pd.Series) -> float: - # Force dtype to datetime (safe even if already datetime) - dt = pd.to_datetime(dt, errors="coerce") +import config.config as cconf +import src.handle_inputs as shainp +import tools.input_tools as tinptool - if dt.isna().all(): - return -1.0 +_LOG = logging.getLogger(__name__) - parsed = dt.notna().mean() - dmin, dmax = dt.min(), dt.max() - sane_range = 1.0 - if dmin < pd.Timestamp("1990-01-01") or dmax > pd.Timestamp("2035-01-01"): - sane_range = 0.7 +def _score_parse(dt: pd.Series) -> float: + """ + Score datetime parse quality. - dt2 = dt.dropna() - mono = 0.0 - if len(dt2) >= 3: - deltas = dt2.diff() # this is Timedelta series now - inversions = (deltas < pd.Timedelta(0)).mean() - mono = 1.0 - float(inversions) + :param dt: candidate datetime series + :return: score where larger means better + """ + datetime_series = pd.to_datetime(dt, errors="coerce", utc=True) + if datetime_series.isna().all(): + score = -1.0 + return score + parsed_fraction = float(datetime_series.notna().mean()) + min_timestamp = datetime_series.min() + max_timestamp = datetime_series.max() + range_score = 1.0 + min_bound = pd.Timestamp("1990-01-01", tz="UTC") + max_bound = pd.Timestamp("2035-01-01", tz="UTC") + if min_timestamp < min_bound or max_timestamp > max_bound: + range_score = 0.7 + datetime_no_na = datetime_series.dropna() + monotonic_score = 0.0 + if len(datetime_no_na) >= 3: + deltas = datetime_no_na.diff() + inversions = float((deltas < pd.Timedelta(0)).mean()) + monotonic_score = 1.0 - inversions + score = ( + parsed_fraction * 0.65 + range_score * 0.15 + monotonic_score * 0.20 + ) + return float(score) - return float(parsed) * 0.65 + sane_range * 0.15 + mono * 0.20 -class _Candidate(BaseModel): - model_config = ConfigDict(extra="forbid") +class _Candidate(pydantic.BaseModel): + """ + Store one datetime parse candidate. + """ - # Keep all keys REQUIRED but nullable, to satisfy strict tool-schema validators. + model_config = pydantic.ConfigDict(extra="forbid") format: str | None dayfirst: bool | None yearfirst: bool | None utc: bool -class _ParseWithCandidatesArgs(BaseModel): - model_config = ConfigDict(extra="forbid") +class _ParseWithCandidatesArgs(pydantic.BaseModel): + """ + Store tool arguments for candidate parsing. + """ + model_config = pydantic.ConfigDict(extra="forbid") path: str col_name: str candidates: list[_Candidate] -@tool(args_schema=_ParseWithCandidatesArgs) -def _parse_with_candidates(path: str, col_name: str, candidates: list[_Candidate]): - """ - Try multiple datetime parsing “candidates” for a single column and pick the best one. - - This helper normalizes the input series to strings, then iterates - over a list of candidate parse configurations (format/dayfirst/yearfirst/utc), parses the column - with `pandas.to_datetime`, and selects the candidate with the highest score as computed by `_score_parse`. - - Scoring (via `_score_parse`) favors: - - high parse success rate (fraction of non-NaT values), - - a “sane” min/max timestamp range (1990-01-01 through 2035-01-01), - - monotonicity / low rate of backwards time jumps (for columns that look like time). - - Parameters - ---------- - col: - A pandas Series containing the raw values for a single candidate - time column. - Values are coerced to `str`, stripped, and empty strings as well - as the literal - strings `"nan"` and `"NaT"` are treated as missing. - candidates: - A list of dicts, each describing one parsing attempt. Supported keys: - - "format": `str | None` - Passed to `pd.to_datetime(..., format=...)`. Common values: - - a strptime format (e.g. "%Y-%m-%d %H:%M:%S") - - "ISO8601" (pandas special value) - - "mixed" (pandas special value for per-element inference) - - None (let pandas infer) - - "dayfirst": `bool` (default False) - - "yearfirst": `bool` (default False) - - "utc": `bool` (default False) - - eg, - {"format": "%d %m %Y %H:%M", "dayfirst": None, "yearfirst": None, "utc": False} - {"format": "mixed", "dayfirst": True, "yearfirst": False, "utc": False} - {"format": "ISO8601", "dayfirst": None, "yearfirst": None, "utc": True} - - Returns - ------- - dict: - JSON-serializable summary of the best candidate: - - best_candidate: {format, dayfirst, yearfirst, utc} - - best_score: float - - parsed_fraction: float in [0, 1] - """ - _path = Path(path) - data = load_dataset(_path) - col: pd.Series = data[col_name] +@ltools.tool(args_schema=_ParseWithCandidatesArgs) +def _parse_with_candidates( + path: str, + col_name: str, + candidates: list[_Candidate], +) -> dict: + """ + Parse one column with multiple datetime candidates and pick the best. + + :param path: dataset path + :param col_name: target column name + :param candidates: parse candidates + :return: best candidate summary + """ + dataset_path = pathlib.Path(path) + dataset = tinptool.load_dataset(dataset_path) + col = dataset[col_name] best_score = -1.0 - best_meta = None + best_candidate = None best_parsed_fraction = 0.0 - - s = col.astype(str).str.strip().replace({"": np.nan, "nan": np.nan, "NaT": np.nan}) - - for c in candidates: - c_dict = c if isinstance(c, dict) else c.model_dump() - fmt = c_dict.get("format", None) - dayfirst = c_dict.get("dayfirst", None) - yearfirst = c_dict.get("yearfirst", None) - utc = c_dict.get("utc", None) - - kwargs = {k: v for k, v in {"format": fmt, "dayfirst": dayfirst, "yearfirst": yearfirst, "utc": utc}.items() if v is not None} - + series = col.astype(str).str.strip().replace( + { + "": np.nan, + "nan": np.nan, + "NaT": np.nan, + } + ) + for candidate in candidates: + candidate_dict = candidate.model_dump() + format_val = candidate_dict["format"] + dayfirst_val = candidate_dict["dayfirst"] + yearfirst_val = candidate_dict["yearfirst"] + utc_val = candidate_dict["utc"] + kwargs = { + key: val + for key, val in { + "format": format_val, + "dayfirst": dayfirst_val, + "yearfirst": yearfirst_val, + "utc": utc_val, + }.items() + if val is not None + } try: - dt = pd.to_datetime( - s, + datetime_series = pd.to_datetime( + series, errors="coerce", **kwargs, ) except Exception: continue - - sc = _score_parse(dt) - if sc > best_score: - best_score = sc - best_meta = c_dict - best_parsed_fraction = float(dt.notna().mean()) - - return { - "best_candidate": best_meta, + score = _score_parse(datetime_series) + if score > best_score: + best_score = score + best_candidate = candidate_dict + best_parsed_fraction = float(datetime_series.notna().mean()) + payload = { + "best_candidate": best_candidate, "best_score": float(best_score), "parsed_fraction": float(best_parsed_fraction), } + return payload class DateFormatterState(TypedDict): + """ + Store graph state for datetime formatting. + """ + path: str time_col: str candidates: list[dict] winner_formatter: dict -class DateFormatterOutput(BaseModel): - model_config = ConfigDict(extra="forbid") +class DateFormatterOutput(pydantic.BaseModel): + """ + Store structured formatter output. + """ + + model_config = pydantic.ConfigDict(extra="forbid") candidates: list[_Candidate] winner_formatter: _Candidate -def run_formatting_agent( - state: DateFormatterState -): - system_prompt: str = """Use the tools at your disposal to convert the time column provided into a correct datetime format. The docstring for the function - has information on how to pass the arguments. To get an idea of formatting strings, use the extract_head tool as needed. - - Steps: - 1. Use extract_head to get an idea of what the temporal column looks like and create a list of dict candidates looking like: - [{"format": "%d %m %Y %H:%M", "dayfirst": None, "yearfirst": None, "utc": False}, - {"format": "mixed", "dayfirst": True, "yearfirst": False, "utc": False}, - ... - ] - - 2. Pass all the information needed by _parse_with_candidates and find out the winning format. e.g. {"format": "%d %m %Y %H:%M", "dayfirst": None, "yearfirst": None, "utc": False} - -""" - llm = get_chat_model(model="gpt-4.1") - agent = create_agent( - model = llm, - tools = [_parse_with_candidates, extract_head], + +def run_formatting_agent(state: DateFormatterState) -> dict: + """ + Run LLM tool-calling to find the best datetime parser. + + :param state: formatter graph state + :return: candidate list and winner formatter + """ + system_prompt = ( + "Use tools to convert the provided time column into a correct datetime " + "format.\n" + "1. Use extract_head to inspect the temporal column and propose parse " + "candidates.\n" + "2. Call _parse_with_candidates with those candidates.\n" + "3. Return all candidates and the winning formatter." + ) + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[_parse_with_candidates, tinptool.extract_head], system_prompt=system_prompt, response_format=DateFormatterOutput, ) - out = agent.invoke( - {"messages": [HumanMessage(content=f"The dataset path is {state['path']} and the time column name is {state['time_col']}")]} + { + "messages": [ + lmessages.HumanMessage( + content=( + f"The dataset path is {state['path']} and the time " + f"column name is {state['time_col']}" + ) + ) + ] + } ) + structured_response = out["structured_response"].model_dump() + payload = { + "candidates": structured_response["candidates"], + "winner_formatter": structured_response["winner_formatter"], + } + return payload - sr = out["structured_response"].model_dump() - return {"candidates": sr["candidates"], "winner_formatter": sr["winner_formatter"]} def call_input_handler(state: DateFormatterState) -> dict: - # Call compiled subgraph like a function - out = handle_inputs.run_input_handler(state["path"]) + """ + Run input handler and pick the first temporal column. + + :param state: formatter graph state + :return: selected temporal column + """ + out = shainp.run_input_handler(state["path"]) temporal_cols = out.get("temporal_cols") or [] if not temporal_cols: raise ValueError("No temporal columns found by input handler.") - return {"time_col": temporal_cols[0]} + payload = {"time_col": temporal_cols[0]} + return payload + -date_formatter = StateGraph(DateFormatterState) +date_formatter = lgraph.StateGraph(DateFormatterState) date_formatter.add_node("input_handler", call_input_handler) date_formatter.add_node("run_formatting_agent", run_formatting_agent) -date_formatter.add_edge(START, "input_handler") +date_formatter.add_edge(lgraph.START, "input_handler") date_formatter.add_edge("input_handler", "run_formatting_agent") -date_formatter.add_edge("run_formatting_agent", END) +date_formatter.add_edge("run_formatting_agent", lgraph.END) graph = date_formatter.compile() -def run_date_formatter(path: str): - - inp = { - "path": path, +def run_date_formatter(path: str) -> dict: + """ + Execute datetime formatter graph and parse the selected time column. + :param path: dataset path + :return: output including selected formatter and parsed dtype + """ + graph_in = {"path": path} + out: DateFormatterState = graph.invoke(graph_in) # type: ignore[assignment] + dataset_path = pathlib.Path(path) + dataset = tinptool.load_dataset(dataset_path) + raw_args = out["winner_formatter"] + format_args = {key: val for key, val in raw_args.items() if val is not None} + parsed_time = pd.to_datetime(dataset[out["time_col"]], **format_args) + payload = { + "time_col": out["time_col"], + "winner_formatter": out["winner_formatter"], + "parsed_dtype": str(parsed_time.dtype), } - out: DateFormatterState = graph.invoke(inp) #type: ignore - print(out["winner_formatter"]) - - _path = Path(path) - data = load_dataset(_path) - raw_args: dict = out["winner_formatter"] - format_args = {k: v for k, v in raw_args.items() if v is not None} - print(type(pd.to_datetime(data[out["time_col"]], **format_args))) # type: ignore - - -if __name__ == "__main__": - run_date_formatter("datasets/T1_slice.csv") - + _LOG.info("Date formatter output: %s", payload) + return payload diff --git a/agentic_eda/jupyterlab_extension_backend/src/handle_inputs.py b/agentic_eda/jupyterlab_extension_backend/src/handle_inputs.py index 0a6665ec4..5c3e6ba68 100644 --- a/agentic_eda/jupyterlab_extension_backend/src/handle_inputs.py +++ b/agentic_eda/jupyterlab_extension_backend/src/handle_inputs.py @@ -1,96 +1,168 @@ """ -Docstring for src.handle_inputs - -Input dataset checks to run: -- Does it have headers? If not throw error [Header Gate] -- What are the temporal, numeric value, and categorical headers? +Import as: +import src.handle_inputs as shainp """ from __future__ import annotations + import argparse +import logging +import pathlib from typing import TypedDict -from langgraph.graph import START, END, StateGraph -from pathlib import Path -from tools.input_tools import extract_head, extract_metadata, headerAnalysis -from config.config import get_chat_model -from pydantic import BaseModel -from langchain.agents import create_agent -import pandas as pd -from langchain_core.messages import HumanMessage + +import langchain.agents as lagents +import langchain_core.messages as lmessages +import langgraph.graph as lgraph +import pydantic + +import config.config as cconf +import tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + class InputState(TypedDict): - path: str | Path - done: list + """ + Store graph state for input checks. + """ + + path: str | pathlib.Path + done: list[str] has_header: bool has_missing_values: bool error: str info: str - cols: list + cols: list[str] temporal_cols: list[str] numeric_val_cols: list[str] categorical_val_cols: list[str] -class LLMOutput(BaseModel): + +class LLMOutput(pydantic.BaseModel): + """ + Store structured output from the header classifier. + """ + temporal_cols: list[str] numeric_val_cols: list[str] categorical_val_cols: list[str] -def header_classification_agent( - state: InputState -) -> dict: - - llm = get_chat_model(model="gpt-4.1") - agent = create_agent( - model = llm, - tools = [extract_head, extract_metadata], - system_prompt="""You are a header classifier agent. Use any of the tools at your disposal to ultimately convey which columns are temporal, and of the remaining value columns which ones are purely numeric and which ones are categorical. The final output has all lists of columns. - OUTPUT FORMAT: {"temporal_cols":["..."],"numeric_val_cols": ["..."],"categorical_val_cols":[]} - """, +def header_classification_agent(state: InputState) -> dict: + """ + Classify temporal, numeric, and categorical columns. + + :param state: input graph state + :return: column classification payload + """ + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[tinptool.extract_head, tinptool.extract_metadata], + system_prompt=( + "You are a header classifier agent. Use tools to identify temporal " + "columns and classify the remaining value columns as numeric or " + "categorical. Output JSON with keys temporal_cols, " + "numeric_val_cols, and categorical_val_cols." + ), response_format=LLMOutput, ) - out = agent.invoke( - {"messages": [HumanMessage(content=f"The dataset is in {state['path']}")]} + { + "messages": [ + lmessages.HumanMessage( + content=f"The dataset is in {state['path']}" + ) + ] + } ) + result = out["structured_response"].model_dump() + return result - return out["structured_response"].model_dump() -def error_node( - state: InputState -): - - print(state['error']) +def error_node(state: InputState) -> dict: + """ + Log an error node transition. -def hasHeader(state) -> bool: - return state['has_header'] + :param state: input graph state + :return: empty update + """ + _LOG.error("Input handler failed: %s", state["error"]) + return {} -def run_input_handler(path: str | Path): - g = StateGraph(InputState) - g.add_node("headerAnalysis", headerAnalysis) - g.add_node("header_classification_agent", header_classification_agent) - g.add_node("error", error_node) +def has_header(state: InputState) -> bool: + """ + Check if header validation passed. - g.add_edge(START, "headerAnalysis") - # g.add_edge("hasHeader", "header_classification_agent") - g.add_conditional_edges("headerAnalysis", hasHeader, {True: "header_classification_agent", False: "error"}) - g.add_edge("error", END) - g.add_edge("header_classification_agent", END) + :param state: input graph state + :return: true when headers are valid + """ + has_header_flag = state["has_header"] + return has_header_flag - graph = g.compile() - init: InputState = { #type: ignore - "path": path, +def run_input_handler(path: str | pathlib.Path) -> dict: + """ + Run dataset header and column classification checks. + :param path: path to dataset + :return: final graph output + """ + graph_builder = lgraph.StateGraph(InputState) + graph_builder.add_node("header_analysis", tinptool.analyze_header) + graph_builder.add_node( + "header_classification_agent", + header_classification_agent, + ) + graph_builder.add_node("error", error_node) + graph_builder.add_edge(lgraph.START, "header_analysis") + graph_builder.add_conditional_edges( + "header_analysis", + has_header, + { + True: "header_classification_agent", + False: "error", + }, + ) + graph_builder.add_edge("error", lgraph.END) + graph_builder.add_edge("header_classification_agent", lgraph.END) + graph = graph_builder.compile() + init_state: InputState = { + "path": str(path), + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], } + out = graph.invoke(init_state) + _LOG.info("Input handler output: %s", out) + return out - out = graph.invoke(init) - print(out) +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args - return out if __name__ == "__main__": - run_input_handler('datasets/T1_slice.csv') + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_input_handler(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/integrity.py b/agentic_eda/jupyterlab_extension_backend/src/integrity.py index 94907775b..c606de1ed 100644 --- a/agentic_eda/jupyterlab_extension_backend/src/integrity.py +++ b/agentic_eda/jupyterlab_extension_backend/src/integrity.py @@ -1,21 +1,35 @@ -import src.handle_inputs as handle_inputs -import src.format_datetime as format_datetime -import pandas as pd -import numpy as np +""" +Import as: + +import src.integrity as sinteg +""" + +import logging +import pathlib from typing import Literal -from typing_extensions import TypedDict -from langgraph.graph import StateGraph, START, END -from pathlib import Path -from tools.input_tools import load_dataset -from config.config import get_chat_model -from pydantic import BaseModel -from langchain.agents import create_agent -from langchain_core.messages import HumanMessage +from typing import TypedDict + +import langchain.agents as lagents +import langchain_core.messages as lmessages +import langgraph.graph as lgraph +import pandas as pd +import pydantic + +import config.config as cconf +import src.format_datetime as sfordat +import src.handle_inputs as shainp +import tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) class IntegrityState(TypedDict): + """ + Store graph state for integrity checks. + """ + path: str - time_col: str + time_col: str | None winner_formatter: dict entity_col: str | None numeric_cols: list[str] @@ -26,199 +40,295 @@ class IntegrityState(TypedDict): flag: str -class IntegrityJudgeOutput(BaseModel): +class IntegrityJudgeOutput(pydantic.BaseModel): + """ + Store structured LLM judgment. + """ + summary: str flag: Literal["yes", "no"] def call_date_formatter(state: IntegrityState) -> dict: - out: format_datetime.DateFormatterState = format_datetime.graph.invoke( # type: ignore - {"path": state["path"]} #type:ignore + """ + Run the datetime formatter graph. + + :param state: integrity graph state + :return: selected time column and formatter + """ + out: sfordat.DateFormatterState = sfordat.graph.invoke( # type: ignore + {"path": state["path"]} ) - return {"time_col": out["time_col"], "winner_formatter": out["winner_formatter"]} + payload = { + "time_col": out["time_col"], + "winner_formatter": out["winner_formatter"], + } + return payload + def _maybe_infer_columns(state: IntegrityState) -> dict: + """ + Infer numeric columns when they are not provided. + + :param state: integrity graph state + :return: optional numeric column update + """ if state.get("numeric_cols"): - return {} - out = handle_inputs.run_input_handler(state["path"]) - numeric_cols = out.get("numeric_val_cols") or [] - return {"numeric_cols": numeric_cols} + payload = {} + else: + out = shainp.run_input_handler(state["path"]) + numeric_cols = out.get("numeric_val_cols") or [] + payload = {"numeric_cols": numeric_cols} + return payload def run_integrity_checks(state: IntegrityState) -> dict: - path = Path(state["path"]) - df = load_dataset(path) - + """ + Run deterministic integrity checks on a dataset. + + :param state: integrity graph state + :return: report payload + """ + dataset_path = pathlib.Path(state["path"]) + dataset = tinptool.load_dataset(dataset_path) issues: list[dict] = [] summary: dict = { - "n_rows": int(df.shape[0]), - "n_cols": int(df.shape[1]), + "n_rows": int(dataset.shape[0]), + "n_cols": int(dataset.shape[1]), } - - if df.shape[0] == 0: - issues.append({"type": "empty_dataset", "msg": "Dataset has 0 rows"}) - return {"report": {"summary": summary, "issues": issues}} - + if dataset.shape[0] == 0: + issues.append({"type": "empty_dataset", "msg": "Dataset has 0 rows."}) + report = {"summary": summary, "issues": issues} + payload = {"report": report} + return payload time_col = state.get("time_col") - if not time_col or time_col not in df.columns: - issues.append({"type": "missing_time_col", "msg": f"time_col missing: {time_col!r}"}) - return {"report": {"summary": summary, "issues": issues}} + if time_col is None or time_col not in dataset.columns: + issues.append( + { + "type": "missing_time_col", + "msg": f"time_col missing: {time_col!r}", + } + ) + report = {"summary": summary, "issues": issues} + payload = {"report": report} + return payload format_args = state.get("winner_formatter") or {} - format_args = {k: v for k, v in format_args.items() if v is not None} + format_args = { + key: val + for key, val in format_args.items() + if val is not None + } try: - ts = pd.to_datetime(df[time_col], errors="coerce", **format_args) + timestamp = pd.to_datetime( + dataset[time_col], + errors="coerce", + **format_args, + ) except Exception: - ts = pd.to_datetime(df[time_col], errors="coerce") - summary["n_nat_time"] = int(ts.isna().sum()) - summary["min_time"] = None if ts.dropna().empty else str(ts.dropna().min()) - summary["max_time"] = None if ts.dropna().empty else str(ts.dropna().max()) - - dup_ts = int(ts.dropna().duplicated().sum()) - summary["duplicate_timestamps"] = dup_ts - if dup_ts > 0: - issues.append({"type": "duplicate_timestamps", "count": dup_ts}) - - entity_col = state.get("entity_col") or None - if entity_col and entity_col in df.columns: - summary["n_entities"] = int(df[entity_col].nunique(dropna=True)) - tmp = df[[entity_col]].copy() - tmp["_ts"] = ts - dup_pairs = int(tmp.dropna(subset=[entity_col, "_ts"]).duplicated(subset=[entity_col, "_ts"]).sum()) - summary["duplicate_entity_timestamp_pairs"] = dup_pairs - if dup_pairs > 0: - issues.append({"type": "duplicate_entity_timestamp_pairs", "count": dup_pairs}) + timestamp = pd.to_datetime(dataset[time_col], errors="coerce") + summary["n_nat_time"] = int(timestamp.isna().sum()) + summary["min_time"] = ( + None if timestamp.dropna().empty else str(timestamp.dropna().min()) + ) + summary["max_time"] = ( + None if timestamp.dropna().empty else str(timestamp.dropna().max()) + ) + duplicate_timestamps = int(timestamp.dropna().duplicated().sum()) + summary["duplicate_timestamps"] = duplicate_timestamps + if duplicate_timestamps > 0: + issues.append( + {"type": "duplicate_timestamps", "count": duplicate_timestamps} + ) + entity_col = state.get("entity_col") + if entity_col is not None and entity_col in dataset.columns: + summary["n_entities"] = int(dataset[entity_col].nunique(dropna=True)) + tmp = dataset[[entity_col]].copy() + tmp["_ts"] = timestamp + duplicate_pairs = int( + tmp.dropna(subset=[entity_col, "_ts"]) + .duplicated(subset=[entity_col, "_ts"]) + .sum() + ) + summary["duplicate_entity_timestamp_pairs"] = duplicate_pairs + if duplicate_pairs > 0: + issues.append( + { + "type": "duplicate_entity_timestamp_pairs", + "count": duplicate_pairs, + } + ) else: summary["duplicate_entity_timestamp_pairs"] = None - - numeric_cols = state.get("numeric_cols") or [] - numeric_cols = [c for c in numeric_cols if c in df.columns] - - nonnegative_cols = state.get("nonnegative_cols") or [] - neg_report: dict = {} - for c in nonnegative_cols: - if c not in df.columns: + numeric_cols = [col for col in state.get("numeric_cols") or []] + numeric_cols = [col for col in numeric_cols if col in dataset.columns] + nonnegative_cols = [col for col in state.get("nonnegative_cols") or []] + negative_report: dict = {} + for col in nonnegative_cols: + if col not in dataset.columns: continue - s = pd.to_numeric(df[c], errors="coerce") - nneg = int((s < 0).sum(skipna=True)) - if nneg > 0: - neg_report[c] = nneg - summary["negatives_in_nonnegative_cols"] = neg_report - if len(neg_report) > 0: - issues.append({"type": "negative_values", "details": neg_report}) - + series = pd.to_numeric(dataset[col], errors="coerce") + n_negative = int((series < 0).sum(skipna=True)) + if n_negative > 0: + negative_report[col] = n_negative + summary["negatives_in_nonnegative_cols"] = negative_report + if negative_report: + issues.append({"type": "negative_values", "details": negative_report}) jump_mult = float(state.get("jump_mult") or 20.0) jumps: dict = {} if numeric_cols: - tmp = df[[time_col] + ([entity_col] if entity_col and entity_col in df.columns else []) + numeric_cols].copy() - tmp["_ts"] = ts - sort_cols = ["_ts"] if not (entity_col and entity_col in tmp.columns) else [entity_col, "_ts"] + selected_cols = [time_col] + if entity_col is not None and entity_col in dataset.columns: + selected_cols.append(entity_col) + selected_cols.extend(numeric_cols) + tmp = dataset[selected_cols].copy() + tmp["_ts"] = timestamp + if entity_col is None or entity_col not in tmp.columns: + sort_cols = ["_ts"] + else: + sort_cols = [entity_col, "_ts"] tmp = tmp.sort_values(sort_cols) - - for c in numeric_cols: - tmp[c] = pd.to_numeric(tmp[c], errors="coerce") - if entity_col and entity_col in tmp.columns: - diff = tmp.groupby(entity_col)[c].diff() + for col in numeric_cols: + tmp[col] = pd.to_numeric(tmp[col], errors="coerce") + if entity_col is None or entity_col not in tmp.columns: + diff = tmp[col].diff() else: - diff = tmp[c].diff() + diff = tmp.groupby(entity_col)[col].diff() diff_abs = diff.abs() - scale = diff_abs.median() if pd.isna(scale) or float(scale) <= 0.0: scale = diff_abs.mean() if pd.isna(scale) or float(scale) <= 0.0: continue - - threshold = float(scale) * float(jump_mult) - flag = diff_abs > threshold - n_flag = int(flag.sum(skipna=True)) - if n_flag <= 0: + threshold = float(scale) * jump_mult + flagged = diff_abs > threshold + n_flagged = int(flagged.sum(skipna=True)) + if n_flagged <= 0: continue - - examples = [] - for i in tmp.index[flag.fillna(False)][:5]: - d = diff.loc[i] - curr = tmp.loc[i, c] - prev = None if pd.isna(d) or pd.isna(curr) else float(curr - d) - examples.append( - { - "col": c, - "entity": None if not (entity_col and entity_col in tmp.columns) else tmp.loc[i, entity_col], - "time": None if pd.isna(tmp.loc[i, "_ts"]) else str(tmp.loc[i, "_ts"]), - "prev": prev, - "curr": None if pd.isna(curr) else float(curr), #type:ignore - "diff": None if pd.isna(d) else float(d), - "threshold": float(threshold), - } - ) - - jumps[c] = {"count": n_flag, "threshold": threshold, "examples": examples} - issues.append({"type": "impossible_jumps", "col": c, "count": n_flag}) - - summary["jump_mult"] = float(jump_mult) + examples: list[dict] = [] + flagged_idx = tmp.index[flagged.fillna(False)][:5] + for idx in flagged_idx: + diff_val = diff.loc[idx] + curr_val = tmp.loc[idx, col] + if pd.isna(diff_val) or pd.isna(curr_val): + prev_val = None + else: + prev_val = float(curr_val - diff_val) + example = { + "col": col, + "entity": ( + None + if entity_col is None or entity_col not in tmp.columns + else tmp.loc[idx, entity_col] + ), + "time": ( + None + if pd.isna(tmp.loc[idx, "_ts"]) + else str(tmp.loc[idx, "_ts"]) + ), + "prev": prev_val, + "curr": None if pd.isna(curr_val) else float(curr_val), + "diff": None if pd.isna(diff_val) else float(diff_val), + "threshold": float(threshold), + } + examples.append(example) + jumps[col] = { + "count": n_flagged, + "threshold": threshold, + "examples": examples, + } + issues.append( + { + "type": "impossible_jumps", + "col": col, + "count": n_flagged, + } + ) + summary["jump_mult"] = jump_mult summary["jumps"] = jumps + report = {"summary": summary, "issues": issues} + payload = {"report": report} + return payload - return {"report": {"summary": summary, "issues": issues}} def integrity_llm_summary(state: IntegrityState) -> dict: - llm = get_chat_model(model="gpt-4.1") - agent = create_agent( + """ + Summarize integrity report and provide go/no-go flag. + + :param state: integrity graph state + :return: summary and decision flag + """ + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( model=llm, tools=[], - system_prompt="""You are an integrity judge. -You get an integrity report dict from a dataset. -Decide if everything looks normal enough to proceed. - -Output format: -{ "summary": "...", "flag": "yes" or "no" } - -Rules: -- flag = "yes" only if the report has no meaningful integrity issues. -- flag = "no" if there are clear issues (duplicates, impossible jumps, bad timestamps, etc.). -- Keep summary short and direct. -""", + system_prompt=( + "You are an integrity judge. Decide if the dataset can proceed. " + "Return JSON with keys summary and flag. Set flag to yes only when " + "there are no meaningful integrity issues." + ), response_format=IntegrityJudgeOutput, ) out = agent.invoke( { "messages": [ - HumanMessage( + lmessages.HumanMessage( content=f"Here is the integrity report: {state['report']}" ) ] } ) - sr = out["structured_response"].model_dump() - return {"summary": sr["summary"], "flag": sr["flag"]} + structured_response = out["structured_response"].model_dump() + payload = { + "summary": structured_response["summary"], + "flag": structured_response["flag"], + } + return payload -integrity = StateGraph(IntegrityState) +integrity = lgraph.StateGraph(IntegrityState) integrity.add_node("date_formatter", call_date_formatter) integrity.add_node("maybe_infer_columns", _maybe_infer_columns) -integrity.add_node("integrity", run_integrity_checks) +integrity.add_node("run_integrity_checks", run_integrity_checks) integrity.add_node("integrity_llm_summary", integrity_llm_summary) -integrity.add_edge(START, "date_formatter") +integrity.add_edge(lgraph.START, "date_formatter") integrity.add_edge("date_formatter", "maybe_infer_columns") -integrity.add_edge("maybe_infer_columns", "integrity") -integrity.add_edge("integrity", "integrity_llm_summary") -integrity.add_edge("integrity_llm_summary", END) +integrity.add_edge("maybe_infer_columns", "run_integrity_checks") +integrity.add_edge("run_integrity_checks", "integrity_llm_summary") +integrity.add_edge("integrity_llm_summary", lgraph.END) graph = integrity.compile() -def run_integrity(path: str, time_col: str | None = None, entity_col: str | None = None): - init: IntegrityState = { # type: ignore +def run_integrity( + path: str, + *, + time_col: str | None = None, + entity_col: str | None = None, +) -> dict: + """ + Execute integrity graph end to end. + + :param path: dataset path + :param time_col: optional time column override + :param entity_col: optional entity column + :return: integrity report with summary and flag + """ + init_state: IntegrityState = { "path": path, - "time_col": time_col, #type:ignore + "time_col": time_col, "winner_formatter": {}, "entity_col": entity_col, "numeric_cols": [], "nonnegative_cols": [], "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", } - out = graph.invoke(init) - print(out["report"]) - print({"summary": out["summary"], "flag": out["flag"]}) - return {"report": out["report"], "summary": out["summary"], "flag": out["flag"]} - - -if __name__ == "__main__": - run_integrity("datasets/T1_slice.csv") + out = graph.invoke(init_state) + payload = { + "report": out["report"], + "summary": out["summary"], + "flag": out["flag"], + } + _LOG.info("Integrity output: %s", payload) + return payload diff --git a/agentic_eda/jupyterlab_extension_backend/src/main.py b/agentic_eda/jupyterlab_extension_backend/src/main.py new file mode 100644 index 000000000..9d60ccd46 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/main.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +""" +Import as: + +import src.main as smain +""" + +import argparse +import json +import logging + +import src.format_datetime as sfordat +import src.handle_inputs as shainp +import src.integrity as sinteg + +_LOG = logging.getLogger(__name__) + + +def _parse_args() -> argparse.Namespace: + """ + Parse CLI arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--mode", + required=True, + choices=["input", "format", "integrity"], + help="Pipeline stage to execute.", + ) + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + parser.add_argument( + "--time_col", + default=None, + help="Optional time column override for integrity mode.", + ) + parser.add_argument( + "--entity_col", + default=None, + help="Optional entity column for integrity mode.", + ) + args = parser.parse_args() + return args + + +def _run_cli(args: argparse.Namespace) -> dict: + """ + Execute selected backend stage. + + :param args: parsed CLI args + :return: stage output payload + """ + mode = args.mode + if mode == "input": + payload = shainp.run_input_handler(args.path) + elif mode == "format": + payload = sfordat.run_date_formatter(args.path) + elif mode == "integrity": + payload = sinteg.run_integrity( + args.path, + time_col=args.time_col, + entity_col=args.entity_col, + ) + else: + raise ValueError(f"Unsupported mode='{mode}'") + return payload + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + cli_args = _parse_args() + output = _run_cli(cli_args) + _LOG.info("Pipeline output: %s", json.dumps(output, default=str, indent=2)) diff --git a/agentic_eda/jupyterlab_extension_backend/tools/input_tools.py b/agentic_eda/jupyterlab_extension_backend/tools/input_tools.py index e37a0725d..28d1c4c22 100644 --- a/agentic_eda/jupyterlab_extension_backend/tools/input_tools.py +++ b/agentic_eda/jupyterlab_extension_backend/tools/input_tools.py @@ -1,126 +1,120 @@ -from __future__ import annotations +""" +Import as: + +import tools.input_tools as tinptool +""" import json -from typing_extensions import Annotated -from langchain.tools import tool -from langgraph.prebuilt import InjectedState -import pandas as pd -from pathlib import Path +import pathlib import re -# from src.handle_inputs import InputState - -# @tool -# def dataset_brief( -# question: str, -# dataset_meta: Annotated[dict, InjectedState("dataset_meta")], # (not visible to the LLM) -# ) -> str: -# """ -# Answer a question using system-provided dataset metadata -# """ -# # dataset_meta comes from state["dataset_meta"], injected at runtime -# payload = { -# "question": question, -# "n_rows": dataset_meta.get("n_rows"), -# "n_cols": dataset_meta.get("n_cols"), -# "columns": dataset_meta.get("columns"), -# "freq": dataset_meta.get("freq"), -# } -# return json.dumps(payload) - - -def load_dataset(path: Path) -> pd.DataFrame: - # Load dataset. - + +import langchain.tools as ltools +import pandas as pd + +_VALID_HEADER_START_RE = re.compile(r"^[A-Za-z_]") + + +def load_dataset(path: pathlib.Path) -> pd.DataFrame: + """ + Load a supported dataset from disk. + + :param path: path to dataset file + :return: dataset as dataframe + """ ext = path.suffix.lower() + if ext == ".csv": + dataset = pd.read_csv(path) + else: + raise ValueError(f"Unsupported file extension='{ext}'") + return dataset + + +def analyze_header(state: dict) -> dict: + """ + Validate dataset headers. - if ext in {'.csv'}: - data = pd.read_csv(path) - # TODO: Extend to other types of data. - - return data - -def headerAnalysis( - state -) -> dict: - path = Path(state['path']) - data = load_dataset(path) - cols = list(data.columns) - has_header: bool = True - error: str = "" - _valid_start = re.compile(r"^[A-Za-z_]") - if all(isinstance(c, int) for c in cols) and cols == list(range(len(cols))): + :param state: graph state containing dataset path + :return: updated state fields with header status + """ + path = pathlib.Path(str(state["path"])) + dataset = load_dataset(path) + cols = list(dataset.columns) + has_header = True + error = "" + if ( + all(isinstance(col, int) for col in cols) + and cols == list(range(len(cols))) + ): has_header = False - error += "No column names;" - - return {'has_header': has_header, 'error': error} - - for i, c in enumerate(cols): - if c is None: - has_header = False - error += "One or more column names missing" - return {'has_header': has_header, 'error': error} - name = str(c).strip() - if name[0].isdigit() or not _valid_start.match(name): - has_header = False - error += "One or more column names missing (headers are numbers)" - return {'has_header': has_header, 'error': error} - - - - return {'has_header': has_header, 'dataset': data} - - -@tool -def extract_metadata( - path: str -) -> dict: + error = "No column names." + else: + for col in cols: + if col is None: + has_header = False + error = "One or more column names missing." + break + col_name = str(col).strip() + if col_name == "": + has_header = False + error = "One or more column names missing." + break + if ( + col_name[0].isdigit() + or not _VALID_HEADER_START_RE.match(col_name) + ): + has_header = False + error = ( + "One or more column names start with invalid characters." + ) + break + if has_header: + result = {"has_header": has_header, "dataset": dataset} + else: + result = {"has_header": has_header, "error": error} + return result + + +@ltools.tool +def extract_metadata(path: str) -> dict: """ Return minimal dataset metadata. - Only includes: - - number of rows - - number of columns - - number of unique values per column - - :param dataset: dataset to process - :return: metadata + :param path: dataset path + :return: metadata with shape and per-column cardinality """ - d_path = Path(path) - dataset = load_dataset(d_path) + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) n_rows, n_cols = dataset.shape - nunique = dataset.nunique(dropna=True) - nunique_map = {str(col): int(nunique[col]) for col in nunique.index} - - return { + n_unique = dataset.nunique(dropna=True) + n_unique_map = {str(col): int(n_unique[col]) for col in n_unique.index} + metadata = { "n_rows": int(n_rows), "n_cols": int(n_cols), - "n_unique": nunique_map, + "n_unique": n_unique_map, } + return metadata -@tool -def extract_head( - path: str, - n: int = 5 -) -> dict: + +@ltools.tool +def extract_head(path: str, *, n: int = 5) -> dict: """ - Return dataset head - - :param dataset: dataset to process - :param n: number of head rows - :return: the first n rows + Return the first rows from a dataset. + + :param path: dataset path + :param n: number of rows to return + :return: head rows serialized as JSON-compatible payload """ - d_path = Path(path) - dataset = load_dataset(d_path) - n_int = int(n) - if n_int <= 0: - n_int = 5 - n_int = min(n_int, 50) - - head = dataset.head(n_int) - # Use to_json so datetimes become ISO strings and NaNs become null-ish. + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + n_rows = int(n) + if n_rows <= 0: + n_rows = 5 + n_rows = min(n_rows, 50) + head = dataset.head(n_rows) rows = json.loads(head.to_json(orient="records", date_format="iso")) - return { - "n": n_int, - "columns": [str(c) for c in head.columns.tolist()], + payload = { + "n": n_rows, + "columns": [str(col) for col in head.columns.tolist()], "rows": rows, } + return payload From ca53817a5ec78e92a2a8c8a3421b7fe36e9b3fd0 Mon Sep 17 00:00:00 2001 From: Indrayudd Roy Chowdhury Date: Sun, 22 Mar 2026 11:52:34 -0700 Subject: [PATCH 4/5] TutorTask696: Ingestion done in agent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../jupyterlab_extension_backend/.gitignore | 3 + .../src/config/__init__.py | 3 + .../{ => src}/config/config.py | 2 +- .../src/handle_inputs.py | 168 ---- .../src/ingest/__init__.py | 3 + .../src/ingest/compute_temporal_stats.py | 223 +++++ .../src/{ => ingest}/format_datetime.py | 8 +- .../src/ingest/handle_inputs.py | 646 ++++++++++++++ .../src/ingest/infer_structure.py | 194 +++++ .../src/ingest/infer_type.py | 222 +++++ .../src/{ => ingest}/integrity.py | 110 ++- .../jupyterlab_extension_backend/src/main.py | 40 +- .../src/tools/__init__.py | 3 + .../src/tools/input_tools.py | 794 ++++++++++++++++++ .../tools/input_tools.py | 120 --- 15 files changed, 2206 insertions(+), 333 deletions(-) create mode 100644 agentic_eda/jupyterlab_extension_backend/src/config/__init__.py rename agentic_eda/jupyterlab_extension_backend/{ => src}/config/config.py (99%) delete mode 100644 agentic_eda/jupyterlab_extension_backend/src/handle_inputs.py create mode 100644 agentic_eda/jupyterlab_extension_backend/src/ingest/__init__.py create mode 100644 agentic_eda/jupyterlab_extension_backend/src/ingest/compute_temporal_stats.py rename agentic_eda/jupyterlab_extension_backend/src/{ => ingest}/format_datetime.py (97%) create mode 100644 agentic_eda/jupyterlab_extension_backend/src/ingest/handle_inputs.py create mode 100644 agentic_eda/jupyterlab_extension_backend/src/ingest/infer_structure.py create mode 100644 agentic_eda/jupyterlab_extension_backend/src/ingest/infer_type.py rename agentic_eda/jupyterlab_extension_backend/src/{ => ingest}/integrity.py (75%) create mode 100644 agentic_eda/jupyterlab_extension_backend/src/tools/__init__.py create mode 100644 agentic_eda/jupyterlab_extension_backend/src/tools/input_tools.py delete mode 100644 agentic_eda/jupyterlab_extension_backend/tools/input_tools.py diff --git a/agentic_eda/jupyterlab_extension_backend/.gitignore b/agentic_eda/jupyterlab_extension_backend/.gitignore index 082bc2d6e..b41013075 100644 --- a/agentic_eda/jupyterlab_extension_backend/.gitignore +++ b/agentic_eda/jupyterlab_extension_backend/.gitignore @@ -15,3 +15,6 @@ config/.env *secret* *.key *.pem +langchain-reference +AGENTS.md +traces/ diff --git a/agentic_eda/jupyterlab_extension_backend/src/config/__init__.py b/agentic_eda/jupyterlab_extension_backend/src/config/__init__.py new file mode 100644 index 000000000..2a18c45cd --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/config/__init__.py @@ -0,0 +1,3 @@ +""" +Backend configuration package. +""" diff --git a/agentic_eda/jupyterlab_extension_backend/config/config.py b/agentic_eda/jupyterlab_extension_backend/src/config/config.py similarity index 99% rename from agentic_eda/jupyterlab_extension_backend/config/config.py rename to agentic_eda/jupyterlab_extension_backend/src/config/config.py index 56a61fabe..f64d5fa9a 100644 --- a/agentic_eda/jupyterlab_extension_backend/config/config.py +++ b/agentic_eda/jupyterlab_extension_backend/src/config/config.py @@ -1,7 +1,7 @@ """ Import as: -import config.config as cconf +import src.config.config as cconf """ import dataclasses diff --git a/agentic_eda/jupyterlab_extension_backend/src/handle_inputs.py b/agentic_eda/jupyterlab_extension_backend/src/handle_inputs.py deleted file mode 100644 index 5c3e6ba68..000000000 --- a/agentic_eda/jupyterlab_extension_backend/src/handle_inputs.py +++ /dev/null @@ -1,168 +0,0 @@ -""" -Import as: - -import src.handle_inputs as shainp -""" - -from __future__ import annotations - -import argparse -import logging -import pathlib -from typing import TypedDict - -import langchain.agents as lagents -import langchain_core.messages as lmessages -import langgraph.graph as lgraph -import pydantic - -import config.config as cconf -import tools.input_tools as tinptool - -_LOG = logging.getLogger(__name__) - - -class InputState(TypedDict): - """ - Store graph state for input checks. - """ - - path: str | pathlib.Path - done: list[str] - has_header: bool - has_missing_values: bool - error: str - info: str - cols: list[str] - temporal_cols: list[str] - numeric_val_cols: list[str] - categorical_val_cols: list[str] - - -class LLMOutput(pydantic.BaseModel): - """ - Store structured output from the header classifier. - """ - - temporal_cols: list[str] - numeric_val_cols: list[str] - categorical_val_cols: list[str] - - -def header_classification_agent(state: InputState) -> dict: - """ - Classify temporal, numeric, and categorical columns. - - :param state: input graph state - :return: column classification payload - """ - llm = cconf.get_chat_model(model="gpt-4.1") - agent = lagents.create_agent( - model=llm, - tools=[tinptool.extract_head, tinptool.extract_metadata], - system_prompt=( - "You are a header classifier agent. Use tools to identify temporal " - "columns and classify the remaining value columns as numeric or " - "categorical. Output JSON with keys temporal_cols, " - "numeric_val_cols, and categorical_val_cols." - ), - response_format=LLMOutput, - ) - out = agent.invoke( - { - "messages": [ - lmessages.HumanMessage( - content=f"The dataset is in {state['path']}" - ) - ] - } - ) - result = out["structured_response"].model_dump() - return result - - -def error_node(state: InputState) -> dict: - """ - Log an error node transition. - - :param state: input graph state - :return: empty update - """ - _LOG.error("Input handler failed: %s", state["error"]) - return {} - - -def has_header(state: InputState) -> bool: - """ - Check if header validation passed. - - :param state: input graph state - :return: true when headers are valid - """ - has_header_flag = state["has_header"] - return has_header_flag - - -def run_input_handler(path: str | pathlib.Path) -> dict: - """ - Run dataset header and column classification checks. - - :param path: path to dataset - :return: final graph output - """ - graph_builder = lgraph.StateGraph(InputState) - graph_builder.add_node("header_analysis", tinptool.analyze_header) - graph_builder.add_node( - "header_classification_agent", - header_classification_agent, - ) - graph_builder.add_node("error", error_node) - graph_builder.add_edge(lgraph.START, "header_analysis") - graph_builder.add_conditional_edges( - "header_analysis", - has_header, - { - True: "header_classification_agent", - False: "error", - }, - ) - graph_builder.add_edge("error", lgraph.END) - graph_builder.add_edge("header_classification_agent", lgraph.END) - graph = graph_builder.compile() - init_state: InputState = { - "path": str(path), - "done": [], - "has_header": True, - "has_missing_values": False, - "error": "", - "info": "", - "cols": [], - "temporal_cols": [], - "numeric_val_cols": [], - "categorical_val_cols": [], - } - out = graph.invoke(init_state) - _LOG.info("Input handler output: %s", out) - return out - - -def _parse_args() -> argparse.Namespace: - """ - Parse command-line arguments. - - :return: parsed arguments - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--path", - required=True, - help="Path to dataset file.", - ) - args = parser.parse_args() - return args - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - args = _parse_args() - run_input_handler(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/ingest/__init__.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/__init__.py new file mode 100644 index 000000000..176a9790e --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/__init__.py @@ -0,0 +1,3 @@ +""" +Ingestion stages for the Jupyter backend. +""" diff --git a/agentic_eda/jupyterlab_extension_backend/src/ingest/compute_temporal_stats.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/compute_temporal_stats.py new file mode 100644 index 000000000..1b323d8c8 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/compute_temporal_stats.py @@ -0,0 +1,223 @@ +""" +Import as: + +import src.ingest.compute_temporal_stats as sctstats +""" + +from __future__ import annotations + +import argparse +import logging +from typing import TypedDict + +import langgraph.graph as lgraph + +import src.ingest.infer_structure as sinferstruct +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class TemporalStatsState(TypedDict): + """ + Store deterministic temporal statistics. + """ + + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + + +class CompositeState(TypedDict): + """ + Store graph state for temporal statistics. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + + +def call_infer_structure(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to feature-structure inference. + + :param state: graph state + :return: composite payload from infer_structure + """ + payload = sinferstruct.run_infer_structure(state["path"]) + return payload + + +def compute_temporal_stats(state: CompositeState) -> dict: + """ + Compute deterministic temporal range, coverage, and frequency statistics. + + :param state: graph state + :return: temporal statistics payload + """ + temporal_report = tinptool.compute_temporal_stats.invoke( + { + "path": state["path"], + "time_col": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "winner_formatter": state["winner_formatter"], + } + ) + trace_payload = { + "primary_key": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "temporal_report": temporal_report, + } + tinptool.write_stage_trace(state["path"], "compute_temporal_stats", trace_payload) + payload = { + "n_nat_time": temporal_report["n_nat_time"], + "min_time": temporal_report["min_time"], + "max_time": temporal_report["max_time"], + "typical_delta_mode": temporal_report["typical_delta_mode"], + "typical_delta_median": temporal_report["typical_delta_median"], + "expected_frequency": temporal_report["expected_frequency"], + "dominant_frequency_fraction": temporal_report["dominant_frequency_fraction"], + "is_irregular_sampling": temporal_report["is_irregular_sampling"], + "resampling_decision": temporal_report["resampling_decision"], + "coverage_summary": temporal_report["coverage_summary"], + "coverage_per_entity": temporal_report["coverage_per_entity"], + } + return payload + + +temporal_stats = lgraph.StateGraph(CompositeState) +temporal_stats.add_node("infer_structure_pipeline", call_infer_structure) +temporal_stats.add_node("compute_temporal_stats", compute_temporal_stats) +temporal_stats.add_edge(lgraph.START, "infer_structure_pipeline") +temporal_stats.add_edge("infer_structure_pipeline", "compute_temporal_stats") +temporal_stats.add_edge("compute_temporal_stats", lgraph.END) +graph = temporal_stats.compile() + + +def run_compute_temporal_stats(path: str) -> dict: + """ + Execute temporal statistics end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + "n_nat_time": 0, + "min_time": None, + "max_time": None, + "typical_delta_mode": None, + "typical_delta_median": None, + "expected_frequency": None, + "dominant_frequency_fraction": 0.0, + "is_irregular_sampling": False, + "resampling_decision": "", + "coverage_summary": {}, + "coverage_per_entity": [], + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Temporal stats output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_compute_temporal_stats(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/format_datetime.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/format_datetime.py similarity index 97% rename from agentic_eda/jupyterlab_extension_backend/src/format_datetime.py rename to agentic_eda/jupyterlab_extension_backend/src/ingest/format_datetime.py index 8b538ff35..6af3065d7 100644 --- a/agentic_eda/jupyterlab_extension_backend/src/format_datetime.py +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/format_datetime.py @@ -1,7 +1,7 @@ """ Import as: -import src.format_datetime as sfordat +import src.ingest.format_datetime as sfordat """ import logging @@ -16,9 +16,9 @@ import pandas as pd import pydantic -import config.config as cconf -import src.handle_inputs as shainp -import tools.input_tools as tinptool +import src.config.config as cconf +import src.ingest.handle_inputs as shainp +import src.tools.input_tools as tinptool _LOG = logging.getLogger(__name__) diff --git a/agentic_eda/jupyterlab_extension_backend/src/ingest/handle_inputs.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/handle_inputs.py new file mode 100644 index 000000000..84a3474c1 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/handle_inputs.py @@ -0,0 +1,646 @@ +""" +Import as: + +import src.ingest.handle_inputs as shainp +""" + +from __future__ import annotations + +import argparse +import logging +import pathlib +from typing import Any +from typing import Literal +from typing import TypedDict + +import langchain.agents as lagents +import langchain_core.messages as lmessages +import langgraph.graph as lgraph +import pandas as pd +import pydantic + +import src.config.config as cconf +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class InputState(TypedDict): + """ + Store graph state for input checks. + """ + + path: str | pathlib.Path + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + + +class LLMOutput(pydantic.BaseModel): + """ + Store structured output from the header classifier. + """ + + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + + +class SeriesStructureFallbackOutput(pydantic.BaseModel): + """ + Store structured fallback output for ambiguous series-structure cases. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + secondary_keys: list[str] + + +class BadRowDescriptor(pydantic.BaseModel): + """ + Store one fuzzy descriptor for a bad row. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + row_index: int + fuzzy_descriptor: str + + +class BadRowDescriptorOutput(pydantic.BaseModel): + """ + Store structured fuzzy descriptors for detected bad rows. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + descriptors: list[BadRowDescriptor] + + +class SeriesStructureAssessment(TypedDict): + """ + Store deterministic and fallback evidence for series-structure inference. + """ + + duplicate_timestamps: int + duplicate_timestamp_fraction: float + timestamps_mostly_unique: bool + candidate_entity_cols: list[str] + entity_candidate_report: dict + secondary_keys: list[str] + confidence: Literal["high", "medium", "low"] + method: Literal["deterministic", "deterministic_no_panel", "fuzzy"] + + +def _json_safe_value(value: Any) -> Any: + """ + Convert dataframe cell values into JSON-safe Python values. + + :param value: raw cell value + :return: JSON-safe value + """ + if pd.isna(value): + return None + if hasattr(value, "item"): + try: + return value.item() + except Exception: + return str(value) + return value + + +def _row_to_record(row: pd.Series) -> dict[str, Any]: + """ + Convert one dataframe row into a JSON-safe mapping. + + :param row: dataframe row + :return: serialized row mapping + """ + return { + str(col): _json_safe_value(value) + for col, value in row.to_dict().items() + } + + +def detect_bad_rows(state: InputState) -> dict: + """ + Detect rows that do not behave like observations because their temporal + fields are missing or unparseable. + + Theory: + In time-series ingestion, observation rows should participate in the time + axis. Rows whose temporal fields cannot be parsed are often metadata, + annotation, footer, or malformed rows. Capturing them explicitly preserves + evidence for downstream handling without silently dropping information at + ingestion time. + + :param state: input graph state + :return: detected bad-row payload + """ + temporal_cols = state.get("temporal_cols") or [] + if not temporal_cols: + return {"bad_rows": []} + + dataset_path = pathlib.Path(str(state["path"])) + dataset = tinptool.load_dataset(dataset_path) + valid_temporal_cols = [col for col in temporal_cols if col in dataset.columns] + if not valid_temporal_cols: + return {"bad_rows": []} + + parse_matrix: dict[str, pd.Series] = {} + normalized_matrix: dict[str, pd.Series] = {} + for col in valid_temporal_cols: + raw_series = dataset[col] + normalized = raw_series.astype(str).str.strip().replace( + {"": pd.NA, "nan": pd.NA, "NaT": pd.NA} + ) + normalized_matrix[col] = normalized + parse_matrix[col] = pd.to_datetime(normalized, errors="coerce") + + bad_rows: list[dict[str, Any]] = [] + for row_idx in range(int(dataset.shape[0])): + reasons: list[str] = [] + temporal_values: dict[str, Any] = {} + has_temporal_signal = False + has_parseable_temporal = False + for col in valid_temporal_cols: + raw_value = normalized_matrix[col].iloc[row_idx] + parsed_value = parse_matrix[col].iloc[row_idx] + temporal_values[col] = _json_safe_value(raw_value) + if not pd.isna(raw_value): + has_temporal_signal = True + if not pd.isna(parsed_value): + has_parseable_temporal = True + continue + if pd.isna(raw_value): + reasons.append(f"missing_temporal_value:{col}") + else: + raw_text = str(raw_value).strip() + reasons.append(f"unparseable_temporal_value:{col}") + if raw_text.endswith(":"): + reasons.append(f"annotation_like_temporal_value:{col}") + if has_parseable_temporal: + continue + if not has_temporal_signal and not reasons: + continue + row = dataset.iloc[row_idx] + bad_rows.append( + { + "row_index": int(row_idx), + "csv_row_number": int(row_idx) + 2, + "temporal_values": temporal_values, + "reasons": sorted(dict.fromkeys(reasons)), + "raw_row": _row_to_record(row), + "fuzzy_descriptor": "", + } + ) + return {"bad_rows": bad_rows} + + +def describe_bad_rows(state: InputState) -> dict: + """ + Attach short fuzzy descriptors to already-detected bad rows. + + Theory: + Deterministic rules can reliably tell us that a row does not behave like a + data observation, but they are less expressive about the row's likely role. + A constrained model can add a short human-readable descriptor such as + metadata row, blank footer row, or malformed timestamp row without being + allowed to invent new row IDs or alter the deterministic evidence. + + :param state: input graph state + :return: bad rows with fuzzy descriptors + """ + bad_rows = [dict(row) for row in (state.get("bad_rows") or [])] + if not bad_rows: + return {"bad_rows": []} + + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[], + system_prompt=( + "You are labeling already-detected bad rows in a dataset. " + "For each row_index, return a short fuzzy descriptor such as " + "'metadata/control row', 'blank/incomplete row', " + "'annotation row', or 'malformed timestamp row'. " + "Do not change row_index values and do not add rows." + ), + response_format=BadRowDescriptorOutput, + ) + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=f"Detected bad rows: {bad_rows}" + ) + ] + } + ) + descriptors = out["structured_response"].model_dump().get("descriptors") or [] + descriptor_map = { + int(item["row_index"]): str(item["fuzzy_descriptor"]).strip() + for item in descriptors + } + for row in bad_rows: + row["fuzzy_descriptor"] = descriptor_map.get( + int(row["row_index"]), + "bad/non-data row", + ) + return {"bad_rows": bad_rows} + + +def _parse_time_series( + path: str | pathlib.Path, + time_col: str, + winner_formatter: dict | None = None, +) -> pd.Series: + """ + Parse a proposed time column to measure whether it behaves like a real time + axis. + + Theory: + Handle-input classification identifies candidate temporal columns, but it + does not establish whether the observed values actually parse into a stable + datetime axis. Parseability is the empirical question: can the values be + converted into usable timestamps with only a small failure rate? That check + is important because schema inference should rely on observed value + behavior, not just column labels or LLM guesses. + + :param path: dataset path + :param time_col: selected time column + :param winner_formatter: optional datetime parsing kwargs + :return: parsed timestamp series + """ + dataset = tinptool.load_dataset(pathlib.Path(str(path))) + format_args = winner_formatter or {} + format_args = {key: val for key, val in format_args.items() if val is not None} + try: + return pd.to_datetime(dataset[time_col], errors="coerce", **format_args) + except Exception: + return pd.to_datetime(dataset[time_col], errors="coerce") + + +def _select_entity_candidate_cols( + *, + cols: list[str], + time_col: str, + numeric_val_cols: list[str], + categorical_val_cols: list[str], + column_profiles: dict, +) -> list[str]: + """ + Select plausible entity-key candidates using value-level heuristics. + + Theory: + Entity keys should behave like identifiers that partition repeated + timestamps into coherent per-entity series. Measurement columns usually do + not do that, even if they repeat. The candidate filter therefore keeps + likely identifier-like categoricals and only a narrow class of integer-like + numeric columns, while excluding continuous measurements, binary flags, and + near-row-unique columns. + + :param cols: all dataset columns + :param time_col: selected time column + :param numeric_val_cols: numeric value columns + :param categorical_val_cols: categorical value columns + :param column_profiles: per-column deterministic profiles + :return: filtered candidate entity columns + """ + candidates: list[str] = [] + numeric_set = set(numeric_val_cols) + categorical_set = set(categorical_val_cols) + for col in cols: + if col == time_col: + continue + profile = column_profiles.get(col) or {} + n_unique = int(profile.get("n_unique", 0)) + unique_ratio = float(profile.get("unique_ratio", 1.0)) + if n_unique <= 1 or unique_ratio >= 0.95: + continue + if col in categorical_set: + candidates.append(col) + continue + if col in numeric_set: + if bool(profile.get("is_binary_like")): + continue + if not bool(profile.get("is_integer_like")): + continue + if not bool(profile.get("is_nonnegative_like")): + continue + if n_unique > 200: + continue + if unique_ratio > 0.50: + continue + candidates.append(col) + return candidates + + +def _fuzzy_secondary_key_agent( + *, + path: str, + time_col: str, + candidate_entity_cols: list[str], + entity_candidate_report: dict, + column_profiles: dict, +) -> list[str]: + """ + Resolve ambiguous panel-vs-multivariate cases with a constrained LLM tie + breaker. + + Theory: + Deterministic heuristics are strongest when the data exhibits clean + identifier behavior. Ambiguous cases remain, especially when columns are + poorly named or identifier-like columns are partially numeric. In those + cases, a model can act as a constrained judge over a narrow candidate set, + using deterministic evidence rather than inventing columns freely. This + keeps fuzzy reasoning explainable and bounded. + + :param path: dataset path + :param time_col: selected time column + :param candidate_entity_cols: filtered entity-key candidates + :param entity_candidate_report: deterministic scoring report + :param column_profiles: per-column profiles + :return: chosen secondary keys, possibly empty + """ + if not candidate_entity_cols: + return [] + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[tinptool.extract_head, tinptool.extract_metadata], + system_prompt=( + "You are resolving an ambiguous series-structure classification. " + "Choose secondary keys only from the provided candidate_entity_cols. " + "Return [] if the dataset still looks like a single or wide " + "multivariate time series rather than panel data. Prefer the " + "deterministic evidence report over column names." + ), + response_format=SeriesStructureFallbackOutput, + ) + profile_subset = { + col: column_profiles.get(col, {}) + for col in candidate_entity_cols + } + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=( + f"Dataset path: {path}\n" + f"time_col: {time_col}\n" + f"candidate_entity_cols: {candidate_entity_cols}\n" + f"entity_candidate_report: {entity_candidate_report}\n" + f"column_profiles: {profile_subset}" + ) + ) + ] + } + ) + structured = out["structured_response"].model_dump() + secondary_keys: list[str] = [] + seen: set[str] = set() + allowed = set(candidate_entity_cols) + for col in structured.get("secondary_keys") or []: + col_name = str(col) + if col_name not in allowed or col_name in seen: + continue + seen.add(col_name) + secondary_keys.append(col_name) + return secondary_keys + + +def assess_series_structure( + *, + path: str | pathlib.Path, + cols: list[str], + time_col: str, + numeric_val_cols: list[str], + categorical_val_cols: list[str], + winner_formatter: dict | None = None, +) -> SeriesStructureAssessment: + """ + Assess whether the dataset behaves like a single series, panel, or wide + multivariate time series. + + Theory: + The decisive signal for panel structure is not the column name but the time + axis itself. If timestamps are already mostly unique, there is no need to + search for entity keys: the data is behaving like one wide time-indexed + table. Only when timestamps repeat meaningfully should we look for + identifier columns that make `(entity, time)` close to unique. This staging + avoids promoting ordinary measurement columns into fake entity IDs. + + :param path: dataset path + :param cols: all dataset columns + :param time_col: selected time column + :param numeric_val_cols: numeric value columns + :param categorical_val_cols: categorical value columns + :param winner_formatter: optional datetime parsing kwargs + :return: series-structure assessment + """ + string_path = str(path) + timestamp = _parse_time_series(string_path, time_col, winner_formatter) + valid_ts = timestamp.dropna() + duplicate_timestamps = int(valid_ts.duplicated().sum()) + duplicate_fraction = ( + 0.0 if valid_ts.empty else float(duplicate_timestamps / max(1, int(valid_ts.shape[0]))) + ) + timestamps_mostly_unique = duplicate_timestamps == 0 or duplicate_fraction < 0.01 + profiles_out = tinptool.extract_column_profiles.invoke({"path": string_path}) + column_profiles = profiles_out.get("column_profiles") or {} + candidate_entity_cols = _select_entity_candidate_cols( + cols=cols, + time_col=time_col, + numeric_val_cols=numeric_val_cols, + categorical_val_cols=categorical_val_cols, + column_profiles=column_profiles, + ) + if timestamps_mostly_unique: + return { + "duplicate_timestamps": duplicate_timestamps, + "duplicate_timestamp_fraction": duplicate_fraction, + "timestamps_mostly_unique": True, + "candidate_entity_cols": [], + "entity_candidate_report": { + "time_col": time_col, + "candidate_cols": [], + "candidates": [], + "recommended_secondary_keys": [], + }, + "secondary_keys": [], + "confidence": "high", + "method": "deterministic_no_panel", + } + entity_candidate_report = tinptool.score_entity_candidates.invoke( + { + "path": string_path, + "time_col": time_col, + "candidate_cols": candidate_entity_cols, + "max_combo_size": 2, + } + ) + recommended_secondary_keys = ( + entity_candidate_report.get("recommended_secondary_keys") or [] + ) + candidates = entity_candidate_report.get("candidates") or [] + top_score = 0.0 if not candidates else float(candidates[0].get("score", 0.0)) + if recommended_secondary_keys: + confidence: Literal["high", "medium", "low"] = ( + "high" if top_score >= 0.75 else "medium" + ) + return { + "duplicate_timestamps": duplicate_timestamps, + "duplicate_timestamp_fraction": duplicate_fraction, + "timestamps_mostly_unique": False, + "candidate_entity_cols": candidate_entity_cols, + "entity_candidate_report": entity_candidate_report, + "secondary_keys": recommended_secondary_keys, + "confidence": confidence, + "method": "deterministic", + } + fuzzy_secondary_keys = _fuzzy_secondary_key_agent( + path=string_path, + time_col=time_col, + candidate_entity_cols=candidate_entity_cols, + entity_candidate_report=entity_candidate_report, + column_profiles=column_profiles, + ) + return { + "duplicate_timestamps": duplicate_timestamps, + "duplicate_timestamp_fraction": duplicate_fraction, + "timestamps_mostly_unique": False, + "candidate_entity_cols": candidate_entity_cols, + "entity_candidate_report": entity_candidate_report, + "secondary_keys": fuzzy_secondary_keys, + "confidence": "low" if fuzzy_secondary_keys else "medium", + "method": "fuzzy", + } + + +def header_classification_agent(state: InputState) -> dict: + """ + Classify temporal, numeric, and categorical columns. + + :param state: input graph state + :return: column classification payload + """ + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[tinptool.extract_head, tinptool.extract_metadata], + system_prompt=( + "You are a header classifier agent. Use tools to identify temporal " + "columns and classify the remaining value columns as numeric or " + "categorical. Output JSON with keys temporal_cols, " + "numeric_val_cols, and categorical_val_cols." + ), + response_format=LLMOutput, + ) + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=f"The dataset is in {state['path']}" + ) + ] + } + ) + result = out["structured_response"].model_dump() + return result + + +def error_node(state: InputState) -> dict: + """ + Log an error node transition. + + :param state: input graph state + :return: empty update + """ + _LOG.error("Input handler failed: %s", state["error"]) + return {} + + +def has_header(state: InputState) -> bool: + """ + Check if header validation passed. + + :param state: input graph state + :return: true when headers are valid + """ + has_header_flag = state["has_header"] + return has_header_flag + + +def run_input_handler(path: str | pathlib.Path) -> dict: + """ + Run dataset header and column classification checks. + + :param path: path to dataset + :return: final graph output + """ + graph_builder = lgraph.StateGraph(InputState) + graph_builder.add_node("header_analysis", tinptool.analyze_header) + graph_builder.add_node( + "header_classification_agent", + header_classification_agent, + ) + graph_builder.add_node("detect_bad_rows", detect_bad_rows) + graph_builder.add_node("describe_bad_rows", describe_bad_rows) + graph_builder.add_node("error", error_node) + graph_builder.add_edge(lgraph.START, "header_analysis") + graph_builder.add_conditional_edges( + "header_analysis", + has_header, + { + True: "header_classification_agent", + False: "error", + }, + ) + graph_builder.add_edge("error", lgraph.END) + graph_builder.add_edge("header_classification_agent", "detect_bad_rows") + graph_builder.add_edge("detect_bad_rows", "describe_bad_rows") + graph_builder.add_edge("describe_bad_rows", lgraph.END) + graph = graph_builder.compile() + init_state: InputState = { + "path": str(path), + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + } + out = graph.invoke(init_state) + _LOG.info("Input handler output: %s", out) + return out + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_input_handler(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/ingest/infer_structure.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/infer_structure.py new file mode 100644 index 000000000..a57f094f6 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/infer_structure.py @@ -0,0 +1,194 @@ +""" +Import as: + +import src.ingest.infer_structure as sinferstruct +""" + +from __future__ import annotations + +import argparse +import logging +from typing import TypedDict + +import langgraph.graph as lgraph + +import src.ingest.infer_type as sinfert +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class FeatureStructureState(TypedDict): + """ + Store inferred semantic feature groupings. + """ + + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + + +class CompositeState(TypedDict): + """ + Store graph state for feature-structure inference. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + + +def call_infer_type(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to series-type inference. + + :param state: graph state + :return: composite payload from infer_type + """ + payload = sinfert.run_infer_type(state["path"]) + return payload + + +def infer_structure(state: CompositeState) -> dict: + """ + Infer semantic feature roles for EDA deterministically from observed column + behavior. + + :param state: graph state + :return: inferred feature groupings + """ + feature_bucket_report = tinptool.infer_feature_buckets.invoke( + { + "path": state["path"], + "time_col": state["primary_key"], + "secondary_keys": state["secondary_keys"], + } + ) + trace_payload = { + "primary_key": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "series_type": state["type"], + "feature_bucket_report": feature_bucket_report, + } + tinptool.write_stage_trace(state["path"], "infer_structure", trace_payload) + payload = { + "numeric_continuous_cols": feature_bucket_report["numeric_continuous_cols"], + "numeric_count_cols": feature_bucket_report["numeric_count_cols"], + "binary_flag_cols": feature_bucket_report["binary_flag_cols"], + "categorical_feature_cols": feature_bucket_report["categorical_feature_cols"], + "known_exogenous_cols": feature_bucket_report["known_exogenous_cols"], + "target_cols": feature_bucket_report["target_cols"], + "covariate_cols": feature_bucket_report["covariate_cols"], + } + return payload + + +feature_structure = lgraph.StateGraph(CompositeState) +feature_structure.add_node("infer_type_pipeline", call_infer_type) +feature_structure.add_node("infer_structure", infer_structure) +feature_structure.add_edge(lgraph.START, "infer_type_pipeline") +feature_structure.add_edge("infer_type_pipeline", "infer_structure") +feature_structure.add_edge("infer_structure", lgraph.END) +graph = feature_structure.compile() + + +def run_infer_structure(path: str) -> dict: + """ + Execute feature-structure inference end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Feature structure output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_infer_structure(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/ingest/infer_type.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/infer_type.py new file mode 100644 index 000000000..e3fe05786 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/infer_type.py @@ -0,0 +1,222 @@ +""" +Import as: + +import src.ingest.infer_type as sinfert +""" + +from __future__ import annotations + +import argparse +import logging +import pathlib +from typing import Literal +from typing import TypedDict + +import langgraph.graph as lgraph + +import src.ingest.format_datetime as sfordat +import src.ingest.handle_inputs as shainp +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class SeriesTypeState(TypedDict): + """ + Store the inferred series structure. + """ + + type: Literal["single", "multiple", "multivariate"] + primary_key: str + secondary_keys: list[str] + + +class CompositeState(TypedDict): + """ + Store graph state for series-structure inference. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: Literal["single", "multiple", "multivariate"] + primary_key: str + secondary_keys: list[str] + + +def call_input_handler(state: CompositeState) -> dict: + """ + Run input handler and collect column metadata. + + :param state: graph state + :return: column classification payload + """ + dataset_path = pathlib.Path(state["path"]) + dataset = tinptool.load_dataset(dataset_path) + out = shainp.run_input_handler(state["path"]) + metadata = tinptool.extract_metadata.invoke({"path": state["path"]}) + payload = { + "done": out.get("done") or [], + "has_header": bool(out.get("has_header", True)), + "has_missing_values": bool(out.get("has_missing_values", False)), + "error": str(out.get("error") or ""), + "info": str(out.get("info") or ""), + "cols": [str(col) for col in dataset.columns.tolist()], + "temporal_cols": out.get("temporal_cols") or [], + "numeric_val_cols": out.get("numeric_val_cols") or [], + "categorical_val_cols": out.get("categorical_val_cols") or [], + "bad_rows": out.get("bad_rows") or [], + "numeric_cols": out.get("numeric_val_cols") or [], + "metadata": metadata, + } + return payload + + +def call_date_formatter(state: CompositeState) -> dict: + """ + Run the datetime formatter graph. + + :param state: graph state + :return: selected time column + """ + out: sfordat.DateFormatterState = sfordat.graph.invoke( # type: ignore + {"path": state["path"]} + ) + payload = { + "time_col": out["time_col"], + "candidates": out.get("candidates") or [], + "winner_formatter": out.get("winner_formatter") or {}, + } + return payload + + +def infer_type(state: CompositeState) -> dict: + """ + Infer whether the dataset is single-series, panel, or multivariate using + deterministic value-level evidence. + + :param state: graph state + :return: inferred series structure + """ + structure_assessment = shainp.assess_series_structure( + path=state["path"], + cols=state["cols"], + time_col=state["time_col"], + numeric_val_cols=state["numeric_val_cols"], + categorical_val_cols=state["categorical_val_cols"], + winner_formatter=state["winner_formatter"], + ) + primary_key = state["time_col"] + secondary_keys = structure_assessment.get("secondary_keys") or [] + if secondary_keys: + inferred_type: Literal["single", "multiple", "multivariate"] = "multiple" + elif len(state["numeric_val_cols"]) > 1: + inferred_type = "multivariate" + else: + inferred_type = "single" + trace_payload = { + "time_col": primary_key, + "structure_assessment": structure_assessment, + "inferred_type": inferred_type, + "secondary_keys": secondary_keys, + } + tinptool.write_stage_trace(state["path"], "infer_type", trace_payload) + payload = { + "type": inferred_type, + "primary_key": primary_key, + "secondary_keys": secondary_keys, + "entity_col": secondary_keys[0] if secondary_keys else None, + } + return payload + + +series_type = lgraph.StateGraph(CompositeState) +series_type.add_node("input_handler", call_input_handler) +series_type.add_node("date_formatter", call_date_formatter) +series_type.add_node("infer_type", infer_type) +series_type.add_edge(lgraph.START, "input_handler") +series_type.add_edge("input_handler", "date_formatter") +series_type.add_edge("date_formatter", "infer_type") +series_type.add_edge("infer_type", lgraph.END) +graph = series_type.compile() + + +def run_infer_type(path: str) -> dict: + """ + Execute series-structure inference end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "single", + "primary_key": "", + "secondary_keys": [], + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Series type output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_infer_type(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/integrity.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/integrity.py similarity index 75% rename from agentic_eda/jupyterlab_extension_backend/src/integrity.py rename to agentic_eda/jupyterlab_extension_backend/src/ingest/integrity.py index c606de1ed..71ee4670c 100644 --- a/agentic_eda/jupyterlab_extension_backend/src/integrity.py +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/integrity.py @@ -1,7 +1,7 @@ """ Import as: -import src.integrity as sinteg +import src.ingest.integrity as sinteg """ import logging @@ -15,10 +15,11 @@ import pandas as pd import pydantic -import config.config as cconf -import src.format_datetime as sfordat -import src.handle_inputs as shainp -import tools.input_tools as tinptool +import src.config.config as cconf +import src.ingest.format_datetime as sfordat +import src.ingest.handle_inputs as shainp +import src.ingest.infer_type as sinfert +import src.tools.input_tools as tinptool _LOG = logging.getLogger(__name__) @@ -31,8 +32,14 @@ class IntegrityState(TypedDict): path: str time_col: str | None winner_formatter: dict + cols: list[str] + temporal_cols: list[str] + bad_rows: list[dict] entity_col: str | None numeric_cols: list[str] + categorical_val_cols: list[str] + metadata: dict + secondary_keys: list[str] nonnegative_cols: list[str] jump_mult: float report: dict @@ -68,17 +75,75 @@ def call_date_formatter(state: IntegrityState) -> dict: def _maybe_infer_columns(state: IntegrityState) -> dict: """ - Infer numeric columns when they are not provided. + Collect schema context needed by downstream integrity checks. :param state: integrity graph state - :return: optional numeric column update + :return: schema-related state updates """ - if state.get("numeric_cols"): + if ( + state.get("cols") + and state.get("temporal_cols") + and state.get("numeric_cols") + and state.get("metadata") + ): payload = {} else: + dataset_path = pathlib.Path(state["path"]) + dataset = tinptool.load_dataset(dataset_path) out = shainp.run_input_handler(state["path"]) - numeric_cols = out.get("numeric_val_cols") or [] - payload = {"numeric_cols": numeric_cols} + metadata = tinptool.extract_metadata.invoke({"path": state["path"]}) + payload = { + "cols": [str(col) for col in dataset.columns.tolist()], + "temporal_cols": out.get("temporal_cols") or [], + "bad_rows": out.get("bad_rows") or [], + "numeric_cols": out.get("numeric_val_cols") or [], + "categorical_val_cols": out.get("categorical_val_cols") or [], + "metadata": metadata, + } + return payload + + +def call_infer_type(state: IntegrityState) -> dict: + """ + Infer the series structure and derive the temporary entity key. + + :param state: integrity graph state + :return: inferred secondary keys and first entity key + """ + infer_state: sinfert.CompositeState = { + "path": state["path"], + "cols": state.get("cols") or [], + "temporal_cols": state.get("temporal_cols") or [], + "numeric_val_cols": state.get("numeric_cols") or [], + "categorical_val_cols": state.get("categorical_val_cols") or [], + "bad_rows": state.get("bad_rows") or [], + "metadata": state.get("metadata") or {}, + "time_col": state["time_col"] or "", + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "candidates": [], + "winner_formatter": state.get("winner_formatter") or {}, + "entity_col": None, + "numeric_cols": state.get("numeric_cols") or [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "single", + "primary_key": "", + "secondary_keys": [], + } + out = sinfert.infer_type(infer_state) + secondary_keys = out.get("secondary_keys") or [] + entity_col = secondary_keys[0] if secondary_keys else None + payload = { + "secondary_keys": secondary_keys, + "entity_col": entity_col, + } return payload @@ -140,6 +205,8 @@ def run_integrity_checks(state: IntegrityState) -> dict: {"type": "duplicate_timestamps", "count": duplicate_timestamps} ) entity_col = state.get("entity_col") + # TODO: Use all inferred secondary_keys as a composite entity key for + # integrity checks; for now we temporarily use only the first key. if entity_col is not None and entity_col in dataset.columns: summary["n_entities"] = int(dataset[entity_col].nunique(dropna=True)) tmp = dataset[[entity_col]].copy() @@ -288,36 +355,37 @@ def integrity_llm_summary(state: IntegrityState) -> dict: integrity = lgraph.StateGraph(IntegrityState) integrity.add_node("date_formatter", call_date_formatter) integrity.add_node("maybe_infer_columns", _maybe_infer_columns) +integrity.add_node("infer_type", call_infer_type) integrity.add_node("run_integrity_checks", run_integrity_checks) integrity.add_node("integrity_llm_summary", integrity_llm_summary) integrity.add_edge(lgraph.START, "date_formatter") integrity.add_edge("date_formatter", "maybe_infer_columns") -integrity.add_edge("maybe_infer_columns", "run_integrity_checks") +integrity.add_edge("maybe_infer_columns", "infer_type") +integrity.add_edge("infer_type", "run_integrity_checks") integrity.add_edge("run_integrity_checks", "integrity_llm_summary") integrity.add_edge("integrity_llm_summary", lgraph.END) graph = integrity.compile() -def run_integrity( - path: str, - *, - time_col: str | None = None, - entity_col: str | None = None, -) -> dict: +def run_integrity(path: str) -> dict: """ Execute integrity graph end to end. :param path: dataset path - :param time_col: optional time column override - :param entity_col: optional entity column :return: integrity report with summary and flag """ init_state: IntegrityState = { "path": path, - "time_col": time_col, + "time_col": None, "winner_formatter": {}, - "entity_col": entity_col, + "cols": [], + "temporal_cols": [], + "bad_rows": [], + "entity_col": None, "numeric_cols": [], + "categorical_val_cols": [], + "metadata": {}, + "secondary_keys": [], "nonnegative_cols": [], "jump_mult": 20.0, "report": {}, diff --git a/agentic_eda/jupyterlab_extension_backend/src/main.py b/agentic_eda/jupyterlab_extension_backend/src/main.py index 9d60ccd46..6e9a3765c 100644 --- a/agentic_eda/jupyterlab_extension_backend/src/main.py +++ b/agentic_eda/jupyterlab_extension_backend/src/main.py @@ -9,9 +9,12 @@ import json import logging -import src.format_datetime as sfordat -import src.handle_inputs as shainp -import src.integrity as sinteg +import src.ingest.compute_temporal_stats as sctstats +import src.ingest.format_datetime as sfordat +import src.ingest.handle_inputs as shainp +import src.ingest.infer_structure as sinferstruct +import src.ingest.infer_type as sinfert +import src.ingest.integrity as sinteg _LOG = logging.getLogger(__name__) @@ -26,7 +29,14 @@ def _parse_args() -> argparse.Namespace: parser.add_argument( "--mode", required=True, - choices=["input", "format", "integrity"], + choices=[ + "input", + "format", + "infer_type", + "infer_structure", + "compute_temporal_stats", + "integrity", + ], help="Pipeline stage to execute.", ) parser.add_argument( @@ -34,16 +44,6 @@ def _parse_args() -> argparse.Namespace: required=True, help="Path to dataset file.", ) - parser.add_argument( - "--time_col", - default=None, - help="Optional time column override for integrity mode.", - ) - parser.add_argument( - "--entity_col", - default=None, - help="Optional entity column for integrity mode.", - ) args = parser.parse_args() return args @@ -61,11 +61,13 @@ def _run_cli(args: argparse.Namespace) -> dict: elif mode == "format": payload = sfordat.run_date_formatter(args.path) elif mode == "integrity": - payload = sinteg.run_integrity( - args.path, - time_col=args.time_col, - entity_col=args.entity_col, - ) + payload = sinteg.run_integrity(args.path) + elif mode == "infer_type": + payload = sinfert.run_infer_type(args.path) + elif mode == "infer_structure": + payload = sinferstruct.run_infer_structure(args.path) + elif mode == "compute_temporal_stats": + payload = sctstats.run_compute_temporal_stats(args.path) else: raise ValueError(f"Unsupported mode='{mode}'") return payload diff --git a/agentic_eda/jupyterlab_extension_backend/src/tools/__init__.py b/agentic_eda/jupyterlab_extension_backend/src/tools/__init__.py new file mode 100644 index 000000000..46d455292 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/tools/__init__.py @@ -0,0 +1,3 @@ +""" +Backend tool package. +""" diff --git a/agentic_eda/jupyterlab_extension_backend/src/tools/input_tools.py b/agentic_eda/jupyterlab_extension_backend/src/tools/input_tools.py new file mode 100644 index 000000000..4190c5533 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/tools/input_tools.py @@ -0,0 +1,794 @@ +""" +Import as: + +import src.tools.input_tools as tinptool +""" + +import json +import itertools +import pathlib +import re +from typing import Any + +import langchain.tools as ltools +import pandas as pd +import pydantic + +_VALID_HEADER_START_RE = re.compile(r"^[A-Za-z_]") + + +def load_dataset(path: pathlib.Path) -> pd.DataFrame: + """ + Load a supported dataset from disk. + + :param path: path to dataset file + :return: dataset as dataframe + """ + ext = path.suffix.lower() + if ext == ".csv": + dataset = pd.read_csv(path) + else: + raise ValueError(f"Unsupported file extension='{ext}'") + return dataset + + +def _sample_values(series: pd.Series, *, limit: int = 5) -> list[str]: + """ + Return a small deterministic sample of distinct non-null values. + + Theory: + A short value sample gives downstream logic human-interpretable evidence + about whether a column behaves like a flag, identifier, category, or + free-form measurement, without depending on the column name alone. + + :param series: input series + :param limit: max number of sample values + :return: stringified sample values + """ + values: list[str] = [] + seen: set[str] = set() + for value in series.dropna().tolist(): + key = str(value) + if key in seen: + continue + seen.add(key) + values.append(key) + if len(values) >= limit: + break + return values + + +def _normalized_non_null_fraction(series: pd.Series) -> float: + """ + Compute the non-null fraction for a series. + + Theory: + Missingness changes how much confidence we should place in any inferred + semantic role. Columns with very little observed data provide weak evidence + for type inference, so completeness is a foundational statistic. + + :param series: input series + :return: non-null fraction + """ + if len(series) == 0: + return 0.0 + return float(series.notna().mean()) + + +def _coerce_numeric(series: pd.Series) -> pd.Series: + """ + Convert a series to numeric values where possible. + + Theory: + Many semantic distinctions begin with whether values actually behave like + numbers in the data, not whether the declared dtype says so. Numeric + coercion exposes columns that are numerically meaningful even when loaded + as strings. + + :param series: input series + :return: numeric series with NaN for non-numeric values + """ + return pd.to_numeric(series, errors="coerce") + + +def _is_integer_like(series: pd.Series) -> bool: + """ + Check whether numeric values are effectively integers. + + Theory: + Count variables and encoded flags often live on the integers, whereas + continuous measurements usually do not. Integer support is therefore a + useful deterministic signal for separating counts from continuous values. + + :param series: numeric-like series + :return: true when all observed values are close to integers + """ + numeric = _coerce_numeric(series).dropna() + if numeric.empty: + return False + rounded = numeric.round() + return bool((numeric - rounded).abs().le(1e-9).all()) + + +def _is_binary_like(series: pd.Series) -> bool: + """ + Check whether a column behaves like a binary flag. + + Theory: + Binary indicators are characterized by two logical states regardless of + whether they are stored as booleans, strings, or numeric codes. Recognizing + this two-state support helps prevent flags from being misclassified as + general categoricals or counts. + + :param series: input series + :return: true when the column has exactly two logical states + """ + non_null = series.dropna() + if non_null.empty: + return False + unique_raw = {str(value).strip().lower() for value in non_null.unique()} + binary_vocab = { + "0", + "1", + "true", + "false", + "t", + "f", + "yes", + "no", + "y", + "n", + } + if unique_raw and unique_raw.issubset(binary_vocab) and len(unique_raw) <= 2: + return True + return len(unique_raw) == 2 + + +def _build_column_profiles(dataset: pd.DataFrame) -> dict[str, dict[str, Any]]: + """ + Build deterministic per-column profiles used by downstream schema tools. + + Theory: + Robust schema inference should summarize how each column behaves in the + observed data: completeness, cardinality, numeric support, integer support, + binary support, and value examples. Those empirical signals are what later + stages use to infer keys and semantic feature types in a reproducible way. + + :param dataset: input dataframe + :return: map of column name to summary statistics + """ + profiles: dict[str, dict[str, Any]] = {} + n_rows = int(dataset.shape[0]) + for col in dataset.columns: + series = dataset[col] + non_null = series.dropna() + n_non_null = int(non_null.shape[0]) + n_unique = int(non_null.nunique(dropna=True)) + unique_ratio = 0.0 if n_non_null == 0 else float(n_unique / n_non_null) + numeric = _coerce_numeric(series) + numeric_non_null = numeric.dropna() + numeric_fraction = ( + 0.0 if n_non_null == 0 else float(numeric_non_null.shape[0] / n_non_null) + ) + integer_like = _is_integer_like(series) + nonnegative_like = ( + False + if numeric_non_null.empty + else bool((numeric_non_null >= 0).all()) + ) + profile = { + "dtype": str(series.dtype), + "n_rows": n_rows, + "n_non_null": n_non_null, + "non_null_fraction": _normalized_non_null_fraction(series), + "n_unique": n_unique, + "unique_ratio": unique_ratio, + "is_numeric_like": bool(numeric_fraction >= 0.95 and not numeric_non_null.empty), + "numeric_fraction": numeric_fraction, + "is_integer_like": integer_like, + "is_binary_like": _is_binary_like(series), + "is_nonnegative_like": nonnegative_like, + "sample_values": _sample_values(series), + } + if not numeric_non_null.empty: + profile["min_numeric"] = float(numeric_non_null.min()) + profile["max_numeric"] = float(numeric_non_null.max()) + else: + profile["min_numeric"] = None + profile["max_numeric"] = None + profiles[str(col)] = profile + return profiles + + +def write_stage_trace(path: str, stage: str, payload: dict[str, Any]) -> str: + """ + Persist diagnostic findings for one pipeline stage to a backend-local trace + file. + + :param path: dataset path + :param stage: pipeline stage name + :param payload: JSON-serializable diagnostic payload + :return: absolute trace file path + """ + dataset_path = pathlib.Path(path) + trace_root = pathlib.Path(__file__).resolve().parents[1] / "traces" + trace_root.mkdir(parents=True, exist_ok=True) + filename = f"{dataset_path.stem}.{stage}.json" + trace_path = trace_root / filename + trace_payload = { + "dataset_path": str(dataset_path), + "stage": stage, + "payload": payload, + } + trace_path.write_text( + json.dumps(trace_payload, default=str, indent=2), + encoding="utf-8", + ) + return str(trace_path) + + +def _parse_time_series( + dataset: pd.DataFrame, + time_col: str, + winner_formatter: dict[str, Any] | None = None, +) -> pd.Series: + """ + Parse the selected time column with the best-known formatter settings. + + Theory: + Temporal statistics are only meaningful once the time axis has been mapped + into a consistent datetime representation. Reusing the formatter selected + earlier in the pipeline avoids accidental drift between schema inference and + downstream coverage/frequency calculations. + + :param dataset: input dataframe + :param time_col: selected time column + :param winner_formatter: optional datetime parsing kwargs + :return: parsed timestamp series + """ + format_args = winner_formatter or {} + format_args = {key: val for key, val in format_args.items() if val is not None} + try: + return pd.to_datetime(dataset[time_col], errors="coerce", **format_args) + except Exception: + return pd.to_datetime(dataset[time_col], errors="coerce") + + +def _format_timedelta(delta: pd.Timedelta | None) -> str | None: + """ + Convert a timedelta into a stable string representation. + + Theory: + Frequency and gap summaries are easier to compare across stages when they + are rendered into a canonical textual duration rather than leaking pandas- + specific objects into the public payload. + + :param delta: input timedelta + :return: normalized string or None + """ + if delta is None or pd.isna(delta): + return None + return str(delta) + + +def _series_identifier(keys: list[str], values: tuple[Any, ...]) -> dict[str, Any] | None: + """ + Package one composite entity identifier as a JSON-friendly mapping. + + Theory: + Coverage and frequency statistics are naturally computed per series. When a + panel uses composite entity keys, the identifier must preserve every key + component so the reported findings still point back to the original series. + + :param keys: entity key column names + :param values: grouped key values + :return: key-value mapping or None for single-series data + """ + if not keys: + return None + return {key: value for key, value in zip(keys, values, strict=True)} + + +class _TemporalStatsArgs(pydantic.BaseModel): + """ + Store arguments for deterministic temporal statistics. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + path: str + time_col: str + secondary_keys: list[str] | None = None + winner_formatter: dict[str, Any] | None = None + + +@ltools.tool(args_schema=_TemporalStatsArgs) +def compute_temporal_stats( + path: str, + time_col: str, + secondary_keys: list[str] | None = None, + winner_formatter: dict[str, Any] | None = None, +) -> dict: + """ + Compute deterministic temporal range, coverage, and sampling-frequency + statistics. + + Theory: + Time-series coverage is defined relative to an expected sampling interval. + Once the timestamps are parsed, the empirical deltas between consecutive + observations reveal the dominant cadence of the data. That cadence becomes + the expected frequency against which we can measure irregular sampling, + missing timestamps, longest gaps, and per-entity coverage. For panel data, + these statistics must be computed per entity (or per composite entity key), + because a dataset can be well covered overall while still containing weak or + sparse individual series. + + :param path: dataset path + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param winner_formatter: optional datetime parsing kwargs + :return: temporal statistics payload + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + if time_col not in dataset.columns: + raise KeyError(f"time_col '{time_col}' not found in dataset") + secondary_keys = [ + key for key in (secondary_keys or []) if key in dataset.columns and key != time_col + ] + timestamp = _parse_time_series(dataset, time_col, winner_formatter) + valid_rows = dataset.copy() + valid_rows["_ts"] = timestamp + valid_rows = valid_rows.dropna(subset=["_ts"]) + if secondary_keys: + grouped_iter = valid_rows.groupby(secondary_keys, dropna=True) + group_items = list(grouped_iter) + else: + group_items = [(tuple(), valid_rows)] + + all_deltas: list[pd.Timedelta] = [] + per_entity: list[dict[str, Any]] = [] + global_min = None if valid_rows.empty else valid_rows["_ts"].min() + global_max = None if valid_rows.empty else valid_rows["_ts"].max() + + for raw_key, frame in group_items: + key_tuple = raw_key if isinstance(raw_key, tuple) else (raw_key,) + unique_ts = ( + frame["_ts"].dropna().drop_duplicates().sort_values().reset_index(drop=True) + ) + n_observed = int(unique_ts.shape[0]) + if n_observed >= 2: + deltas = unique_ts.diff().dropna() + positive_deltas = deltas[deltas > pd.Timedelta(0)] + else: + positive_deltas = pd.Series(dtype="timedelta64[ns]") + all_deltas.extend(list(positive_deltas.tolist())) + per_entity.append( + { + "entity": _series_identifier(secondary_keys, key_tuple), + "n_observed_timestamps": n_observed, + "min_time": None if unique_ts.empty else str(unique_ts.min()), + "max_time": None if unique_ts.empty else str(unique_ts.max()), + "_positive_deltas": positive_deltas, + } + ) + + if all_deltas: + delta_series = pd.Series(all_deltas, dtype="timedelta64[ns]") + mode_candidates = delta_series.mode() + mode_delta = None if mode_candidates.empty else mode_candidates.iloc[0] + median_delta = delta_series.median() + dominant_fraction = ( + 0.0 + if mode_delta is None + else float((delta_series == mode_delta).mean()) + ) + expected_delta = mode_delta if dominant_fraction >= 0.5 else median_delta + is_irregular_sampling = bool( + expected_delta is not None + and float((delta_series == expected_delta).mean()) < 0.8 + ) + else: + delta_series = pd.Series(dtype="timedelta64[ns]") + mode_delta = None + median_delta = None + dominant_fraction = 0.0 + expected_delta = None + is_irregular_sampling = False + + coverage_values: list[float] = [] + total_gaps = 0 + for item in per_entity: + positive_deltas = item.pop("_positive_deltas") + n_observed = item["n_observed_timestamps"] + if n_observed == 0 or expected_delta is None or pd.isna(expected_delta): + coverage_pct = None + n_expected = n_observed + gap_mask = pd.Series(dtype=bool) + longest_gap = None + else: + span = pd.Timestamp(item["max_time"]) - pd.Timestamp(item["min_time"]) + if expected_delta <= pd.Timedelta(0): + n_expected = n_observed + else: + n_expected = int(span / expected_delta) + 1 + n_expected = max(n_expected, n_observed, 1) + coverage_pct = float(100.0 * n_observed / n_expected) + gap_mask = positive_deltas > expected_delta + longest_gap = ( + None if positive_deltas.empty else positive_deltas.max() + ) + n_gaps = int(gap_mask.sum()) if not gap_mask.empty else 0 + total_gaps += n_gaps + if coverage_pct is not None: + coverage_values.append(coverage_pct) + item["n_expected_timestamps"] = int(n_expected) + item["coverage_pct"] = coverage_pct + item["n_gaps"] = n_gaps + item["longest_gap"] = _format_timedelta(longest_gap) + + if expected_delta is None: + resampling_decision = "insufficient_data" + elif is_irregular_sampling: + resampling_decision = "keep_irregular_gap_aware" + elif coverage_values and min(coverage_values) < 99.0: + resampling_decision = "resample_to_regular_grid" + else: + resampling_decision = "already_regular" + + coverage_summary = { + "n_series": len(per_entity), + "mean_coverage_pct": ( + None if not coverage_values else float(pd.Series(coverage_values).mean()) + ), + "min_coverage_pct": ( + None if not coverage_values else float(pd.Series(coverage_values).min()) + ), + "max_coverage_pct": ( + None if not coverage_values else float(pd.Series(coverage_values).max()) + ), + "total_gaps": int(total_gaps), + } + + return { + "time_col": time_col, + "secondary_keys": secondary_keys, + "n_nat_time": int(timestamp.isna().sum()), + "min_time": None if global_min is None else str(global_min), + "max_time": None if global_max is None else str(global_max), + "typical_delta_mode": _format_timedelta(mode_delta), + "typical_delta_median": _format_timedelta(median_delta), + "expected_frequency": _format_timedelta(expected_delta), + "dominant_frequency_fraction": dominant_fraction, + "is_irregular_sampling": is_irregular_sampling, + "resampling_decision": resampling_decision, + "coverage_summary": coverage_summary, + "coverage_per_entity": per_entity, + } + + +def analyze_header(state: dict) -> dict: + """ + Validate dataset headers. + + :param state: graph state containing dataset path + :return: updated state fields with header status + """ + path = pathlib.Path(str(state["path"])) + dataset = load_dataset(path) + cols = list(dataset.columns) + has_header = True + error = "" + if ( + all(isinstance(col, int) for col in cols) + and cols == list(range(len(cols))) + ): + has_header = False + error = "No column names." + else: + for col in cols: + if col is None: + has_header = False + error = "One or more column names missing." + break + col_name = str(col).strip() + if col_name == "": + has_header = False + error = "One or more column names missing." + break + if ( + col_name[0].isdigit() + or not _VALID_HEADER_START_RE.match(col_name) + ): + has_header = False + error = ( + "One or more column names start with invalid characters." + ) + break + if has_header: + result = {"has_header": has_header, "dataset": dataset} + else: + result = {"has_header": has_header, "error": error} + return result + + +@ltools.tool +def extract_metadata(path: str) -> dict: + """ + Return minimal dataset metadata. + + :param path: dataset path + :return: metadata with shape and per-column cardinality + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + n_rows, n_cols = dataset.shape + n_unique = dataset.nunique(dropna=True) + n_unique_map = {str(col): int(n_unique[col]) for col in n_unique.index} + metadata = { + "n_rows": int(n_rows), + "n_cols": int(n_cols), + "n_unique": n_unique_map, + } + return metadata + + +@ltools.tool +def extract_column_profiles(path: str) -> dict: + """ + Profile each column using value-level statistics rather than relying on + names alone. + + Theory: + Semantic feature inference becomes more robust when it is grounded in + empirical column behavior. Binary flags tend to have two states, counts + tend to be nonnegative integers, continuous measurements usually have many + distinct real-valued observations, and identifiers often repeat but are not + numeric measurements. These profile statistics give later stages stable + evidence even when column names are unhelpful. + + :param path: dataset path + :return: per-column profile map + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + profiles = _build_column_profiles(dataset) + return {"column_profiles": profiles} + + +class _EntityCandidateArgs(pydantic.BaseModel): + """ + Store arguments for deterministic entity-key scoring. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + path: str + time_col: str + candidate_cols: list[str] | None = None + max_combo_size: int = 2 + + +@ltools.tool(args_schema=_EntityCandidateArgs) +def score_entity_candidates( + path: str, + time_col: str, + candidate_cols: list[str] | None = None, + max_combo_size: int = 2, +) -> dict: + """ + Score candidate entity keys by how well they partition repeated time-series + observations into stable per-entity trajectories. + + Theory: + A useful entity key in panel data should do three things. First, entities + should reappear across multiple rows, otherwise the key behaves like a + row-level identifier rather than a series identifier. Second, the pair + `(entity_key, time_col)` should be close to unique, because that pair is + the natural coordinate system of a panel time series. Third, a good entity + key should explain repeated timestamps by reducing collisions once the + entity dimension is included. These criteria are deterministic and more + reliable than name-based guessing. + + :param path: dataset path + :param time_col: selected time column + :param candidate_cols: optional candidate entity columns + :param max_combo_size: max size of composite key combinations to evaluate + :return: scored candidate report with recommended secondary keys + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + if time_col not in dataset.columns: + raise KeyError(f"time_col '{time_col}' not found in dataset") + timestamp = pd.to_datetime(dataset[time_col], errors="coerce") + profiles = _build_column_profiles(dataset) + available_cols = [str(col) for col in dataset.columns if str(col) != time_col] + if candidate_cols is None: + selected = [] + for col in available_cols: + profile = profiles[col] + if profile["n_unique"] <= 1: + continue + if profile["unique_ratio"] >= 1.0: + continue + selected.append(col) + candidate_cols = selected + else: + candidate_cols = [ + col for col in candidate_cols if col in dataset.columns and col != time_col + ] + candidate_cols = sorted(dict.fromkeys(candidate_cols)) + max_combo_size = max(1, min(int(max_combo_size), 2)) + duplicate_timestamps = int(timestamp.dropna().duplicated().sum()) + candidates: list[dict[str, Any]] = [] + for combo_size in range(1, max_combo_size + 1): + for combo in itertools.combinations(candidate_cols, combo_size): + subset = dataset[list(combo)].copy() + subset["_ts"] = timestamp + valid = subset.dropna(subset=[*combo, "_ts"]) + if valid.empty: + continue + group_sizes = valid.groupby(list(combo), dropna=True).size() + if group_sizes.empty: + continue + n_entities = int(group_sizes.shape[0]) + mean_obs_per_entity = float(group_sizes.mean()) + entity_reuse_fraction = float((group_sizes > 1).mean()) + duplicate_pairs = int( + valid.duplicated(subset=[*combo, "_ts"]).sum() + ) + pair_uniqueness = float( + 1.0 - (duplicate_pairs / max(1, int(valid.shape[0]))) + ) + if duplicate_timestamps > 0: + collision_reduction = float( + 1.0 - (duplicate_pairs / max(1, duplicate_timestamps)) + ) + else: + collision_reduction = 1.0 if mean_obs_per_entity > 1.0 else 0.0 + repeatability_score = float(min(max((mean_obs_per_entity - 1.0) / 4.0, 0.0), 1.0)) + score = float( + 0.35 * pair_uniqueness + + 0.35 * repeatability_score + + 0.20 * entity_reuse_fraction + + 0.10 * max(0.0, min(collision_reduction, 1.0)) + ) + candidates.append( + { + "secondary_keys": list(combo), + "n_entities": n_entities, + "mean_obs_per_entity": mean_obs_per_entity, + "entity_reuse_fraction": entity_reuse_fraction, + "duplicate_entity_timestamp_pairs": duplicate_pairs, + "pair_uniqueness": pair_uniqueness, + "collision_reduction": collision_reduction, + "score": score, + } + ) + candidates.sort( + key=lambda item: ( + item["score"], + item["entity_reuse_fraction"], + item["mean_obs_per_entity"], + -len(item["secondary_keys"]), + ), + reverse=True, + ) + top_candidate = candidates[0] if candidates else None + if ( + top_candidate is not None + and top_candidate["score"] >= 0.60 + and top_candidate["n_entities"] >= 2 + and top_candidate["mean_obs_per_entity"] >= 2.0 + ): + recommended_secondary_keys = top_candidate["secondary_keys"] + else: + recommended_secondary_keys = [] + return { + "time_col": time_col, + "duplicate_timestamps": duplicate_timestamps, + "candidate_cols": candidate_cols, + "candidates": candidates[:10], + "recommended_secondary_keys": recommended_secondary_keys, + } + + +class _FeatureBucketsArgs(pydantic.BaseModel): + """ + Store arguments for deterministic semantic feature typing. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + path: str + time_col: str + secondary_keys: list[str] | None = None + + +@ltools.tool(args_schema=_FeatureBucketsArgs) +def infer_feature_buckets( + path: str, + time_col: str, + secondary_keys: list[str] | None = None, +) -> dict: + """ + Deterministically type features from their observed value behavior. + + Theory: + The semantic distinction between counts, binary flags, continuous measures, + and categoricals can often be established directly from the support of the + observed values. Binary flags exhibit two states, counts live on the + nonnegative integers, continuous measures take broader real-valued ranges, + and categorical features are residual non-key columns that do not behave + like numeric measurements. Weakly inferred classes such as targets or + exogenous drivers are intentionally left empty because their meaning depends + more on task context than on value support alone. + + :param path: dataset path + :param time_col: selected time column + :param secondary_keys: optional entity key columns to exclude + :return: semantic feature buckets + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + profiles = _build_column_profiles(dataset) + excluded = {time_col, *(secondary_keys or [])} + numeric_continuous_cols: list[str] = [] + numeric_count_cols: list[str] = [] + binary_flag_cols: list[str] = [] + categorical_feature_cols: list[str] = [] + for col in [str(value) for value in dataset.columns]: + if col in excluded: + continue + profile = profiles[col] + if profile["is_binary_like"]: + binary_flag_cols.append(col) + elif ( + profile["is_numeric_like"] + and profile["is_integer_like"] + and profile["is_nonnegative_like"] + and profile["n_unique"] > 2 + ): + numeric_count_cols.append(col) + elif profile["is_numeric_like"]: + numeric_continuous_cols.append(col) + else: + categorical_feature_cols.append(col) + covariate_cols = ( + numeric_continuous_cols + + numeric_count_cols + + binary_flag_cols + + categorical_feature_cols + ) + return { + "numeric_continuous_cols": numeric_continuous_cols, + "numeric_count_cols": numeric_count_cols, + "binary_flag_cols": binary_flag_cols, + "categorical_feature_cols": categorical_feature_cols, + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": covariate_cols, + "column_profiles": profiles, + } + + +@ltools.tool +def extract_head(path: str, *, n: int = 5) -> dict: + """ + Return the first rows from a dataset. + + :param path: dataset path + :param n: number of rows to return + :return: head rows serialized as JSON-compatible payload + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + n_rows = int(n) + if n_rows <= 0: + n_rows = 5 + n_rows = min(n_rows, 50) + head = dataset.head(n_rows) + rows = json.loads(head.to_json(orient="records", date_format="iso")) + payload = { + "n": n_rows, + "columns": [str(col) for col in head.columns.tolist()], + "rows": rows, + } + return payload diff --git a/agentic_eda/jupyterlab_extension_backend/tools/input_tools.py b/agentic_eda/jupyterlab_extension_backend/tools/input_tools.py deleted file mode 100644 index 28d1c4c22..000000000 --- a/agentic_eda/jupyterlab_extension_backend/tools/input_tools.py +++ /dev/null @@ -1,120 +0,0 @@ -""" -Import as: - -import tools.input_tools as tinptool -""" - -import json -import pathlib -import re - -import langchain.tools as ltools -import pandas as pd - -_VALID_HEADER_START_RE = re.compile(r"^[A-Za-z_]") - - -def load_dataset(path: pathlib.Path) -> pd.DataFrame: - """ - Load a supported dataset from disk. - - :param path: path to dataset file - :return: dataset as dataframe - """ - ext = path.suffix.lower() - if ext == ".csv": - dataset = pd.read_csv(path) - else: - raise ValueError(f"Unsupported file extension='{ext}'") - return dataset - - -def analyze_header(state: dict) -> dict: - """ - Validate dataset headers. - - :param state: graph state containing dataset path - :return: updated state fields with header status - """ - path = pathlib.Path(str(state["path"])) - dataset = load_dataset(path) - cols = list(dataset.columns) - has_header = True - error = "" - if ( - all(isinstance(col, int) for col in cols) - and cols == list(range(len(cols))) - ): - has_header = False - error = "No column names." - else: - for col in cols: - if col is None: - has_header = False - error = "One or more column names missing." - break - col_name = str(col).strip() - if col_name == "": - has_header = False - error = "One or more column names missing." - break - if ( - col_name[0].isdigit() - or not _VALID_HEADER_START_RE.match(col_name) - ): - has_header = False - error = ( - "One or more column names start with invalid characters." - ) - break - if has_header: - result = {"has_header": has_header, "dataset": dataset} - else: - result = {"has_header": has_header, "error": error} - return result - - -@ltools.tool -def extract_metadata(path: str) -> dict: - """ - Return minimal dataset metadata. - - :param path: dataset path - :return: metadata with shape and per-column cardinality - """ - dataset_path = pathlib.Path(path) - dataset = load_dataset(dataset_path) - n_rows, n_cols = dataset.shape - n_unique = dataset.nunique(dropna=True) - n_unique_map = {str(col): int(n_unique[col]) for col in n_unique.index} - metadata = { - "n_rows": int(n_rows), - "n_cols": int(n_cols), - "n_unique": n_unique_map, - } - return metadata - - -@ltools.tool -def extract_head(path: str, *, n: int = 5) -> dict: - """ - Return the first rows from a dataset. - - :param path: dataset path - :param n: number of rows to return - :return: head rows serialized as JSON-compatible payload - """ - dataset_path = pathlib.Path(path) - dataset = load_dataset(dataset_path) - n_rows = int(n) - if n_rows <= 0: - n_rows = 5 - n_rows = min(n_rows, 50) - head = dataset.head(n_rows) - rows = json.loads(head.to_json(orient="records", date_format="iso")) - payload = { - "n": n_rows, - "columns": [str(col) for col in head.columns.tolist()], - "rows": rows, - } - return payload From 24c2e039127268039e9c65786104d75d4c458389 Mon Sep 17 00:00:00 2001 From: Indrayudd Roy Chowdhury Date: Wed, 25 Mar 2026 08:25:06 -0700 Subject: [PATCH 5/5] TutorTask696: Add quality handling and univariate analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../jupyterlab_extension_backend/src/main.py | 20 + .../src/quality_handling/__init__.py | 3 + .../src/quality_handling/audit_missingness.py | 209 +++ .../quality_handling/handle_missingness.py | 386 ++++++ .../src/quality_handling/standardize.py | 488 +++++++ .../src/tools/input_tools.py | 1182 ++++++++++++++++- .../src/univariate_analysis/__init__.py | 3 + .../univariate_analysis/test_transforms.py | 212 +++ .../univariate_metrics_plotting.py | 214 +++ 9 files changed, 2714 insertions(+), 3 deletions(-) create mode 100644 agentic_eda/jupyterlab_extension_backend/src/quality_handling/__init__.py create mode 100644 agentic_eda/jupyterlab_extension_backend/src/quality_handling/audit_missingness.py create mode 100644 agentic_eda/jupyterlab_extension_backend/src/quality_handling/handle_missingness.py create mode 100644 agentic_eda/jupyterlab_extension_backend/src/quality_handling/standardize.py create mode 100644 agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/__init__.py create mode 100644 agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/test_transforms.py create mode 100644 agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/univariate_metrics_plotting.py diff --git a/agentic_eda/jupyterlab_extension_backend/src/main.py b/agentic_eda/jupyterlab_extension_backend/src/main.py index 6e9a3765c..f5fd3e70f 100644 --- a/agentic_eda/jupyterlab_extension_backend/src/main.py +++ b/agentic_eda/jupyterlab_extension_backend/src/main.py @@ -15,6 +15,11 @@ import src.ingest.infer_structure as sinferstruct import src.ingest.infer_type as sinfert import src.ingest.integrity as sinteg +import src.quality_handling.audit_missingness as sauditmiss +import src.quality_handling.handle_missingness as shandlemiss +import src.quality_handling.standardize as sstandard +import src.univariate_analysis.test_transforms as stransforms +import src.univariate_analysis.univariate_metrics_plotting as sunivar _LOG = logging.getLogger(__name__) @@ -36,6 +41,11 @@ def _parse_args() -> argparse.Namespace: "infer_structure", "compute_temporal_stats", "integrity", + "audit_missingness", + "handle_missingness", + "standardize", + "univariate_metrics_plotting", + "test_transforms", ], help="Pipeline stage to execute.", ) @@ -68,6 +78,16 @@ def _run_cli(args: argparse.Namespace) -> dict: payload = sinferstruct.run_infer_structure(args.path) elif mode == "compute_temporal_stats": payload = sctstats.run_compute_temporal_stats(args.path) + elif mode == "audit_missingness": + payload = sauditmiss.run_audit_missingness(args.path) + elif mode == "handle_missingness": + payload = shandlemiss.run_handle_missingness(args.path) + elif mode == "standardize": + payload = sstandard.run_standardize(args.path) + elif mode == "univariate_metrics_plotting": + payload = sunivar.run_univariate_metrics_plotting(args.path) + elif mode == "test_transforms": + payload = stransforms.run_test_transforms(args.path) else: raise ValueError(f"Unsupported mode='{mode}'") return payload diff --git a/agentic_eda/jupyterlab_extension_backend/src/quality_handling/__init__.py b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/__init__.py new file mode 100644 index 000000000..b6cf94fe8 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/__init__.py @@ -0,0 +1,3 @@ +""" +Quality-handling stages and helpers for the Jupyter backend. +""" diff --git a/agentic_eda/jupyterlab_extension_backend/src/quality_handling/audit_missingness.py b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/audit_missingness.py new file mode 100644 index 000000000..a037ca02a --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/audit_missingness.py @@ -0,0 +1,209 @@ +""" +Import as: + +import src.quality_handling.audit_missingness as sauditmiss +""" + +from __future__ import annotations + +import argparse +import logging +from typing import TypedDict + +import langgraph.graph as lgraph + +import src.ingest.compute_temporal_stats as sctstats +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class MissingnessAuditState(TypedDict): + """ + Store deterministic missingness audit output. + """ + + missingness_report: dict + + +class CompositeState(TypedDict): + """ + Store graph state for missingness auditing. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + missingness_report: dict + + +def call_compute_temporal_stats(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to temporal statistics. + + :param state: graph state + :return: composite payload from compute_temporal_stats + """ + payload = sctstats.run_compute_temporal_stats(state["path"]) + return payload + + +def audit_missingness(state: CompositeState) -> dict: + """ + Audit value missingness and timestamp missingness deterministically. + + :param state: graph state + :return: missingness report payload + """ + missingness_report = tinptool.audit_missingness.invoke( + { + "path": state["path"], + "time_col": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "winner_formatter": state["winner_formatter"], + } + ) + trace_payload = { + "primary_key": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "missingness_report": missingness_report, + } + tinptool.write_stage_trace(state["path"], "audit_missingness", trace_payload) + payload = { + "missingness_report": missingness_report, + "has_missing_values": bool( + missingness_report["value_missingness_summary"]["total_missing_cells"] > 0 + or missingness_report["timestamp_missingness_summary"]["total_missing_timestamps"] > 0 + ), + } + return payload + + +missingness_audit = lgraph.StateGraph(CompositeState) +missingness_audit.add_node("compute_temporal_stats_pipeline", call_compute_temporal_stats) +missingness_audit.add_node("audit_missingness", audit_missingness) +missingness_audit.add_edge(lgraph.START, "compute_temporal_stats_pipeline") +missingness_audit.add_edge("compute_temporal_stats_pipeline", "audit_missingness") +missingness_audit.add_edge("audit_missingness", lgraph.END) +graph = missingness_audit.compile() + + +def run_audit_missingness(path: str) -> dict: + """ + Execute missingness auditing end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + "n_nat_time": 0, + "min_time": None, + "max_time": None, + "typical_delta_mode": None, + "typical_delta_median": None, + "expected_frequency": None, + "dominant_frequency_fraction": 0.0, + "is_irregular_sampling": False, + "resampling_decision": "", + "coverage_summary": {}, + "coverage_per_entity": [], + "missingness_report": {}, + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Missingness audit output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_audit_missingness(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/quality_handling/handle_missingness.py b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/handle_missingness.py new file mode 100644 index 000000000..325f1cdd4 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/handle_missingness.py @@ -0,0 +1,386 @@ +""" +Import as: + +import src.quality_handling.handle_missingness as shandlemiss +""" + +from __future__ import annotations + +import argparse +import logging +from typing import Literal +from typing import TypedDict + +import langchain.agents as lagents +import langchain_core.messages as lmessages +import langgraph.graph as lgraph +import pydantic + +import src.config.config as cconf +import src.quality_handling.audit_missingness as sauditmiss +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +def _build_missingness_plan_summary(actions: list[dict], *, defaulted_cols: int) -> str: + """ + Build a summary from the normalized missingness actions. + + :param actions: normalized action list + :param defaulted_cols: number of columns defaulted during normalization + :return: summary text aligned with the final plan + """ + if not actions: + return "No non-time columns required missingness handling." + counts: dict[str, int] = {} + for action in actions: + strategy = str(action["strategy"]) + counts[strategy] = counts.get(strategy, 0) + 1 + ordered_counts = ", ".join( + f"{strategy}={counts[strategy]}" + for strategy in sorted(counts) + ) + summary = ( + f"Normalized missingness plan for {len(actions)} columns: {ordered_counts}. " + "Actions reflect the final bounded plan after validation against eligible strategies." + ) + if defaulted_cols > 0: + summary += f" {defaulted_cols} columns were defaulted conservatively during normalization." + return summary + + +class MissingnessDecision(pydantic.BaseModel): + """ + Store one bounded missingness decision. + """ + + col: str + strategy: Literal[ + "leave_as_nan", + "forward_fill", + "interpolate", + "zero_fill", + "drop_rows", + ] + create_missingness_flag: bool = True + reason: str + + +class MissingnessPlanOutput(pydantic.BaseModel): + """ + Store LLM-produced missingness plan. + """ + + summary: str + actions: list[MissingnessDecision] + + +class CompositeState(TypedDict): + """ + Store graph state for missingness handling. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + missingness_report: dict + missingness_plan: dict + missingness_handling_report: dict + quality_dataset_path: str + + +def call_audit_missingness(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to missingness auditing. + + :param state: graph state + :return: composite payload from audit_missingness + """ + payload = sauditmiss.run_audit_missingness(state["path"]) + return payload + + +def _normalize_missingness_plan(state: CompositeState, raw_plan: dict) -> dict: + """ + Ensure every missing column has one supported action. + + :param state: graph state + :param raw_plan: LLM-produced plan + :return: normalized deterministic plan + """ + audit_report = state["missingness_report"] + missing_cols = [ + item + for item in audit_report["value_missingness_by_column"] + if item["n_missing"] > 0 and item["col"] != state["primary_key"] + ] + eligible_by_col = { + item["col"]: set(item["eligible_strategies"]) + for item in missing_cols + } + plan_by_col = {} + defaulted_cols = 0 + for item in raw_plan.get("actions") or []: + col = str(item.get("col") or "") + if col not in eligible_by_col: + continue + strategy = str(item.get("strategy") or "leave_as_nan") + if strategy not in eligible_by_col[col]: + strategy = "leave_as_nan" + plan_by_col[col] = { + "col": col, + "strategy": strategy, + "create_missingness_flag": bool(item.get("create_missingness_flag", True)), + "reason": str(item.get("reason") or ""), + } + normalized_actions = [] + for item in missing_cols: + col = item["col"] + action = plan_by_col.get( + col, + { + "col": col, + "strategy": "leave_as_nan", + "create_missingness_flag": True, + "reason": "Defaulted conservatively because no valid explicit plan was provided.", + }, + ) + normalized_actions.append(action) + if col not in plan_by_col: + defaulted_cols += 1 + return { + "summary": _build_missingness_plan_summary( + normalized_actions, + defaulted_cols=defaulted_cols, + ), + "actions": normalized_actions, + } + + +def choose_missingness_plan(state: CompositeState) -> dict: + """ + Choose bounded missingness actions using deterministic evidence. + + :param state: graph state + :return: normalized missingness plan + """ + missing_cols = [ + item + for item in state["missingness_report"]["value_missingness_by_column"] + if item["n_missing"] > 0 and item["col"] != state["primary_key"] + ] + if not missing_cols: + payload = { + "missingness_plan": { + "summary": "No non-time columns contain missing values requiring handling.", + "actions": [], + } + } + return payload + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[], + system_prompt=( + "You are a missingness planner for a time-series EDA backend. " + "Choose exactly one bounded strategy per column with missing values. " + "Allowed strategies are leave_as_nan, forward_fill, interpolate, " + "zero_fill, and drop_rows. Prefer conservative choices when the " + "evidence is weak. Use zero_fill only for true count-like variables " + "where structural zeros are plausible. Use interpolate only for " + "numeric columns. Use forward_fill for stateful or slowly varying " + "features when continuity is plausible. Missing timestamps are a " + "separate issue from missing cell values; do not pretend that a cell " + "imputation solves timestamp holes." + ), + response_format=MissingnessPlanOutput, + ) + evidence = { + "series_type": state["type"], + "expected_frequency": state["expected_frequency"], + "is_irregular_sampling": state["is_irregular_sampling"], + "timestamp_missingness_summary": state["missingness_report"]["timestamp_missingness_summary"], + "columns_with_missing_values": missing_cols, + "numeric_continuous_cols": state["numeric_continuous_cols"], + "numeric_count_cols": state["numeric_count_cols"], + "binary_flag_cols": state["binary_flag_cols"], + "categorical_feature_cols": state["categorical_feature_cols"], + } + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=f"Plan missingness handling from this evidence: {evidence}" + ) + ] + } + ) + raw_plan = out["structured_response"].model_dump() + normalized_plan = _normalize_missingness_plan(state, raw_plan) + payload = {"missingness_plan": normalized_plan} + return payload + + +def apply_missingness_plan(state: CompositeState) -> dict: + """ + Apply the chosen missingness plan deterministically. + + :param state: graph state + :return: handling report and output dataset path + """ + handling_report = tinptool.apply_missingness_actions.invoke( + { + "source_path": state["path"], + "input_path": state["path"], + "time_col": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "winner_formatter": state["winner_formatter"], + "actions": state["missingness_plan"]["actions"], + } + ) + trace_payload = { + "missingness_plan": state["missingness_plan"], + "missingness_handling_report": handling_report, + } + tinptool.write_stage_trace(state["path"], "handle_missingness", trace_payload) + payload = { + "missingness_handling_report": handling_report, + "quality_dataset_path": handling_report["output_path"], + } + return payload + + +missingness_handling = lgraph.StateGraph(CompositeState) +missingness_handling.add_node("audit_missingness_pipeline", call_audit_missingness) +missingness_handling.add_node("choose_missingness_plan", choose_missingness_plan) +missingness_handling.add_node("apply_missingness_plan", apply_missingness_plan) +missingness_handling.add_edge(lgraph.START, "audit_missingness_pipeline") +missingness_handling.add_edge("audit_missingness_pipeline", "choose_missingness_plan") +missingness_handling.add_edge("choose_missingness_plan", "apply_missingness_plan") +missingness_handling.add_edge("apply_missingness_plan", lgraph.END) +graph = missingness_handling.compile() + + +def run_handle_missingness(path: str) -> dict: + """ + Execute missingness handling end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + "n_nat_time": 0, + "min_time": None, + "max_time": None, + "typical_delta_mode": None, + "typical_delta_median": None, + "expected_frequency": None, + "dominant_frequency_fraction": 0.0, + "is_irregular_sampling": False, + "resampling_decision": "", + "coverage_summary": {}, + "coverage_per_entity": [], + "missingness_report": {}, + "missingness_plan": {}, + "missingness_handling_report": {}, + "quality_dataset_path": "", + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Missingness handling output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_handle_missingness(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/quality_handling/standardize.py b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/standardize.py new file mode 100644 index 000000000..0dab99163 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/standardize.py @@ -0,0 +1,488 @@ +""" +Import as: + +import src.quality_handling.standardize as sstandard +""" + +from __future__ import annotations + +import argparse +import logging +from typing import Literal +from typing import TypedDict + +import langchain.agents as lagents +import langchain_core.messages as lmessages +import langgraph.graph as lgraph +import pydantic + +import src.config.config as cconf +import src.quality_handling.handle_missingness as shandlemiss +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +def _build_standardization_plan_summary(actions: list[dict], *, defaulted_cols: int) -> str: + """ + Build a summary from the normalized standardization actions. + + :param actions: normalized action list + :param defaulted_cols: number of columns defaulted during normalization + :return: summary text aligned with the final plan + """ + if not actions: + return "No numeric candidate columns were selected for optional standardization." + counts: dict[str, int] = {} + for action in actions: + transform = str(action["action"]) + counts[transform] = counts.get(transform, 0) + 1 + ordered_counts = ", ".join( + f"{transform}={counts[transform]}" + for transform in sorted(counts) + ) + summary = ( + f"Normalized standardization plan for {len(actions)} columns: {ordered_counts}. " + "This summary reflects the final validated transform choices, not the raw LLM prose." + ) + if defaulted_cols > 0: + summary += f" {defaulted_cols} columns defaulted conservatively to `none`." + return summary + + +class StandardizationDecision(pydantic.BaseModel): + """ + Store one bounded standardization decision. + """ + + col: str + action: Literal["none", "robust_scale", "log1p", "log1p_then_robust_scale"] + reason: str + + +class StandardizationPlanOutput(pydantic.BaseModel): + """ + Store LLM-produced standardization plan. + """ + + summary: str + actions: list[StandardizationDecision] + + +class StandardizationGateOutput(pydantic.BaseModel): + """ + Store the dataset-level standardization gate decision. + """ + + should_standardize: bool + reason: str + + +class CompositeState(TypedDict): + """ + Store graph state for optional standardization. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + missingness_report: dict + missingness_plan: dict + missingness_handling_report: dict + quality_dataset_path: str + standardization_profile: dict + standardization_gate: dict + standardization_plan: dict + standardization_report: dict + standardized_dataset_path: str + + +def call_handle_missingness(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to missingness handling. + + :param state: graph state + :return: composite payload from handle_missingness + """ + payload = shandlemiss.run_handle_missingness(state["path"]) + return payload + + +def profile_standardization(state: CompositeState) -> dict: + """ + Profile numeric feature scale and tail behavior deterministically. + + :param state: graph state + :return: scale profile report + """ + input_path = state["quality_dataset_path"] or state["path"] + profile = tinptool.profile_standardization_candidates.invoke( + { + "path": input_path, + "numeric_continuous_cols": state["numeric_continuous_cols"], + "numeric_count_cols": state["numeric_count_cols"], + "binary_flag_cols": state["binary_flag_cols"], + } + ) + payload = {"standardization_profile": profile} + return payload + + +def choose_standardization_gate(state: CompositeState) -> dict: + """ + Decide whether optional standardization should run at all. + + :param state: graph state + :return: dataset-level gate decision + """ + per_column = state["standardization_profile"].get("per_column") or [] + if not per_column: + return { + "standardization_gate": { + "should_standardize": False, + "reason": "No numeric candidate columns were available for optional standardization.", + } + } + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[], + system_prompt=( + "You are the gatekeeper for point 9 in a time-series EDA backend. " + "Decide whether optional standardization should run at all for this dataset. " + "Favor should_standardize=false unless there is strong evidence that rescaling " + "or log-scaling is genuinely useful. Favor false for raw exploratory analysis, " + "for SCADA or sensor-style datasets where physical units matter, and for cases " + "where leaving values untouched preserves interpretability. Favor true only when " + "scale disparities or heavy tails are severe enough that not transforming would " + "materially hinder comparison or downstream modeling." + ), + response_format=StandardizationGateOutput, + ) + evidence = { + "series_type": state["type"], + "numeric_continuous_cols": state["numeric_continuous_cols"], + "numeric_count_cols": state["numeric_count_cols"], + "binary_flag_cols": state["binary_flag_cols"], + "scale_summary": state["standardization_profile"].get("scale_summary"), + "sample_profiles": per_column[:20], + } + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=f"Decide whether optional standardization should run from this evidence: {evidence}" + ) + ] + } + ) + gate = out["structured_response"].model_dump() + return {"standardization_gate": gate} + + +def _normalize_standardization_plan(state: CompositeState, raw_plan: dict) -> dict: + """ + Ensure every candidate column gets a supported transform decision. + + :param state: graph state + :param raw_plan: LLM-produced plan + :return: normalized plan + """ + per_column = state["standardization_profile"].get("per_column") or [] + eligible_by_col = { + item["col"]: set(item["eligible_actions"]) + for item in per_column + } + plan_by_col = {} + defaulted_cols = 0 + for item in raw_plan.get("actions") or []: + col = str(item.get("col") or "") + if col not in eligible_by_col: + continue + action = str(item.get("action") or "none") + if action not in eligible_by_col[col]: + action = "none" + plan_by_col[col] = { + "col": col, + "action": action, + "reason": str(item.get("reason") or ""), + } + normalized_actions = [] + for item in per_column: + col = item["col"] + if col not in plan_by_col: + defaulted_cols += 1 + normalized_actions.append( + plan_by_col.get( + col, + { + "col": col, + "action": "none", + "reason": "Defaulted conservatively because no valid transform was selected.", + }, + ) + ) + return { + "summary": _build_standardization_plan_summary( + normalized_actions, + defaulted_cols=defaulted_cols, + ), + "actions": normalized_actions, + } + + +def choose_standardization_plan(state: CompositeState) -> dict: + """ + Choose whether optional standardization is justified. + + :param state: graph state + :return: normalized standardization plan + """ + gate = state.get("standardization_gate") or {} + if not bool(gate.get("should_standardize")): + payload = { + "standardization_plan": { + "summary": ( + "Dataset-level standardization gate returned `no`. " + f"Reason: {str(gate.get('reason') or 'No reason provided.')}" + ), + "actions": [], + } + } + return payload + per_column = state["standardization_profile"].get("per_column") or [] + if not per_column: + payload = { + "standardization_plan": { + "summary": "No numeric candidate columns were available for optional standardization.", + "actions": [], + } + } + return payload + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[], + system_prompt=( + "You are an optional standardization planner for a time-series EDA backend. " + "This stage is optional. Use action none unless there is a concrete reason " + "to transform a feature. Allowed actions are none, robust_scale, log1p, " + "and log1p_then_robust_scale. Favor none when evidence is weak. Favor " + "robust_scale for large cross-feature scale disparities. Favor log1p for " + "strongly right-skewed nonnegative features. Never invent new actions." + ), + response_format=StandardizationPlanOutput, + ) + evidence = { + "series_type": state["type"], + "scale_summary": state["standardization_profile"].get("scale_summary"), + "per_column": per_column, + } + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=f"Choose optional standardization actions from this evidence: {evidence}" + ) + ] + } + ) + raw_plan = out["structured_response"].model_dump() + normalized_plan = _normalize_standardization_plan(state, raw_plan) + payload = {"standardization_plan": normalized_plan} + return payload + + +def apply_standardization_plan(state: CompositeState) -> dict: + """ + Apply the chosen standardization plan deterministically. + + :param state: graph state + :return: transformation report and output path + """ + input_path = state["quality_dataset_path"] or state["path"] + if not state["standardization_plan"]["actions"]: + report = { + "input_path": input_path, + "output_path": input_path, + "skipped": True, + "reason": state["standardization_plan"]["summary"], + "actions_applied": [], + } + trace_payload = { + "input_path": input_path, + "standardization_profile": state["standardization_profile"], + "standardization_gate": state.get("standardization_gate") or {}, + "standardization_plan": state["standardization_plan"], + "standardization_report": report, + } + tinptool.write_stage_trace(state["path"], "standardize", trace_payload) + payload = { + "standardization_report": report, + "standardized_dataset_path": input_path, + } + return payload + report = tinptool.apply_standardization_actions.invoke( + { + "source_path": state["path"], + "input_path": input_path, + "actions": state["standardization_plan"]["actions"], + } + ) + trace_payload = { + "input_path": input_path, + "standardization_profile": state["standardization_profile"], + "standardization_gate": state.get("standardization_gate") or {}, + "standardization_plan": state["standardization_plan"], + "standardization_report": report, + } + tinptool.write_stage_trace(state["path"], "standardize", trace_payload) + payload = { + "standardization_report": report, + "standardized_dataset_path": report["output_path"], + } + return payload + + +standardization = lgraph.StateGraph(CompositeState) +standardization.add_node("handle_missingness_pipeline", call_handle_missingness) +standardization.add_node("profile_standardization", profile_standardization) +standardization.add_node("choose_standardization_gate", choose_standardization_gate) +standardization.add_node("choose_standardization_plan", choose_standardization_plan) +standardization.add_node("apply_standardization_plan", apply_standardization_plan) +standardization.add_edge(lgraph.START, "handle_missingness_pipeline") +standardization.add_edge("handle_missingness_pipeline", "profile_standardization") +standardization.add_edge("profile_standardization", "choose_standardization_gate") +standardization.add_edge("choose_standardization_gate", "choose_standardization_plan") +standardization.add_edge("choose_standardization_plan", "apply_standardization_plan") +standardization.add_edge("apply_standardization_plan", lgraph.END) +graph = standardization.compile() + + +def run_standardize(path: str) -> dict: + """ + Execute optional standardization end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + "n_nat_time": 0, + "min_time": None, + "max_time": None, + "typical_delta_mode": None, + "typical_delta_median": None, + "expected_frequency": None, + "dominant_frequency_fraction": 0.0, + "is_irregular_sampling": False, + "resampling_decision": "", + "coverage_summary": {}, + "coverage_per_entity": [], + "missingness_report": {}, + "missingness_plan": {}, + "missingness_handling_report": {}, + "quality_dataset_path": "", + "standardization_profile": {}, + "standardization_gate": {}, + "standardization_plan": {}, + "standardization_report": {}, + "standardized_dataset_path": "", + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Standardization output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_standardize(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/tools/input_tools.py b/agentic_eda/jupyterlab_extension_backend/src/tools/input_tools.py index 4190c5533..3490b4811 100644 --- a/agentic_eda/jupyterlab_extension_backend/src/tools/input_tools.py +++ b/agentic_eda/jupyterlab_extension_backend/src/tools/input_tools.py @@ -11,12 +11,24 @@ from typing import Any import langchain.tools as ltools +import numpy as np import pandas as pd import pydantic _VALID_HEADER_START_RE = re.compile(r"^[A-Za-z_]") +def _trace_root() -> pathlib.Path: + """ + Return the backend-level trace directory. + + :return: absolute trace root + """ + trace_root = pathlib.Path(__file__).resolve().parents[2] / "traces" + trace_root.mkdir(parents=True, exist_ok=True) + return trace_root + + def load_dataset(path: pathlib.Path) -> pd.DataFrame: """ Load a supported dataset from disk. @@ -211,10 +223,8 @@ def write_stage_trace(path: str, stage: str, payload: dict[str, Any]) -> str: :return: absolute trace file path """ dataset_path = pathlib.Path(path) - trace_root = pathlib.Path(__file__).resolve().parents[1] / "traces" - trace_root.mkdir(parents=True, exist_ok=True) filename = f"{dataset_path.stem}.{stage}.json" - trace_path = trace_root / filename + trace_path = _trace_root() / filename trace_payload = { "dataset_path": str(dataset_path), "stage": stage, @@ -227,6 +237,43 @@ def write_stage_trace(path: str, stage: str, payload: dict[str, Any]) -> str: return str(trace_path) +def write_stage_dataset(path: str, stage: str, dataset: pd.DataFrame) -> str: + """ + Persist a stage-produced dataset artifact alongside trace files. + + :param path: source dataset path + :param stage: pipeline stage name + :param dataset: dataframe to serialize + :return: absolute output dataset path + """ + dataset_path = pathlib.Path(path) + filename = f"{dataset_path.stem}.{stage}.csv" + output_path = _trace_root() / filename + dataset.to_csv(output_path, index=False) + return str(output_path) + + +def write_stage_plot(path: str, stage: str, plot_name: str, fig: Any) -> str: + """ + Persist a stage-produced plot under the backend trace directory. + + :param path: source dataset path + :param stage: pipeline stage name + :param plot_name: plot-specific filename stem + :param fig: matplotlib figure + :return: absolute output plot path + """ + dataset_path = pathlib.Path(path) + plot_dir = _trace_root() / f"{dataset_path.stem}.{stage}" + plot_dir.mkdir(parents=True, exist_ok=True) + safe_name = re.sub(r"[^A-Za-z0-9_.-]+", "_", plot_name).strip("_") + if not safe_name: + safe_name = "plot" + output_path = plot_dir / f"{safe_name}.png" + fig.savefig(output_path, dpi=140, bbox_inches="tight") + return str(output_path) + + def _parse_time_series( dataset: pd.DataFrame, time_col: str, @@ -289,6 +336,292 @@ def _series_identifier(keys: list[str], values: tuple[Any, ...]) -> dict[str, An return {key: value for key, value in zip(keys, values, strict=True)} +def _ordered_dataset( + dataset: pd.DataFrame, + time_col: str, + secondary_keys: list[str] | None = None, + winner_formatter: dict[str, Any] | None = None, +) -> pd.DataFrame: + """ + Return a stable, time-aware ordering for sequential quality operations. + + :param dataset: input dataframe + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param winner_formatter: optional datetime parsing kwargs + :return: ordered dataframe with helper columns + """ + ordered = dataset.copy() + ordered["_row_order"] = range(int(ordered.shape[0])) + if time_col in ordered.columns: + ordered["_ts"] = _parse_time_series(ordered, time_col, winner_formatter) + else: + ordered["_ts"] = pd.NaT + valid_secondary_keys = [ + key + for key in (secondary_keys or []) + if key in ordered.columns and key != time_col + ] + sort_cols = list(valid_secondary_keys) + if ordered["_ts"].notna().any(): + sort_cols.append("_ts") + sort_cols.append("_row_order") + ordered = ordered.sort_values(sort_cols, na_position="last").reset_index(drop=True) + return ordered + + +def _iter_series_frames( + dataset: pd.DataFrame, + secondary_keys: list[str] | None = None, +) -> list[tuple[dict[str, Any] | None, pd.DataFrame]]: + """ + Yield one frame per inferred series. + + :param dataset: ordered dataframe + :param secondary_keys: optional entity keys + :return: list of entity/frame pairs + """ + valid_secondary_keys = [ + key for key in (secondary_keys or []) if key in dataset.columns + ] + if not valid_secondary_keys: + return [(None, dataset)] + items: list[tuple[dict[str, Any] | None, pd.DataFrame]] = [] + grouped = dataset.groupby(valid_secondary_keys, dropna=False, sort=False) + for raw_key, frame in grouped: + key_tuple = raw_key if isinstance(raw_key, tuple) else (raw_key,) + items.append((_series_identifier(valid_secondary_keys, key_tuple), frame)) + return items + + +def _mask_run_lengths(mask: pd.Series) -> list[int]: + """ + Return lengths of consecutive true runs in a boolean mask. + + :param mask: boolean mask + :return: run lengths + """ + run_lengths: list[int] = [] + current = 0 + for is_true in mask.fillna(False).astype(bool).tolist(): + if is_true: + current += 1 + elif current > 0: + run_lengths.append(current) + current = 0 + if current > 0: + run_lengths.append(current) + return run_lengths + + +def _safe_float(value: Any) -> float | None: + """ + Convert a numeric-like value into a JSON-friendly float. + + :param value: input value + :return: float or None + """ + if value is None or pd.isna(value): + return None + return float(value) + + +def _candidate_univariate_numeric_cols( + dataset: pd.DataFrame, + *, + time_col: str, + secondary_keys: list[str] | None = None, + numeric_continuous_cols: list[str] | None = None, + numeric_count_cols: list[str] | None = None, + binary_flag_cols: list[str] | None = None, +) -> list[str]: + """ + Return deterministic numeric columns suitable for univariate analysis. + + :param dataset: input dataframe + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param numeric_continuous_cols: inferred continuous numeric columns + :param numeric_count_cols: inferred count columns + :param binary_flag_cols: inferred binary columns + :return: ordered numeric analysis columns + """ + excluded = {time_col, *(secondary_keys or [])} + candidates = list( + dict.fromkeys( + [ + *[col for col in (numeric_continuous_cols or []) if col in dataset.columns], + *[col for col in (numeric_count_cols or []) if col in dataset.columns], + *[col for col in (binary_flag_cols or []) if col in dataset.columns], + ] + ) + ) + if not candidates: + candidates = [ + str(col) + for col in dataset.columns + if str(col) not in excluded and pd.to_numeric(dataset[col], errors="coerce").notna().any() + ] + return [col for col in candidates if col not in excluded] + + +def _tail_ratio(series: pd.Series) -> float | None: + """ + Compute a simple deterministic tail ratio. + + :param series: numeric series + :return: tail ratio or None + """ + valid = pd.to_numeric(series, errors="coerce").dropna() + if valid.empty: + return None + p50 = valid.quantile(0.50) + p99 = valid.quantile(0.99) + if pd.isna(p50) or pd.isna(p99): + return None + if float(abs(p50)) <= 1e-12: + return None if float(abs(p99)) <= 1e-12 else float(abs(p99)) + return float(abs(p99) / abs(p50)) + + +def _univariate_summary(series: pd.Series) -> dict[str, Any]: + """ + Compute deterministic univariate summary statistics. + + :param series: numeric-like series + :return: summary stats + """ + numeric = pd.to_numeric(series, errors="coerce") + valid = numeric.dropna() + n_total = int(series.shape[0]) + n_non_null = int(valid.shape[0]) + n_missing = max(0, n_total - n_non_null) + if valid.empty: + return { + "n_total": n_total, + "n_non_null": 0, + "n_missing": n_missing, + "missing_pct": None if n_total == 0 else float(100.0 * n_missing / n_total), + "n_unique": 0, + "mean": None, + "std": None, + "min": None, + "p01": None, + "p05": None, + "p25": None, + "p50": None, + "p75": None, + "p95": None, + "p99": None, + "max": None, + "iqr": None, + "zero_fraction": None, + "skew": None, + "kurtosis": None, + "tail_ratio_p99_p50": None, + } + q = valid.quantile([0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99]) + return { + "n_total": n_total, + "n_non_null": n_non_null, + "n_missing": n_missing, + "missing_pct": None if n_total == 0 else float(100.0 * n_missing / n_total), + "n_unique": int(valid.nunique(dropna=True)), + "mean": _safe_float(valid.mean()), + "std": _safe_float(valid.std()), + "min": _safe_float(valid.min()), + "p01": _safe_float(q.loc[0.01]), + "p05": _safe_float(q.loc[0.05]), + "p25": _safe_float(q.loc[0.25]), + "p50": _safe_float(q.loc[0.50]), + "p75": _safe_float(q.loc[0.75]), + "p95": _safe_float(q.loc[0.95]), + "p99": _safe_float(q.loc[0.99]), + "max": _safe_float(valid.max()), + "iqr": _safe_float(q.loc[0.75] - q.loc[0.25]), + "zero_fraction": float((valid == 0).mean()), + "skew": _safe_float(valid.skew()), + "kurtosis": _safe_float(valid.kurt()), + "tail_ratio_p99_p50": _tail_ratio(valid), + } + + +def _gaussian_kde_curve(series: pd.Series, *, n_points: int = 256) -> tuple[np.ndarray, np.ndarray] | None: + """ + Compute a simple Gaussian KDE curve without scipy. + + :param series: numeric series + :param n_points: number of evaluation points + :return: x/y arrays or None when KDE is not appropriate + """ + valid = pd.to_numeric(series, errors="coerce").dropna().to_numpy(dtype=float) + if valid.size < 30: + return None + unique = np.unique(valid) + if unique.size < 10: + return None + std = float(np.std(valid, ddof=1)) + iqr = float(np.subtract(*np.percentile(valid, [75, 25]))) + scale = min(std, iqr / 1.34) if iqr > 0.0 else std + if not np.isfinite(scale) or scale <= 0.0: + return None + bandwidth = 0.9 * scale * (valid.size ** (-1.0 / 5.0)) + if not np.isfinite(bandwidth) or bandwidth <= 0.0: + return None + x_grid = np.linspace(float(valid.min()), float(valid.max()), n_points) + diffs = (x_grid[:, None] - valid[None, :]) / bandwidth + density = np.exp(-0.5 * diffs**2).sum(axis=1) + density /= float(valid.size * bandwidth * np.sqrt(2.0 * np.pi)) + return x_grid, density + + +def _transform_candidates(series: pd.Series) -> dict[str, pd.Series]: + """ + Build deterministic transform candidates for one numeric series. + + :param series: numeric series + :return: map of candidate name to transformed series + """ + numeric = pd.to_numeric(series, errors="coerce") + candidates: dict[str, pd.Series] = {"none": numeric} + valid = numeric.dropna() + if valid.empty: + return candidates + candidates["cuberoot"] = numeric.apply( + lambda value: np.cbrt(value) if pd.notna(value) else value + ) + if float(valid.min()) >= 0.0: + candidates["sqrt"] = numeric.apply( + lambda value: np.sqrt(value) if pd.notna(value) else value + ) + candidates["log1p"] = pd.Series(np.log1p(numeric), index=numeric.index) + return candidates + + +def _transform_score(series: pd.Series) -> dict[str, Any]: + """ + Score one transformed series using deterministic shape criteria. + + :param series: transformed numeric series + :return: score details + """ + summary = _univariate_summary(series) + valid = pd.to_numeric(series, errors="coerce").dropna() + if valid.empty: + return { + "summary": summary, + "score": None, + } + abs_skew = abs(float(summary["skew"])) if summary["skew"] is not None else 99.0 + abs_kurtosis = abs(float(summary["kurtosis"])) if summary["kurtosis"] is not None else 99.0 + tail_ratio = float(summary["tail_ratio_p99_p50"]) if summary["tail_ratio_p99_p50"] is not None else 99.0 + score = float(abs_skew + 0.25 * abs_kurtosis + 0.10 * tail_ratio) + return { + "summary": summary, + "score": score, + } + + class _TemporalStatsArgs(pydantic.BaseModel): """ Store arguments for deterministic temporal statistics. @@ -466,6 +799,849 @@ def compute_temporal_stats( } +class _MissingnessAuditArgs(pydantic.BaseModel): + """ + Store arguments for deterministic missingness auditing. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + path: str + time_col: str + secondary_keys: list[str] | None = None + winner_formatter: dict[str, Any] | None = None + + +@ltools.tool(args_schema=_MissingnessAuditArgs) +def audit_missingness( + path: str, + time_col: str, + secondary_keys: list[str] | None = None, + winner_formatter: dict[str, Any] | None = None, +) -> dict: + """ + Audit missingness as two distinct problems: missing values and missing + timestamps. + + Theory: + Missing cells inside observed rows and missing timestamps in the implied + time grid are different failure modes. Value missingness tells us which + variables are incomplete at observed observation times. Timestamp + missingness tells us whether observations are absent from the expected + sampling cadence. The former guides imputation choices per feature; the + latter guides reindexing, coverage assessment, and gap-aware modeling. + + :param path: dataset path + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param winner_formatter: optional datetime parsing kwargs + :return: missingness audit payload + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + ordered = _ordered_dataset( + dataset, + time_col=time_col, + secondary_keys=secondary_keys, + winner_formatter=winner_formatter, + ) + temporal_report = compute_temporal_stats.invoke( + { + "path": path, + "time_col": time_col, + "secondary_keys": secondary_keys or [], + "winner_formatter": winner_formatter or {}, + } + ) + profiles = _build_column_profiles(dataset) + value_missingness_by_column: list[dict[str, Any]] = [] + total_missing_cells = int(dataset.isna().sum().sum()) + all_series_frames = _iter_series_frames(ordered, secondary_keys) + for col in [str(value) for value in dataset.columns]: + missing_mask = dataset[col].isna() + n_missing = int(missing_mask.sum()) + missing_pct = 0.0 if dataset.empty else float(100.0 * n_missing / len(dataset)) + run_lengths: list[int] = [] + for _, frame in all_series_frames: + frame_run_lengths = _mask_run_lengths(frame[col].isna()) + run_lengths.extend(frame_run_lengths) + eligible_strategies = ["leave_as_nan", "drop_rows"] + profile = profiles[col] + if n_missing > 0 and col != time_col: + eligible_strategies.append("forward_fill") + if profile["is_numeric_like"]: + eligible_strategies.append("interpolate") + if ( + profile["is_numeric_like"] + and profile["is_integer_like"] + and profile["is_nonnegative_like"] + ): + eligible_strategies.append("zero_fill") + value_missingness_by_column.append( + { + "col": col, + "dtype": profile["dtype"], + "n_missing": n_missing, + "missing_pct": missing_pct, + "n_missing_runs": int(len(run_lengths)), + "longest_missing_run": int(max(run_lengths, default=0)), + "eligible_strategies": eligible_strategies, + "sample_values": profile["sample_values"], + } + ) + value_missingness_by_column.sort( + key=lambda item: (item["n_missing"], item["longest_missing_run"]), + reverse=True, + ) + worst_value_col = next( + (item for item in value_missingness_by_column if item["n_missing"] > 0), + None, + ) + timestamp_missingness_by_entity: list[dict[str, Any]] = [] + total_expected_timestamps = 0 + total_observed_timestamps = 0 + total_missing_timestamps = 0 + n_series_with_timestamp_gaps = 0 + for item in temporal_report["coverage_per_entity"]: + n_observed = int(item.get("n_observed_timestamps") or 0) + n_expected = int(item.get("n_expected_timestamps") or n_observed) + n_missing_timestamps = max(0, n_expected - n_observed) + total_expected_timestamps += n_expected + total_observed_timestamps += n_observed + total_missing_timestamps += n_missing_timestamps + if n_missing_timestamps > 0: + n_series_with_timestamp_gaps += 1 + timestamp_missingness_by_entity.append( + { + "entity": item.get("entity"), + "n_observed_timestamps": n_observed, + "n_expected_timestamps": n_expected, + "n_missing_timestamps": n_missing_timestamps, + "coverage_pct": item.get("coverage_pct"), + "n_gaps": int(item.get("n_gaps") or 0), + "longest_gap": item.get("longest_gap"), + } + ) + timestamp_missingness_by_entity.sort( + key=lambda item: ( + item["n_missing_timestamps"], + item["n_gaps"], + item["coverage_pct"] if item["coverage_pct"] is not None else -1.0, + ), + reverse=True, + ) + return { + "time_col": time_col, + "secondary_keys": [ + key for key in (secondary_keys or []) if key in dataset.columns and key != time_col + ], + "n_rows": int(dataset.shape[0]), + "n_cols": int(dataset.shape[1]), + "value_missingness_summary": { + "total_missing_cells": total_missing_cells, + "total_missing_fraction": ( + 0.0 + if dataset.empty + else float(100.0 * total_missing_cells / max(1, int(dataset.size))) + ), + "columns_with_missing_values": int(sum(item["n_missing"] > 0 for item in value_missingness_by_column)), + "worst_column": None if worst_value_col is None else worst_value_col["col"], + "worst_column_missing_pct": ( + None if worst_value_col is None else worst_value_col["missing_pct"] + ), + }, + "value_missingness_by_column": value_missingness_by_column, + "timestamp_missingness_summary": { + "expected_frequency": temporal_report["expected_frequency"], + "is_irregular_sampling": temporal_report["is_irregular_sampling"], + "resampling_decision": temporal_report["resampling_decision"], + "n_nat_time": temporal_report["n_nat_time"], + "total_expected_timestamps": total_expected_timestamps, + "total_observed_timestamps": total_observed_timestamps, + "total_missing_timestamps": total_missing_timestamps, + "n_series_with_timestamp_gaps": n_series_with_timestamp_gaps, + }, + "timestamp_missingness_by_entity": timestamp_missingness_by_entity, + "column_profiles": profiles, + } + + +class MissingnessActionSpec(pydantic.BaseModel): + """ + Store one bounded missingness action. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + col: str + strategy: str + create_missingness_flag: bool = True + reason: str = "" + + +class _ApplyMissingnessActionsArgs(pydantic.BaseModel): + """ + Store arguments for deterministic missingness handling. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + source_path: str + input_path: str + time_col: str + secondary_keys: list[str] | None = None + winner_formatter: dict[str, Any] | None = None + actions: list[MissingnessActionSpec] + + +@ltools.tool(args_schema=_ApplyMissingnessActionsArgs) +def apply_missingness_actions( + source_path: str, + input_path: str, + time_col: str, + secondary_keys: list[str] | None = None, + winner_formatter: dict[str, Any] | None = None, + actions: list[MissingnessActionSpec] | None = None, +) -> dict: + """ + Apply one bounded missingness strategy per selected column. + + Theory: + The policy choice for each column may be ambiguous, but the mechanics of + applying a chosen action should be deterministic and reproducible. By + sorting within entity/time order, optionally adding missingness flags, and + then applying simple bounded transforms, the stage can record exactly what + changed without allowing the LLM to mutate data directly. + + :param source_path: original dataset path used for artifact naming + :param input_path: dataset path to transform + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param winner_formatter: optional datetime parsing kwargs + :param actions: bounded per-column action plan + :return: transformation report with output dataset path + """ + dataset = load_dataset(pathlib.Path(input_path)) + working = _ordered_dataset( + dataset, + time_col=time_col, + secondary_keys=secondary_keys, + winner_formatter=winner_formatter, + ) + valid_secondary_keys = [ + key + for key in (secondary_keys or []) + if key in working.columns and key != time_col + ] + action_items = [item.model_dump() if isinstance(item, pydantic.BaseModel) else item for item in (actions or [])] + drop_mask = pd.Series(False, index=working.index) + applied_actions: list[dict[str, Any]] = [] + for action in action_items: + col = str(action["col"]) + strategy = str(action["strategy"]) + create_missingness_flag = bool(action.get("create_missingness_flag", True)) + reason = str(action.get("reason") or "") + if col not in working.columns: + applied_actions.append( + { + "col": col, + "strategy": strategy, + "status": "skipped_missing_column", + "reason": reason, + } + ) + continue + before_mask = working[col].isna() + n_missing_before = int(before_mask.sum()) + if create_missingness_flag and n_missing_before > 0: + flag_col = f"{col}__was_missing" + if flag_col not in working.columns: + working[flag_col] = before_mask.astype(int) + status = "applied" + if strategy == "leave_as_nan": + pass + elif strategy == "drop_rows": + drop_mask = drop_mask | before_mask + elif strategy == "forward_fill": + if valid_secondary_keys: + working[col] = working.groupby(valid_secondary_keys, dropna=False)[col].ffill() + else: + working[col] = working[col].ffill() + elif strategy == "interpolate": + numeric = pd.to_numeric(working[col], errors="coerce") + if valid_secondary_keys: + working[col] = working.groupby(valid_secondary_keys, dropna=False)[numeric.name].transform( + lambda series: pd.to_numeric(series, errors="coerce").interpolate( + limit_area="inside" + ) + ) + else: + working[col] = numeric.interpolate(limit_area="inside") + elif strategy == "zero_fill": + numeric = pd.to_numeric(working[col], errors="coerce") + working[col] = numeric.fillna(0.0) + else: + status = "skipped_unsupported_strategy" + n_missing_after = int(working[col].isna().sum()) if col in working.columns else None + applied_actions.append( + { + "col": col, + "strategy": strategy, + "status": status, + "reason": reason, + "create_missingness_flag": create_missingness_flag, + "n_missing_before": n_missing_before, + "n_missing_after": n_missing_after, + "n_values_filled": None if n_missing_after is None else max(0, n_missing_before - n_missing_after), + "n_rows_marked_for_drop": int(before_mask.sum()) if strategy == "drop_rows" else 0, + } + ) + n_rows_before = int(working.shape[0]) + if bool(drop_mask.any()): + working = working.loc[~drop_mask].copy() + n_rows_after = int(working.shape[0]) + n_rows_dropped = max(0, n_rows_before - n_rows_after) + remaining_missing_by_column = { + str(col): int(working[col].isna().sum()) + for col in working.columns + if not str(col).startswith("_") + } + output_dataset = working.drop(columns=["_ts", "_row_order"], errors="ignore") + output_path = write_stage_dataset(source_path, "handle_missingness", output_dataset) + return { + "input_path": input_path, + "output_path": output_path, + "n_rows_before": n_rows_before, + "n_rows_after": n_rows_after, + "n_rows_dropped": n_rows_dropped, + "actions_applied": applied_actions, + "remaining_missing_by_column": remaining_missing_by_column, + "sorted_by": valid_secondary_keys + (["_ts"] if "_ts" in working.columns else []), + } + + +class _ScaleProfileArgs(pydantic.BaseModel): + """ + Store arguments for deterministic scale profiling. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + path: str + numeric_continuous_cols: list[str] | None = None + numeric_count_cols: list[str] | None = None + binary_flag_cols: list[str] | None = None + + +@ltools.tool(args_schema=_ScaleProfileArgs) +def profile_standardization_candidates( + path: str, + numeric_continuous_cols: list[str] | None = None, + numeric_count_cols: list[str] | None = None, + binary_flag_cols: list[str] | None = None, +) -> dict: + """ + Profile scale and tail behavior for numeric features. + + Theory: + Standardization is only justified when the observed numeric scales or tail + behaviors would otherwise distort comparisons or downstream models. Robust + scaling depends on median/IQR support, while `log1p` depends on nonnegative + support and heavy right tails. These properties can be measured + deterministically before the LLM decides whether the optional transform is + worth applying. + + :param path: dataset path + :param numeric_continuous_cols: inferred continuous numeric columns + :param numeric_count_cols: inferred count-like numeric columns + :param binary_flag_cols: inferred binary columns to exclude + :return: per-column scale profile + """ + dataset = load_dataset(pathlib.Path(path)) + continuous = [col for col in (numeric_continuous_cols or []) if col in dataset.columns] + counts = [col for col in (numeric_count_cols or []) if col in dataset.columns] + excluded = {col for col in (binary_flag_cols or []) if col in dataset.columns} + candidate_cols = [col for col in continuous + counts if col not in excluded] + candidate_cols = list(dict.fromkeys(candidate_cols)) + per_column: list[dict[str, Any]] = [] + iqr_values: list[float] = [] + for col in candidate_cols: + numeric = pd.to_numeric(dataset[col], errors="coerce").dropna() + if numeric.empty: + continue + median = numeric.median() + q1 = numeric.quantile(0.25) + q3 = numeric.quantile(0.75) + iqr = q3 - q1 + p01 = numeric.quantile(0.01) + p50 = numeric.quantile(0.50) + p99 = numeric.quantile(0.99) + positive_fraction = float((numeric >= 0).mean()) + zero_fraction = float((numeric == 0).mean()) + abs_median = abs(float(median)) if not pd.isna(median) else 0.0 + tail_ratio = None + if p50 > 0: + tail_ratio = float(p99 / p50) + if float(iqr) > 0.0: + iqr_values.append(float(iqr)) + feature_bucket = "numeric_continuous" if col in continuous else "numeric_count" + eligible_actions = ["none"] + if float(iqr) > 0.0: + eligible_actions.append("robust_scale") + if float(numeric.min()) >= 0.0: + eligible_actions.append("log1p") + if "robust_scale" in eligible_actions and "log1p" in eligible_actions: + eligible_actions.append("log1p_then_robust_scale") + per_column.append( + { + "col": col, + "feature_bucket": feature_bucket, + "n_non_null": int(numeric.shape[0]), + "min": _safe_float(numeric.min()), + "max": _safe_float(numeric.max()), + "mean": _safe_float(numeric.mean()), + "std": _safe_float(numeric.std()), + "median": _safe_float(median), + "iqr": _safe_float(iqr), + "p01": _safe_float(p01), + "p50": _safe_float(p50), + "p99": _safe_float(p99), + "zero_fraction": zero_fraction, + "positive_fraction": positive_fraction, + "skew": _safe_float(numeric.skew()), + "tail_ratio_p99_p50": None if tail_ratio is None else tail_ratio, + "scale_span": _safe_float(numeric.max() - numeric.min()), + "relative_iqr_to_median": None if abs_median <= 0.0 else float(iqr / abs_median), + "eligible_actions": eligible_actions, + } + ) + positive_iqrs = [value for value in iqr_values if value > 0.0] + return { + "path": path, + "candidate_cols": [item["col"] for item in per_column], + "n_candidate_cols": len(per_column), + "scale_summary": { + "max_iqr": None if not positive_iqrs else float(max(positive_iqrs)), + "min_positive_iqr": None if not positive_iqrs else float(min(positive_iqrs)), + "iqr_ratio_max_to_min": ( + None + if len(positive_iqrs) < 2 or min(positive_iqrs) <= 0.0 + else float(max(positive_iqrs) / min(positive_iqrs)) + ), + "n_nontrivial_log_candidates": int( + sum( + ( + item["min"] is not None + and item["min"] >= 0.0 + and item["tail_ratio_p99_p50"] is not None + and item["tail_ratio_p99_p50"] >= 5.0 + ) + for item in per_column + ) + ), + }, + "per_column": per_column, + } + + +class StandardizationActionSpec(pydantic.BaseModel): + """ + Store one bounded standardization action. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + col: str + action: str + reason: str = "" + + +class _ApplyStandardizationArgs(pydantic.BaseModel): + """ + Store arguments for deterministic standardization. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + source_path: str + input_path: str + actions: list[StandardizationActionSpec] + + +@ltools.tool(args_schema=_ApplyStandardizationArgs) +def apply_standardization_actions( + source_path: str, + input_path: str, + actions: list[StandardizationActionSpec] | None = None, +) -> dict: + """ + Apply bounded numeric transforms deterministically. + + Theory: + Whether a transform is desirable is an interpretive decision, but the + transform itself should be a pure function of the observed column values and + recorded parameters. Persisting medians, IQRs, and log usage makes the + optional stage reproducible and auditable. + + :param source_path: original dataset path used for artifact naming + :param input_path: dataset path to transform + :param actions: bounded per-column transformation plan + :return: transformation report with output dataset path + """ + dataset = load_dataset(pathlib.Path(input_path)).copy() + action_items = [item.model_dump() if isinstance(item, pydantic.BaseModel) else item for item in (actions or [])] + applied_actions: list[dict[str, Any]] = [] + for action in action_items: + col = str(action["col"]) + transform = str(action["action"]) + reason = str(action.get("reason") or "") + if col not in dataset.columns: + applied_actions.append( + { + "col": col, + "action": transform, + "status": "skipped_missing_column", + "reason": reason, + } + ) + continue + numeric = pd.to_numeric(dataset[col], errors="coerce") + valid = numeric.dropna() + if valid.empty: + applied_actions.append( + { + "col": col, + "action": transform, + "status": "skipped_no_numeric_values", + "reason": reason, + } + ) + continue + params: dict[str, Any] = {} + transformed = numeric.copy() + status = "applied" + if transform == "none": + pass + elif transform == "robust_scale": + median = valid.median() + q1 = valid.quantile(0.25) + q3 = valid.quantile(0.75) + iqr = q3 - q1 + if float(iqr) <= 0.0: + status = "skipped_zero_iqr" + else: + transformed = (numeric - median) / iqr + params = {"median": float(median), "iqr": float(iqr)} + elif transform == "log1p": + if float(valid.min()) < 0.0: + status = "skipped_negative_values" + else: + transformed = pd.Series(np.log1p(numeric), index=numeric.index) + params = {"log1p": True} + elif transform == "log1p_then_robust_scale": + if float(valid.min()) < 0.0: + status = "skipped_negative_values" + else: + logged = pd.Series(np.log1p(numeric), index=numeric.index) + logged_valid = logged.dropna() + median = logged_valid.median() + q1 = logged_valid.quantile(0.25) + q3 = logged_valid.quantile(0.75) + iqr = q3 - q1 + if float(iqr) <= 0.0: + status = "skipped_zero_iqr_after_log1p" + else: + transformed = (logged - median) / iqr + params = { + "log1p": True, + "median_after_log1p": float(median), + "iqr_after_log1p": float(iqr), + } + else: + status = "skipped_unsupported_action" + if status == "applied": + dataset[col] = transformed + applied_actions.append( + { + "col": col, + "action": transform, + "status": status, + "reason": reason, + "params": params, + } + ) + output_path = write_stage_dataset(source_path, "standardize", dataset) + return { + "input_path": input_path, + "output_path": output_path, + "actions_applied": applied_actions, + } + + +class _UnivariateAnalysisArgs(pydantic.BaseModel): + """ + Store arguments for deterministic univariate analysis. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + source_path: str + input_path: str + time_col: str + secondary_keys: list[str] | None = None + numeric_continuous_cols: list[str] | None = None + numeric_count_cols: list[str] | None = None + binary_flag_cols: list[str] | None = None + + +@ltools.tool(args_schema=_UnivariateAnalysisArgs) +def compute_univariate_metrics_and_plots( + source_path: str, + input_path: str, + time_col: str, + secondary_keys: list[str] | None = None, + numeric_continuous_cols: list[str] | None = None, + numeric_count_cols: list[str] | None = None, + binary_flag_cols: list[str] | None = None, +) -> dict: + """ + Compute deterministic univariate metrics and produce per-column plots. + + Theory: + Univariate EDA starts by measuring one feature at a time. Summary metrics + expose support, spread, skew, missingness, and tail behavior, while + histogram/ECDF/KDE plots show what "normal values" look like. For panel + data, per-entity summaries are also useful because a few odd entities can + hide inside an otherwise normal aggregate distribution. + + :param source_path: original dataset path used for artifact naming + :param input_path: dataset path to analyze + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param numeric_continuous_cols: inferred continuous numeric columns + :param numeric_count_cols: inferred count columns + :param binary_flag_cols: inferred binary columns + :return: summary report and plot manifest + """ + import matplotlib + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + dataset = load_dataset(pathlib.Path(input_path)) + candidate_cols = _candidate_univariate_numeric_cols( + dataset, + time_col=time_col, + secondary_keys=secondary_keys, + numeric_continuous_cols=numeric_continuous_cols, + numeric_count_cols=numeric_count_cols, + binary_flag_cols=binary_flag_cols, + ) + overall_feature_summaries: list[dict[str, Any]] = [] + per_entity_feature_summaries: list[dict[str, Any]] = [] + plot_manifest: list[dict[str, Any]] = [] + valid_secondary_keys = [ + key for key in (secondary_keys or []) if key in dataset.columns and key != time_col + ] + for col in candidate_cols: + summary = _univariate_summary(dataset[col]) + summary["col"] = col + summary["feature_bucket"] = ( + "numeric_continuous" + if col in (numeric_continuous_cols or []) + else "numeric_count" + if col in (numeric_count_cols or []) + else "binary_flag" + if col in (binary_flag_cols or []) + else "numeric" + ) + overall_feature_summaries.append(summary) + + numeric = pd.to_numeric(dataset[col], errors="coerce").dropna() + fig, axes = plt.subplots(1, 2, figsize=(10, 3.8)) + if numeric.empty: + axes[0].text(0.5, 0.5, "No numeric observations", ha="center", va="center") + axes[0].set_axis_off() + axes[1].text(0.5, 0.5, "No numeric observations", ha="center", va="center") + axes[1].set_axis_off() + kde_plotted = False + else: + n_bins = int(min(50, max(10, np.sqrt(numeric.shape[0])))) + axes[0].hist(numeric, bins=n_bins, color="#4472C4", alpha=0.75, density=True) + kde_curve = _gaussian_kde_curve(numeric) + kde_plotted = kde_curve is not None + if kde_curve is not None: + x_grid, density = kde_curve + axes[0].plot(x_grid, density, color="#D62728", linewidth=1.5) + sorted_vals = np.sort(numeric.to_numpy(dtype=float)) + y_ecdf = np.arange(1, sorted_vals.size + 1) / float(sorted_vals.size) + axes[1].step(sorted_vals, y_ecdf, where="post", color="#2CA02C", linewidth=1.5) + axes[1].set_ylim(0.0, 1.0) + axes[0].set_title(f"{col} histogram") + axes[1].set_title(f"{col} ECDF") + fig.suptitle( + f"{col} | skew={summary['skew']} | tail_ratio={summary['tail_ratio_p99_p50']}", + fontsize=10, + ) + plot_path = write_stage_plot(source_path, "univariate_metrics_plotting", f"{col}.distribution", fig) + plt.close(fig) + plot_manifest.append( + { + "col": col, + "plot_path": plot_path, + "kde_plotted": kde_plotted, + } + ) + + if valid_secondary_keys: + grouped = dataset.groupby(valid_secondary_keys, dropna=False, sort=False) + for raw_key, frame in grouped: + key_tuple = raw_key if isinstance(raw_key, tuple) else (raw_key,) + entity = _series_identifier(valid_secondary_keys, key_tuple) + entity_summary = _univariate_summary(frame[col]) + entity_summary["col"] = col + entity_summary["entity"] = entity + per_entity_feature_summaries.append(entity_summary) + + overall_feature_summaries.sort( + key=lambda item: ( + item["missing_pct"] if item["missing_pct"] is not None else -1.0, + abs(item["skew"]) if item["skew"] is not None else -1.0, + item["tail_ratio_p99_p50"] if item["tail_ratio_p99_p50"] is not None else -1.0, + ), + reverse=True, + ) + return { + "input_path": input_path, + "analysis_numeric_cols": candidate_cols, + "overall_feature_summaries": overall_feature_summaries, + "per_entity_feature_summaries": per_entity_feature_summaries, + "plot_manifest": plot_manifest, + } + + +class _TransformTestArgs(pydantic.BaseModel): + """ + Store arguments for deterministic transform testing. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + source_path: str + input_path: str + time_col: str + secondary_keys: list[str] | None = None + numeric_continuous_cols: list[str] | None = None + numeric_count_cols: list[str] | None = None + binary_flag_cols: list[str] | None = None + + +@ltools.tool(args_schema=_TransformTestArgs) +def test_univariate_transforms( + source_path: str, + input_path: str, + time_col: str, + secondary_keys: list[str] | None = None, + numeric_continuous_cols: list[str] | None = None, + numeric_count_cols: list[str] | None = None, + binary_flag_cols: list[str] | None = None, +) -> dict: + """ + Deterministically compare candidate transforms for skewed or heavy-tailed + numeric features. + + Theory: + Transform testing should only run when there is enough empirical evidence + that raw values may violate practical modeling assumptions or obscure + univariate structure. The decision can be made deterministically from + summary shape metrics such as skewness and tail ratios. Candidate transforms + are then compared by how much they reduce those distortions. + + :param source_path: original dataset path used for trace naming + :param input_path: dataset path to analyze + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param numeric_continuous_cols: inferred continuous numeric columns + :param numeric_count_cols: inferred count columns + :param binary_flag_cols: inferred binary columns + :return: transform test report + """ + dataset = load_dataset(pathlib.Path(input_path)) + candidate_cols = _candidate_univariate_numeric_cols( + dataset, + time_col=time_col, + secondary_keys=secondary_keys, + numeric_continuous_cols=numeric_continuous_cols, + numeric_count_cols=numeric_count_cols, + binary_flag_cols=binary_flag_cols, + ) + tested_columns: list[dict[str, Any]] = [] + skipped_columns: list[dict[str, Any]] = [] + for col in candidate_cols: + numeric = pd.to_numeric(dataset[col], errors="coerce") + base_summary = _univariate_summary(numeric) + n_non_null = int(base_summary["n_non_null"]) + abs_skew = abs(float(base_summary["skew"])) if base_summary["skew"] is not None else 0.0 + tail_ratio = float(base_summary["tail_ratio_p99_p50"]) if base_summary["tail_ratio_p99_p50"] is not None else 0.0 + should_test = bool( + n_non_null >= 30 + and ( + abs_skew >= 1.0 + or tail_ratio >= 4.0 + ) + ) + if not should_test: + skipped_columns.append( + { + "col": col, + "reason": ( + "Insufficient deterministic evidence for transform testing. " + f"n_non_null={n_non_null}, abs_skew={abs_skew:.3f}, tail_ratio={tail_ratio:.3f}" + ), + "base_summary": base_summary, + } + ) + continue + candidate_scores: list[dict[str, Any]] = [] + for name, transformed in _transform_candidates(numeric).items(): + score_payload = _transform_score(transformed) + candidate_scores.append( + { + "transform": name, + "score": score_payload["score"], + "summary": score_payload["summary"], + } + ) + valid_scores = [item for item in candidate_scores if item["score"] is not None] + valid_scores.sort(key=lambda item: float(item["score"])) + best = valid_scores[0] + baseline = next(item for item in valid_scores if item["transform"] == "none") + improvement = float(baseline["score"] - best["score"]) + if best["transform"] == "none" or improvement < 0.25: + recommendation = "none" + reason = ( + "Candidate transforms did not materially improve deterministic shape metrics " + f"(best_improvement={improvement:.3f})." + ) + else: + recommendation = best["transform"] + reason = ( + f"{best['transform']} best reduced deterministic shape distortion " + f"(baseline_score={baseline['score']:.3f}, best_score={best['score']:.3f})." + ) + tested_columns.append( + { + "col": col, + "base_summary": base_summary, + "candidate_scores": valid_scores, + "recommended_transform": recommendation, + "improvement_over_none": improvement, + "reason": reason, + } + ) + payload = { + "input_path": input_path, + "n_candidate_cols": len(candidate_cols), + "n_tested_cols": len(tested_columns), + "n_skipped_cols": len(skipped_columns), + "tested_columns": tested_columns, + "skipped_columns": skipped_columns, + } + write_stage_trace(source_path, "test_transforms", payload) + return payload + + def analyze_header(state: dict) -> dict: """ Validate dataset headers. diff --git a/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/__init__.py b/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/__init__.py new file mode 100644 index 000000000..66ee48f67 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/__init__.py @@ -0,0 +1,3 @@ +""" +Univariate analysis stages for the Jupyter backend. +""" diff --git a/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/test_transforms.py b/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/test_transforms.py new file mode 100644 index 000000000..115afd951 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/test_transforms.py @@ -0,0 +1,212 @@ +""" +Import as: + +import src.univariate_analysis.test_transforms as stransforms +""" + +from __future__ import annotations + +import argparse +import logging +from typing import TypedDict + +import langgraph.graph as lgraph + +import src.univariate_analysis.univariate_metrics_plotting as sunivar +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class CompositeState(TypedDict): + """ + Store graph state for transform testing. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + missingness_report: dict + missingness_plan: dict + missingness_handling_report: dict + quality_dataset_path: str + standardization_profile: dict + standardization_gate: dict + standardization_plan: dict + standardization_report: dict + standardized_dataset_path: str + univariate_report: dict + transform_test_report: dict + + +def call_univariate_metrics_plotting(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to univariate metrics/plots. + + :param state: graph state + :return: composite payload from univariate metrics/plots + """ + payload = sunivar.run_univariate_metrics_plotting(state["path"]) + return payload + + +def test_transforms(state: CompositeState) -> dict: + """ + Compare candidate transforms deterministically for columns where it matters. + + :param state: graph state + :return: transform test report + """ + analysis_path = state.get("quality_dataset_path") or state["path"] + report = tinptool.test_univariate_transforms.invoke( + { + "source_path": state["path"], + "input_path": analysis_path, + "time_col": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "numeric_continuous_cols": state["numeric_continuous_cols"], + "numeric_count_cols": state["numeric_count_cols"], + "binary_flag_cols": state["binary_flag_cols"], + } + ) + payload = {"transform_test_report": report} + return payload + + +transform_testing = lgraph.StateGraph(CompositeState) +transform_testing.add_node("univariate_metrics_plotting_pipeline", call_univariate_metrics_plotting) +transform_testing.add_node("test_transforms", test_transforms) +transform_testing.add_edge(lgraph.START, "univariate_metrics_plotting_pipeline") +transform_testing.add_edge("univariate_metrics_plotting_pipeline", "test_transforms") +transform_testing.add_edge("test_transforms", lgraph.END) +graph = transform_testing.compile() + + +def run_test_transforms(path: str) -> dict: + """ + Execute transform testing end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + "n_nat_time": 0, + "min_time": None, + "max_time": None, + "typical_delta_mode": None, + "typical_delta_median": None, + "expected_frequency": None, + "dominant_frequency_fraction": 0.0, + "is_irregular_sampling": False, + "resampling_decision": "", + "coverage_summary": {}, + "coverage_per_entity": [], + "missingness_report": {}, + "missingness_plan": {}, + "missingness_handling_report": {}, + "quality_dataset_path": "", + "standardization_profile": {}, + "standardization_gate": {}, + "standardization_plan": {}, + "standardization_report": {}, + "standardized_dataset_path": "", + "univariate_report": {}, + "transform_test_report": {}, + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Transform testing output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + return parser.parse_args() + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_test_transforms(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/univariate_metrics_plotting.py b/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/univariate_metrics_plotting.py new file mode 100644 index 000000000..1bcd9b6bb --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/univariate_metrics_plotting.py @@ -0,0 +1,214 @@ +""" +Import as: + +import src.univariate_analysis.univariate_metrics_plotting as sunivar +""" + +from __future__ import annotations + +import argparse +import logging +from typing import TypedDict + +import langgraph.graph as lgraph + +import src.quality_handling.standardize as sstandard +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class CompositeState(TypedDict): + """ + Store graph state for univariate metrics and plotting. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + missingness_report: dict + missingness_plan: dict + missingness_handling_report: dict + quality_dataset_path: str + standardization_profile: dict + standardization_gate: dict + standardization_plan: dict + standardization_report: dict + standardized_dataset_path: str + univariate_report: dict + + +def call_standardize(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to optional standardization. + + :param state: graph state + :return: composite payload from standardize + """ + payload = sstandard.run_standardize(state["path"]) + return payload + + +def compute_univariate_metrics_and_plots(state: CompositeState) -> dict: + """ + Compute univariate summaries and write per-feature distribution plots. + + :param state: graph state + :return: univariate report + """ + analysis_path = state.get("quality_dataset_path") or state["path"] + report = tinptool.compute_univariate_metrics_and_plots.invoke( + { + "source_path": state["path"], + "input_path": analysis_path, + "time_col": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "numeric_continuous_cols": state["numeric_continuous_cols"], + "numeric_count_cols": state["numeric_count_cols"], + "binary_flag_cols": state["binary_flag_cols"], + } + ) + trace_payload = { + "analysis_path": analysis_path, + "univariate_report": report, + } + tinptool.write_stage_trace(state["path"], "univariate_metrics_plotting", trace_payload) + return {"univariate_report": report} + + +univariate_analysis = lgraph.StateGraph(CompositeState) +univariate_analysis.add_node("standardize_pipeline", call_standardize) +univariate_analysis.add_node("compute_univariate_metrics_and_plots", compute_univariate_metrics_and_plots) +univariate_analysis.add_edge(lgraph.START, "standardize_pipeline") +univariate_analysis.add_edge("standardize_pipeline", "compute_univariate_metrics_and_plots") +univariate_analysis.add_edge("compute_univariate_metrics_and_plots", lgraph.END) +graph = univariate_analysis.compile() + + +def run_univariate_metrics_plotting(path: str) -> dict: + """ + Execute univariate summaries and plotting end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + "n_nat_time": 0, + "min_time": None, + "max_time": None, + "typical_delta_mode": None, + "typical_delta_median": None, + "expected_frequency": None, + "dominant_frequency_fraction": 0.0, + "is_irregular_sampling": False, + "resampling_decision": "", + "coverage_summary": {}, + "coverage_per_entity": [], + "missingness_report": {}, + "missingness_plan": {}, + "missingness_handling_report": {}, + "quality_dataset_path": "", + "standardization_profile": {}, + "standardization_gate": {}, + "standardization_plan": {}, + "standardization_report": {}, + "standardized_dataset_path": "", + "univariate_report": {}, + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Univariate analysis output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + return parser.parse_args() + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_univariate_metrics_plotting(args.path)