Merge pull request #20 from fastomop/feat/ss-databricks

sshenzha · web-flow · commit 72edc5dbda12 · 2025-12-08T12:38:03.000Z
Feat/ss databricks
diff --git a/src/omcp/db.py b/src/omcp/db.py
@@ -13,6 +13,7 @@
 from ibis.backends.databricks import Backend as DatabricksBackend
 
 from omcp.sql_validator import SQLValidator
+from omcp.transpiler import transpile_query
 
 logger = logging.getLogger(__name__)
 
@@ -113,6 +114,11 @@ def __init__(
         self.cdm_schema = cdm_schema
         self.vocab_schema = vocab_schema
 
+        # Determine database dialect from connection string
+        self.target_dialect = self._get_dialect_from_connection_string(
+            connection_string
+        )
+
         # Try initial connection
         logger.info(f"Initializing connection to: {connection_string}")
         try:
@@ -127,6 +133,29 @@ def __init__(
         except Exception as e:
             raise ConnectionError(f"Failed to connect to database: {str(e)}")
 
+    def _get_dialect_from_connection_string(self, connection_string: str) -> str:
+        """
+        Determine the SQL dialect from the connection string.
+
+        Args:
+            connection_string: Database connection string
+
+        Returns:
+            SQL dialect name (e.g., 'databricks', 'postgres', 'duckdb')
+        """
+        if connection_string.startswith("databricks://"):
+            return "databricks"
+        elif connection_string.startswith("postgres"):
+            return "postgres"
+        elif connection_string.startswith("duckdb://"):
+            return "duckdb"
+        else:
+            # Default to postgres for unknown dialects
+            logger.warning(
+                f"Unknown dialect for connection string: {connection_string}, defaulting to postgres"
+            )
+            return "postgres"
+
     def _ensure_connected(self):
         """Ensure we have a valid database connection."""
         with self._conn_lock:
@@ -304,12 +333,34 @@ def read_query(self, query: str) -> str:
                     errors,
                 )
 
+            # Transpile query if needed (postgres -> databricks, etc.)
+            # We assume Claude generates queries in postgres dialect by default
+            source_dialect = "postgres"
+            transpiled_query = query
+
+            if self.target_dialect != source_dialect:
+                logger.info(
+                    f"Transpiling query from {source_dialect} to {self.target_dialect}"
+                )
+                try:
+                    transpiled_query = transpile_query(
+                        query, source_dialect, self.target_dialect
+                    )
+                    logger.debug(f"Original query: {query}")
+                    logger.debug(f"Transpiled query: {transpiled_query}")
+                except Exception as transpile_error:
+                    logger.warning(
+                        f"Transpilation failed: {transpile_error}, using original query"
+                    )
+                    # If transpilation fails, fall back to original query
+                    transpiled_query = query
+
             # Ensure connected
             self._ensure_connected()
 
-            # Execute the validated query
+            # Execute the validated and transpiled query
             with self._conn_lock:
-                result = self._conn.sql(query).limit(self.row_limit)
+                result = self._conn.sql(transpiled_query).limit(self.row_limit)
                 df = result.execute()
                 # Convert dataframe to csv
                 return df.to_csv(index=False)
@@ -325,7 +376,7 @@ def read_query(self, query: str) -> str:
             try:
                 self._ensure_connected()
                 with self._conn_lock:
-                    result = self._conn.sql(query).limit(self.row_limit)
+                    result = self._conn.sql(transpiled_query).limit(self.row_limit)
                     df = result.execute()
                     return df.to_csv(index=False)
             except Exception as retry_error:
diff --git a/src/omcp/main.py b/src/omcp/main.py
@@ -345,37 +345,45 @@ def signal_handler(signum, frame):
 
 @mcp_app.tool(
     name="Get_Information_Schema",
-    description="Get the information schema of the OMOP database.",
+    description="Get the database schema name and type. Returns the schema prefix to use for table references (e.g., 'gold') and the database type (e.g., 'databricks').",
 )
 @capture_context(tool_name="Get_Information_Schema")
 def get_information_schema() -> mcp.types.CallToolResult:
-    """Get the information schema of the OMOP database.
+    """Get the database schema name and type.
+
+    This function returns only the essential schema information needed for SQL generation:
+    - schema_name: The schema/database prefix to use (e.g., 'gold', 'omop', 'public')
+    - database_type: The SQL dialect to use (e.g., 'databricks', 'postgres', 'duckdb')
 
-    This function retrieves information from the information schema of the OMOP database.
-    Information is restricted to only tables and columns allowed by the users configuration.
     Args:
         None
     Returns:
-        List of schemas, tables, columns and data types formatted as a CSV string.
+        Simple text with schema name and database type
     """
     try:
-        logger.debug("Getting information schema...")
-        # Note: @capture_context decorator already handles Langfuse tracing
-        result = db.get_information_schema()
-        logger.debug("Information schema retrieved successfully")
+        logger.debug("Getting schema information...")
+
+        # Return only the essential information
+        schema_name = db.cdm_schema
+        database_type = db.target_dialect
+
+        result = f"Schema: {schema_name}\nDatabase Type: {database_type}"
+
+        logger.debug(f"Schema info: {result}")
         return mcp.types.CallToolResult(
             content=[
                 mcp.types.TextContent(type="text", text=result),
-            ]
+            ],
+            _meta={"database_type": database_type, "schema_name": schema_name},
         )
     except Exception as e:
-        logger.error(f"Failed to retrieve information schema: {e}")
+        logger.error(f"Failed to retrieve schema information: {e}")
         return mcp.types.CallToolResult(
             isError=True,
             content=[
                 mcp.types.TextContent(
                     type="text",
-                    text=f"Failed to retrieve information schema: {str(e)}",
+                    text=f"Failed to retrieve schema information: {str(e)}",
                 )
             ],
         )
@@ -431,6 +439,104 @@ def read_query(query: str) -> mcp.types.CallToolResult:
         )
 
 
+@mcp_app.tool(
+    name="Lookup_Drug",
+    description="Look up drug concepts by name in the OMOP concept table. Returns standardized drug concepts with concept_id, concept_name, concept_code, vocabulary_id, and domain_id.",
+)
+@capture_context(tool_name="Lookup_Drug")
+def lookup_drug(term: str, limit: int = 10) -> mcp.types.CallToolResult:
+    """Look up drug concepts by name.
+
+    This function searches for drug concepts in the OMOP concept table by partial name match.
+    Only returns standard, valid drug concepts ordered by name length (shortest first).
+
+    Args:
+        term: Drug name to search for (case-insensitive partial match)
+        limit: Maximum number of results to return (default: 10)
+
+    Returns:
+        CSV formatted results with: concept_id, concept_name, concept_code, vocabulary_id, domain_id
+    """
+    try:
+        schema = db.cdm_schema
+        # Use parameterized query pattern with LIKE
+        query = f"""
+        SELECT concept_id, concept_name, concept_code, vocabulary_id, domain_id
+        FROM {schema}.concept
+        WHERE LOWER(concept_name) LIKE LOWER('%{term}%')
+          AND domain_id = 'Drug'
+          AND standard_concept = 'S'
+          AND invalid_reason IS NULL
+        ORDER BY LENGTH(concept_name), concept_name
+        LIMIT {limit}
+        """
+        logger.info(f"Looking up drug: {term}")
+        result = db.read_query(query)
+        logger.info(f"Drug lookup completed for: {term}")
+        return mcp.types.CallToolResult(
+            content=[mcp.types.TextContent(type="text", text=result)]
+        )
+    except Exception as e:
+        logger.error(f"Failed to lookup drug '{term}': {e}")
+        return mcp.types.CallToolResult(
+            isError=True,
+            content=[
+                mcp.types.TextContent(
+                    type="text", text=f"Failed to lookup drug: {str(e)}"
+                )
+            ],
+        )
+
+
+@mcp_app.tool(
+    name="Lookup_Condition",
+    description="Look up condition concepts by name in the OMOP concept table. Returns standardized condition concepts with concept_id, concept_name, concept_code, vocabulary_id, and domain_id.",
+)
+@capture_context(tool_name="Lookup_Condition")
+def lookup_condition(term: str, limit: int = 10) -> mcp.types.CallToolResult:
+    """Look up condition concepts by name.
+
+    This function searches for condition concepts in the OMOP concept table by partial name match.
+    Only returns standard, valid condition concepts ordered by name length (shortest first).
+
+    Args:
+        term: Condition name to search for (case-insensitive partial match)
+        limit: Maximum number of results to return (default: 10)
+
+    Returns:
+        CSV formatted results with: concept_id, concept_name, concept_code, vocabulary_id, domain_id
+    """
+    try:
+        schema = db.cdm_schema
+        # Use parameterized query pattern with LIKE
+        query = f"""
+        SELECT concept_id, concept_name, concept_code, vocabulary_id, domain_id
+        FROM {schema}.concept
+        WHERE LOWER(concept_name) LIKE LOWER('%{term}%')
+          AND domain_id = 'Condition'
+          AND standard_concept = 'S'
+          AND invalid_reason IS NULL
+        ORDER BY LENGTH(concept_name), concept_name
+        LIMIT {limit}
+        """
+        logger.info(f"Looking up condition: {term}")
+        result = db.read_query(query)
+        logger.info(f"Condition lookup completed for: {term}")
+        return mcp.types.CallToolResult(
+            content=[mcp.types.TextContent(type="text", text=result)]
+        )
+    except Exception as e:
+        logger.error(f"Failed to lookup condition '{term}': {e}")
+        return mcp.types.CallToolResult(
+            isError=True,
+            content=[
+                mcp.types.TextContent(
+                    type="text", text=f"Failed to lookup condition: {str(e)}"
+                )
+            ],
+        )
+
+
 def main():
     """Main function to run the MCP server."""
     logger.info(f"Starting OMOP MCP Server with {transport_type.upper()} transport...")
diff --git a/src/omcp/transpiler.py b/src/omcp/transpiler.py