From 792e9709442f536d2b1e89335ac32d0086fd3f38 Mon Sep 17 00:00:00 2001 From: monozoide Date: Fri, 17 Oct 2025 15:01:10 +0200 Subject: [PATCH 1/2] Fix: SQL export reports success on failure #63 The --sql-export command was displaying a misleading success message even when data conversion errors occurred. This was because the `format_sql_value` function would log a warning and return NULL on conversion failure, but it did not propagate the error. This commit makes the data conversion stricter by raising an `SQLExportError` when a conversion for a NOT NULL column fails. The `run_sql_export` function now handles this exception, counts the errors, and returns `False` if any errors occurred. It also deletes the incomplete SQL file to avoid leaving invalid artifacts. This ensures that the SQL export process provides accurate feedback and only reports success when the export is actually successful. --- lib/maillogsentinel/sql_exporter.py | 214 ++++++++++++---------------- 1 file changed, 91 insertions(+), 123 deletions(-) diff --git a/lib/maillogsentinel/sql_exporter.py b/lib/maillogsentinel/sql_exporter.py index 4340900..b0ed0af 100644 --- a/lib/maillogsentinel/sql_exporter.py +++ b/lib/maillogsentinel/sql_exporter.py @@ -212,58 +212,50 @@ def format_sql_value(value: Any, sql_type_def: str) -> str: Returns: SQL-formatted string representation of the value. """ - if ( - value is None - or str(value).strip().lower() == "null" - or str(value).strip() == "" - ): - # Allow 'DEFAULT NULL' columns to receive NULL - if ( - "DEFAULT NULL" in sql_type_def.upper() - or "PRIMARY KEY" not in sql_type_def.upper() - ): # Quick check + is_not_null = "NOT NULL" in sql_type_def.upper() + is_nullable = not is_not_null or "DEFAULT NULL" in sql_type_def.upper() + + if value is None or str(value).strip().lower() in ["null", "na", "n/a", ""]: + if is_nullable: return "NULL" - # If it's a NOT NULL column without default and value is empty/None, this is an issue - # The calling code should ideally handle this by providing a default or raising error - # For now, if it's a string type, represent as empty string, otherwise problem. - if "CHAR" in sql_type_def.upper() or "TEXT" in sql_type_def.upper(): - return "''" - # This will likely cause an SQL error if the column is NOT NULL - # Consider raising an error here if value is None and column is NOT NULL without default - logger.warning( - f"{LOG_PREFIX}: Null/empty value encountered for a potentially NOT NULL column: {sql_type_def} (value: {value})" - ) - return "NULL" # Or raise error + else: + # This is a critical issue: a null-like value for a NOT NULL column. + raise SQLExportError( + f"Null or empty value provided for a NOT NULL column. Column Def: '{sql_type_def}', Value: '{value}'" + ) sql_type_lower = sql_type_def.lower() if "int" in sql_type_lower or "serial" in sql_type_lower: try: return str(int(value)) - except ValueError: - logger.warning( - f"{LOG_PREFIX}: Could not convert '{value}' to int for SQL; using NULL. Column: {sql_type_def}" - ) - return "NULL" # Or raise error + except (ValueError, TypeError): + if is_nullable: + logger.warning( + f"{LOG_PREFIX}: Could not convert '{value}' to int for SQL; using NULL. Column: {sql_type_def}" + ) + return "NULL" + else: + raise SQLExportError( + f"Failed to convert value '{value}' to integer for a NOT NULL column. Column Def: '{sql_type_def}'" + ) elif "datetime" in sql_type_lower or "timestamp" in sql_type_lower: - # Assuming value is already in 'YYYY-MM-DD HH:MM:SS' format or a datetime object - # For SQLite, it's typically a string. if isinstance(value, datetime.datetime): return f"'{value.strftime('%Y-%m-%d %H:%M:%S')}'" - return escape_sql_string(str(value)) # Assuming pre-formatted string + return escape_sql_string(str(value)) elif ( "char" in sql_type_lower or "text" in sql_type_lower or "enum" in sql_type_lower ): return escape_sql_string(str(value)) - elif "bool" in sql_type_lower: # SQLite stores booleans as integers 0 or 1 + elif "bool" in sql_type_lower: return "1" if str(value).lower() in ["true", "1", "yes", "on"] else "0" - else: # Default to string escaping for unknown types (e.g. IP, custom types) + else: return escape_sql_string(str(value)) def generate_insert_statement( row_dict: Dict[str, Any], table_name: str, column_mapping: Dict[str, Dict[str, str]] -) -> Optional[str]: +) -> str: """ Generates an SQL INSERT statement from a CSV row dictionary. @@ -273,21 +265,19 @@ def generate_insert_statement( column_mapping: The column mapping dictionary. Returns: - A string containing the SQL INSERT statement, or None if a row is skipped. + A string containing the SQL INSERT statement. + + Raises: + SQLExportError: If data conversion fails for a required column. """ - # Determine target SQL columns and their corresponding values from the row_dict sql_columns = [] sql_values = [] - valid_row = True for sql_col_name, mapping_info in column_mapping.items(): csv_col_name = mapping_info.get("csv_column_name") sql_col_def = mapping_info.get("sql_column_def", "") - if sql_col_name == "id" and "AUTO_INCREMENT" in sql_col_def.upper(): - # Skip ID column if it's auto-incrementing; DB will handle it. - # Alternatively, if CSV provides an ID, it should be used, and AUTO_INCREMENT removed from SQL def. - # For now, assuming DB generates ID. + if "AUTO_INCREMENT" in sql_col_def.upper() or "SERIAL" in sql_col_def.upper(): continue if not csv_col_name: @@ -298,29 +288,20 @@ def generate_insert_statement( raw_value = row_dict.get(csv_col_name) - # Basic validation: if column is NOT NULL and has no DEFAULT, raw_value must exist - # This is a simplified check; actual NOT NULL check depends on precise SQL definition - if ( - raw_value is None - and "NOT NULL" in sql_col_def.upper() - and "DEFAULT" not in sql_col_def.upper() - and "AUTO_INCREMENT" not in sql_col_def.upper() - ): - logger.error( - f"{LOG_PREFIX}: Missing value for NOT NULL column '{sql_col_name}' (mapped from CSV '{csv_col_name}'). Row: {row_dict}. Skipping row." - ) - valid_row = False - break # Skip this row - - formatted_value = format_sql_value(raw_value, sql_col_def) - - sql_columns.append(sql_col_name) - sql_values.append(formatted_value) - - if not valid_row or not sql_columns: - return None - - columns_str = ", ".join(sql_columns) + try: + formatted_value = format_sql_value(raw_value, sql_col_def) + sql_columns.append(sql_col_name) + sql_values.append(formatted_value) + except SQLExportError as e: + # Re-raise with more context about the row being processed. + raise SQLExportError(f"Error in row {row_dict}: {e}") + + if not sql_columns: + # This can happen if all columns are auto-incrementing, which is unlikely but possible. + # Or if the mapping is empty. + raise SQLExportError("No columns to insert for the given row.") + + columns_str = ", ".join(f'"{c}"' for c in sql_columns) values_str = ", ".join(sql_values) return f"INSERT INTO {table_name} ({columns_str}) VALUES ({values_str});" @@ -565,38 +546,35 @@ def run_sql_export(config: AppConfig, output_log_level: str = "INFO") -> bool: outfile.write("BEGIN TRANSACTION;\n") - for row in reader: + conversion_errors = 0 + for row_num, row in enumerate(reader, start=1): records_processed += 1 - # Check for empty rows (e.g. just delimiters ;;;;) if not any(row.values()): logger.debug( - f"{LOG_PREFIX}: Skipping empty or malformed row: {row}" + f"{LOG_PREFIX}: Skipping empty or malformed row at line number (approx) {row_num}." ) - # Still need to update offset for this line - # The DictReader handles line ending consumption. - # To get the byte length of the line for offset: - # This is tricky with DictReader. Simplest is to read line by line first, - # then parse. For now, this part of offset update is imprecise with DictReader. - # A more robust offset: re-open and read line-by-line to count bytes. - # For now, new_offset will be updated at the end based on infile.tell(). continue try: - # Ensure all expected keys are present in row, map to None if not - # This is important if CSV is sparse or has missing optional fields - # Handled by row_dict.get(csv_col_name) in generate_insert_statement insert_stmt = generate_insert_statement( row, table_name, column_mapping ) - if insert_stmt: - outfile.write(insert_stmt + "\n") - records_exported += 1 - except Exception as e: + outfile.write(insert_stmt + "\n") + records_exported += 1 + except SQLExportError as e: logger.error( - f"{LOG_PREFIX}: Error processing row: {row}. Error: {e}", + f"{LOG_PREFIX}: Failed to process row (approx line {row_num}). Reason: {e}" + ) + conversion_errors += 1 + except Exception as e: + logger.critical( + f"{LOG_PREFIX}: A critical unexpected error occurred at row (approx line {row_num}): {row}. Aborting export. Error: {e}", exc_info=True, ) - # Decide if we skip this row or abort. For now, skip. + # This is a more serious error than a simple conversion issue. We should abort. + outfile.close() + sql_file_path.unlink(missing_ok=True) + return False # After processing all available lines from the current offset new_offset = infile.tell() # Get the end position @@ -636,35 +614,28 @@ def run_sql_export(config: AppConfig, output_log_level: str = "INFO") -> bool: if "outfile" in locals() and not outfile.closed: outfile.close() + if conversion_errors > 0: + logger.error( + f"{LOG_PREFIX}: SQL export completed with {conversion_errors} errors. " + f"The generated SQL file '{sql_file_path}' is incomplete and will be deleted." + ) + sql_file_path.unlink(missing_ok=True) + # We still update the offset to avoid reprocessing failed rows, + # but the overall operation is a failure. + update_offset(offset_file_path, new_offset) + return False + if records_exported == 0: - if records_processed > 0: - # Processed lines but exported nothing (e.g., all rows skipped due to errors/filters) - logger.warning( - f"{LOG_PREFIX}: Processed {records_processed} lines, but no records were actually exported. SQL file {sql_file_path} will contain only BEGIN/COMMIT. This might indicate data or mapping issues." - ) - # Keep the file for inspection in this specific case. - else: - # No records exported AND no records processed (beyond header if it was the first run) - # This includes: - # 1. Truly empty CSV (already handled by returning after unlink if first_line is empty) - # 2. Header-only CSV on first run (current_offset=0 initially, records_processed=0) - # 3. Resume run with no new data lines (current_offset>0 initially, records_processed=0) - logger.info( - f"{LOG_PREFIX}: No records exported and no new data lines processed. Removing SQL export file: {sql_file_path}" - ) - sql_file_path.unlink(missing_ok=True) - else: # records_exported > 0 + logger.info( + f"{LOG_PREFIX}: No new valid records to export. Removing empty SQL file." + ) + sql_file_path.unlink(missing_ok=True) + else: logger.info( f"{LOG_PREFIX}: Successfully created SQL export file: {sql_file_path} with {records_exported} records." ) - # Update offset only if processing was generally successful or partially successful - # If a critical error happened early (e.g. cant load mapping), offset should not change. - # Current logic updates offset if we reach here. - # If CSV was not found, we return False before this. - # If mapping failed, we return False before this. update_offset(offset_file_path, new_offset) - logger.info( f"{LOG_PREFIX}: SQL export process complete. Final offset: {new_offset}" ) @@ -740,40 +711,37 @@ def _get_bundled_mapping_headers_for_test(): return [] -DUMMY_CSV_HEADERS = _get_bundled_mapping_headers_for_test() -if not DUMMY_CSV_HEADERS: - DUMMY_CSV_HEADERS = [ - "server", - "event_time", - "ip", - "username", - "hostname", - "reverse_dns_status", - "country_code", - "asn_number_placeholder", - "asn_org_placeholder", - ] - logger.warning(f"Using fallback DUMMY_CSV_HEADERS for testing: {DUMMY_CSV_HEADERS}") +DUMMY_CSV_HEADERS = [ + "server", + "date", + "ip", + "user", + "hostname", + "reverse_dns_status", + "country_code", + "asn", + "aso", +] def _make_dummy_csv_data_row(custom_headers_order, values_dict): """Helper to create a CSV data row based on DUMMY_CSV_HEADERS global order.""" row = [] - for header in DUMMY_CSV_HEADERS: # Ensure consistent order + for header in custom_headers_order: # Use the provided header order row.append(values_dict.get(header, f"dummy_{header}")) return row DUMMY_CSV_DATA_ROW_1_VALS = { "server": "mail.example.com", - "event_time": "2023-10-26 10:00:00", + "date": "2023-10-26 10:00:00", "ip": "192.168.1.100", - "username": "testuser", + "user": "testuser", "hostname": "client.example.org", "reverse_dns_status": "OK", "country_code": "US", - "asn_number_placeholder": "12345", - "asn_org_placeholder": "AS-EXAMPLE Example ISP", + "asn": "12345", + "aso": "AS-EXAMPLE Example ISP", } DUMMY_CSV_DATA_ROW_1 = _make_dummy_csv_data_row( DUMMY_CSV_HEADERS, DUMMY_CSV_DATA_ROW_1_VALS From 027be370cfdd0d46acfac4fbd0cc023752ad78c3 Mon Sep 17 00:00:00 2001 From: monozoide Date: Fri, 17 Oct 2025 16:11:02 +0200 Subject: [PATCH 2/2] Update SQL export tests #63 Tests now expect SQLExportError when None is provided for NOT NULL columns. Updated assertions to match new SQL statement formatting with quoted column names. This improves test accuracy and enforces stricter validation in SQL export logic. --- lib/maillogsentinel/sql_exporter.py | 2 +- .../lib/maillogsentinel/test_sql_exporter.py | 32 ++++++------------- 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/lib/maillogsentinel/sql_exporter.py b/lib/maillogsentinel/sql_exporter.py index b0ed0af..b451c5e 100644 --- a/lib/maillogsentinel/sql_exporter.py +++ b/lib/maillogsentinel/sql_exporter.py @@ -11,7 +11,7 @@ import json import logging from pathlib import Path -from typing import Optional, List, Dict, Any +from typing import List, Dict, Any import importlib.resources # Added for loading bundled data import tempfile diff --git a/tests/lib/maillogsentinel/test_sql_exporter.py b/tests/lib/maillogsentinel/test_sql_exporter.py index 0a8d7ea..aedcdc9 100644 --- a/tests/lib/maillogsentinel/test_sql_exporter.py +++ b/tests/lib/maillogsentinel/test_sql_exporter.py @@ -18,6 +18,7 @@ generate_insert_statement, run_sql_export, CSVSchemaError, + SQLExportError, ) from lib.maillogsentinel.config import AppConfig # For mocking config @@ -189,9 +190,9 @@ def test_format_sql_value(): # Test for potentially problematic None conversion for NOT NULL without default # Current logic might return "NULL" which could be an issue, or "''" for text. # This depends on strictness desired. - assert ( - format_sql_value(None, "INT NOT NULL") == "NULL" - ) # This would likely fail on DB if column is NOT NULL + with pytest.raises(SQLExportError) as excinfo: + format_sql_value(None, "INT NOT NULL") + assert "Null or empty value provided for a NOT NULL column" in str(excinfo.value) def test_load_column_mapping_success( @@ -309,10 +310,7 @@ def test_generate_insert_statement(sample_column_mapping_content): row_dict, table_name, sample_column_mapping_content ) assert stmt is not None - assert ( - "INSERT INTO logs (server, event_time, ip, username, hostname, status) VALUES ('mail.example.com', '2023-01-01 12:00:00', '192.168.1.1', 'testuser', 'client.local', 'OK');" - in stmt - ) + assert ('INSERT INTO logs ("server", "event_time", "ip", "username", "hostname", "status") VALUES (\'mail.example.com\', \'2023-01-01 12:00:00\', \'192.168.1.1\', \'testuser\', \'client.local\', \'OK\');' in stmt) def test_generate_insert_statement_with_none_for_nullable( @@ -333,10 +331,7 @@ def test_generate_insert_statement_with_none_for_nullable( assert stmt is not None assert "hostname" in stmt assert "NULL" in stmt # Check that hostname is NULL - assert ( - "'mail.example.com', '2023-01-01 12:00:00', '192.168.1.1', 'testuser', NULL, 'FAIL'" - in stmt - ) + assert ("'mail.example.com', '2023-01-01 12:00:00', '192.168.1.1', 'testuser', NULL, 'FAIL'" in stmt) def test_generate_insert_statement_skip_auto_increment_id( @@ -439,14 +434,8 @@ def test_run_sql_export_basic_flow( expected_filename1 = sql_output_dir / "20230101_1000_maillogsentinel_export.sql" assert expected_filename1 in exported_files1 sql_content1 = expected_filename1.read_text() - assert ( - "INSERT INTO test_log_events (server, event_time, ip, username, hostname, status) VALUES ('srv1', '2023-01-01 10:00:00', '1.1.1.1', 'user1', 'host1.com', 'OK');" - in sql_content1 - ) - assert ( - "INSERT INTO test_log_events (server, event_time, ip, username, hostname, status) VALUES ('srv2', '2023-01-02 11:00:00', '2.2.2.2', 'user2', 'host2.net', 'FAIL');" - in sql_content1 - ) + assert ('INSERT INTO test_log_events ("server", "event_time", "ip", "username", "hostname", "status") VALUES (\'srv1\', \'2023-01-01 10:00:00\', \'1.1.1.1\', \'user1\', \'host1.com\', \'OK\');' in sql_content1) + assert ('INSERT INTO test_log_events ("server", "event_time", "ip", "username", "hostname", "status") VALUES (\'srv2\', \'2023-01-02 11:00:00\', \'2.2.2.2\', \'user2\', \'host2.net\', \'FAIL\');' in sql_content1) assert "BEGIN TRANSACTION;" in sql_content1 assert "COMMIT;" in sql_content1 @@ -475,10 +464,7 @@ def test_run_sql_export_basic_flow( expected_filename3 = sql_output_dir / "20230101_1002_maillogsentinel_export.sql" assert expected_filename3 in exported_files3 sql_content3 = expected_filename3.read_text() - assert ( - "INSERT INTO test_log_events (server, event_time, ip, username, hostname, status) VALUES ('srv3', '2023-01-03 12:00:00', '3.3.3.3', 'user3', 'host3.org', 'OK');" - in sql_content3 - ) + assert ('INSERT INTO test_log_events ("server", "event_time", "ip", "username", "hostname", "status") VALUES (\'srv3\', \'2023-01-03 12:00:00\', \'3.3.3.3\', \'user3\', \'host3.org\', \'OK\');' in sql_content3) assert "BEGIN TRANSACTION;" in sql_content3 assert "COMMIT;" in sql_content3