fix: add retries for failed first statements

olavloite · olavloite · commit 997bab09cf7f · 2026-03-13T13:05:36.000+01:00
Add retries if the first statement in a read/write transaction fails, as the
statement then does not return a transaction ID. In order to ensure that we
get a transaction ID, we first execute an explicit BeginTransaction RPC and
then retry the original statement. We return the response of the retry to
the application, regardless whether the retry fails or succeeds.

The reason that we do a retry with a BeginTransaction AND include the
first statement, is to guarantee transaction consistency. If we were to
leave the first statement out of the transaction, then it will not be
guaranteed that the error condition that cause the failure in the first
place is actually still true when the transaction commits. This would break
the transaction guarantees.

Example (pseudo-code):

```sql
-- The following statement fails with ALREADY_EXISTS
insert into some_table (id, value) values (1, 'One');

-- Execute an explicit BeginTransaction RPC.
begin;
-- Retry the initial statement. This ensures that
-- whatever the response is, this response will be
-- valid for the entire transaction.
insert into some_table (id, value) values (1, 'One');

-- This is guaranteed to return a row.
select * from some_table where id=1;

-- ... execute the rest of the transaction ...
commit;
```

If we had not included the initial insert statement in the retried transaction,
then there is no guarantee that the select statement would actually return any
rows, as other transactions could in theory have deleted it in the meantime.
diff --git a/google/cloud/spanner_dbapi/batch_dml_executor.py b/google/cloud/spanner_dbapi/batch_dml_executor.py
@@ -104,6 +104,11 @@ def run_batch_dml(cursor: "Cursor", statements: List[Statement]):
                     connection._transaction = None
                     raise Aborted(status.message)
                 elif status.code != OK:
+                    if not transaction._transaction_id:
+                        # This should normally not happen,
+                        # but we safeguard against it just to be sure.
+                        transaction._reset_and_begin()
+                        continue
                     raise OperationalError(status.message)
 
                 cursor._batch_dml_rows_count = res
@@ -116,6 +121,11 @@ def run_batch_dml(cursor: "Cursor", statements: List[Statement]):
                     raise
                 else:
                     connection._transaction_helper.retry_transaction()
+            except Exception as ex:
+                if not transaction._transaction_id:
+                    transaction._reset_and_begin()
+                    continue
+                raise ex
 
 
 def _do_batch_update_autocommit(transaction, statements):
diff --git a/google/cloud/spanner_dbapi/cursor.py b/google/cloud/spanner_dbapi/cursor.py
@@ -366,6 +366,16 @@ def _execute_in_rw_transaction(self):
                         raise
                     else:
                         self.transaction_helper.retry_transaction()
+                except Exception as ex:
+                    # In case of inline-begin failure, the transaction isn't started.
+                    # We immediately retry with an explicit BeginTransaction.
+                    transaction = getattr(self.connection, "_transaction", None)
+                    if transaction and not transaction._transaction_id:
+                        transaction._reset_and_begin()
+
+                        # Let the existing retry loop handle the retry of the statement
+                        continue
+                    raise ex
         else:
             self.connection.database.run_in_transaction(
                 self._do_execute_update_in_autocommit,
diff --git a/google/cloud/spanner_v1/testing/mock_spanner.py b/google/cloud/spanner_v1/testing/mock_spanner.py
@@ -15,9 +15,11 @@
 import inspect
 import grpc
 from concurrent import futures
+from dataclasses import dataclass
 
-from google.protobuf import empty_pb2
 from grpc_status.rpc_status import _Status
+from google.rpc.code_pb2 import OK
+from google.protobuf import empty_pb2
 
 from google.cloud.spanner_v1 import (
     TransactionOptions,
@@ -53,10 +55,23 @@ def get_result(self, sql: str) -> result_set.ResultSet:
         return result
 
     def add_error(self, method: str, error: _Status):
+        if not hasattr(self, "_errors_list"):
+            self._errors_list = {}
+        if method not in self._errors_list:
+            self._errors_list[method] = []
+        self._errors_list[method].append(error)
         self.errors[method] = error
 
     def pop_error(self, context):
         name = inspect.currentframe().f_back.f_code.co_name
+        if hasattr(self, "_errors_list") and name in self._errors_list:
+            if self._errors_list[name]:
+                error = self._errors_list[name].pop(0)
+                context.abort_with_status(error)
+                return
+            return  # Queue is empty, return normally (no error)
+
+        # Fallback to single error
         error: _Status | None = self.errors.pop(name, None)
         if error:
             context.abort_with_status(error)
@@ -94,6 +109,12 @@ def get_result_as_partial_result_sets(
         return partials
 
 
+@dataclass
+class BatchDmlResponseConfig:
+    status: _Status
+    include_transaction_id: bool = True
+
+
 # An in-memory mock Spanner server that can be used for testing.
 class SpannerServicer(spanner_grpc.SpannerServicer):
     def __init__(self):
@@ -103,6 +124,7 @@ def __init__(self):
         self.transaction_counter = 0
         self.transactions = {}
         self._mock_spanner = MockSpanner()
+        self._batch_dml_response_configs = []
 
     @property
     def mock_spanner(self):
@@ -115,6 +137,15 @@ def requests(self):
     def clear_requests(self):
         self._requests = []
 
+    def add_batch_dml_response_status(self, status, include_transaction_id=True):
+        if not hasattr(self, "_batch_dml_response_configs"):
+            self._batch_dml_response_configs = []
+        self._batch_dml_response_configs.append(
+            BatchDmlResponseConfig(
+                status=status, include_transaction_id=include_transaction_id
+            )
+        )
+
     def CreateSession(self, request, context):
         self._requests.append(request)
         return self.__create_session(request.database, request.session)
@@ -176,6 +207,14 @@ def ExecuteBatchDml(self, request, context):
         self.mock_spanner.pop_error(context)
         response = spanner.ExecuteBatchDmlResponse()
         started_transaction = self.__maybe_create_transaction(request)
+
+        config = None
+        if (
+            hasattr(self, "_batch_dml_response_configs")
+            and self._batch_dml_response_configs
+        ):
+            config = self._batch_dml_response_configs.pop(0)
+
         first = True
         for statement in request.statements:
             result = self.mock_spanner.get_result(statement.sql)
@@ -184,8 +223,16 @@ def ExecuteBatchDml(self, request, context):
                     self.mock_spanner.get_result(statement.sql)
                 )
                 result.metadata = result_set.ResultSetMetadata(result.metadata)
-                result.metadata.transaction = started_transaction
+                if config is None or config.include_transaction_id:
+                    result.metadata.transaction = started_transaction
+                first = False
             response.result_sets.append(result)
+
+        if config is not None:
+            response.status.CopyFrom(config.status)
+        else:
+            response.status.code = OK
+
         return response
 
     def Read(self, request, context):
diff --git a/google/cloud/spanner_v1/transaction.py b/google/cloud/spanner_v1/transaction.py
@@ -214,6 +214,12 @@ def wrapped_method(*args, **kwargs):
 
         self.rolled_back = True
 
+    def _reset_and_begin(self):
+        """This function can be used to reset the transaction and execute an explicit BeginTransaction RPC if the first statement in the transaction failed, and that statement included an inlined BeginTransaction option."""
+        self._read_request_count = 0
+        self._execute_sql_request_count = 0
+        self.begin()
+
     def commit(
         self, return_commit_stats=False, request_options=None, max_commit_delay=None
     ):
diff --git a/tests/mockserver_tests/mock_server_test_base.py b/tests/mockserver_tests/mock_server_test_base.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import os
 import unittest
 
 import grpc
@@ -65,6 +66,19 @@ def aborted_status() -> _Status:
     return status
 
 
+def invalid_argument_status() -> _Status:
+    error = status_pb2.Status(
+        code=code_pb2.INVALID_ARGUMENT,
+        message="Invalid argument.",
+    )
+    status = _Status(
+        code=code_to_grpc_status_code(error.code),
+        details=error.message,
+        trailing_metadata=(("grpc-status-details-bin", error.SerializeToString()),),
+    )
+    return status
+
+
 def _make_partial_result_sets(
     fields: list[tuple[str, TypeCode]], results: list[dict]
 ) -> list[result_set.PartialResultSet]:
@@ -174,6 +188,9 @@ class MockServerTestBase(unittest.TestCase):
 
     def __init__(self, *args, **kwargs):
         super(MockServerTestBase, self).__init__(*args, **kwargs)
+        # Disable built-in metrics for tests to avoid Unauthenticated errors
+        os.environ["SPANNER_DISABLE_BUILTIN_METRICS"] = "true"
+
         self._client = None
         self._instance = None
         self._database = None
diff --git a/tests/mockserver_tests/test_dbapi_inline_begin.py b/tests/mockserver_tests/test_dbapi_inline_begin.py
diff --git a/tests/mockserver_tests/test_dbapi_isolation_level.py b/tests/mockserver_tests/test_dbapi_isolation_level.py