diff --git a/dm/pkg/election/election.go b/dm/pkg/election/election.go index c8b3b7607b..484ca4c32e 100644 --- a/dm/pkg/election/election.go +++ b/dm/pkg/election/election.go @@ -482,10 +482,15 @@ forLoop: } } - // add more options if needed. - // NOTE: I think use the client's context is better than something like `concurrency.WithContext(ctx)`, - // so we can close the session when the client is still valid. - session, err = concurrency.NewSession(e.cli, concurrency.WithTTL(e.sessionTTL)) + // Bind the session ctx to the election ctx so Session.Close()'s Revoke + // call aborts when the election is closed, instead of waiting out the + // lease TTL on a dying etcd cluster. Without this, when all dm-masters + // receive SIGHUP together and tear down concurrently (ha_cases + // cleanup), quorum vanishes mid-Revoke and each non-leader blocks up + // to sessionTTL (60s) in session.Close(), exceeding the cleanup + // deadline. On normal session expiry (line ~232), the lease is + // already gone on the server side so Revoke returns quickly anyway. + session, err = concurrency.NewSession(e.cli, concurrency.WithTTL(e.sessionTTL), concurrency.WithContext(ctx)) if err == nil || errors.Cause(err) == e.cli.Ctx().Err() { break forLoop } diff --git a/dm/tests/lightning_mode/run.sh b/dm/tests/lightning_mode/run.sh index d414ef7937..dccff345e6 100755 --- a/dm/tests/lightning_mode/run.sh +++ b/dm/tests/lightning_mode/run.sh @@ -51,6 +51,43 @@ function run() { run_dm_master_info_log $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT + # Pre-create Lightning's internal metadata tables. Each dm-worker spawns its + # own Lightning instance with the physical backend, and they race to + # bootstrap `lightning_metadata.task_meta_v2` / `table_meta` concurrently. + # TiDB's DDL schema cache can lag between the concurrent `CREATE TABLE IF + # NOT EXISTS` and the follow-up `SELECT`, yielding `Error 1146: Table ... + # doesn't exist` on one worker. Creating the tables ahead of time removes + # the race. The schemas mirror + # br/pkg/lightning/importer/import.go:CreateTaskMetaTable / CreateTableMetadataTable. + run_sql_tidb "CREATE DATABASE IF NOT EXISTS lightning_metadata;" + run_sql_tidb "CREATE TABLE IF NOT EXISTS lightning_metadata.task_meta_v2 ( + task_id BIGINT(20) UNSIGNED NOT NULL, + pd_cfgs VARCHAR(2048) NOT NULL DEFAULT '', + status VARCHAR(32) NOT NULL, + state TINYINT(1) NOT NULL DEFAULT 0 COMMENT '0: normal, 1: exited before finish', + tikv_source_bytes BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + tiflash_source_bytes BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + tikv_avail BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + tiflash_avail BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + PRIMARY KEY (task_id) + );" + run_sql_tidb "CREATE TABLE IF NOT EXISTS lightning_metadata.table_meta ( + task_id BIGINT(20) UNSIGNED, + table_id BIGINT(64) NOT NULL, + table_name VARCHAR(64) NOT NULL, + row_id_base BIGINT(20) NOT NULL DEFAULT 0, + row_id_max BIGINT(20) NOT NULL DEFAULT 0, + total_kvs_base BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + total_bytes_base BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + checksum_base BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + total_kvs BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + total_bytes BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + checksum BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + status VARCHAR(32) NOT NULL, + has_duplicates BOOL NOT NULL DEFAULT 0, + PRIMARY KEY (table_id, task_id) + );" + # start DM task dmctl_start_task "$cur/conf/dm-task-dup.yaml" "--remove-meta" run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \