From 2af5690a61115009a3274ea679f34348958f779a Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Thu, 22 Sep 2022 21:14:24 +0800 Subject: [PATCH 01/49] add chaos test --- .github/workflows/dataflow_engine_chaos.yaml | 59 +++- dm/simulator/{internal => }/config/config.go | 36 ++ dm/simulator/config/config_test.go | 52 +++ dm/simulator/{internal => }/mcp/errors.go | 0 dm/simulator/{internal => }/mcp/mcp.go | 0 dm/simulator/{internal => }/mcp/mcp_test.go | 0 dm/simulator/{internal => }/mcp/uk.go | 19 + dm/simulator/{internal => }/mcp/uk_test.go | 0 dm/simulator/{internal => }/sqlgen/errors.go | 0 dm/simulator/{internal => }/sqlgen/impl.go | 8 +- .../{internal => }/sqlgen/impl_test.go | 4 +- dm/simulator/{internal => }/sqlgen/sqlgen.go | 6 +- engine/chaos/cases/case_dm_job.go | 43 +++ engine/chaos/cases/cases.go | 2 +- engine/chaos/cases/conf/dmjob.yaml | 31 ++ engine/chaos/cases/config.go | 3 + engine/chaos/cases/dm/case.go | 326 ++++++++++++++++++ engine/chaos/cases/dm/db.go | 106 ++++++ engine/chaos/cases/main.go | 2 +- engine/chaos/manifests/Dockerfile | 1 + engine/chaos/manifests/cases.yaml | 1 + 21 files changed, 690 insertions(+), 9 deletions(-) rename dm/simulator/{internal => }/config/config.go (59%) create mode 100644 dm/simulator/config/config_test.go rename dm/simulator/{internal => }/mcp/errors.go (100%) rename dm/simulator/{internal => }/mcp/mcp.go (100%) rename dm/simulator/{internal => }/mcp/mcp_test.go (100%) rename dm/simulator/{internal => }/mcp/uk.go (91%) rename dm/simulator/{internal => }/mcp/uk_test.go (100%) rename dm/simulator/{internal => }/sqlgen/errors.go (100%) rename dm/simulator/{internal => }/sqlgen/impl.go (97%) rename dm/simulator/{internal => }/sqlgen/impl_test.go (98%) rename dm/simulator/{internal => }/sqlgen/sqlgen.go (89%) create mode 100644 engine/chaos/cases/case_dm_job.go create mode 100644 engine/chaos/cases/conf/dmjob.yaml create mode 100644 engine/chaos/cases/dm/case.go create mode 100644 engine/chaos/cases/dm/db.go diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index 621b3c1372..5b1f907e74 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -71,7 +71,9 @@ jobs: helm version - name: Build dataflow engine binary - run: make tiflow tiflow-chaos-case + run: | + make tiflow tiflow-chaos-case + cp -r $GITHUB_WORKSPACE/engine/chaos/cases/conf/ $GITHUB_WORKSPACE/bin/engine-conf - name: Build Dataflow engine docker image run: | @@ -188,6 +190,61 @@ jobs: kubectl logs chaos-executor-2 -p || true kubectl logs chaos-executor-0 -c wait-server-master || true + + # Set up upstream instances + - name: Set up sources + run: | + kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml + kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml + kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml + - name: Wait for sources ready # kubectl wait --all not working + run: | + kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s || true + kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s || true + kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s || true + sleep 10 + echo show pvc + kubectl get pvc -l app=sources -o wide + echo show pv + kubectl get pv -o wide + echo show svc + kubectl get svc -l app=sources -o wide + echo show sts + kubectl get sts -l app=sources -o wide + echo show po + kubectl get po -l app=sources -o wide + echo describe po + kubectl describe po -l app=sources + echo describe pvc + kubectl describe pvc -l app=sources + kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s + kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s + kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s + + # Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator) + - name: Set up TiDB + run: | + kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + - name: Wait for TiDB ready + run: | + kubectl wait --for=condition=Ready pod/tidb-0 --timeout=300s || true + echo show pvc + kubectl get pvc -l app=tidb -o wide + echo show pv + kubectl get pv -o wide + echo show svc + kubectl get svc -l app=tidb -o wide + echo show sts + kubectl get sts -l app=tidb -o wide + echo show po + kubectl get po -l app=tidb -o wide + echo describe po + kubectl describe po -l app=tidb + echo describe pvc + kubectl describe pvc -l app=tidb + kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s - name: Set up chaos test cases run: | diff --git a/dm/simulator/internal/config/config.go b/dm/simulator/config/config.go similarity index 59% rename from dm/simulator/internal/config/config.go rename to dm/simulator/config/config.go index 9f8508d21f..b2f6b61912 100644 --- a/dm/simulator/internal/config/config.go +++ b/dm/simulator/config/config.go @@ -14,6 +14,13 @@ // Package config is the configuration definitions used by the simulator. package config +import ( + "strconv" + "strings" + + "github.com/pingcap/tidb/util/dbutil" +) + // TableConfig is the sub config for describing a simulating table in the data source. type TableConfig struct { TableID string `yaml:"id"` @@ -29,3 +36,32 @@ type ColumnDefinition struct { DataType string `yaml:"type"` DataLen int `yaml:"length"` } + +func (t *TableConfig) GenCreateTable() string { + var buf strings.Builder + buf.WriteString("CREATE TABLE ") + buf.WriteString(dbutil.TableName(t.DatabaseName, t.TableName)) + buf.WriteByte('(') + for i, col := range t.Columns { + if i != 0 { + buf.WriteByte(',') + } + buf.WriteString(dbutil.ColumnName(col.ColumnName)) + buf.WriteByte(' ') + buf.WriteString(col.DataType) + if col.DataLen > 0 { + buf.WriteByte('(') + buf.WriteString(strconv.Itoa(col.DataLen)) + buf.WriteByte(')') + } + } + for _, ukColName := range t.UniqueKeyColumnNames { + buf.WriteString(",UNIQUE KEY ") + buf.WriteString(dbutil.ColumnName(ukColName)) + buf.WriteString("(") + buf.WriteString(dbutil.ColumnName(ukColName)) + buf.WriteByte(')') + } + buf.WriteByte(')') + return buf.String() +} diff --git a/dm/simulator/config/config_test.go b/dm/simulator/config/config_test.go new file mode 100644 index 0000000000..f8b6443710 --- /dev/null +++ b/dm/simulator/config/config_test.go @@ -0,0 +1,52 @@ +// Copyright 2022 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package config is the configuration definitions used by the simulator. +package config + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestConfig(t *testing.T) { + tableConfig := &TableConfig{ + DatabaseName: "games", + TableName: "members", + Columns: []*ColumnDefinition{ + { + ColumnName: "id", + DataType: "int", + DataLen: 11, + }, + { + ColumnName: "name", + DataType: "varchar", + DataLen: 255, + }, + { + ColumnName: "age", + DataType: "int", + DataLen: 11, + }, + { + ColumnName: "team_id", + DataType: "int", + DataLen: 11, + }, + }, + UniqueKeyColumnNames: []string{"id"}, + } + require.Equal(t, "CREATE TABLE `games`.`members`(`id` int(11),`name` varchar(255),`age` int(11),`team_id` int(11),UNIQUE KEY `id`(`id`))", tableConfig.GenCreateTable()) +} diff --git a/dm/simulator/internal/mcp/errors.go b/dm/simulator/mcp/errors.go similarity index 100% rename from dm/simulator/internal/mcp/errors.go rename to dm/simulator/mcp/errors.go diff --git a/dm/simulator/internal/mcp/mcp.go b/dm/simulator/mcp/mcp.go similarity index 100% rename from dm/simulator/internal/mcp/mcp.go rename to dm/simulator/mcp/mcp.go diff --git a/dm/simulator/internal/mcp/mcp_test.go b/dm/simulator/mcp/mcp_test.go similarity index 100% rename from dm/simulator/internal/mcp/mcp_test.go rename to dm/simulator/mcp/mcp_test.go diff --git a/dm/simulator/internal/mcp/uk.go b/dm/simulator/mcp/uk.go similarity index 91% rename from dm/simulator/internal/mcp/uk.go rename to dm/simulator/mcp/uk.go index ce7ee72ef5..210fcd65fa 100644 --- a/dm/simulator/internal/mcp/uk.go +++ b/dm/simulator/mcp/uk.go @@ -15,6 +15,7 @@ package mcp import ( "fmt" + "sort" "strings" "sync" ) @@ -74,6 +75,24 @@ func (uk *UniqueKey) GetValue() map[string]interface{} { return result } +// GetValueHash return hash for values +func (uk *UniqueKey) GetValueHash() string { + uk.RLock() + defer uk.RUnlock() + + keys := make([]string, 0) + for k := range uk.value { + keys = append(keys, k) + } + sort.Strings(keys) + var b strings.Builder + for _, k := range keys { + v := uk.value[k] + fmt.Fprintf(&b, "%s = %v; ", k, v) + } + return b.String() +} + // SetValue sets the UK value map. // The input values are cloned into the UK, // and further modifications on the input map won't affect the values inside the UK. diff --git a/dm/simulator/internal/mcp/uk_test.go b/dm/simulator/mcp/uk_test.go similarity index 100% rename from dm/simulator/internal/mcp/uk_test.go rename to dm/simulator/mcp/uk_test.go diff --git a/dm/simulator/internal/sqlgen/errors.go b/dm/simulator/sqlgen/errors.go similarity index 100% rename from dm/simulator/internal/sqlgen/errors.go rename to dm/simulator/sqlgen/errors.go diff --git a/dm/simulator/internal/sqlgen/impl.go b/dm/simulator/sqlgen/impl.go similarity index 97% rename from dm/simulator/internal/sqlgen/impl.go rename to dm/simulator/sqlgen/impl.go index 20f51c4de8..c908ae3130 100644 --- a/dm/simulator/internal/sqlgen/impl.go +++ b/dm/simulator/sqlgen/impl.go @@ -26,8 +26,8 @@ import ( "go.uber.org/zap" "github.com/pingcap/tiflow/dm/pkg/log" - "github.com/pingcap/tiflow/dm/simulator/internal/config" - "github.com/pingcap/tiflow/dm/simulator/internal/mcp" + "github.com/pingcap/tiflow/dm/simulator/config" + "github.com/pingcap/tiflow/dm/simulator/mcp" ) type sqlGeneratorImpl struct { @@ -270,3 +270,7 @@ func (g *sqlGeneratorImpl) GenLoadUniqueKeySQL() (string, []*config.ColumnDefini } return sql, cols, nil } + +func (g *sqlGeneratorImpl) GenCreateTable() string { + return g.tableConfig.GenCreateTable() +} diff --git a/dm/simulator/internal/sqlgen/impl_test.go b/dm/simulator/sqlgen/impl_test.go similarity index 98% rename from dm/simulator/internal/sqlgen/impl_test.go rename to dm/simulator/sqlgen/impl_test.go index 363a77a886..26ac429339 100644 --- a/dm/simulator/internal/sqlgen/impl_test.go +++ b/dm/simulator/sqlgen/impl_test.go @@ -23,8 +23,8 @@ import ( "github.com/stretchr/testify/suite" "github.com/pingcap/tiflow/dm/pkg/log" - "github.com/pingcap/tiflow/dm/simulator/internal/config" - "github.com/pingcap/tiflow/dm/simulator/internal/mcp" + "github.com/pingcap/tiflow/dm/simulator/config" + "github.com/pingcap/tiflow/dm/simulator/mcp" ) type testSQLGenImplSuite struct { diff --git a/dm/simulator/internal/sqlgen/sqlgen.go b/dm/simulator/sqlgen/sqlgen.go similarity index 89% rename from dm/simulator/internal/sqlgen/sqlgen.go rename to dm/simulator/sqlgen/sqlgen.go index 68dd0dd5c5..83b4533076 100644 --- a/dm/simulator/internal/sqlgen/sqlgen.go +++ b/dm/simulator/sqlgen/sqlgen.go @@ -15,8 +15,8 @@ package sqlgen import ( - "github.com/pingcap/tiflow/dm/simulator/internal/config" - "github.com/pingcap/tiflow/dm/simulator/internal/mcp" + "github.com/pingcap/tiflow/dm/simulator/config" + "github.com/pingcap/tiflow/dm/simulator/mcp" ) // SQLGenerator contains all the operations for generating SQLs. @@ -35,4 +35,6 @@ type SQLGenerator interface { GenUpdateRow(*mcp.UniqueKey) (string, error) // GenDeleteRow generates a DELETE SQL for the given unique key. GenDeleteRow(*mcp.UniqueKey) (string, error) + // GenCreateTable generates a CreateTable SQL by table config. + GenCreateTable() string } diff --git a/engine/chaos/cases/case_dm_job.go b/engine/chaos/cases/case_dm_job.go new file mode 100644 index 0000000000..ab5ad5597b --- /dev/null +++ b/engine/chaos/cases/case_dm_job.go @@ -0,0 +1,43 @@ +// Copyright 2022 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "path/filepath" + + dmchaos "github.com/pingcap/tiflow/engine/chaos/cases/dm" + "golang.org/x/sync/errgroup" +) + +var ( + filenames = []string{"dmjob.yaml"} + doSchemas = []string{"dm_engine_chaos"} +) + +func runDMJobCases(ctx context.Context, cfg *config) error { + eg, ctx2 := errgroup.WithContext(ctx) + for _, f := range filenames { + file := f + eg.Go(func() error { + testCase, err := dmchaos.NewCase(ctx2, cfg.Addr, "dmjob", filepath.Join(cfg.ConfigDir, file)) + if err != nil { + return err + } + + return testCase.Run(ctx2) + }) + } + return eg.Wait() +} diff --git a/engine/chaos/cases/cases.go b/engine/chaos/cases/cases.go index fda6b006b9..f1b692189f 100644 --- a/engine/chaos/cases/cases.go +++ b/engine/chaos/cases/cases.go @@ -21,7 +21,7 @@ import ( type caseFn func(context.Context, *config) error -var cases = []caseFn{runFakeJobCase} +var cases = []caseFn{runDMJobCases} func runCases(ctx context.Context, cfg *config) error { errg, ctx := errgroup.WithContext(ctx) diff --git a/engine/chaos/cases/conf/dmjob.yaml b/engine/chaos/cases/conf/dmjob.yaml new file mode 100644 index 0000000000..0abe3ec605 --- /dev/null +++ b/engine/chaos/cases/conf/dmjob.yaml @@ -0,0 +1,31 @@ +task-mode: all +target-database: + host: tidb-0.tidb + port: 4000 + user: root + password: "" +upstreams: + - db-config: + host: "mysql57-0.sources" + port: 3306 + user: root + password: "" + source-id: replica-01 + block-allow-list: balist-01 + - db-config: + host: "mysql8-0.sources" + port: 3306 + user: root + password: "" + source-id: replica-02 + block-allow-list: balist-01 + - db-config: + host: "mariadb-0.sources" + port: 3306 + user: root + password: "" + source-id: replica-03 + block-allow-list: balist-01 +block-allow-list: + balist-01: + do-dbs: ["dmjob"] \ No newline at end of file diff --git a/engine/chaos/cases/config.go b/engine/chaos/cases/config.go index 9c0eb6f718..5985f20c26 100644 --- a/engine/chaos/cases/config.go +++ b/engine/chaos/cases/config.go @@ -30,6 +30,8 @@ type config struct { MasterCount int `toml:"master-count" yaml:"master-count" json:"master-count"` WorkerCount int `toml:"worker-count" yaml:"worker-count" json:"worker-count"` + + ConfigDir string `toml:"config-dir" yaml:"config-dir" json:"config-dir"` } // newConfig creates a config for this chaos testing suite. @@ -46,6 +48,7 @@ func newConfig() *config { fs.IntVar(&cfg.MasterCount, "master-count", 3, "expect count of server-master") fs.IntVar(&cfg.WorkerCount, "worker-count", 4, "expect count of executor") + fs.StringVar(&cfg.ConfigDir, "config-dir", "/", "path of the source and task config files") return cfg } diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go new file mode 100644 index 0000000000..817a74e47f --- /dev/null +++ b/engine/chaos/cases/dm/case.go @@ -0,0 +1,326 @@ +// Copyright 2022 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package dm + +import ( + "context" + "errors" + "fmt" + "math/rand" + "os" + "strings" + "time" + + "github.com/pingcap/log" + "github.com/pingcap/tidb/util/dbutil" + sqlconfig "github.com/pingcap/tiflow/dm/simulator/config" + "github.com/pingcap/tiflow/dm/simulator/mcp" + sqlgen "github.com/pingcap/tiflow/dm/simulator/sqlgen" + pb "github.com/pingcap/tiflow/engine/enginepb" + "github.com/pingcap/tiflow/engine/jobmaster/dm/config" + "github.com/pingcap/tiflow/engine/test/e2e" + "go.uber.org/zap" +) + +const ( + tableNum = 5 + rowNum = 1000 + batch = 100 + diffTimes = 10 + diffInterval = 10 * time.Second +) + +// Case is a data migration Case test case with one or more sources. +type Case struct { + ctx context.Context + + addr string + cfgBytes []byte + sources []*dbConn + target *dbConn + jobID string + name string + + // source -> table -> mcp + mcps []map[string]*mcp.ModificationCandidatePool + // source -> table -> generator + generators []map[string]sqlgen.SQLGenerator + // table -> key -> struct{} + keySet map[string]map[string]struct{} + + result []int +} + +// NewCase creates a new test case. +func NewCase(ctx context.Context, addr string, name string, cfgPath string) (*Case, error) { + cfgBytes, err := os.ReadFile(cfgPath) + if err != nil { + return nil, err + } + + var jobCfg config.JobCfg + if err := jobCfg.Decode(cfgBytes); err != nil { + return nil, err + } + + c := &Case{ + ctx: ctx, + sources: make([]*dbConn, 0, len(jobCfg.Upstreams)), + cfgBytes: cfgBytes, + addr: addr, + name: name, + mcps: make([]map[string]*mcp.ModificationCandidatePool, 0, 3), + generators: make([]map[string]sqlgen.SQLGenerator, 0, 3), + keySet: make(map[string]map[string]struct{}, tableNum), + result: make([]int, 3), + } + for _, upstream := range jobCfg.Upstreams { + source, err := newDBConn(ctx, upstream.DBCfg, name) + if err != nil { + return nil, err + } + c.sources = append(c.sources, source) + } + target, err := newDBConn(ctx, jobCfg.TargetDB, name) + if err != nil { + return nil, err + } + c.target = target + + for range c.sources { + generators := make(map[string]sqlgen.SQLGenerator) + mcps := make(map[string]*mcp.ModificationCandidatePool) + for i := 1; i <= tableNum; i++ { + tableName := fmt.Sprintf("tb%d", i) + tableConfig := &sqlconfig.TableConfig{ + DatabaseName: c.name, + TableName: tableName, + Columns: []*sqlconfig.ColumnDefinition{ + { + ColumnName: "id", + DataType: "int", + DataLen: 11, + }, + { + ColumnName: "name", + DataType: "varchar", + DataLen: 255, + }, + { + ColumnName: "age", + DataType: "int", + DataLen: 11, + }, + { + ColumnName: "team_id", + DataType: "int", + DataLen: 11, + }, + }, + UniqueKeyColumnNames: []string{"id"}, + } + generators[tableName] = sqlgen.NewSQLGeneratorImpl(tableConfig) + mcps[tableName] = mcp.NewModificationCandidatePool(1000000) + c.keySet[tableName] = make(map[string]struct{}) + } + c.generators = append(c.generators, generators) + c.mcps = append(c.mcps, mcps) + } + + return c, nil +} + +// Run runs a test case. +func (c *Case) Run(ctx context.Context) error { + defer func() { + log.L().Info("finish run case", zap.Int("insert", c.result[0]), zap.Int("update", c.result[1]), zap.Int("delete", c.result[2])) + }() + if err := c.createJob(ctx); err != nil { + return err + } + if err := c.genFullData(); err != nil { + return err + } + if err := c.diffDataLoop(ctx); err != nil { + return err + } + return c.incrLoop(ctx) +} + +func (c *Case) createJob(ctx context.Context) error { + jobID, err := e2e.CreateJobViaHTTP(ctx, c.addr, "chaos-dm-test", "project-dm", pb.Job_DM, c.cfgBytes) + if err != nil { + return err + } + c.jobID = jobID + return nil +} + +func (c *Case) genFullData() error { + log.L().Info("start generate full data") + for source, generators := range c.generators { + for table, generator := range generators { + if _, err := c.sources[source].ExecuteSQLs("CREATE DATABASE IF NOT EXISTS "+c.name+" CHARSET latin1", "USE "+c.name); err != nil { + return err + } + if _, err := c.sources[source].ExecuteSQLs(generator.GenCreateTable()); err != nil { + return err + } + sqls := make([]string, 0, rowNum) + for j := 0; j < rowNum; j++ { + sql, uk, err := generator.GenInsertRow() + if err != nil { + return err + } + // key already exists + if _, ok := c.keySet[table][uk.GetValueHash()]; ok { + continue + } + if err := c.mcps[source][table].AddUK(uk); err != nil { + return err + } + c.keySet[table][uk.GetValueHash()] = struct{}{} + sqls = append(sqls, sql) + } + if _, err := c.sources[source].ExecuteSQLs(sqls...); err != nil { + return err + } + } + } + return nil +} + +// TODO: use sync-diff-inspector instead. +func (c *Case) diffData(ctx context.Context) (bool, error) { + log.L().Info("start diff data") + for i := 1; i <= tableNum; i++ { + tableName := fmt.Sprintf("tb%d", i) + row := c.target.db.DB.QueryRowContext(ctx, fmt.Sprintf("SELECT count(1) FROM %s", dbutil.TableName(c.target.currDB, tableName))) + if row.Err() != nil { + return false, row.Err() + } + var count int + if err := row.Scan(&count); err != nil { + return false, err + } + var totalCount int + for _, mcps := range c.mcps { + totalCount += mcps[tableName].Len() + } + if count != totalCount { + log.Error("data is not same", zap.Int("downstream", count), zap.Int("upstream", totalCount)) + return false, nil + } + } +} + +func (c *Case) diffDataLoop(ctx context.Context) error { + for i := 0; i < diffTimes; i++ { + select { + case <-ctx.Done(): + return nil + case <-time.After(diffInterval): + if same, err := c.diffData(ctx); err != nil { + if strings.Contains(err.Error(), "not found") { + continue + } + return err + } else if same { + return nil + } + } + } + return errors.New("data is not same") +} + +// randDML generates DML (INSERT, UPDATE or DELETE). +func (c *Case) randDML(source int, table string) (string, error) { + generator := c.generators[source][table] + mcp := c.mcps[source][table] + t := rand.Intn(3) + c.result[t]++ + switch t { + case 0: + sql, uk, err := generator.GenInsertRow() + if err != nil { + return "", err + } + for _, ok := c.keySet[table][uk.GetValueHash()]; ok; { + sql, uk, err = generator.GenInsertRow() + if err != nil { + return "", err + } + } + if err := c.mcps[source][table].AddUK(uk); err != nil { + return "", err + } + c.keySet[table][uk.GetValueHash()] = struct{}{} + return sql, nil + case 1: + return generator.GenUpdateRow(mcp.NextUK()) + default: + key := mcp.NextUK() + sql, err := generator.GenDeleteRow(key) + if err != nil { + return "", err + } + delete(c.keySet[table], key.GetValueHash()) + err = mcp.DeleteUK(key) + return sql, err + } +} + +func (c *Case) genIncrData(ctx context.Context) error { + log.L().Info("start generate incremental data") + for { + select { + case <-ctx.Done(): + return nil + default: + } + source := rand.Intn(len(c.sources)) + tableName := fmt.Sprintf("tb%d", rand.Intn(tableNum)+1) + + sqls := make([]string, 0, batch) + for i := 0; i < batch; i++ { + sql, err := c.randDML(source, tableName) + if err != nil { + return err + } + sqls = append(sqls, sql) + } + if _, err := c.sources[source].ExecuteSQLs(sqls...); err != nil { + return err + } + } +} + +func (c *Case) incrLoop(ctx context.Context) error { + for { + select { + case <-ctx.Done(): + return nil + default: + } + ctx2, cancel := context.WithTimeout(ctx, time.Second*10) + err := c.genIncrData(ctx2) + cancel() + if err != nil { + return err + } + if err := c.diffDataLoop(ctx); err != nil { + return err + } + } +} diff --git a/engine/chaos/cases/dm/db.go b/engine/chaos/cases/dm/db.go new file mode 100644 index 0000000000..6f48b9a969 --- /dev/null +++ b/engine/chaos/cases/dm/db.go @@ -0,0 +1,106 @@ +// Copyright 2022 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package dm + +import ( + "context" + "fmt" + "time" + + "github.com/go-sql-driver/mysql" + "github.com/pingcap/errors" + "github.com/pingcap/tidb/errno" + dmconfig "github.com/pingcap/tiflow/dm/config" + "github.com/pingcap/tiflow/dm/pkg/conn" + tcontext "github.com/pingcap/tiflow/dm/pkg/context" + "github.com/pingcap/tiflow/dm/pkg/log" + "github.com/pingcap/tiflow/dm/pkg/retry" + "go.uber.org/zap" +) + +type dbConn struct { + db *conn.BaseDB + con *conn.BaseConn + currDB string +} + +func newDBConn(ctx context.Context, cfg *dmconfig.DBConfig, currDB string) (*dbConn, error) { + db, err := conn.DefaultDBProvider.Apply(cfg) + if err != nil { + return nil, err + } + con, err := db.GetBaseConn(ctx) + if err != nil { + return nil, err + } + + return &dbConn{ + db: db, + con: con, + currDB: currDB, + }, nil +} + +func (dc *dbConn) resetConn(ctx context.Context) error { + err := dc.db.CloseBaseConn(dc.con) + if err != nil { + log.L().Warn("fail to close connection", zap.Error(err)) + } + dc.con, err = dc.db.GetBaseConn(ctx) + if err != nil { + return err + } + _, err = dc.con.ExecuteSQL(tcontext.NewContext(ctx, log.L()), nil, "chaos-cases", []string{fmt.Sprintf("USE %s", dc.currDB)}) + return err +} + +func ignoreExecSQLError(err error) bool { + err = errors.Cause(err) // check the original error + mysqlErr, ok := err.(*mysql.MySQLError) + if !ok { + return false + } + + switch mysqlErr.Number { + case errno.ErrDupEntry: // HACK: we tolerate `invalid connection`, then `Duplicate entry` may be reported. + return true + default: + return false + } +} + +func (dc *dbConn) ExecuteSQLs(queries ...string) (int, error) { + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + params := retry.Params{ + RetryCount: 3, + FirstRetryDuration: time.Second, + BackoffStrategy: retry.Stable, + IsRetryableFn: func(_ int, err error) bool { + if retry.IsConnectionError(err) { + // HACK: for some errors like `invalid connection`, `sql: connection is already closed`, we can ignore them just for testing. + err = dc.resetConn(ctx) + return err == nil + } + return false + }, + } + + ret, _, err := dc.con.ApplyRetryStrategy(tcontext.NewContext(ctx, log.L()), params, + func(tctx *tcontext.Context) (interface{}, error) { + ret, err2 := dc.con.ExecuteSQLWithIgnoreError(tctx, nil, "chaos-cases", ignoreExecSQLError, queries) + return ret, err2 + }) + return ret.(int), err +} diff --git a/engine/chaos/cases/main.go b/engine/chaos/cases/main.go index 097888b300..0f302f62c5 100644 --- a/engine/chaos/cases/main.go +++ b/engine/chaos/cases/main.go @@ -66,7 +66,7 @@ func main() { rand.Seed(time.Now().UnixNano()) - ctx, cancel := context.WithCancel(context.Background()) + ctx, cancel := context.WithTimeout(context.Background(), cfg.Duration) defer cancel() sc := make(chan os.Signal, 1) diff --git a/engine/chaos/manifests/Dockerfile b/engine/chaos/manifests/Dockerfile index 7bd4f31f3e..b8976de2e2 100644 --- a/engine/chaos/manifests/Dockerfile +++ b/engine/chaos/manifests/Dockerfile @@ -2,6 +2,7 @@ FROM alpine:3.14 ADD tiflow /tiflow ADD tiflow-chaos-case /tiflow-chaos-case +ADD engine-conf /engine-conf RUN chmod a+x /tiflow /tiflow-chaos-case diff --git a/engine/chaos/manifests/cases.yaml b/engine/chaos/manifests/cases.yaml index e5fed30804..4cfa08ccc5 100644 --- a/engine/chaos/manifests/cases.yaml +++ b/engine/chaos/manifests/cases.yaml @@ -11,6 +11,7 @@ spec: imagePullPolicy: IfNotPresent command: - "/tiflow-chaos-case" + - "--config-dir=/engine-conf" - "--duration=20m" restartPolicy: Never backoffLimit: 0 # fail immediately From 341e0e76e561025753a33eea6f4c86ae4ee30ed7 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Mon, 26 Sep 2022 14:10:35 +0800 Subject: [PATCH 02/49] fix fmt --- engine/chaos/cases/dm/case.go | 1 + 1 file changed, 1 insertion(+) diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index 817a74e47f..2ebb560b19 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -223,6 +223,7 @@ func (c *Case) diffData(ctx context.Context) (bool, error) { return false, nil } } + return true, nil } func (c *Case) diffDataLoop(ctx context.Context) error { From deb7b1e22a9c19f8d8af358940a4408515005ce8 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 27 Sep 2022 14:40:36 +0800 Subject: [PATCH 03/49] update --- engine/chaos/cases/case_dm_job.go | 5 +- engine/chaos/cases/dm/case.go | 106 ++++++++++++++++++++++++------ 2 files changed, 89 insertions(+), 22 deletions(-) diff --git a/engine/chaos/cases/case_dm_job.go b/engine/chaos/cases/case_dm_job.go index ab5ad5597b..c6d10c36fe 100644 --- a/engine/chaos/cases/case_dm_job.go +++ b/engine/chaos/cases/case_dm_job.go @@ -22,8 +22,7 @@ import ( ) var ( - filenames = []string{"dmjob.yaml"} - doSchemas = []string{"dm_engine_chaos"} + filenames = []string{"dmjob"} ) func runDMJobCases(ctx context.Context, cfg *config) error { @@ -31,7 +30,7 @@ func runDMJobCases(ctx context.Context, cfg *config) error { for _, f := range filenames { file := f eg.Go(func() error { - testCase, err := dmchaos.NewCase(ctx2, cfg.Addr, "dmjob", filepath.Join(cfg.ConfigDir, file)) + testCase, err := dmchaos.NewCase(ctx2, cfg.Addr, file, filepath.Join(cfg.ConfigDir, file+".yaml")) if err != nil { return err } diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index 2ebb560b19..2bc62d800b 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -15,14 +15,17 @@ package dm import ( "context" - "errors" + "database/sql" "fmt" "math/rand" "os" - "strings" "time" + "github.com/go-sql-driver/mysql" + "github.com/pingcap/errors" "github.com/pingcap/log" + "github.com/pingcap/tidb-tools/pkg/diff" + "github.com/pingcap/tidb/errno" "github.com/pingcap/tidb/util/dbutil" sqlconfig "github.com/pingcap/tiflow/dm/simulator/config" "github.com/pingcap/tiflow/dm/simulator/mcp" @@ -37,18 +40,17 @@ const ( tableNum = 5 rowNum = 1000 batch = 100 - diffTimes = 10 - diffInterval = 10 * time.Second + diffTimes = 60 + diffInterval = 2 * time.Second ) // Case is a data migration Case test case with one or more sources. type Case struct { - ctx context.Context - addr string cfgBytes []byte sources []*dbConn target *dbConn + tables []string jobID string name string @@ -75,7 +77,6 @@ func NewCase(ctx context.Context, addr string, name string, cfgPath string) (*Ca } c := &Case{ - ctx: ctx, sources: make([]*dbConn, 0, len(jobCfg.Upstreams)), cfgBytes: cfgBytes, addr: addr, @@ -98,10 +99,11 @@ func NewCase(ctx context.Context, addr string, name string, cfgPath string) (*Ca } c.target = target + // init table config for range c.sources { generators := make(map[string]sqlgen.SQLGenerator) mcps := make(map[string]*mcp.ModificationCandidatePool) - for i := 1; i <= tableNum; i++ { + for i := 0; i < tableNum; i++ { tableName := fmt.Sprintf("tb%d", i) tableConfig := &sqlconfig.TableConfig{ DatabaseName: c.name, @@ -133,6 +135,7 @@ func NewCase(ctx context.Context, addr string, name string, cfgPath string) (*Ca generators[tableName] = sqlgen.NewSQLGeneratorImpl(tableConfig) mcps[tableName] = mcp.NewModificationCandidatePool(1000000) c.keySet[tableName] = make(map[string]struct{}) + c.tables = append(c.tables, tableName) } c.generators = append(c.generators, generators) c.mcps = append(c.mcps, mcps) @@ -144,7 +147,7 @@ func NewCase(ctx context.Context, addr string, name string, cfgPath string) (*Ca // Run runs a test case. func (c *Case) Run(ctx context.Context) error { defer func() { - log.L().Info("finish run case", zap.Int("insert", c.result[0]), zap.Int("update", c.result[1]), zap.Int("delete", c.result[2])) + log.L().Info("finish run case", zap.String("name", c.name), zap.String("job_id", c.jobID), zap.Int("insert", c.result[0]), zap.Int("update", c.result[1]), zap.Int("delete", c.result[2])) }() if err := c.createJob(ctx); err != nil { return err @@ -168,7 +171,7 @@ func (c *Case) createJob(ctx context.Context) error { } func (c *Case) genFullData() error { - log.L().Info("start generate full data") + log.L().Info("start generate full data", zap.String("name", c.name), zap.String("job_id", c.jobID)) for source, generators := range c.generators { for table, generator := range generators { if _, err := c.sources[source].ExecuteSQLs("CREATE DATABASE IF NOT EXISTS "+c.name+" CHARSET latin1", "USE "+c.name); err != nil { @@ -201,11 +204,9 @@ func (c *Case) genFullData() error { return nil } -// TODO: use sync-diff-inspector instead. func (c *Case) diffData(ctx context.Context) (bool, error) { - log.L().Info("start diff data") - for i := 1; i <= tableNum; i++ { - tableName := fmt.Sprintf("tb%d", i) + log.L().Info("start diff data", zap.String("name", c.name), zap.String("job_id", c.jobID)) + for _, tableName := range c.tables { row := c.target.db.DB.QueryRowContext(ctx, fmt.Sprintf("SELECT count(1) FROM %s", dbutil.TableName(c.target.currDB, tableName))) if row.Err() != nil { return false, row.Err() @@ -219,7 +220,7 @@ func (c *Case) diffData(ctx context.Context) (bool, error) { totalCount += mcps[tableName].Len() } if count != totalCount { - log.Error("data is not same", zap.Int("downstream", count), zap.Int("upstream", totalCount)) + log.Error("data is not same", zap.String("name", c.name), zap.String("job_id", c.jobID), zap.Int("downstream", count), zap.Int("upstream", totalCount)) return false, nil } } @@ -233,7 +234,7 @@ func (c *Case) diffDataLoop(ctx context.Context) error { return nil case <-time.After(diffInterval): if same, err := c.diffData(ctx); err != nil { - if strings.Contains(err.Error(), "not found") { + if ignoreErrNoSuchTable(err) { continue } return err @@ -242,7 +243,11 @@ func (c *Case) diffDataLoop(ctx context.Context) error { } } } - return errors.New("data is not same") + sourceDBs := make([]*sql.DB, 0, len(c.sources)) + for _, s := range c.sources { + sourceDBs = append(sourceDBs, s.db.DB) + } + return syncDiffInspector(ctx, c.name, c.tables, c.target.db.DB, sourceDBs...) } // randDML generates DML (INSERT, UPDATE or DELETE). @@ -283,7 +288,7 @@ func (c *Case) randDML(source int, table string) (string, error) { } func (c *Case) genIncrData(ctx context.Context) error { - log.L().Info("start generate incremental data") + log.L().Info("start generate incremental data", zap.String("name", c.name), zap.String("job_id", c.jobID)) for { select { case <-ctx.Done(): @@ -291,7 +296,7 @@ func (c *Case) genIncrData(ctx context.Context) error { default: } source := rand.Intn(len(c.sources)) - tableName := fmt.Sprintf("tb%d", rand.Intn(tableNum)+1) + tableName := c.tables[rand.Intn(tableNum)] sqls := make([]string, 0, batch) for i := 0; i < batch; i++ { @@ -325,3 +330,66 @@ func (c *Case) incrLoop(ctx context.Context) error { } } } + +func ignoreErrNoSuchTable(err error) bool { + err = errors.Cause(err) + mysqlErr, ok := err.(*mysql.MySQLError) + if !ok { + return false + } + + switch mysqlErr.Number { + case errno.ErrNoSuchTable: + return true + default: + return false + } +} + +func syncDiffInspector(ctx context.Context, schema string, tables []string, targetDB *sql.DB, sourceDBs ...*sql.DB) error { + for _, table := range tables { + sourceTables := make([]*diff.TableInstance, 0, len(sourceDBs)) + for i, sourceDB := range sourceDBs { + sourceTables = append(sourceTables, &diff.TableInstance{ + Conn: sourceDB, + Schema: schema, + Table: table, + InstanceID: fmt.Sprintf("source-%d", i), + }) + } + + targetTable := &diff.TableInstance{ + Conn: targetDB, + Schema: schema, + Table: table, + InstanceID: "target", + } + + td := &diff.TableDiff{ + SourceTables: sourceTables, + TargetTable: targetTable, + ChunkSize: 1000, + Sample: 100, + CheckThreadCount: 1, + UseChecksum: true, + TiDBStatsSource: targetTable, + CpDB: targetDB, + } + + structEqual, dataEqual, err := td.Equal(ctx, func(dml string) error { + return nil + }) + + if errors.Cause(err) == context.Canceled || errors.Cause(err) == context.DeadlineExceeded { + return nil + } + if !structEqual { + return errors.Errorf("different struct for table %s", dbutil.TableName(schema, table)) + } else if !dataEqual { + return errors.Errorf("different data for table %s", dbutil.TableName(schema, table)) + } + log.L().Info("data equal for table", zap.String("schema", schema), zap.String("table", table)) + } + + return nil +} From 6106ab64f2d9303441bef72d5b102e9a94133b24 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 27 Sep 2022 14:46:02 +0800 Subject: [PATCH 04/49] update --- .github/workflows/dataflow_engine_chaos.yaml | 3 +++ engine/chaos/cases/cases.go | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index 5b1f907e74..4b052e861a 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -9,6 +9,9 @@ on: description: 'Which PR do you want to trigger (use PR number, such as 6127)' required: true default: '' + pull_request: + branches: + - master # See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency. concurrency: diff --git a/engine/chaos/cases/cases.go b/engine/chaos/cases/cases.go index f1b692189f..dc185dc54e 100644 --- a/engine/chaos/cases/cases.go +++ b/engine/chaos/cases/cases.go @@ -21,7 +21,7 @@ import ( type caseFn func(context.Context, *config) error -var cases = []caseFn{runDMJobCases} +var cases = []caseFn{runFakeJobCase, runDMJobCases} func runCases(ctx context.Context, cfg *config) error { errg, ctx := errgroup.WithContext(ctx) From 2b073193c283701a1392d52edc4dcade4a3f3ff1 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 27 Sep 2022 15:07:29 +0800 Subject: [PATCH 05/49] update --- .github/workflows/dataflow_engine_chaos.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index 4b052e861a..88066f0b33 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -232,7 +232,7 @@ jobs: kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - name: Wait for TiDB ready run: | - kubectl wait --for=condition=Ready pod/tidb-0 --timeout=300s || true + kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true echo show pvc kubectl get pvc -l app=tidb -o wide echo show pv From e55555fe7dd42dc2ec293d0297db02bb22ddacf2 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 27 Sep 2022 15:14:13 +0800 Subject: [PATCH 06/49] update --- .github/workflows/dataflow_engine_chaos.yaml | 112 +++++++++---------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index 88066f0b33..9bfbdf2886 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -87,6 +87,61 @@ jobs: run: | kind load docker-image dataflow:chaos --name dataflow-engine-cluster + # Set up upstream instances + - name: Set up sources + run: | + kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml + kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml + kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml + - name: Wait for sources ready # kubectl wait --all not working + run: | + kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s || true + kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s || true + kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s || true + sleep 10 + echo show pvc + kubectl get pvc -l app=sources -o wide + echo show pv + kubectl get pv -o wide + echo show svc + kubectl get svc -l app=sources -o wide + echo show sts + kubectl get sts -l app=sources -o wide + echo show po + kubectl get po -l app=sources -o wide + echo describe po + kubectl describe po -l app=sources + echo describe pvc + kubectl describe pvc -l app=sources + kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s + kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s + kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s + + # Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator) + - name: Set up TiDB + run: | + kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + - name: Wait for TiDB ready + run: | + kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true + echo show pvc + kubectl get pvc -l app=tidb -o wide + echo show pv + kubectl get pv -o wide + echo show svc + kubectl get svc -l app=tidb -o wide + echo show sts + kubectl get sts -l app=tidb -o wide + echo show po + kubectl get po -l app=tidb -o wide + echo describe po + kubectl describe po -l app=tidb + echo describe pvc + kubectl describe pvc -l app=tidb + kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s + # Set up metastore and basic services - name: Set up metastore and basic services run: | @@ -193,62 +248,7 @@ jobs: kubectl logs chaos-executor-2 -p || true kubectl logs chaos-executor-0 -c wait-server-master || true - - # Set up upstream instances - - name: Set up sources - run: | - kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml - kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml - kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml - - name: Wait for sources ready # kubectl wait --all not working - run: | - kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s || true - kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s || true - kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s || true - sleep 10 - echo show pvc - kubectl get pvc -l app=sources -o wide - echo show pv - kubectl get pv -o wide - echo show svc - kubectl get svc -l app=sources -o wide - echo show sts - kubectl get sts -l app=sources -o wide - echo show po - kubectl get po -l app=sources -o wide - echo describe po - kubectl describe po -l app=sources - echo describe pvc - kubectl describe pvc -l app=sources - kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s - kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s - kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s - - # Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator) - - name: Set up TiDB - run: | - kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - - name: Wait for TiDB ready - run: | - kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true - echo show pvc - kubectl get pvc -l app=tidb -o wide - echo show pv - kubectl get pv -o wide - echo show svc - kubectl get svc -l app=tidb -o wide - echo show sts - kubectl get sts -l app=tidb -o wide - echo show po - kubectl get po -l app=tidb -o wide - echo describe po - kubectl describe po -l app=tidb - echo describe pvc - kubectl describe pvc -l app=tidb - kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s - + - name: Set up chaos test cases run: | kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml From 5bd247d26e47518d23cdedb0a0bedf100d96174e Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 27 Sep 2022 16:08:53 +0800 Subject: [PATCH 07/49] debug ci --- dm/chaos/manifests/tidb.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dm/chaos/manifests/tidb.yaml b/dm/chaos/manifests/tidb.yaml index 3914aece28..fad8d55b60 100644 --- a/dm/chaos/manifests/tidb.yaml +++ b/dm/chaos/manifests/tidb.yaml @@ -31,7 +31,7 @@ spec: spec: containers: - name: tidb - image: pingcap/tidb:latest # latest release version + image: pingcap/tidb:v6.1.0 # latest release version imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data From bc5c12a094418785278668fb08fc708d80f88877 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 27 Sep 2022 16:28:57 +0800 Subject: [PATCH 08/49] update kind config --- engine/chaos/manifests/kind-cluster.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/engine/chaos/manifests/kind-cluster.yaml b/engine/chaos/manifests/kind-cluster.yaml index 250966401f..8ab4df8e03 100644 --- a/engine/chaos/manifests/kind-cluster.yaml +++ b/engine/chaos/manifests/kind-cluster.yaml @@ -5,3 +5,6 @@ nodes: - role: worker - role: worker - role: worker +- role: worker +- role: worker +- role: worker From 162edf5bb4ac42526af68a94465e440f5858dadd Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 27 Sep 2022 16:31:56 +0800 Subject: [PATCH 09/49] Revert "debug ci" This reverts commit 5bd247d26e47518d23cdedb0a0bedf100d96174e. --- dm/chaos/manifests/tidb.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dm/chaos/manifests/tidb.yaml b/dm/chaos/manifests/tidb.yaml index fad8d55b60..3914aece28 100644 --- a/dm/chaos/manifests/tidb.yaml +++ b/dm/chaos/manifests/tidb.yaml @@ -31,7 +31,7 @@ spec: spec: containers: - name: tidb - image: pingcap/tidb:v6.1.0 # latest release version + image: pingcap/tidb:latest # latest release version imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data From 003cf68841f17202f39804123be263c41c8e6b72 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 27 Sep 2022 17:22:56 +0800 Subject: [PATCH 10/49] debug: reorder --- .github/workflows/dataflow_engine_chaos.yaml | 52 ++++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index 9bfbdf2886..092ba84fa2 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -87,6 +87,31 @@ jobs: run: | kind load docker-image dataflow:chaos --name dataflow-engine-cluster + # Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator) + - name: Set up TiDB + run: | + kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + - name: Wait for TiDB ready + run: | + kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true + echo show pvc + kubectl get pvc -l app=tidb -o wide + echo show pv + kubectl get pv -o wide + echo show svc + kubectl get svc -l app=tidb -o wide + echo show sts + kubectl get sts -l app=tidb -o wide + echo show po + kubectl get po -l app=tidb -o wide + echo describe po + kubectl describe po -l app=tidb + echo describe pvc + kubectl describe pvc -l app=tidb + kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s + # Set up upstream instances - name: Set up sources run: | @@ -116,32 +141,7 @@ jobs: kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s - - # Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator) - - name: Set up TiDB - run: | - kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - - name: Wait for TiDB ready - run: | - kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true - echo show pvc - kubectl get pvc -l app=tidb -o wide - echo show pv - kubectl get pv -o wide - echo show svc - kubectl get svc -l app=tidb -o wide - echo show sts - kubectl get sts -l app=tidb -o wide - echo show po - kubectl get po -l app=tidb -o wide - echo describe po - kubectl describe po -l app=tidb - echo describe pvc - kubectl describe pvc -l app=tidb - kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s - + # Set up metastore and basic services - name: Set up metastore and basic services run: | From 405199539070319afda522e7d4cec183533d2f71 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 27 Sep 2022 18:04:48 +0800 Subject: [PATCH 11/49] debug: describe --- .github/workflows/dataflow_engine_chaos.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index 092ba84fa2..6cb5d4fd50 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -95,6 +95,8 @@ jobs: kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - name: Wait for TiDB ready run: | + sleep 120 + kubectl describe pods tidb-0 kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true echo show pvc kubectl get pvc -l app=tidb -o wide From 8fb5f114af8d4dc863f018a6512160042a293b8c Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 27 Sep 2022 18:22:49 +0800 Subject: [PATCH 12/49] debug: see log --- .github/workflows/dataflow_engine_chaos.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index 6cb5d4fd50..4b5a753e07 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -95,8 +95,6 @@ jobs: kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - name: Wait for TiDB ready run: | - sleep 120 - kubectl describe pods tidb-0 kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true echo show pvc kubectl get pvc -l app=tidb -o wide @@ -112,6 +110,8 @@ jobs: kubectl describe po -l app=tidb echo describe pvc kubectl describe pvc -l app=tidb + echo get log + kubectl logs pod/tidb-0 kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s # Set up upstream instances From d24468b1bc05fdf2a1c206cecf338ecf1dbc40f6 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 27 Sep 2022 18:24:24 +0800 Subject: [PATCH 13/49] debug: remove worker --- engine/chaos/manifests/kind-cluster.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/engine/chaos/manifests/kind-cluster.yaml b/engine/chaos/manifests/kind-cluster.yaml index 8ab4df8e03..250966401f 100644 --- a/engine/chaos/manifests/kind-cluster.yaml +++ b/engine/chaos/manifests/kind-cluster.yaml @@ -5,6 +5,3 @@ nodes: - role: worker - role: worker - role: worker -- role: worker -- role: worker -- role: worker From 05b26ea5421fb4d92a6810d67fefde39e242978a Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 28 Sep 2022 13:25:10 +0800 Subject: [PATCH 14/49] debug: upgrade --- .github/workflows/dataflow_engine_chaos.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index 4b5a753e07..34fba55ef5 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -47,6 +47,11 @@ jobs: with: ref: refs/pull/${{ github.event.inputs.pr }}/head + - name: upgrade + run: | + sudo apt update && \ + sudo apt -y upgrade + - uses: actions/setup-go@v3 with: go-version: 1.18 From fa0c1c9fbabe0f9c961e4e202815f64ab3845529 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 28 Sep 2022 13:49:44 +0800 Subject: [PATCH 15/49] debug: upgrade kind --- .github/workflows/dataflow_engine_chaos.yaml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index 34fba55ef5..a014ecf043 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -47,11 +47,6 @@ jobs: with: ref: refs/pull/${{ github.event.inputs.pr }}/head - - name: upgrade - run: | - sudo apt update && \ - sudo apt -y upgrade - - uses: actions/setup-go@v3 with: go-version: 1.18 @@ -63,7 +58,7 @@ jobs: key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }} - name: Create k8s Kind Cluster - uses: helm/kind-action@v1.2.0 + uses: helm/kind-action@v1.3.0 with: cluster_name: dataflow-engine-cluster config: ${{ github.workspace }}/engine/chaos/manifests/kind-cluster.yaml From 3f21e454ca58139f55e04690be49f340a981c8ad Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 28 Sep 2022 14:13:27 +0800 Subject: [PATCH 16/49] debug: upgrade chaos mesh --- .github/workflows/dataflow_engine_chaos.yaml | 119 +++++++++--------- .../manifests/network-emulation-dataflow.yaml | 115 +++-------------- .../manifests/network-partition-dataflow.yaml | 77 +++--------- .../chaos/manifests/pod-failure-dataflow.yaml | 21 ++-- engine/chaos/manifests/pod-kill-dataflow.yaml | 21 ++-- .../chaos/manifests/time-shift-dataflow.yaml | 25 ++-- 6 files changed, 127 insertions(+), 251 deletions(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index a014ecf043..94132d2a1b 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -58,7 +58,7 @@ jobs: key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }} - name: Create k8s Kind Cluster - uses: helm/kind-action@v1.3.0 + uses: helm/kind-action@master with: cluster_name: dataflow-engine-cluster config: ${{ github.workspace }}/engine/chaos/manifests/kind-cluster.yaml @@ -86,64 +86,7 @@ jobs: - name: Load docker image to kind cluster run: | kind load docker-image dataflow:chaos --name dataflow-engine-cluster - - # Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator) - - name: Set up TiDB - run: | - kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - - name: Wait for TiDB ready - run: | - kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true - echo show pvc - kubectl get pvc -l app=tidb -o wide - echo show pv - kubectl get pv -o wide - echo show svc - kubectl get svc -l app=tidb -o wide - echo show sts - kubectl get sts -l app=tidb -o wide - echo show po - kubectl get po -l app=tidb -o wide - echo describe po - kubectl describe po -l app=tidb - echo describe pvc - kubectl describe pvc -l app=tidb - echo get log - kubectl logs pod/tidb-0 - kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s - - # Set up upstream instances - - name: Set up sources - run: | - kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml - kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml - kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml - - name: Wait for sources ready # kubectl wait --all not working - run: | - kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s || true - kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s || true - kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s || true - sleep 10 - echo show pvc - kubectl get pvc -l app=sources -o wide - echo show pv - kubectl get pv -o wide - echo show svc - kubectl get svc -l app=sources -o wide - echo show sts - kubectl get sts -l app=sources -o wide - echo show po - kubectl get po -l app=sources -o wide - echo describe po - kubectl describe po -l app=sources - echo describe pvc - kubectl describe pvc -l app=sources - kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s - kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s - kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s - + # Set up metastore and basic services - name: Set up metastore and basic services run: | @@ -250,6 +193,63 @@ jobs: kubectl logs chaos-executor-2 -p || true kubectl logs chaos-executor-0 -c wait-server-master || true + + # Set up upstream instances + - name: Set up sources + run: | + kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml + kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml + kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml + - name: Wait for sources ready # kubectl wait --all not working + run: | + kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s || true + kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s || true + kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s || true + sleep 10 + echo show pvc + kubectl get pvc -l app=sources -o wide + echo show pv + kubectl get pv -o wide + echo show svc + kubectl get svc -l app=sources -o wide + echo show sts + kubectl get sts -l app=sources -o wide + echo show po + kubectl get po -l app=sources -o wide + echo describe po + kubectl describe po -l app=sources + echo describe pvc + kubectl describe pvc -l app=sources + kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s + kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s + kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s + + # Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator) + - name: Set up TiDB + run: | + kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + - name: Wait for TiDB ready + run: | + kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true + echo show pvc + kubectl get pvc -l app=tidb -o wide + echo show pv + kubectl get pv -o wide + echo show svc + kubectl get svc -l app=tidb -o wide + echo show sts + kubectl get sts -l app=tidb -o wide + echo show po + kubectl get po -l app=tidb -o wide + echo describe po + kubectl describe po -l app=tidb + echo describe pvc + kubectl describe pvc -l app=tidb + echo get log + kubectl logs pod/tidb-0 + kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s - name: Set up chaos test cases run: | @@ -266,7 +266,6 @@ jobs: uses: chaos-mesh/chaos-mesh-action@master env: CFG_BASE64: ${{ env.CFG_BASE64 }} - CHAOS_MESH_VERSION: v1.0.0 # check whether complete with 1m * 20 times. - name: Wait for chaos test case complete diff --git a/engine/chaos/manifests/network-emulation-dataflow.yaml b/engine/chaos/manifests/network-emulation-dataflow.yaml index cda46b7051..f69de5a1a9 100644 --- a/engine/chaos/manifests/network-emulation-dataflow.yaml +++ b/engine/chaos/manifests/network-emulation-dataflow.yaml @@ -1,17 +1,20 @@ ---- -# A Network Loss action causes network packets to drop randomly apiVersion: chaos-mesh.org/v1alpha1 -kind: NetworkChaos +kind: Schedule metadata: name: network-loss-dataflow labels: app: network-loss-dataflow spec: - action: loss - mode: one - selector: - pods: - default: # default namespace + schedule: 2-59/6 * * * * + type: NetworkChaos + historyLimit: 5 + concurrencyPolicy: Forbid + networkChaos: + action: loss + mode: one + selector: + pods: + default: - chaos-server-master-0 - chaos-server-master-1 - chaos-server-master-2 @@ -19,95 +22,7 @@ spec: - chaos-executor-1 - chaos-executor-2 - chaos-executor-3 - loss: - loss: "25" - correlation: "25" - duration: "30s" - scheduler: - cron: "2-59/6 * * * *" # At every 6th minute from 2 through 59, (2, 8, 14, ...) - - -# A Network Delay action causes delays in message sending ---- -apiVersion: chaos-mesh.org/v1alpha1 -kind: NetworkChaos -metadata: - name: network-delay-dataflow - labels: - app: network-delay-dataflow -spec: - action: delay - mode: one - selector: - pods: - default: # default namespace - - chaos-server-master-0 - - chaos-server-master-1 - - chaos-server-master-2 - - chaos-executor-0 - - chaos-executor-1 - - chaos-executor-2 - - chaos-executor-3 - delay: - latency: "150ms" - correlation: "25" - jitter: "150ms" - duration: "30s" - scheduler: - cron: "3-59/6 * * * *" # At every 6th minute from 3 through 59, (3, 9, 15, ...) - ---- -# A Network Duplicate action causes packet duplication -apiVersion: chaos-mesh.org/v1alpha1 -kind: NetworkChaos -metadata: - name: network-duplicate-dataflow - labels: - app: network-duplicate-dataflow -spec: - action: duplicate - mode: one - selector: - pods: - default: # default namespace - - chaos-server-master-0 - - chaos-server-master-1 - - chaos-server-master-2 - - chaos-executor-0 - - chaos-executor-1 - - chaos-executor-2 - - chaos-executor-3 - duplicate: - duplicate: "40" - correlation: "25" - duration: "30s" - scheduler: - cron: "4-59/6 * * * *" # At every 6th minute from 4 through 59, (4, 10, 16, ...) - ---- -# A Network Corrupt action causes packet corruption -apiVersion: chaos-mesh.org/v1alpha1 -kind: NetworkChaos -metadata: - name: network-corrupt-dataflow - labels: - app: network-corrupt-dataflow -spec: - action: corrupt - mode: one - selector: - pods: - default: # default namespace - - chaos-server-master-0 - - chaos-server-master-1 - - chaos-server-master-2 - - chaos-executor-0 - - chaos-executor-1 - - chaos-executor-2 - - chaos-executor-3 - corrupt: - corrupt: "40" - correlation: "25" - duration: "30s" - scheduler: - cron: "5-59/6 * * * *" # At every 5th minute from 5 through 59, (5, 11, 17, ...) + loss: + loss: "25" + correlation: "25" + duration: 30s diff --git a/engine/chaos/manifests/network-partition-dataflow.yaml b/engine/chaos/manifests/network-partition-dataflow.yaml index 8bf3416563..786c3abb90 100644 --- a/engine/chaos/manifests/network-partition-dataflow.yaml +++ b/engine/chaos/manifests/network-partition-dataflow.yaml @@ -1,71 +1,24 @@ ---- -# network partition between server-master and executor apiVersion: chaos-mesh.org/v1alpha1 -kind: NetworkChaos +kind: Schedule metadata: name: network-partition-dataflow-master-executor labels: app: network-partition-dataflow-master-executor spec: - action: partition - mode: one - selector: - labelSelectors: - "app": "chaos-server-master" - direction: both - target: - selector: - labelSelectors: - "app": "chaos-executor" - mode: one - duration: "20s" - scheduler: - cron: "2-59/4 * * * *" # At every 4th minute from 2 through 59, (2, 6, 10, ...) - ---- -# network partition between dataflow server-master members -apiVersion: chaos-mesh.org/v1alpha1 -kind: NetworkChaos -metadata: - name: network-partition-dataflow-master-master - labels: - app: network-partition-dataflow-master-master -spec: - action: partition - mode: one - selector: - labelSelectors: - "app": "chaos-server-master" - direction: both - target: - selector: - labelSelectors: - "app": "chaos-server-master" + schedule: 2-59/4 * * * * + type: NetworkChaos + historyLimit: 5 + concurrencyPolicy: Forbid + networkChaos: + action: partition mode: one - duration: "20s" - scheduler: - cron: "3-59/4 * * * *" # At every 4th minute from 3 through 59, (3, 7, 11, ...) - ---- -# network partition between dataflow executor members -apiVersion: chaos-mesh.org/v1alpha1 -kind: NetworkChaos -metadata: - name: network-partition-dataflow-executor-executor - labels: - app: network-partition-dataflow-executor-executor -spec: - action: partition - mode: one - selector: - labelSelectors: - "app": "chaos-executor" - direction: both - target: selector: labelSelectors: - "app": "chaos-executor" - mode: one - duration: "20s" - scheduler: - cron: "1-59/4 * * * *" # At every 4th minute from 1 through 59, (1, 5, 9, ...) + app: chaos-server-master + direction: both + target: + selector: + labelSelectors: + app: chaos-executor + mode: one + duration: 20s diff --git a/engine/chaos/manifests/pod-failure-dataflow.yaml b/engine/chaos/manifests/pod-failure-dataflow.yaml index 952b68b817..2a2934b30c 100644 --- a/engine/chaos/manifests/pod-failure-dataflow.yaml +++ b/engine/chaos/manifests/pod-failure-dataflow.yaml @@ -1,16 +1,21 @@ apiVersion: chaos-mesh.org/v1alpha1 -kind: PodChaos +kind: Schedule metadata: name: pod-failure-dataflow labels: app: pod-failure-dataflow spec: - action: pod-failure - mode: one - duration: "30s" - selector: - pods: - default: # default namespace + schedule: '@every 2m' + type: PodChaos + historyLimit: 5 + concurrencyPolicy: Forbid + podChaos: + action: pod-failure + mode: one + duration: 30s + selector: + pods: + default: - chaos-server-master-0 - chaos-server-master-1 - chaos-server-master-2 @@ -18,5 +23,3 @@ spec: - chaos-executor-1 - chaos-executor-2 - chaos-executor-3 - scheduler: - cron: "@every 2m" diff --git a/engine/chaos/manifests/pod-kill-dataflow.yaml b/engine/chaos/manifests/pod-kill-dataflow.yaml index d3d12259a1..d18be3016c 100644 --- a/engine/chaos/manifests/pod-kill-dataflow.yaml +++ b/engine/chaos/manifests/pod-kill-dataflow.yaml @@ -1,16 +1,21 @@ apiVersion: chaos-mesh.org/v1alpha1 -kind: PodChaos +kind: Schedule metadata: name: pod-kill-dataflow labels: app: pod-kill-dataflow spec: - action: pod-kill - mode: one - gracePeriod: 30 - selector: - pods: - default: # default namespace + schedule: '@every 2m' + type: PodChaos + historyLimit: 5 + concurrencyPolicy: Forbid + podChaos: + action: pod-kill + mode: one + gracePeriod: 30 + selector: + pods: + default: - chaos-server-master-0 - chaos-server-master-1 - chaos-server-master-2 @@ -18,5 +23,3 @@ spec: - chaos-executor-1 - chaos-executor-2 - chaos-executor-3 - scheduler: - cron: "@every 2m" diff --git a/engine/chaos/manifests/time-shift-dataflow.yaml b/engine/chaos/manifests/time-shift-dataflow.yaml index 239cd8622a..58ba336480 100644 --- a/engine/chaos/manifests/time-shift-dataflow.yaml +++ b/engine/chaos/manifests/time-shift-dataflow.yaml @@ -1,16 +1,21 @@ apiVersion: chaos-mesh.org/v1alpha1 -kind: TimeChaos +kind: Schedule metadata: name: time-shift-dataflow labels: app: time-shift-dataflow spec: - mode: "random-max-percent" - value: "60" - duration: "30s" - selector: - pods: - default: # default namespace + schedule: '@every 2m' + type: TimeChaos + historyLimit: 5 + concurrencyPolicy: Forbid + timeChaos: + mode: random-max-percent + value: "60" + duration: 30s + selector: + pods: + default: - chaos-server-master-0 - chaos-server-master-1 - chaos-server-master-2 @@ -18,8 +23,6 @@ spec: - chaos-executor-1 - chaos-executor-2 - chaos-executor-3 - timeOffset: '-10m' - clockIds: + timeOffset: -10m + clockIds: - CLOCK_REALTIME - scheduler: - cron: "@every 2m" From 998cb91c537e9226f6ae592cac9a99232db1e0e3 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 28 Sep 2022 14:16:56 +0800 Subject: [PATCH 17/49] debug: remove master --- .github/workflows/dataflow_engine_chaos.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index 94132d2a1b..fe3ee0c852 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -58,7 +58,7 @@ jobs: key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }} - name: Create k8s Kind Cluster - uses: helm/kind-action@master + uses: helm/kind-action with: cluster_name: dataflow-engine-cluster config: ${{ github.workspace }}/engine/chaos/manifests/kind-cluster.yaml From e1f45f029ec3db005ced7a0dceb81e9f8e367c46 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 28 Sep 2022 14:17:54 +0800 Subject: [PATCH 18/49] debug: use kind v1.4.0 --- .github/workflows/dataflow_engine_chaos.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index fe3ee0c852..efd2a77d52 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -58,7 +58,7 @@ jobs: key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }} - name: Create k8s Kind Cluster - uses: helm/kind-action + uses: helm/kind-action@1.4.0 with: cluster_name: dataflow-engine-cluster config: ${{ github.workspace }}/engine/chaos/manifests/kind-cluster.yaml From 9d9cf1d9783bb790ba8c6e83dbdf492eb56d4c93 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 28 Sep 2022 14:20:59 +0800 Subject: [PATCH 19/49] debug: fix --- .github/workflows/dataflow_engine_chaos.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index efd2a77d52..2e35ee8d42 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -58,7 +58,7 @@ jobs: key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }} - name: Create k8s Kind Cluster - uses: helm/kind-action@1.4.0 + uses: helm/kind-action@v1.4.0 with: cluster_name: dataflow-engine-cluster config: ${{ github.workspace }}/engine/chaos/manifests/kind-cluster.yaml From 5c9aea63c78432c0923a5a7125e0e9341c15f6a8 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 28 Sep 2022 15:45:44 +0800 Subject: [PATCH 20/49] update dm chaos mesh --- .github/workflows/dm_chaos.yaml | 3 +- dm/chaos/manifests/io-chaos-dm.yaml | 29 +++-- dm/chaos/manifests/network-emulation-dm.yaml | 112 +++--------------- dm/chaos/manifests/network-partition-dm.yaml | 53 +++------ dm/chaos/manifests/pod-failure-dm.yaml | 21 ++-- dm/chaos/manifests/pod-kill-dm.yaml | 21 ++-- engine/chaos/manifests/pod-kill-dataflow.yaml | 2 +- 7 files changed, 72 insertions(+), 169 deletions(-) diff --git a/.github/workflows/dm_chaos.yaml b/.github/workflows/dm_chaos.yaml index f7a23d5ce4..9aaf24a9f5 100644 --- a/.github/workflows/dm_chaos.yaml +++ b/.github/workflows/dm_chaos.yaml @@ -68,7 +68,7 @@ jobs: key: ${{ runner.os }}-ticdc-tools-${{ hashFiles('tools/check/go.sum') }} - name: Create k8s Kind Cluster - uses: helm/kind-action@v1.2.0 + uses: helm/kind-action@v1.4.0 - name: Print cluster information run: | @@ -247,7 +247,6 @@ jobs: uses: chaos-mesh/chaos-mesh-action@master env: CFG_BASE64: ${{ env.CFG_BASE64 }} - CHAOS_MESH_VERSION: v1.0.0 # check whether complete with 1m * 20 times. - name: Wait for chaos test case complete diff --git a/dm/chaos/manifests/io-chaos-dm.yaml b/dm/chaos/manifests/io-chaos-dm.yaml index 0e4b993bd1..1f5eb0382a 100644 --- a/dm/chaos/manifests/io-chaos-dm.yaml +++ b/dm/chaos/manifests/io-chaos-dm.yaml @@ -1,15 +1,20 @@ apiVersion: chaos-mesh.org/v1alpha1 -kind: IoChaos +kind: Schedule metadata: name: io-delay-dm labels: app: io-delay-dm spec: - action: latency - mode: one - selector: - pods: - default: # default namespace + schedule: '@every 2m' + type: IOChaos + historyLimit: 5 + concurrencyPolicy: Forbid + ioChaos: + action: latency + mode: one + selector: + pods: + default: - dm-master-0 - dm-master-1 - dm-master-2 @@ -17,10 +22,8 @@ spec: - dm-worker-1 - dm-worker-2 - dm-worker-3 - volumePath: /data - path: "/data/**/*" - delay: "100ms" - percent: 50 - duration: "60s" - scheduler: - cron: "@every 2m" + volumePath: /data + path: /data/**/* + delay: 100ms + percent: 50 + duration: 60s diff --git a/dm/chaos/manifests/network-emulation-dm.yaml b/dm/chaos/manifests/network-emulation-dm.yaml index d1e060d954..a63a6f1bdd 100644 --- a/dm/chaos/manifests/network-emulation-dm.yaml +++ b/dm/chaos/manifests/network-emulation-dm.yaml @@ -1,17 +1,20 @@ ---- -# A Network Loss action causes network packets to drop randomly apiVersion: chaos-mesh.org/v1alpha1 -kind: NetworkChaos +kind: Schedule metadata: name: network-loss-dm labels: app: network-loss-dm spec: - action: loss - mode: one - selector: - pods: - default: # default namespace + schedule: 2-59/5 * * * * + type: NetworkChaos + historyLimit: 5 + concurrencyPolicy: Forbid + networkChaos: + action: loss + mode: one + selector: + pods: + default: - dm-master-0 - dm-master-1 - dm-master-2 @@ -19,92 +22,7 @@ spec: - dm-worker-1 - dm-worker-2 - dm-worker-3 - loss: - loss: "25" - correlation: "25" - duration: "30s" - scheduler: - cron: "2-59/5 * * * *" # At every 5th minute from 2 through 59, (2, 7, 12, ...) - - -# A Network Delay action causes delays in message sending ---- -apiVersion: chaos-mesh.org/v1alpha1 -kind: NetworkChaos -metadata: - name: network-delay-dm - labels: - app: network-delay-dm -spec: - action: delay - mode: one - selector: - pods: - default: # default namespace - - dm-master-0 - - dm-master-1 - - dm-master-2 - - dm-worker-0 - - dm-worker-1 - - dm-worker-2 - delay: - latency: "90ms" - correlation: "25" - jitter: "90ms" - duration: "30s" - scheduler: - cron: "3-59/5 * * * *" # At every 5th minute from 3 through 59, (3, 8, 13, ...) - ---- -# A Network Duplicate action causes packet duplication -apiVersion: chaos-mesh.org/v1alpha1 -kind: NetworkChaos -metadata: - name: network-duplicate-dm - labels: - app: network-duplicate-dm -spec: - action: duplicate - mode: one - selector: - pods: - default: # default namespace - - dm-master-0 - - dm-master-1 - - dm-master-2 - - dm-worker-0 - - dm-worker-1 - - dm-worker-2 - duplicate: - duplicate: "40" - correlation: "25" - duration: "30s" - scheduler: - cron: "4-59/5 * * * *" # At every 5th minute from 4 through 59, (4, 9, 14, ...) - ---- -# A Network Corrupt action causes packet corruption -apiVersion: chaos-mesh.org/v1alpha1 -kind: NetworkChaos -metadata: - name: network-corrupt-dm - labels: - app: network-corrupt-dm -spec: - action: corrupt - mode: one - selector: - pods: - default: # default namespace - - dm-master-0 - - dm-master-1 - - dm-master-2 - - dm-worker-0 - - dm-worker-1 - - dm-worker-2 - corrupt: - corrupt: "40" - correlation: "25" - duration: "30s" - scheduler: - cron: "5-59/5 * * * *" # At every 5th minute from 5 through 59, (5, 10, 15, ...) \ No newline at end of file + loss: + loss: "25" + correlation: "25" + duration: 30s diff --git a/dm/chaos/manifests/network-partition-dm.yaml b/dm/chaos/manifests/network-partition-dm.yaml index 99f7c33cf1..21f3fc6995 100644 --- a/dm/chaos/manifests/network-partition-dm.yaml +++ b/dm/chaos/manifests/network-partition-dm.yaml @@ -1,47 +1,24 @@ ---- -# network partition between DM-worker and DM-master apiVersion: chaos-mesh.org/v1alpha1 -kind: NetworkChaos +kind: Schedule metadata: name: network-partition-dm-worker-master labels: app: network-partition-dm-worker-master spec: - action: partition - mode: one - selector: - labelSelectors: - "app": "dm-worker" - direction: both - target: - selector: - labelSelectors: - "app": "dm-master" + schedule: 2-59/3 * * * * + type: NetworkChaos + historyLimit: 5 + concurrencyPolicy: Forbid + networkChaos: + action: partition mode: one - duration: "30s" - scheduler: - cron: "2-59/3 * * * *" # At every 3rd minute from 2 through 59, (2, 5, 8, ...) - ---- -# network partition between DM-master members -apiVersion: chaos-mesh.org/v1alpha1 -kind: NetworkChaos -metadata: - name: network-partition-dm-master-master - labels: - app: network-partition-dm-master-master -spec: - action: partition - mode: one - selector: - labelSelectors: - "app": "dm-master" - direction: both - target: selector: labelSelectors: - "app": "dm-master" - mode: one - duration: "30s" - scheduler: - cron: "3-59/3 * * * *" # At every 3rd minute from 3 through 59, (3, 6, 9, ...) + app: dm-worker + direction: both + target: + selector: + labelSelectors: + app: dm-master + mode: one + duration: 30s diff --git a/dm/chaos/manifests/pod-failure-dm.yaml b/dm/chaos/manifests/pod-failure-dm.yaml index ee6b845c88..58b0b63dab 100644 --- a/dm/chaos/manifests/pod-failure-dm.yaml +++ b/dm/chaos/manifests/pod-failure-dm.yaml @@ -1,16 +1,21 @@ apiVersion: chaos-mesh.org/v1alpha1 -kind: PodChaos +kind: Schedule metadata: name: pod-failure-dm labels: app: pod-failure-dm spec: - action: pod-failure - mode: one - duration: "30s" - selector: - pods: - default: # default namespace + schedule: '@every 2m' + type: PodChaos + historyLimit: 5 + concurrencyPolicy: Forbid + podChaos: + action: pod-failure + mode: one + duration: 30s + selector: + pods: + default: - dm-master-0 - dm-master-1 - dm-master-2 @@ -18,5 +23,3 @@ spec: - dm-worker-1 - dm-worker-2 - dm-worker-3 - scheduler: - cron: "@every 2m" diff --git a/dm/chaos/manifests/pod-kill-dm.yaml b/dm/chaos/manifests/pod-kill-dm.yaml index b016f5f43f..959fc03bcb 100644 --- a/dm/chaos/manifests/pod-kill-dm.yaml +++ b/dm/chaos/manifests/pod-kill-dm.yaml @@ -1,16 +1,21 @@ apiVersion: chaos-mesh.org/v1alpha1 -kind: PodChaos +kind: Schedule metadata: name: pod-kill-dm labels: app: pod-kill-dm spec: - action: pod-kill - mode: one - gracePeriod: 30 - selector: - pods: - default: # default namespace + schedule: '@every 1m' + type: PodChaos + historyLimit: 5 + concurrencyPolicy: Forbid + podChaos: + action: pod-kill + mode: one + gracePeriod: 30 + selector: + pods: + default: - dm-master-0 - dm-master-1 - dm-master-2 @@ -18,5 +23,3 @@ spec: - dm-worker-1 - dm-worker-2 - dm-worker-3 - scheduler: - cron: "@every 1m" diff --git a/engine/chaos/manifests/pod-kill-dataflow.yaml b/engine/chaos/manifests/pod-kill-dataflow.yaml index d18be3016c..924f149cf3 100644 --- a/engine/chaos/manifests/pod-kill-dataflow.yaml +++ b/engine/chaos/manifests/pod-kill-dataflow.yaml @@ -5,7 +5,7 @@ metadata: labels: app: pod-kill-dataflow spec: - schedule: '@every 2m' + schedule: '@every 1m' type: PodChaos historyLimit: 5 concurrencyPolicy: Forbid From 9d466f594b4b3d31c815ab378c7839ebf4368365 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 28 Sep 2022 15:49:11 +0800 Subject: [PATCH 21/49] trigger --- .github/workflows/dm_chaos.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/dm_chaos.yaml b/.github/workflows/dm_chaos.yaml index 9aaf24a9f5..50d9babb18 100644 --- a/.github/workflows/dm_chaos.yaml +++ b/.github/workflows/dm_chaos.yaml @@ -9,6 +9,9 @@ on: description: 'Which PR do you want to trigger' required: true default: '' + pull_request: + branches: + - master # See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency. concurrency: From b59202a4126cec8742457dea94dcdbc69acff1a1 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 28 Sep 2022 16:35:25 +0800 Subject: [PATCH 22/49] add debug log --- engine/chaos/cases/case_dm_job.go | 4 +--- engine/chaos/cases/dm/case.go | 4 ++++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/engine/chaos/cases/case_dm_job.go b/engine/chaos/cases/case_dm_job.go index c6d10c36fe..c2b9de6c11 100644 --- a/engine/chaos/cases/case_dm_job.go +++ b/engine/chaos/cases/case_dm_job.go @@ -21,9 +21,7 @@ import ( "golang.org/x/sync/errgroup" ) -var ( - filenames = []string{"dmjob"} -) +var filenames = []string{"dmjob"} func runDMJobCases(ctx context.Context, cfg *config) error { eg, ctx2 := errgroup.WithContext(ctx) diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index 2bc62d800b..c8c68196f4 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -292,6 +292,7 @@ func (c *Case) genIncrData(ctx context.Context) error { for { select { case <-ctx.Done(): + log.L().Info("ctx done in incr data", zap.String("name", c.name), zap.String("job_id", c.jobID)) return nil default: } @@ -302,11 +303,14 @@ func (c *Case) genIncrData(ctx context.Context) error { for i := 0; i < batch; i++ { sql, err := c.randDML(source, tableName) if err != nil { + log.L().Info("error in rand dml", zap.String("name", c.name), zap.String("job_id", c.jobID), zap.Error(err)) return err } sqls = append(sqls, sql) } + log.L().Info("start execute sql", zap.String("name", c.name), zap.String("job_id", c.jobID)) if _, err := c.sources[source].ExecuteSQLs(sqls...); err != nil { + log.L().Info("error in execute sql", zap.String("name", c.name), zap.String("job_id", c.jobID), zap.Error(err)) return err } } From 12b125f926ab55764b200d4d0519b456066d16b8 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 28 Sep 2022 17:13:01 +0800 Subject: [PATCH 23/49] add more debug log --- engine/chaos/cases/dm/case.go | 1 + engine/chaos/cases/dm/db.go | 1 + 2 files changed, 2 insertions(+) diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index c8c68196f4..47d26ef9cb 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -313,6 +313,7 @@ func (c *Case) genIncrData(ctx context.Context) error { log.L().Info("error in execute sql", zap.String("name", c.name), zap.String("job_id", c.jobID), zap.Error(err)) return err } + log.L().Info("finished execute sql", zap.String("name", c.name), zap.String("job_id", c.jobID)) } } diff --git a/engine/chaos/cases/dm/db.go b/engine/chaos/cases/dm/db.go index 6f48b9a969..4316259b4d 100644 --- a/engine/chaos/cases/dm/db.go +++ b/engine/chaos/cases/dm/db.go @@ -88,6 +88,7 @@ func (dc *dbConn) ExecuteSQLs(queries ...string) (int, error) { FirstRetryDuration: time.Second, BackoffStrategy: retry.Stable, IsRetryableFn: func(_ int, err error) bool { + log.L().Info("in retryable func", zap.Error(err)) if retry.IsConnectionError(err) { // HACK: for some errors like `invalid connection`, `sql: connection is already closed`, we can ignore them just for testing. err = dc.resetConn(ctx) From f5c0f3a173594f6fa50cfa7116248a83f44ed43c Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 28 Sep 2022 17:50:19 +0800 Subject: [PATCH 24/49] add more debug log --- engine/chaos/cases/dm/case.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index 47d26ef9cb..592410e8d0 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -258,6 +258,7 @@ func (c *Case) randDML(source int, table string) (string, error) { c.result[t]++ switch t { case 0: + log.L().Info("gen insert row") sql, uk, err := generator.GenInsertRow() if err != nil { return "", err @@ -274,8 +275,10 @@ func (c *Case) randDML(source int, table string) (string, error) { c.keySet[table][uk.GetValueHash()] = struct{}{} return sql, nil case 1: + log.L().Info("gen update row") return generator.GenUpdateRow(mcp.NextUK()) default: + log.L().Info("gen delete row") key := mcp.NextUK() sql, err := generator.GenDeleteRow(key) if err != nil { @@ -301,7 +304,9 @@ func (c *Case) genIncrData(ctx context.Context) error { sqls := make([]string, 0, batch) for i := 0; i < batch; i++ { + log.L().Info("start rand dml", zap.String("name", c.name), zap.String("job_id", c.jobID)) sql, err := c.randDML(source, tableName) + log.L().Info("finished rand dml", zap.String("name", c.name), zap.String("job_id", c.jobID)) if err != nil { log.L().Info("error in rand dml", zap.String("name", c.name), zap.String("job_id", c.jobID), zap.Error(err)) return err From 90054b95484c18c22ead4bbd7f3b4f3f315fbbac Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Thu, 29 Sep 2022 13:56:08 +0800 Subject: [PATCH 25/49] debug: add more log --- engine/chaos/cases/dm/case.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index 592410e8d0..d078904d90 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -260,18 +260,23 @@ func (c *Case) randDML(source int, table string) (string, error) { case 0: log.L().Info("gen insert row") sql, uk, err := generator.GenInsertRow() + log.L().Info("finish insert row") if err != nil { return "", err } for _, ok := c.keySet[table][uk.GetValueHash()]; ok; { + log.L().Info("again gen insert row") sql, uk, err = generator.GenInsertRow() + log.L().Info("again finish gen insert row") if err != nil { return "", err } } + log.L().Info("add insert row") if err := c.mcps[source][table].AddUK(uk); err != nil { return "", err } + log.L().Info("finished add insert row") c.keySet[table][uk.GetValueHash()] = struct{}{} return sql, nil case 1: From f63bf6c8e8e0412950135ce5df821029d7e647c0 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Thu, 29 Sep 2022 14:38:48 +0800 Subject: [PATCH 26/49] debug: add more time --- engine/chaos/scripts/check-case.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/chaos/scripts/check-case.sh b/engine/chaos/scripts/check-case.sh index d7cf739541..7d82d6c1ab 100755 --- a/engine/chaos/scripts/check-case.sh +++ b/engine/chaos/scripts/check-case.sh @@ -1,7 +1,7 @@ #!/bin/bash completed=false -for i in {1..20}; do +for i in {1..22}; do kubectl wait --for=condition=complete job/chaos-test-case --timeout=1m if [ $? -eq 0 ]; then completed=true From c401498528f2e84804470843399b7d22ca85de59 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Thu, 29 Sep 2022 15:30:30 +0800 Subject: [PATCH 27/49] debug: add more log --- engine/chaos/cases/dm/case.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index d078904d90..73e010d3f7 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -265,9 +265,9 @@ func (c *Case) randDML(source int, table string) (string, error) { return "", err } for _, ok := c.keySet[table][uk.GetValueHash()]; ok; { - log.L().Info("again gen insert row") + log.L().Info("again gen insert row", zap.String("hash", uk.GetValueHash()), zap.Int("len", len(c.keySet[table]))) sql, uk, err = generator.GenInsertRow() - log.L().Info("again finish gen insert row") + log.L().Info("again finish gen insert row", zap.String("hash", uk.GetValueHash()), zap.Int("len", len(c.keySet[table]))) if err != nil { return "", err } From b77335df185a6730bd11d310616ae634ad8ef797 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Thu, 29 Sep 2022 16:36:07 +0800 Subject: [PATCH 28/49] fix --- .github/workflows/dataflow_engine_chaos.yaml | 2 -- engine/chaos/cases/dm/case.go | 19 +++---------------- engine/chaos/cases/dm/db.go | 1 - 3 files changed, 3 insertions(+), 19 deletions(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index 2e35ee8d42..b4f011cd05 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -247,8 +247,6 @@ jobs: kubectl describe po -l app=tidb echo describe pvc kubectl describe pvc -l app=tidb - echo get log - kubectl logs pod/tidb-0 kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s - name: Set up chaos test cases diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index 73e010d3f7..08bfdc6824 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -258,32 +258,26 @@ func (c *Case) randDML(source int, table string) (string, error) { c.result[t]++ switch t { case 0: - log.L().Info("gen insert row") sql, uk, err := generator.GenInsertRow() - log.L().Info("finish insert row") if err != nil { return "", err } - for _, ok := c.keySet[table][uk.GetValueHash()]; ok; { - log.L().Info("again gen insert row", zap.String("hash", uk.GetValueHash()), zap.Int("len", len(c.keySet[table]))) + _, ok := c.keySet[table][uk.GetValueHash()] + for ok { sql, uk, err = generator.GenInsertRow() - log.L().Info("again finish gen insert row", zap.String("hash", uk.GetValueHash()), zap.Int("len", len(c.keySet[table]))) if err != nil { return "", err } + _, ok = c.keySet[table][uk.GetValueHash()] } - log.L().Info("add insert row") if err := c.mcps[source][table].AddUK(uk); err != nil { return "", err } - log.L().Info("finished add insert row") c.keySet[table][uk.GetValueHash()] = struct{}{} return sql, nil case 1: - log.L().Info("gen update row") return generator.GenUpdateRow(mcp.NextUK()) default: - log.L().Info("gen delete row") key := mcp.NextUK() sql, err := generator.GenDeleteRow(key) if err != nil { @@ -300,7 +294,6 @@ func (c *Case) genIncrData(ctx context.Context) error { for { select { case <-ctx.Done(): - log.L().Info("ctx done in incr data", zap.String("name", c.name), zap.String("job_id", c.jobID)) return nil default: } @@ -309,21 +302,15 @@ func (c *Case) genIncrData(ctx context.Context) error { sqls := make([]string, 0, batch) for i := 0; i < batch; i++ { - log.L().Info("start rand dml", zap.String("name", c.name), zap.String("job_id", c.jobID)) sql, err := c.randDML(source, tableName) - log.L().Info("finished rand dml", zap.String("name", c.name), zap.String("job_id", c.jobID)) if err != nil { - log.L().Info("error in rand dml", zap.String("name", c.name), zap.String("job_id", c.jobID), zap.Error(err)) return err } sqls = append(sqls, sql) } - log.L().Info("start execute sql", zap.String("name", c.name), zap.String("job_id", c.jobID)) if _, err := c.sources[source].ExecuteSQLs(sqls...); err != nil { - log.L().Info("error in execute sql", zap.String("name", c.name), zap.String("job_id", c.jobID), zap.Error(err)) return err } - log.L().Info("finished execute sql", zap.String("name", c.name), zap.String("job_id", c.jobID)) } } diff --git a/engine/chaos/cases/dm/db.go b/engine/chaos/cases/dm/db.go index 4316259b4d..6f48b9a969 100644 --- a/engine/chaos/cases/dm/db.go +++ b/engine/chaos/cases/dm/db.go @@ -88,7 +88,6 @@ func (dc *dbConn) ExecuteSQLs(queries ...string) (int, error) { FirstRetryDuration: time.Second, BackoffStrategy: retry.Stable, IsRetryableFn: func(_ int, err error) bool { - log.L().Info("in retryable func", zap.Error(err)) if retry.IsConnectionError(err) { // HACK: for some errors like `invalid connection`, `sql: connection is already closed`, we can ignore them just for testing. err = dc.resetConn(ctx) From c846530a59b0bd198aa34708c61ba9b278c1a219 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Fri, 30 Sep 2022 12:45:49 +0800 Subject: [PATCH 29/49] change log level --- deployments/engine/helm/tiflow/values.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/deployments/engine/helm/tiflow/values.yaml b/deployments/engine/helm/tiflow/values.yaml index 714e62c438..e2fcb952bd 100644 --- a/deployments/engine/helm/tiflow/values.yaml +++ b/deployments/engine/helm/tiflow/values.yaml @@ -31,6 +31,9 @@ master: max-interval = "15s" max-try-time = 100 + [log] + level = "debug" + executor: replicas: 4 logStorage: 1Gi @@ -39,6 +42,9 @@ executor: keepalive-interval = "500ms" session-ttl = 20 + [log] + level = "debug" + metastore: frameworkStorage: 5Gi businessStorage: 5Gi From 9cd33abc9d2f84f2d38ae2bf429ef66707433aa4 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Sun, 9 Oct 2022 12:20:35 +0800 Subject: [PATCH 30/49] update --- .github/workflows/dataflow_engine_chaos.yaml | 2 +- engine/chaos/cases/dm/case.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index b4f011cd05..98f78cfee2 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -24,7 +24,7 @@ jobs: base: # The type of runner that the job will run on runs-on: ubuntu-20.04 - timeout-minutes: 30 + timeout-minutes: 50 strategy: fail-fast: false matrix: diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index 08bfdc6824..750ef1e243 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -133,7 +133,7 @@ func NewCase(ctx context.Context, addr string, name string, cfgPath string) (*Ca UniqueKeyColumnNames: []string{"id"}, } generators[tableName] = sqlgen.NewSQLGeneratorImpl(tableConfig) - mcps[tableName] = mcp.NewModificationCandidatePool(1000000) + mcps[tableName] = mcp.NewModificationCandidatePool(100000000) c.keySet[tableName] = make(map[string]struct{}) c.tables = append(c.tables, tableName) } From 79a6c433194b5f04b5be5dfa5da0a578cdd637a0 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Mon, 10 Oct 2022 14:57:13 +0800 Subject: [PATCH 31/49] update --- .github/workflows/dm_chaos.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dm_chaos.yaml b/.github/workflows/dm_chaos.yaml index 50d9babb18..19ff7f8425 100644 --- a/.github/workflows/dm_chaos.yaml +++ b/.github/workflows/dm_chaos.yaml @@ -24,7 +24,7 @@ jobs: base: # The type of runner that the job will run on runs-on: ubuntu-20.04 - timeout-minutes: 30 + timeout-minutes: 50 strategy: fail-fast: false matrix: From 8124aa0166ad45cf0d437297d2d4167c8894341e Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Mon, 10 Oct 2022 18:25:02 +0800 Subject: [PATCH 32/49] update --- engine/executor/dm/api_test.go | 6 ++--- engine/executor/dm/worker.go | 28 ++++++++++---------- engine/executor/dm/worker_test.go | 2 +- engine/jobmaster/dm/api_test.go | 18 +++++++------ engine/jobmaster/dm/checkpoint/agent.go | 4 +-- engine/jobmaster/dm/config/config.go | 30 ++++++++++++++++++---- engine/jobmaster/dm/dm_jobmaster.go | 2 +- engine/jobmaster/dm/worker_manager.go | 5 +++- engine/jobmaster/dm/worker_manager_test.go | 6 +++-- 9 files changed, 65 insertions(+), 36 deletions(-) diff --git a/engine/executor/dm/api_test.go b/engine/executor/dm/api_test.go index 5e0710dc27..5a759f512b 100644 --- a/engine/executor/dm/api_test.go +++ b/engine/executor/dm/api_test.go @@ -111,7 +111,7 @@ func TestQueryStatusAPI(t *testing.T) { })) dctx = dctx.WithDeps(dp) - dmWorker := newDMWorker(dctx, "", frameModel.WorkerDMDump, &config.SubTaskConfig{SourceID: "task-id"}, 0) + dmWorker := newDMWorker(dctx, "", frameModel.WorkerDMDump, &config.SubTaskConfig{SourceID: "task-id"}, 0, false) unitHolder := &mockUnitHolder{} dmWorker.unitHolder = unitHolder @@ -148,7 +148,7 @@ func TestStopWorker(t *testing.T) { })) dctx = dctx.WithDeps(dp) - dmWorker := newDMWorker(dctx, "master-id", frameModel.WorkerDMDump, &config.SubTaskConfig{SourceID: "task-id"}, 0) + dmWorker := newDMWorker(dctx, "master-id", frameModel.WorkerDMDump, &config.SubTaskConfig{SourceID: "task-id"}, 0, false) dmWorker.BaseWorker = framework.MockBaseWorker("worker-id", "master-id", dmWorker) dmWorker.BaseWorker.Init(context.Background()) dmWorker.unitHolder = &mockUnitHolder{} @@ -169,7 +169,7 @@ func TestOperateTask(t *testing.T) { })) dctx = dctx.WithDeps(dp) - dmWorker := newDMWorker(dctx, "master-id", frameModel.WorkerDMDump, &config.SubTaskConfig{SourceID: "task-id"}, 0) + dmWorker := newDMWorker(dctx, "master-id", frameModel.WorkerDMDump, &config.SubTaskConfig{SourceID: "task-id"}, 0, false) dmWorker.BaseWorker = framework.MockBaseWorker("worker-id", "master-id", dmWorker) dmWorker.BaseWorker.Init(context.Background()) mockUnitHolder := &mockUnitHolder{} diff --git a/engine/executor/dm/worker.go b/engine/executor/dm/worker.go index e3f7013599..02b4fc275b 100644 --- a/engine/executor/dm/worker.go +++ b/engine/executor/dm/worker.go @@ -73,9 +73,9 @@ func (f workerFactory) DeserializeConfig(configBytes []byte) (registry.WorkerCon // NewWorkerImpl implements WorkerFactory.NewWorkerImpl func (f workerFactory) NewWorkerImpl(ctx *dcontext.Context, workerID frameModel.WorkerID, masterID frameModel.MasterID, conf framework.WorkerConfig) (framework.WorkerImpl, error) { cfg := conf.(*config.TaskCfg) - log.Info("new dm worker", zap.String(logutil.ConstFieldJobKey, masterID), zap.String(logutil.ConstFieldWorkerKey, workerID), zap.Uint64("config_modify_revision", cfg.ModRevision)) + log.Info("new dm worker", zap.String(logutil.ConstFieldJobKey, masterID), zap.Stringer("worker_type", f.workerType), zap.String(logutil.ConstFieldWorkerKey, workerID), zap.Uint64("config_modify_revision", cfg.ModRevision)) dmSubtaskCfg := cfg.ToDMSubTaskCfg(masterID) - return newDMWorker(ctx, masterID, f.workerType, dmSubtaskCfg, cfg.ModRevision), nil + return newDMWorker(ctx, masterID, f.workerType, dmSubtaskCfg, cfg.ModRevision, cfg.NoNeedExtStorage), nil } // IsRetryableError implements WorkerFactory.IsRetryableError @@ -100,23 +100,25 @@ type dmWorker struct { masterID frameModel.MasterID messageHandlerManager p2p.MessageHandlerManager - cfgModRevision uint64 + cfgModRevision uint64 + noNeedExtStorage bool } -func newDMWorker(ctx *dcontext.Context, masterID frameModel.MasterID, workerType framework.WorkerType, cfg *dmconfig.SubTaskConfig, cfgModRevision uint64) *dmWorker { +func newDMWorker(ctx *dcontext.Context, masterID frameModel.MasterID, workerType framework.WorkerType, cfg *dmconfig.SubTaskConfig, cfgModRevision uint64, noNeedExtStorage bool) *dmWorker { // TODO: support config later // nolint:errcheck bf, _ := backoff.NewBackoff(dmconfig.DefaultBackoffFactor, dmconfig.DefaultBackoffJitter, dmconfig.DefaultBackoffMin, dmconfig.DefaultBackoffMax) autoResume := &worker.AutoResumeInfo{Backoff: bf, LatestPausedTime: time.Now(), LatestResumeTime: time.Now()} w := &dmWorker{ - cfg: cfg, - stage: metadata.StageInit, - workerType: workerType, - taskID: cfg.SourceID, - masterID: masterID, - unitHolder: newUnitHolderImpl(workerType, cfg), - autoResume: autoResume, - cfgModRevision: cfgModRevision, + cfg: cfg, + stage: metadata.StageInit, + workerType: workerType, + taskID: cfg.SourceID, + masterID: masterID, + unitHolder: newUnitHolderImpl(workerType, cfg), + autoResume: autoResume, + cfgModRevision: cfgModRevision, + noNeedExtStorage: noNeedExtStorage, } // nolint:errcheck @@ -135,7 +137,7 @@ func (w *dmWorker) InitImpl(ctx context.Context) error { if err := w.messageAgent.UpdateClient(w.masterID, w); err != nil { return err } - if w.cfg.Mode != dmconfig.ModeIncrement { + if w.cfg.Mode != dmconfig.ModeIncrement && !w.noNeedExtStorage { if err := w.setupStorage(ctx); err != nil { return err } diff --git a/engine/executor/dm/worker_test.go b/engine/executor/dm/worker_test.go index c21007f4d5..6d15e26d8d 100644 --- a/engine/executor/dm/worker_test.go +++ b/engine/executor/dm/worker_test.go @@ -109,7 +109,7 @@ func TestWorker(t *testing.T) { require.NoError(t, dp.Provide(func() p2p.MessageHandlerManager { return p2p.NewMockMessageHandlerManager() })) - dmWorker := newDMWorker(dctx, "master-id", frameModel.WorkerDMDump, &dmconfig.SubTaskConfig{}, 0) + dmWorker := newDMWorker(dctx, "master-id", frameModel.WorkerDMDump, &dmconfig.SubTaskConfig{}, 0, false) unitHolder := &mockUnitHolder{} dmWorker.unitHolder = unitHolder dmWorker.BaseWorker = framework.MockBaseWorker("worker-id", "master-id", dmWorker) diff --git a/engine/jobmaster/dm/api_test.go b/engine/jobmaster/dm/api_test.go index 6499996472..8fbacdcecf 100644 --- a/engine/jobmaster/dm/api_test.go +++ b/engine/jobmaster/dm/api_test.go @@ -47,15 +47,17 @@ func TestQueryStatusAPI(t *testing.T) { BaseJobMaster: mockBaseJobmaster, metadata: metadata.NewMetaData(metaKVClient, log.L()), } - job = &metadata.Job{ + jobCfg = &config.JobCfg{ModRevision: 4} + taskCfg = jobCfg.ToTaskCfg() + job = &metadata.Job{ Tasks: map[string]*metadata.Task{ - "task1": {Stage: metadata.StagePaused, Cfg: &config.TaskCfg{ModRevision: 4}}, - "task2": {Stage: metadata.StageFinished, Cfg: &config.TaskCfg{ModRevision: 4}}, - "task3": {Stage: metadata.StageFinished, Cfg: &config.TaskCfg{ModRevision: 4}}, - "task4": {Stage: metadata.StageRunning, Cfg: &config.TaskCfg{ModRevision: 4}}, - "task5": {Stage: metadata.StageRunning, Cfg: &config.TaskCfg{ModRevision: 4}}, - "task6": {Stage: metadata.StageRunning, Cfg: &config.TaskCfg{ModRevision: 4}}, - "task7": {Stage: metadata.StageFinished, Cfg: &config.TaskCfg{ModRevision: 4}}, + "task1": {Stage: metadata.StagePaused, Cfg: taskCfg}, + "task2": {Stage: metadata.StageFinished, Cfg: taskCfg}, + "task3": {Stage: metadata.StageFinished, Cfg: taskCfg}, + "task4": {Stage: metadata.StageRunning, Cfg: taskCfg}, + "task5": {Stage: metadata.StageRunning, Cfg: taskCfg}, + "task6": {Stage: metadata.StageRunning, Cfg: taskCfg}, + "task7": {Stage: metadata.StageFinished, Cfg: taskCfg}, }, } dumpStatus = &pb.DumpStatus{ diff --git a/engine/jobmaster/dm/checkpoint/agent.go b/engine/jobmaster/dm/checkpoint/agent.go index 144f8ce959..5fa66250ac 100644 --- a/engine/jobmaster/dm/checkpoint/agent.go +++ b/engine/jobmaster/dm/checkpoint/agent.go @@ -211,7 +211,7 @@ func onlineDDLName(jobID string, cfg *config.JobCfg) string { func isLoadFresh(ctx context.Context, jobID string, taskCfg *config.TaskCfg, db *conn.BaseDB) (bool, error) { // nolint:gosec - query := fmt.Sprintf("SELECT status FROM %s WHERE `task_name` = ? AND `source_name` = ?", loadTableName(jobID, (*config.JobCfg)(taskCfg))) + query := fmt.Sprintf("SELECT status FROM %s WHERE `task_name` = ? AND `source_name` = ?", loadTableName(jobID, taskCfg.ToJobCfg())) var status string err := db.DB.QueryRowContext(ctx, query, jobID, taskCfg.Upstreams[0].SourceID).Scan(&status) switch { @@ -226,7 +226,7 @@ func isLoadFresh(ctx context.Context, jobID string, taskCfg *config.TaskCfg, db func isSyncFresh(ctx context.Context, jobID string, taskCfg *config.TaskCfg, db *conn.BaseDB) (bool, error) { // nolint:gosec - query := fmt.Sprintf("SELECT 1 FROM %s WHERE `id` = ? AND `is_global` = true", syncTableName(jobID, (*config.JobCfg)(taskCfg))) + query := fmt.Sprintf("SELECT 1 FROM %s WHERE `id` = ? AND `is_global` = true", syncTableName(jobID, taskCfg.ToJobCfg())) var status string err := db.DB.QueryRowContext(ctx, query, taskCfg.Upstreams[0].SourceID).Scan(&status) switch { diff --git a/engine/jobmaster/dm/config/config.go b/engine/jobmaster/dm/config/config.go index a8a46dca1e..6df2dbd03d 100644 --- a/engine/jobmaster/dm/config/config.go +++ b/engine/jobmaster/dm/config/config.go @@ -161,9 +161,7 @@ func (c *JobCfg) Clone() (*JobCfg, error) { func (c *JobCfg) ToTaskCfgs() map[string]*TaskCfg { taskCfgs := make(map[string]*TaskCfg, len(c.Upstreams)) for _, mysqlInstance := range c.Upstreams { - // nolint:errcheck - jobCfg, _ := c.Clone() - taskCfg := (*TaskCfg)(jobCfg) + taskCfg := c.ToTaskCfg() taskCfg.Upstreams = []*UpstreamCfg{mysqlInstance} taskCfgs[mysqlInstance.SourceID] = taskCfg } @@ -176,7 +174,7 @@ func FromTaskCfgs(taskCfgs []*TaskCfg) *JobCfg { return nil } - jobCfg := (*JobCfg)(taskCfgs[0]) + jobCfg := taskCfgs[0].ToJobCfg() // nolint:errcheck jobCfg, _ = jobCfg.Clone() for i := 1; i < len(taskCfgs); i++ { @@ -248,9 +246,31 @@ func (c *JobCfg) verifySourceID() error { return nil } +// ToTaskCfg converts JobCfg to TaskCfg. +func (c *JobCfg) ToTaskCfg() *TaskCfg { + // nolint:errcheck + clone, _ := c.Clone() + return &TaskCfg{ + JobCfg: *clone, + } +} + // TaskCfg shares same struct as JobCfg, but it only serves one upstream. // TaskCfg can be converted to an equivalent DM subtask by ToDMSubTaskCfg. -type TaskCfg JobCfg +// TaskCfg add some internal config for jobmaster/worker. +type TaskCfg struct { + JobCfg + + // FIXME: remove this item after fix https://github.com/pingcap/tiflow/issues/7304 + NoNeedExtStorage bool +} + +// ToJobCfg converts TaskCfg to JobCfg. +func (c *TaskCfg) ToJobCfg() *JobCfg { + // nolint:errcheck + clone, _ := c.JobCfg.Clone() + return clone +} // ToDMSubTaskCfg adapts a TaskCfg to a SubTaskCfg for worker now. // TODO: fully support all fields diff --git a/engine/jobmaster/dm/dm_jobmaster.go b/engine/jobmaster/dm/dm_jobmaster.go index 004327433d..ae7cafbef4 100644 --- a/engine/jobmaster/dm/dm_jobmaster.go +++ b/engine/jobmaster/dm/dm_jobmaster.go @@ -411,7 +411,7 @@ func (jm *JobMaster) removeCheckpoint(ctx context.Context) error { } job := state.(*metadata.Job) for _, task := range job.Tasks { - cfg := (*config.JobCfg)(task.Cfg) + cfg := task.Cfg.ToJobCfg() return jm.checkpointAgent.Remove(ctx, cfg) } return errors.New("no task found in job") diff --git a/engine/jobmaster/dm/worker_manager.go b/engine/jobmaster/dm/worker_manager.go index f1c8a9acc2..55295fc84f 100644 --- a/engine/jobmaster/dm/worker_manager.go +++ b/engine/jobmaster/dm/worker_manager.go @@ -249,15 +249,18 @@ func (wm *WorkerManager) checkAndScheduleWorkers(ctx context.Context, job *metad } var resources []resModel.ResourceID + taskCfg := persistentTask.Cfg // first worker don't need local resource. // unfresh sync unit don't need local resource.(if we need to save table checkpoint for loadTableStructureFromDump in future, we can save it before saving global checkpoint.) // TODO: storage should be created/discarded in jobmaster instead of worker. if workerIdxInSeq(persistentTask.Cfg.TaskMode, nextUnit) != 0 && !(nextUnit == frameModel.WorkerDMSync && !isFresh) { resources = append(resources, NewDMResourceID(wm.jobID, persistentTask.Cfg.Upstreams[0].SourceID)) + } else { + taskCfg.NoNeedExtStorage = true } // createWorker should be an asynchronous operation - if err := wm.createWorker(ctx, taskID, nextUnit, persistentTask.Cfg, resources...); err != nil { + if err := wm.createWorker(ctx, taskID, nextUnit, taskCfg, resources...); err != nil { recordError = err continue } diff --git a/engine/jobmaster/dm/worker_manager_test.go b/engine/jobmaster/dm/worker_manager_test.go index ab8f598b9e..45fc59cd1d 100644 --- a/engine/jobmaster/dm/worker_manager_test.go +++ b/engine/jobmaster/dm/worker_manager_test.go @@ -157,10 +157,12 @@ func (t *testDMJobmasterSuite) TestClearWorkerStatus() { job.Tasks[source2] = metadata.NewTask(&config.TaskCfg{}) require.NoError(t.T(), workerManager.stopOutdatedWorkers(context.Background(), job)) messageAgent.On("SendMessage").Return(destroyError).Once() - job.Tasks[source2] = metadata.NewTask(&config.TaskCfg{ModRevision: 1}) + jobCfg := &config.JobCfg{ModRevision: 1} + taskCfg := jobCfg.ToTaskCfg() + job.Tasks[source2] = metadata.NewTask(taskCfg) require.EqualError(t.T(), workerManager.stopOutdatedWorkers(context.Background(), job), destroyError.Error()) messageAgent.On("SendMessage").Return(nil).Once() - job.Tasks[source2] = metadata.NewTask(&config.TaskCfg{ModRevision: 1}) + job.Tasks[source2] = metadata.NewTask(taskCfg) require.NoError(t.T(), workerManager.stopOutdatedWorkers(context.Background(), job)) job = metadata.NewJob(&config.JobCfg{}) From 74f05963f3dcbda4587edc2a20c6e28193f1942b Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Mon, 10 Oct 2022 20:21:10 +0800 Subject: [PATCH 33/49] update --- engine/executor/dm/worker.go | 2 +- engine/jobmaster/dm/worker_manager.go | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/engine/executor/dm/worker.go b/engine/executor/dm/worker.go index 02b4fc275b..0079d1ad12 100644 --- a/engine/executor/dm/worker.go +++ b/engine/executor/dm/worker.go @@ -73,7 +73,7 @@ func (f workerFactory) DeserializeConfig(configBytes []byte) (registry.WorkerCon // NewWorkerImpl implements WorkerFactory.NewWorkerImpl func (f workerFactory) NewWorkerImpl(ctx *dcontext.Context, workerID frameModel.WorkerID, masterID frameModel.MasterID, conf framework.WorkerConfig) (framework.WorkerImpl, error) { cfg := conf.(*config.TaskCfg) - log.Info("new dm worker", zap.String(logutil.ConstFieldJobKey, masterID), zap.Stringer("worker_type", f.workerType), zap.String(logutil.ConstFieldWorkerKey, workerID), zap.Uint64("config_modify_revision", cfg.ModRevision)) + log.Info("new dm worker", zap.String(logutil.ConstFieldJobKey, masterID), zap.Stringer("worker_type", f.workerType), zap.String(logutil.ConstFieldWorkerKey, workerID), zap.Uint64("config_modify_revision", cfg.ModRevision), zap.Bool("no need storage", cfg.NoNeedExtStorage)) dmSubtaskCfg := cfg.ToDMSubTaskCfg(masterID) return newDMWorker(ctx, masterID, f.workerType, dmSubtaskCfg, cfg.ModRevision, cfg.NoNeedExtStorage), nil } diff --git a/engine/jobmaster/dm/worker_manager.go b/engine/jobmaster/dm/worker_manager.go index 55295fc84f..adb6a41876 100644 --- a/engine/jobmaster/dm/worker_manager.go +++ b/engine/jobmaster/dm/worker_manager.go @@ -255,7 +255,10 @@ func (wm *WorkerManager) checkAndScheduleWorkers(ctx context.Context, job *metad // TODO: storage should be created/discarded in jobmaster instead of worker. if workerIdxInSeq(persistentTask.Cfg.TaskMode, nextUnit) != 0 && !(nextUnit == frameModel.WorkerDMSync && !isFresh) { resources = append(resources, NewDMResourceID(wm.jobID, persistentTask.Cfg.Upstreams[0].SourceID)) - } else { + } + + // FIXME: remove this after fix https://github.com/pingcap/tiflow/issues/7304 + if nextUnit == frameModel.WorkerDMSync && !isFresh { taskCfg.NoNeedExtStorage = true } From 0c3b2b2bcc98d95e5aa9ee7c9ce4bb6d5f7875ac Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Mon, 10 Oct 2022 20:21:53 +0800 Subject: [PATCH 34/49] update --- engine/executor/dm/worker.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/executor/dm/worker.go b/engine/executor/dm/worker.go index 0079d1ad12..02b4fc275b 100644 --- a/engine/executor/dm/worker.go +++ b/engine/executor/dm/worker.go @@ -73,7 +73,7 @@ func (f workerFactory) DeserializeConfig(configBytes []byte) (registry.WorkerCon // NewWorkerImpl implements WorkerFactory.NewWorkerImpl func (f workerFactory) NewWorkerImpl(ctx *dcontext.Context, workerID frameModel.WorkerID, masterID frameModel.MasterID, conf framework.WorkerConfig) (framework.WorkerImpl, error) { cfg := conf.(*config.TaskCfg) - log.Info("new dm worker", zap.String(logutil.ConstFieldJobKey, masterID), zap.Stringer("worker_type", f.workerType), zap.String(logutil.ConstFieldWorkerKey, workerID), zap.Uint64("config_modify_revision", cfg.ModRevision), zap.Bool("no need storage", cfg.NoNeedExtStorage)) + log.Info("new dm worker", zap.String(logutil.ConstFieldJobKey, masterID), zap.Stringer("worker_type", f.workerType), zap.String(logutil.ConstFieldWorkerKey, workerID), zap.Uint64("config_modify_revision", cfg.ModRevision)) dmSubtaskCfg := cfg.ToDMSubTaskCfg(masterID) return newDMWorker(ctx, masterID, f.workerType, dmSubtaskCfg, cfg.ModRevision, cfg.NoNeedExtStorage), nil } From ff6b001e21f49653cd8ed52f9a993be6db3d4b60 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Mon, 10 Oct 2022 21:44:51 +0800 Subject: [PATCH 35/49] increase diff time --- engine/chaos/cases/dm/case.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index 750ef1e243..480a0fe77c 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -40,7 +40,8 @@ const ( tableNum = 5 rowNum = 1000 batch = 100 - diffTimes = 60 + // 5 minutes + diffTimes = 150 diffInterval = 2 * time.Second ) From 25384dde396726ae1a9e17861869465782bcab42 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 11 Oct 2022 11:33:50 +0800 Subject: [PATCH 36/49] no need extStorage if sync is not fresh --- engine/executor/dm/api_test.go | 57 ++++++++++++++++++++-- engine/executor/dm/worker.go | 30 ++++++------ engine/executor/dm/worker_test.go | 18 ++++++- engine/jobmaster/dm/api_test.go | 18 ++++--- engine/jobmaster/dm/checkpoint/agent.go | 4 +- engine/jobmaster/dm/config/config.go | 30 ++++++++++-- engine/jobmaster/dm/dm_jobmaster.go | 2 +- engine/jobmaster/dm/worker_manager.go | 8 ++- engine/jobmaster/dm/worker_manager_test.go | 6 ++- 9 files changed, 135 insertions(+), 38 deletions(-) diff --git a/engine/executor/dm/api_test.go b/engine/executor/dm/api_test.go index 5e0710dc27..9a6c20224a 100644 --- a/engine/executor/dm/api_test.go +++ b/engine/executor/dm/api_test.go @@ -20,10 +20,11 @@ import ( "testing" "github.com/gogo/protobuf/jsonpb" - "github.com/pingcap/tiflow/dm/config" + dmconfig "github.com/pingcap/tiflow/dm/config" "github.com/pingcap/tiflow/dm/pb" "github.com/pingcap/tiflow/engine/framework" frameModel "github.com/pingcap/tiflow/engine/framework/model" + "github.com/pingcap/tiflow/engine/jobmaster/dm/config" "github.com/pingcap/tiflow/engine/jobmaster/dm/metadata" dcontext "github.com/pingcap/tiflow/engine/pkg/context" "github.com/pingcap/tiflow/engine/pkg/deps" @@ -102,6 +103,22 @@ func TestQueryStatusAPI(t *testing.T) { Result: &dmpkg.ProcessResult{Errors: []*dmpkg.ProcessError{processError}}, Status: []byte(syncStatusBytes), } + taskCfg = &config.TaskCfg{ + JobCfg: config.JobCfg{ + TargetDB: &dmconfig.DBConfig{}, + Upstreams: []*config.UpstreamCfg{ + { + MySQLInstance: dmconfig.MySQLInstance{ + Mydumper: &dmconfig.MydumperConfig{}, + Loader: &dmconfig.LoaderConfig{}, + Syncer: &dmconfig.SyncerConfig{}, + SourceID: "task-id", + }, + DBCfg: &dmconfig.DBConfig{}, + }, + }, + }, + } ) dctx := dcontext.Background() @@ -111,7 +128,7 @@ func TestQueryStatusAPI(t *testing.T) { })) dctx = dctx.WithDeps(dp) - dmWorker := newDMWorker(dctx, "", frameModel.WorkerDMDump, &config.SubTaskConfig{SourceID: "task-id"}, 0) + dmWorker := newDMWorker(dctx, "", frameModel.WorkerDMDump, taskCfg) unitHolder := &mockUnitHolder{} dmWorker.unitHolder = unitHolder @@ -148,7 +165,23 @@ func TestStopWorker(t *testing.T) { })) dctx = dctx.WithDeps(dp) - dmWorker := newDMWorker(dctx, "master-id", frameModel.WorkerDMDump, &config.SubTaskConfig{SourceID: "task-id"}, 0) + taskCfg := &config.TaskCfg{ + JobCfg: config.JobCfg{ + TargetDB: &dmconfig.DBConfig{}, + Upstreams: []*config.UpstreamCfg{ + { + MySQLInstance: dmconfig.MySQLInstance{ + Mydumper: &dmconfig.MydumperConfig{}, + Loader: &dmconfig.LoaderConfig{}, + Syncer: &dmconfig.SyncerConfig{}, + SourceID: "task-id", + }, + DBCfg: &dmconfig.DBConfig{}, + }, + }, + }, + } + dmWorker := newDMWorker(dctx, "master-id", frameModel.WorkerDMDump, taskCfg) dmWorker.BaseWorker = framework.MockBaseWorker("worker-id", "master-id", dmWorker) dmWorker.BaseWorker.Init(context.Background()) dmWorker.unitHolder = &mockUnitHolder{} @@ -169,7 +202,23 @@ func TestOperateTask(t *testing.T) { })) dctx = dctx.WithDeps(dp) - dmWorker := newDMWorker(dctx, "master-id", frameModel.WorkerDMDump, &config.SubTaskConfig{SourceID: "task-id"}, 0) + taskCfg := &config.TaskCfg{ + JobCfg: config.JobCfg{ + TargetDB: &dmconfig.DBConfig{}, + Upstreams: []*config.UpstreamCfg{ + { + MySQLInstance: dmconfig.MySQLInstance{ + Mydumper: &dmconfig.MydumperConfig{}, + Loader: &dmconfig.LoaderConfig{}, + Syncer: &dmconfig.SyncerConfig{}, + SourceID: "task-id", + }, + DBCfg: &dmconfig.DBConfig{}, + }, + }, + }, + } + dmWorker := newDMWorker(dctx, "master-id", frameModel.WorkerDMDump, taskCfg) dmWorker.BaseWorker = framework.MockBaseWorker("worker-id", "master-id", dmWorker) dmWorker.BaseWorker.Init(context.Background()) mockUnitHolder := &mockUnitHolder{} diff --git a/engine/executor/dm/worker.go b/engine/executor/dm/worker.go index e3f7013599..482827fc49 100644 --- a/engine/executor/dm/worker.go +++ b/engine/executor/dm/worker.go @@ -73,9 +73,8 @@ func (f workerFactory) DeserializeConfig(configBytes []byte) (registry.WorkerCon // NewWorkerImpl implements WorkerFactory.NewWorkerImpl func (f workerFactory) NewWorkerImpl(ctx *dcontext.Context, workerID frameModel.WorkerID, masterID frameModel.MasterID, conf framework.WorkerConfig) (framework.WorkerImpl, error) { cfg := conf.(*config.TaskCfg) - log.Info("new dm worker", zap.String(logutil.ConstFieldJobKey, masterID), zap.String(logutil.ConstFieldWorkerKey, workerID), zap.Uint64("config_modify_revision", cfg.ModRevision)) - dmSubtaskCfg := cfg.ToDMSubTaskCfg(masterID) - return newDMWorker(ctx, masterID, f.workerType, dmSubtaskCfg, cfg.ModRevision), nil + log.Info("new dm worker", zap.String(logutil.ConstFieldJobKey, masterID), zap.Stringer("worker_type", f.workerType), zap.String(logutil.ConstFieldWorkerKey, workerID), zap.Any("task_config", cfg)) + return newDMWorker(ctx, masterID, f.workerType, cfg), nil } // IsRetryableError implements WorkerFactory.IsRetryableError @@ -100,23 +99,26 @@ type dmWorker struct { masterID frameModel.MasterID messageHandlerManager p2p.MessageHandlerManager - cfgModRevision uint64 + cfgModRevision uint64 + noNeedExtStorage bool } -func newDMWorker(ctx *dcontext.Context, masterID frameModel.MasterID, workerType framework.WorkerType, cfg *dmconfig.SubTaskConfig, cfgModRevision uint64) *dmWorker { +func newDMWorker(ctx *dcontext.Context, masterID frameModel.MasterID, workerType framework.WorkerType, cfg *config.TaskCfg) *dmWorker { // TODO: support config later // nolint:errcheck bf, _ := backoff.NewBackoff(dmconfig.DefaultBackoffFactor, dmconfig.DefaultBackoffJitter, dmconfig.DefaultBackoffMin, dmconfig.DefaultBackoffMax) autoResume := &worker.AutoResumeInfo{Backoff: bf, LatestPausedTime: time.Now(), LatestResumeTime: time.Now()} + dmSubtaskCfg := cfg.ToDMSubTaskCfg(masterID) w := &dmWorker{ - cfg: cfg, - stage: metadata.StageInit, - workerType: workerType, - taskID: cfg.SourceID, - masterID: masterID, - unitHolder: newUnitHolderImpl(workerType, cfg), - autoResume: autoResume, - cfgModRevision: cfgModRevision, + cfg: dmSubtaskCfg, + stage: metadata.StageInit, + workerType: workerType, + taskID: dmSubtaskCfg.SourceID, + masterID: masterID, + unitHolder: newUnitHolderImpl(workerType, dmSubtaskCfg), + autoResume: autoResume, + cfgModRevision: cfg.ModRevision, + noNeedExtStorage: cfg.NoNeedExtStorage, } // nolint:errcheck @@ -135,7 +137,7 @@ func (w *dmWorker) InitImpl(ctx context.Context) error { if err := w.messageAgent.UpdateClient(w.masterID, w); err != nil { return err } - if w.cfg.Mode != dmconfig.ModeIncrement { + if w.cfg.Mode != dmconfig.ModeIncrement && !w.noNeedExtStorage { if err := w.setupStorage(ctx); err != nil { return err } diff --git a/engine/executor/dm/worker_test.go b/engine/executor/dm/worker_test.go index 38c88b860d..47fb3002e4 100644 --- a/engine/executor/dm/worker_test.go +++ b/engine/executor/dm/worker_test.go @@ -110,7 +110,23 @@ func TestWorker(t *testing.T) { require.NoError(t, dp.Provide(func() p2p.MessageHandlerManager { return p2p.NewMockMessageHandlerManager() })) - dmWorker := newDMWorker(dctx, "master-id", frameModel.WorkerDMDump, &dmconfig.SubTaskConfig{}, 0) + taskCfg := &config.TaskCfg{ + JobCfg: config.JobCfg{ + TargetDB: &dmconfig.DBConfig{}, + Upstreams: []*config.UpstreamCfg{ + { + MySQLInstance: dmconfig.MySQLInstance{ + Mydumper: &dmconfig.MydumperConfig{}, + Loader: &dmconfig.LoaderConfig{}, + Syncer: &dmconfig.SyncerConfig{}, + SourceID: "task-id", + }, + DBCfg: &dmconfig.DBConfig{}, + }, + }, + }, + } + dmWorker := newDMWorker(dctx, "master-id", frameModel.WorkerDMDump, taskCfg) unitHolder := &mockUnitHolder{} dmWorker.unitHolder = unitHolder dmWorker.BaseWorker = framework.MockBaseWorker("worker-id", "master-id", dmWorker) diff --git a/engine/jobmaster/dm/api_test.go b/engine/jobmaster/dm/api_test.go index 6499996472..8fbacdcecf 100644 --- a/engine/jobmaster/dm/api_test.go +++ b/engine/jobmaster/dm/api_test.go @@ -47,15 +47,17 @@ func TestQueryStatusAPI(t *testing.T) { BaseJobMaster: mockBaseJobmaster, metadata: metadata.NewMetaData(metaKVClient, log.L()), } - job = &metadata.Job{ + jobCfg = &config.JobCfg{ModRevision: 4} + taskCfg = jobCfg.ToTaskCfg() + job = &metadata.Job{ Tasks: map[string]*metadata.Task{ - "task1": {Stage: metadata.StagePaused, Cfg: &config.TaskCfg{ModRevision: 4}}, - "task2": {Stage: metadata.StageFinished, Cfg: &config.TaskCfg{ModRevision: 4}}, - "task3": {Stage: metadata.StageFinished, Cfg: &config.TaskCfg{ModRevision: 4}}, - "task4": {Stage: metadata.StageRunning, Cfg: &config.TaskCfg{ModRevision: 4}}, - "task5": {Stage: metadata.StageRunning, Cfg: &config.TaskCfg{ModRevision: 4}}, - "task6": {Stage: metadata.StageRunning, Cfg: &config.TaskCfg{ModRevision: 4}}, - "task7": {Stage: metadata.StageFinished, Cfg: &config.TaskCfg{ModRevision: 4}}, + "task1": {Stage: metadata.StagePaused, Cfg: taskCfg}, + "task2": {Stage: metadata.StageFinished, Cfg: taskCfg}, + "task3": {Stage: metadata.StageFinished, Cfg: taskCfg}, + "task4": {Stage: metadata.StageRunning, Cfg: taskCfg}, + "task5": {Stage: metadata.StageRunning, Cfg: taskCfg}, + "task6": {Stage: metadata.StageRunning, Cfg: taskCfg}, + "task7": {Stage: metadata.StageFinished, Cfg: taskCfg}, }, } dumpStatus = &pb.DumpStatus{ diff --git a/engine/jobmaster/dm/checkpoint/agent.go b/engine/jobmaster/dm/checkpoint/agent.go index 144f8ce959..5fa66250ac 100644 --- a/engine/jobmaster/dm/checkpoint/agent.go +++ b/engine/jobmaster/dm/checkpoint/agent.go @@ -211,7 +211,7 @@ func onlineDDLName(jobID string, cfg *config.JobCfg) string { func isLoadFresh(ctx context.Context, jobID string, taskCfg *config.TaskCfg, db *conn.BaseDB) (bool, error) { // nolint:gosec - query := fmt.Sprintf("SELECT status FROM %s WHERE `task_name` = ? AND `source_name` = ?", loadTableName(jobID, (*config.JobCfg)(taskCfg))) + query := fmt.Sprintf("SELECT status FROM %s WHERE `task_name` = ? AND `source_name` = ?", loadTableName(jobID, taskCfg.ToJobCfg())) var status string err := db.DB.QueryRowContext(ctx, query, jobID, taskCfg.Upstreams[0].SourceID).Scan(&status) switch { @@ -226,7 +226,7 @@ func isLoadFresh(ctx context.Context, jobID string, taskCfg *config.TaskCfg, db func isSyncFresh(ctx context.Context, jobID string, taskCfg *config.TaskCfg, db *conn.BaseDB) (bool, error) { // nolint:gosec - query := fmt.Sprintf("SELECT 1 FROM %s WHERE `id` = ? AND `is_global` = true", syncTableName(jobID, (*config.JobCfg)(taskCfg))) + query := fmt.Sprintf("SELECT 1 FROM %s WHERE `id` = ? AND `is_global` = true", syncTableName(jobID, taskCfg.ToJobCfg())) var status string err := db.DB.QueryRowContext(ctx, query, taskCfg.Upstreams[0].SourceID).Scan(&status) switch { diff --git a/engine/jobmaster/dm/config/config.go b/engine/jobmaster/dm/config/config.go index a8a46dca1e..6df2dbd03d 100644 --- a/engine/jobmaster/dm/config/config.go +++ b/engine/jobmaster/dm/config/config.go @@ -161,9 +161,7 @@ func (c *JobCfg) Clone() (*JobCfg, error) { func (c *JobCfg) ToTaskCfgs() map[string]*TaskCfg { taskCfgs := make(map[string]*TaskCfg, len(c.Upstreams)) for _, mysqlInstance := range c.Upstreams { - // nolint:errcheck - jobCfg, _ := c.Clone() - taskCfg := (*TaskCfg)(jobCfg) + taskCfg := c.ToTaskCfg() taskCfg.Upstreams = []*UpstreamCfg{mysqlInstance} taskCfgs[mysqlInstance.SourceID] = taskCfg } @@ -176,7 +174,7 @@ func FromTaskCfgs(taskCfgs []*TaskCfg) *JobCfg { return nil } - jobCfg := (*JobCfg)(taskCfgs[0]) + jobCfg := taskCfgs[0].ToJobCfg() // nolint:errcheck jobCfg, _ = jobCfg.Clone() for i := 1; i < len(taskCfgs); i++ { @@ -248,9 +246,31 @@ func (c *JobCfg) verifySourceID() error { return nil } +// ToTaskCfg converts JobCfg to TaskCfg. +func (c *JobCfg) ToTaskCfg() *TaskCfg { + // nolint:errcheck + clone, _ := c.Clone() + return &TaskCfg{ + JobCfg: *clone, + } +} + // TaskCfg shares same struct as JobCfg, but it only serves one upstream. // TaskCfg can be converted to an equivalent DM subtask by ToDMSubTaskCfg. -type TaskCfg JobCfg +// TaskCfg add some internal config for jobmaster/worker. +type TaskCfg struct { + JobCfg + + // FIXME: remove this item after fix https://github.com/pingcap/tiflow/issues/7304 + NoNeedExtStorage bool +} + +// ToJobCfg converts TaskCfg to JobCfg. +func (c *TaskCfg) ToJobCfg() *JobCfg { + // nolint:errcheck + clone, _ := c.JobCfg.Clone() + return clone +} // ToDMSubTaskCfg adapts a TaskCfg to a SubTaskCfg for worker now. // TODO: fully support all fields diff --git a/engine/jobmaster/dm/dm_jobmaster.go b/engine/jobmaster/dm/dm_jobmaster.go index 004327433d..ae7cafbef4 100644 --- a/engine/jobmaster/dm/dm_jobmaster.go +++ b/engine/jobmaster/dm/dm_jobmaster.go @@ -411,7 +411,7 @@ func (jm *JobMaster) removeCheckpoint(ctx context.Context) error { } job := state.(*metadata.Job) for _, task := range job.Tasks { - cfg := (*config.JobCfg)(task.Cfg) + cfg := task.Cfg.ToJobCfg() return jm.checkpointAgent.Remove(ctx, cfg) } return errors.New("no task found in job") diff --git a/engine/jobmaster/dm/worker_manager.go b/engine/jobmaster/dm/worker_manager.go index f1c8a9acc2..adb6a41876 100644 --- a/engine/jobmaster/dm/worker_manager.go +++ b/engine/jobmaster/dm/worker_manager.go @@ -249,6 +249,7 @@ func (wm *WorkerManager) checkAndScheduleWorkers(ctx context.Context, job *metad } var resources []resModel.ResourceID + taskCfg := persistentTask.Cfg // first worker don't need local resource. // unfresh sync unit don't need local resource.(if we need to save table checkpoint for loadTableStructureFromDump in future, we can save it before saving global checkpoint.) // TODO: storage should be created/discarded in jobmaster instead of worker. @@ -256,8 +257,13 @@ func (wm *WorkerManager) checkAndScheduleWorkers(ctx context.Context, job *metad resources = append(resources, NewDMResourceID(wm.jobID, persistentTask.Cfg.Upstreams[0].SourceID)) } + // FIXME: remove this after fix https://github.com/pingcap/tiflow/issues/7304 + if nextUnit == frameModel.WorkerDMSync && !isFresh { + taskCfg.NoNeedExtStorage = true + } + // createWorker should be an asynchronous operation - if err := wm.createWorker(ctx, taskID, nextUnit, persistentTask.Cfg, resources...); err != nil { + if err := wm.createWorker(ctx, taskID, nextUnit, taskCfg, resources...); err != nil { recordError = err continue } diff --git a/engine/jobmaster/dm/worker_manager_test.go b/engine/jobmaster/dm/worker_manager_test.go index ab8f598b9e..45fc59cd1d 100644 --- a/engine/jobmaster/dm/worker_manager_test.go +++ b/engine/jobmaster/dm/worker_manager_test.go @@ -157,10 +157,12 @@ func (t *testDMJobmasterSuite) TestClearWorkerStatus() { job.Tasks[source2] = metadata.NewTask(&config.TaskCfg{}) require.NoError(t.T(), workerManager.stopOutdatedWorkers(context.Background(), job)) messageAgent.On("SendMessage").Return(destroyError).Once() - job.Tasks[source2] = metadata.NewTask(&config.TaskCfg{ModRevision: 1}) + jobCfg := &config.JobCfg{ModRevision: 1} + taskCfg := jobCfg.ToTaskCfg() + job.Tasks[source2] = metadata.NewTask(taskCfg) require.EqualError(t.T(), workerManager.stopOutdatedWorkers(context.Background(), job), destroyError.Error()) messageAgent.On("SendMessage").Return(nil).Once() - job.Tasks[source2] = metadata.NewTask(&config.TaskCfg{ModRevision: 1}) + job.Tasks[source2] = metadata.NewTask(taskCfg) require.NoError(t.T(), workerManager.stopOutdatedWorkers(context.Background(), job)) job = metadata.NewJob(&config.JobCfg{}) From eceb9c652f8cdf158dd958975eacc59713fb7627 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 11 Oct 2022 14:01:39 +0800 Subject: [PATCH 37/49] genFullData before createJob --- engine/chaos/cases/dm/case.go | 10 +++++----- engine/jobmaster/dm/worker_manager.go | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index 480a0fe77c..fa0a0f45d0 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -37,9 +37,9 @@ import ( ) const ( - tableNum = 5 - rowNum = 1000 - batch = 100 + tableNum = 5 + rowNum = 1000 + batch = 100 // 5 minutes diffTimes = 150 diffInterval = 2 * time.Second @@ -150,10 +150,10 @@ func (c *Case) Run(ctx context.Context) error { defer func() { log.L().Info("finish run case", zap.String("name", c.name), zap.String("job_id", c.jobID), zap.Int("insert", c.result[0]), zap.Int("update", c.result[1]), zap.Int("delete", c.result[2])) }() - if err := c.createJob(ctx); err != nil { + if err := c.genFullData(); err != nil { return err } - if err := c.genFullData(); err != nil { + if err := c.createJob(ctx); err != nil { return err } if err := c.diffDataLoop(ctx); err != nil { diff --git a/engine/jobmaster/dm/worker_manager.go b/engine/jobmaster/dm/worker_manager.go index adb6a41876..1d7a515d41 100644 --- a/engine/jobmaster/dm/worker_manager.go +++ b/engine/jobmaster/dm/worker_manager.go @@ -245,7 +245,7 @@ func (wm *WorkerManager) checkAndScheduleWorkers(ctx context.Context, job *metad } else if !runningWorker.RunAsExpected() { wm.logger.Info("unexpected worker status", zap.String("task_id", taskID), zap.Stringer("worker_stage", runningWorker.Stage), zap.Stringer("unit", runningWorker.Unit), zap.Stringer("next_unit", nextUnit)) } else { - wm.logger.Info("switch to next unit", zap.String("task_id", taskID), zap.Stringer("next_unit", runningWorker.Unit)) + wm.logger.Info("switch to next unit", zap.String("task_id", taskID), zap.Stringer("next_unit", nextUnit)) } var resources []resModel.ResourceID From 14ea080ae250a9cb5686d00e8b07740ed321c567 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 11 Oct 2022 14:23:41 +0800 Subject: [PATCH 38/49] wait source at first --- .github/workflows/dataflow_engine_chaos.yaml | 110 +++++++++---------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index f52df5079c..d6718480d5 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -86,6 +86,61 @@ jobs: - name: Load docker image to kind cluster run: | kind load docker-image dataflow:chaos --name dataflow-engine-cluster + + # Set up upstream instances + - name: Set up sources + run: | + kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml + kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml + kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml + - name: Wait for sources ready # kubectl wait --all not working + run: | + kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s || true + kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s || true + kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s || true + sleep 10 + echo show pvc + kubectl get pvc -l app=sources -o wide + echo show pv + kubectl get pv -o wide + echo show svc + kubectl get svc -l app=sources -o wide + echo show sts + kubectl get sts -l app=sources -o wide + echo show po + kubectl get po -l app=sources -o wide + echo describe po + kubectl describe po -l app=sources + echo describe pvc + kubectl describe pvc -l app=sources + kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s + kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s + kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s + + # Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator) + - name: Set up TiDB + run: | + kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml + - name: Wait for TiDB ready + run: | + kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true + echo show pvc + kubectl get pvc -l app=tidb -o wide + echo show pv + kubectl get pv -o wide + echo show svc + kubectl get svc -l app=tidb -o wide + echo show sts + kubectl get sts -l app=tidb -o wide + echo show po + kubectl get po -l app=tidb -o wide + echo describe po + kubectl describe po -l app=tidb + echo describe pvc + kubectl describe pvc -l app=tidb + kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s # Set up metastore and basic services - name: Set up metastore and basic services @@ -194,61 +249,6 @@ jobs: kubectl logs chaos-executor-0 -c wait-server-master || true - # Set up upstream instances - - name: Set up sources - run: | - kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml - kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml - kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml - - name: Wait for sources ready # kubectl wait --all not working - run: | - kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s || true - kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s || true - kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s || true - sleep 10 - echo show pvc - kubectl get pvc -l app=sources -o wide - echo show pv - kubectl get pv -o wide - echo show svc - kubectl get svc -l app=sources -o wide - echo show sts - kubectl get sts -l app=sources -o wide - echo show po - kubectl get po -l app=sources -o wide - echo describe po - kubectl describe po -l app=sources - echo describe pvc - kubectl describe pvc -l app=sources - kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s - kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s - kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s - - # Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator) - - name: Set up TiDB - run: | - kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml - - name: Wait for TiDB ready - run: | - kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true - echo show pvc - kubectl get pvc -l app=tidb -o wide - echo show pv - kubectl get pv -o wide - echo show svc - kubectl get svc -l app=tidb -o wide - echo show sts - kubectl get sts -l app=tidb -o wide - echo show po - kubectl get po -l app=tidb -o wide - echo describe po - kubectl describe po -l app=tidb - echo describe pvc - kubectl describe pvc -l app=tidb - kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s - - name: Set up chaos test cases run: | kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml From 32a74d2b02fc5b67c6b16947bcaecb7be1821fd8 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 11 Oct 2022 15:19:58 +0800 Subject: [PATCH 39/49] wait dm enter sync stage --- .github/workflows/dataflow_engine_chaos.yaml | 11 +++++++++++ engine/chaos/cases/dm/case.go | 1 + 2 files changed, 12 insertions(+) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index d6718480d5..9ca1396cc9 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -256,6 +256,17 @@ jobs: kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml kubectl get pods + # FIXME: remove this after fix https://github.com/pingcap/tiflow/issues/7304 + - name: Wait DM enter sync stage + run: | + for idx in $(seq 0 300); do + echo "wait dm enter sync stage" + if kubectl logs job.batch/chaos-test-case | grep "full mode of the task has completed" ; then + break + fi + sleep 1 + done + - name: Encode chaos-mesh action run: | echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index fa0a0f45d0..f3753a40b2 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -159,6 +159,7 @@ func (c *Case) Run(ctx context.Context) error { if err := c.diffDataLoop(ctx); err != nil { return err } + log.L().Info("full mode of the task has completed", zap.String("name", c.name), zap.String("job_id", c.jobID)) return c.incrLoop(ctx) } From 78df0b17ae68534853785f62b64782438c1d2ba3 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 11 Oct 2022 16:20:49 +0800 Subject: [PATCH 40/49] fix no key --- engine/chaos/cases/dm/case.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index f3753a40b2..0ebb2cd80e 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -257,6 +257,11 @@ func (c *Case) randDML(source int, table string) (string, error) { generator := c.generators[source][table] mcp := c.mcps[source][table] t := rand.Intn(3) + key := mcp.NextUK() + // no rows + if key == nil { + t = 0 + } c.result[t]++ switch t { case 0: @@ -278,9 +283,8 @@ func (c *Case) randDML(source int, table string) (string, error) { c.keySet[table][uk.GetValueHash()] = struct{}{} return sql, nil case 1: - return generator.GenUpdateRow(mcp.NextUK()) + return generator.GenUpdateRow(key) default: - key := mcp.NextUK() sql, err := generator.GenDeleteRow(key) if err != nil { return "", err From d330c958ad4f8a6965e950cda19c93e6c760dec7 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Tue, 11 Oct 2022 16:31:37 +0800 Subject: [PATCH 41/49] fix deadline exceeded --- engine/chaos/cases/dm/case.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index 0ebb2cd80e..2ff7268822 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -211,6 +211,9 @@ func (c *Case) diffData(ctx context.Context) (bool, error) { for _, tableName := range c.tables { row := c.target.db.DB.QueryRowContext(ctx, fmt.Sprintf("SELECT count(1) FROM %s", dbutil.TableName(c.target.currDB, tableName))) if row.Err() != nil { + if row.Err() == context.DeadlineExceeded { + return false, nil + } return false, row.Err() } var count int From a508ac4d3ee3a979b01cd8437cb846a7a8d3e948 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 12 Oct 2022 11:09:29 +0800 Subject: [PATCH 42/49] address comment --- engine/executor/dm/worker.go | 24 ++++++++++++------------ engine/executor/dm/worker_test.go | 1 + engine/jobmaster/dm/config/config.go | 2 +- engine/jobmaster/dm/worker_manager.go | 4 ++-- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/engine/executor/dm/worker.go b/engine/executor/dm/worker.go index 482827fc49..7ad2d5b090 100644 --- a/engine/executor/dm/worker.go +++ b/engine/executor/dm/worker.go @@ -99,8 +99,8 @@ type dmWorker struct { masterID frameModel.MasterID messageHandlerManager p2p.MessageHandlerManager - cfgModRevision uint64 - noNeedExtStorage bool + cfgModRevision uint64 + needExtStorage bool } func newDMWorker(ctx *dcontext.Context, masterID frameModel.MasterID, workerType framework.WorkerType, cfg *config.TaskCfg) *dmWorker { @@ -110,15 +110,15 @@ func newDMWorker(ctx *dcontext.Context, masterID frameModel.MasterID, workerType autoResume := &worker.AutoResumeInfo{Backoff: bf, LatestPausedTime: time.Now(), LatestResumeTime: time.Now()} dmSubtaskCfg := cfg.ToDMSubTaskCfg(masterID) w := &dmWorker{ - cfg: dmSubtaskCfg, - stage: metadata.StageInit, - workerType: workerType, - taskID: dmSubtaskCfg.SourceID, - masterID: masterID, - unitHolder: newUnitHolderImpl(workerType, dmSubtaskCfg), - autoResume: autoResume, - cfgModRevision: cfg.ModRevision, - noNeedExtStorage: cfg.NoNeedExtStorage, + cfg: dmSubtaskCfg, + stage: metadata.StageInit, + workerType: workerType, + taskID: dmSubtaskCfg.SourceID, + masterID: masterID, + unitHolder: newUnitHolderImpl(workerType, dmSubtaskCfg), + autoResume: autoResume, + cfgModRevision: cfg.ModRevision, + needExtStorage: cfg.NeedExtStorage, } // nolint:errcheck @@ -137,7 +137,7 @@ func (w *dmWorker) InitImpl(ctx context.Context) error { if err := w.messageAgent.UpdateClient(w.masterID, w); err != nil { return err } - if w.cfg.Mode != dmconfig.ModeIncrement && !w.noNeedExtStorage { + if w.cfg.Mode != dmconfig.ModeIncrement && w.needExtStorage { if err := w.setupStorage(ctx); err != nil { return err } diff --git a/engine/executor/dm/worker_test.go b/engine/executor/dm/worker_test.go index 47fb3002e4..825ac72b6d 100644 --- a/engine/executor/dm/worker_test.go +++ b/engine/executor/dm/worker_test.go @@ -125,6 +125,7 @@ func TestWorker(t *testing.T) { }, }, }, + NeedExtStorage: true, } dmWorker := newDMWorker(dctx, "master-id", frameModel.WorkerDMDump, taskCfg) unitHolder := &mockUnitHolder{} diff --git a/engine/jobmaster/dm/config/config.go b/engine/jobmaster/dm/config/config.go index 6df2dbd03d..6df83ea8fc 100644 --- a/engine/jobmaster/dm/config/config.go +++ b/engine/jobmaster/dm/config/config.go @@ -262,7 +262,7 @@ type TaskCfg struct { JobCfg // FIXME: remove this item after fix https://github.com/pingcap/tiflow/issues/7304 - NoNeedExtStorage bool + NeedExtStorage bool } // ToJobCfg converts TaskCfg to JobCfg. diff --git a/engine/jobmaster/dm/worker_manager.go b/engine/jobmaster/dm/worker_manager.go index adb6a41876..c8b350b9c4 100644 --- a/engine/jobmaster/dm/worker_manager.go +++ b/engine/jobmaster/dm/worker_manager.go @@ -258,8 +258,8 @@ func (wm *WorkerManager) checkAndScheduleWorkers(ctx context.Context, job *metad } // FIXME: remove this after fix https://github.com/pingcap/tiflow/issues/7304 - if nextUnit == frameModel.WorkerDMSync && !isFresh { - taskCfg.NoNeedExtStorage = true + if nextUnit != frameModel.WorkerDMSync || isFresh { + taskCfg.NeedExtStorage = true } // createWorker should be an asynchronous operation From a26d80713f9528ab90c5f3e382ab78025eb5bc5e Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 12 Oct 2022 13:19:19 +0800 Subject: [PATCH 43/49] remove triger --- .github/workflows/dataflow_engine_chaos.yaml | 3 --- .github/workflows/dm_chaos.yaml | 3 --- 2 files changed, 6 deletions(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index 9ca1396cc9..cc35cdd6bc 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -9,9 +9,6 @@ on: description: 'Which PR do you want to trigger (use PR number, such as 6127)' required: true default: '' - pull_request: - branches: - - master # See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency. concurrency: diff --git a/.github/workflows/dm_chaos.yaml b/.github/workflows/dm_chaos.yaml index d2b7ef9a09..49c7d54969 100644 --- a/.github/workflows/dm_chaos.yaml +++ b/.github/workflows/dm_chaos.yaml @@ -9,9 +9,6 @@ on: description: 'Which PR do you want to trigger' required: true default: '' - pull_request: - branches: - - master # See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency. concurrency: From 7f2517a3007110bf820cc18d00e4c327f19dcb98 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 12 Oct 2022 16:07:14 +0800 Subject: [PATCH 44/49] fix lint --- dm/simulator/mcp/uk.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dm/simulator/mcp/uk.go b/dm/simulator/mcp/uk.go index 210fcd65fa..73f300a149 100644 --- a/dm/simulator/mcp/uk.go +++ b/dm/simulator/mcp/uk.go @@ -75,7 +75,7 @@ func (uk *UniqueKey) GetValue() map[string]interface{} { return result } -// GetValueHash return hash for values +// GetValueHash return hash for values. func (uk *UniqueKey) GetValueHash() string { uk.RLock() defer uk.RUnlock() From ee37202bd38e701f7e47d223649f0e43081d2dbf Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 12 Oct 2022 17:29:29 +0800 Subject: [PATCH 45/49] address comment --- dm/simulator/config/config.go | 13 +++++++++---- dm/simulator/config/config_test.go | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/dm/simulator/config/config.go b/dm/simulator/config/config.go index b2f6b61912..9c5ff73b18 100644 --- a/dm/simulator/config/config.go +++ b/dm/simulator/config/config.go @@ -55,11 +55,16 @@ func (t *TableConfig) GenCreateTable() string { buf.WriteByte(')') } } - for _, ukColName := range t.UniqueKeyColumnNames { + if len(t.UniqueKeyColumnNames) > 0 { buf.WriteString(",UNIQUE KEY ") - buf.WriteString(dbutil.ColumnName(ukColName)) - buf.WriteString("(") - buf.WriteString(dbutil.ColumnName(ukColName)) + buf.WriteString(dbutil.ColumnName(strings.Join(t.UniqueKeyColumnNames, "_"))) + buf.WriteByte('(') + for i, ukColName := range t.UniqueKeyColumnNames { + if i != 0 { + buf.WriteString(",") + } + buf.WriteString(dbutil.ColumnName(ukColName)) + } buf.WriteByte(')') } buf.WriteByte(')') diff --git a/dm/simulator/config/config_test.go b/dm/simulator/config/config_test.go index f8b6443710..cecb3a24d0 100644 --- a/dm/simulator/config/config_test.go +++ b/dm/simulator/config/config_test.go @@ -46,7 +46,7 @@ func TestConfig(t *testing.T) { DataLen: 11, }, }, - UniqueKeyColumnNames: []string{"id"}, + UniqueKeyColumnNames: []string{"id", "name"}, } - require.Equal(t, "CREATE TABLE `games`.`members`(`id` int(11),`name` varchar(255),`age` int(11),`team_id` int(11),UNIQUE KEY `id`(`id`))", tableConfig.GenCreateTable()) + require.Equal(t, "CREATE TABLE `games`.`members`(`id` int(11),`name` varchar(255),`age` int(11),`team_id` int(11),UNIQUE KEY `id_name`(`id`,`name`))", tableConfig.GenCreateTable()) } From 58985352ff1a3aa91062a17be18481b6472b7dd3 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Wed, 12 Oct 2022 17:29:41 +0800 Subject: [PATCH 46/49] Revert "remove triger" This reverts commit a26d80713f9528ab90c5f3e382ab78025eb5bc5e. --- .github/workflows/dataflow_engine_chaos.yaml | 3 +++ .github/workflows/dm_chaos.yaml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index cc35cdd6bc..9ca1396cc9 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -9,6 +9,9 @@ on: description: 'Which PR do you want to trigger (use PR number, such as 6127)' required: true default: '' + pull_request: + branches: + - master # See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency. concurrency: diff --git a/.github/workflows/dm_chaos.yaml b/.github/workflows/dm_chaos.yaml index 49c7d54969..d2b7ef9a09 100644 --- a/.github/workflows/dm_chaos.yaml +++ b/.github/workflows/dm_chaos.yaml @@ -9,6 +9,9 @@ on: description: 'Which PR do you want to trigger' required: true default: '' + pull_request: + branches: + - master # See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency. concurrency: From 782eca614d7e55029fce48145a90754b5b361434 Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Thu, 13 Oct 2022 14:30:26 +0800 Subject: [PATCH 47/49] retry create job --- engine/chaos/cases/dm/case.go | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/engine/chaos/cases/dm/case.go b/engine/chaos/cases/dm/case.go index 2ff7268822..f4b4ea32f5 100644 --- a/engine/chaos/cases/dm/case.go +++ b/engine/chaos/cases/dm/case.go @@ -33,6 +33,7 @@ import ( pb "github.com/pingcap/tiflow/engine/enginepb" "github.com/pingcap/tiflow/engine/jobmaster/dm/config" "github.com/pingcap/tiflow/engine/test/e2e" + "github.com/pingcap/tiflow/pkg/retry" "go.uber.org/zap" ) @@ -164,12 +165,19 @@ func (c *Case) Run(ctx context.Context) error { } func (c *Case) createJob(ctx context.Context) error { - jobID, err := e2e.CreateJobViaHTTP(ctx, c.addr, "chaos-dm-test", "project-dm", pb.Job_DM, c.cfgBytes) - if err != nil { - return err - } - c.jobID = jobID - return nil + return retry.Do(ctx, func() error { + jobID, err := e2e.CreateJobViaHTTP(ctx, c.addr, "chaos-dm-test", "project-dm", pb.Job_DM, c.cfgBytes) + if err != nil { + log.L().Error("create job failed", zap.String("name", c.name), zap.Error(err)) + return err + } + c.jobID = jobID + return nil + }, + retry.WithBackoffBaseDelay(1000 /* 1 second */), + retry.WithBackoffMaxDelay(8000 /* 8 seconds */), + retry.WithMaxTries(15 /* fail after 103 seconds */), + ) } func (c *Case) genFullData() error { From b7c78bc375e995a20fad8807343a45743578a1cd Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Thu, 13 Oct 2022 16:31:32 +0800 Subject: [PATCH 48/49] change log level --- deployments/engine/helm/tiflow/values.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/deployments/engine/helm/tiflow/values.yaml b/deployments/engine/helm/tiflow/values.yaml index e2fcb952bd..714e62c438 100644 --- a/deployments/engine/helm/tiflow/values.yaml +++ b/deployments/engine/helm/tiflow/values.yaml @@ -31,9 +31,6 @@ master: max-interval = "15s" max-try-time = 100 - [log] - level = "debug" - executor: replicas: 4 logStorage: 1Gi @@ -42,9 +39,6 @@ executor: keepalive-interval = "500ms" session-ttl = 20 - [log] - level = "debug" - metastore: frameworkStorage: 5Gi businessStorage: 5Gi From 4998d7aed6bfa467a82f5270cc389af85b1b8abd Mon Sep 17 00:00:00 2001 From: gmhdbjd Date: Thu, 13 Oct 2022 17:53:30 +0800 Subject: [PATCH 49/49] remove trigger --- .github/workflows/dataflow_engine_chaos.yaml | 3 --- .github/workflows/dm_chaos.yaml | 3 --- 2 files changed, 6 deletions(-) diff --git a/.github/workflows/dataflow_engine_chaos.yaml b/.github/workflows/dataflow_engine_chaos.yaml index 9ca1396cc9..cc35cdd6bc 100644 --- a/.github/workflows/dataflow_engine_chaos.yaml +++ b/.github/workflows/dataflow_engine_chaos.yaml @@ -9,9 +9,6 @@ on: description: 'Which PR do you want to trigger (use PR number, such as 6127)' required: true default: '' - pull_request: - branches: - - master # See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency. concurrency: diff --git a/.github/workflows/dm_chaos.yaml b/.github/workflows/dm_chaos.yaml index d2b7ef9a09..49c7d54969 100644 --- a/.github/workflows/dm_chaos.yaml +++ b/.github/workflows/dm_chaos.yaml @@ -9,9 +9,6 @@ on: description: 'Which PR do you want to trigger' required: true default: '' - pull_request: - branches: - - master # See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency. concurrency: