[Commits] 880f227: MDEV-7847: "Slave worker thread retried transaction 10 time(s) in vain, giving up", followed by replication hanging

Kristian Nielsen knielsen at knielsen-hq.org
Mon Mar 30 15:33:55 EEST 2015


revision-id: 880f2273fdc39cd1a2ab28f448cdfbf3d6581af2
parent(s): a4082918c8942b4c72b8f2a65cb237aeaaf10b3e
committer: Kristian Nielsen
branch nick: mariadb
timestamp: 2015-03-30 14:33:44 +0200
message:

MDEV-7847: "Slave worker thread retried transaction 10 time(s) in vain, giving up", followed by replication hanging

This patch fixes a bug in the error handling in parallel replication, when one
worker thread gets a failure and other worker threads processing later
transactions have to rollback and abort.

The problem was with the lifetime of group_commit_orderer objects (GCOs).
A GCO is freed when we register that its last event group has committed. This
relies on register_wait_for_prior_commit() and wait_for_prior_commit() to
ensure that the fact that T2 has committed implies that any earlier T1 has
also committed, and can thus no longer execute mark_start_commit().

However, in the error case, the code was skipping the
register_wait_for_prior_commit() and wait_for_prior_commit() calls. Thus
commit ordering was not guaranteed, and a GCO could be freed too early. Then a
later mark_start_commit() would reference deallocated GCO, which could lead to
lost wakeup (causing slave threads to hang) or other corruption.

This patch makes also the error case respect commit order. This way, also the
error case gets the GCO lifetime correct, and the hang no longer occurs.

---
 mysql-test/suite/rpl/r/rpl_parallel.result | 288 ++++++++++++++++++++++++++++-
 mysql-test/suite/rpl/t/rpl_parallel.test   | 204 +++++++++++++++++++-
 sql/log.cc                                 |   9 +
 sql/rpl_parallel.cc                        |  24 ++-
 sql/rpl_parallel.h                         |   2 +-
 5 files changed, 515 insertions(+), 12 deletions(-)

diff --git a/mysql-test/suite/rpl/r/rpl_parallel.result b/mysql-test/suite/rpl/r/rpl_parallel.result
index 7ceb5ee..666978a 100644
--- a/mysql-test/suite/rpl/r/rpl_parallel.result
+++ b/mysql-test/suite/rpl/r/rpl_parallel.result
@@ -1136,11 +1136,297 @@ SET GLOBAL debug_dbug=@old_dbug;
 SET GLOBAL slave_parallel_threads=0;
 SET GLOBAL slave_parallel_threads=10;
 include/start_slave.inc
+*** MDEV-7847: "Slave worker thread retried transaction 10 time(s) in vain, giving up", followed by replication hanging ***
+*** MDEV-7882: Excessive transaction retry in parallel replication ***
+CREATE TABLE t7 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
+CREATE TABLE t8 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
+include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=40;
+SELECT @old_retries:=@@GLOBAL.slave_transaction_retries;
+ at old_retries:=@@GLOBAL.slave_transaction_retries
+10
+SET GLOBAL slave_transaction_retries= 5;
+INSERT INTO t7 VALUES (1,1), (2,2), (3,3), (4,4), (5,5);
+SET @old_dbug= @@SESSION.debug_dbug;
+SET @commit_id= 42;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+INSERT INTO t8 VALUES (1,1);
+INSERT INTO t8 VALUES (2,2);
+INSERT INTO t8 VALUES (3,3);
+INSERT INTO t8 VALUES (4,4);
+INSERT INTO t8 VALUES (5,5);
+INSERT INTO t8 VALUES (6,6);
+INSERT INTO t8 VALUES (7,7);
+INSERT INTO t8 VALUES (8,8);
+UPDATE t7 SET b=9 WHERE a=3;
+UPDATE t7 SET b=10 WHERE a=3;
+UPDATE t7 SET b=11 WHERE a=3;
+INSERT INTO t8 VALUES (12,12);
+INSERT INTO t8 VALUES (13,13);
+UPDATE t7 SET b=14 WHERE a=3;
+UPDATE t7 SET b=15 WHERE a=3;
+INSERT INTO t8 VALUES (16,16);
+UPDATE t7 SET b=17 WHERE a=3;
+INSERT INTO t8 VALUES (18,18);
+INSERT INTO t8 VALUES (19,19);
+UPDATE t7 SET b=20 WHERE a=3;
+INSERT INTO t8 VALUES (21,21);
+UPDATE t7 SET b=22 WHERE a=3;
+INSERT INTO t8 VALUES (23,24);
+INSERT INTO t8 VALUES (24,24);
+UPDATE t7 SET b=25 WHERE a=3;
+INSERT INTO t8 VALUES (26,26);
+UPDATE t7 SET b=27 WHERE a=3;
+BEGIN;
+INSERT INTO t8 VALUES (28,28);
+INSERT INTO t8 VALUES (29,28), (30,28);
+INSERT INTO t8 VALUES (31,28);
+INSERT INTO t8 VALUES (32,28);
+INSERT INTO t8 VALUES (33,28);
+INSERT INTO t8 VALUES (34,28);
+INSERT INTO t8 VALUES (35,28);
+INSERT INTO t8 VALUES (36,28);
+INSERT INTO t8 VALUES (37,28);
+INSERT INTO t8 VALUES (38,28);
+INSERT INTO t8 VALUES (39,28);
+INSERT INTO t8 VALUES (40,28);
+INSERT INTO t8 VALUES (41,28);
+INSERT INTO t8 VALUES (42,28);
+COMMIT;
+SET @commit_id=43;
+INSERT INTO t8 VALUES (43,43);
+INSERT INTO t8 VALUES (44,44);
+UPDATE t7 SET b=45 WHERE a=3;
+INSERT INTO t8 VALUES (46,46);
+INSERT INTO t8 VALUES (47,47);
+UPDATE t7 SET b=48 WHERE a=3;
+INSERT INTO t8 VALUES (49,49);
+INSERT INTO t8 VALUES (50,50);
+SET @commit_id=44;
+INSERT INTO t8 VALUES (51,51);
+INSERT INTO t8 VALUES (52,52);
+UPDATE t7 SET b=53 WHERE a=3;
+INSERT INTO t8 VALUES (54,54);
+INSERT INTO t8 VALUES (55,55);
+UPDATE t7 SET b=56 WHERE a=3;
+INSERT INTO t8 VALUES (57,57);
+UPDATE t7 SET b=58 WHERE a=3;
+INSERT INTO t8 VALUES (58,58);
+INSERT INTO t8 VALUES (59,59);
+INSERT INTO t8 VALUES (60,60);
+INSERT INTO t8 VALUES (61,61);
+UPDATE t7 SET b=62 WHERE a=3;
+INSERT INTO t8 VALUES (63,63);
+INSERT INTO t8 VALUES (64,64);
+INSERT INTO t8 VALUES (65,65);
+INSERT INTO t8 VALUES (66,66);
+UPDATE t7 SET b=67 WHERE a=3;
+INSERT INTO t8 VALUES (68,68);
+UPDATE t7 SET b=69 WHERE a=3;
+UPDATE t7 SET b=70 WHERE a=3;
+UPDATE t7 SET b=71 WHERE a=3;
+INSERT INTO t8 VALUES (72,72);
+UPDATE t7 SET b=73 WHERE a=3;
+UPDATE t7 SET b=74 WHERE a=3;
+UPDATE t7 SET b=75 WHERE a=3;
+UPDATE t7 SET b=76 WHERE a=3;
+INSERT INTO t8 VALUES (77,77);
+UPDATE t7 SET b=78 WHERE a=3;
+INSERT INTO t8 VALUES (79,79);
+UPDATE t7 SET b=80 WHERE a=3;
+INSERT INTO t8 VALUES (81,81);
+UPDATE t7 SET b=82 WHERE a=3;
+INSERT INTO t8 VALUES (83,83);
+UPDATE t7 SET b=84 WHERE a=3;
+SET @commit_id=45;
+INSERT INTO t8 VALUES (85,85);
+UPDATE t7 SET b=86 WHERE a=3;
+INSERT INTO t8 VALUES (87,87);
+SET @commit_id=46;
+INSERT INTO t8 VALUES (88,88);
+INSERT INTO t8 VALUES (89,89);
+INSERT INTO t8 VALUES (90,90);
+SET SESSION debug_dbug=@old_dbug;
+INSERT INTO t8 VALUES (91,91);
+INSERT INTO t8 VALUES (92,92);
+INSERT INTO t8 VALUES (93,93);
+INSERT INTO t8 VALUES (94,94);
+INSERT INTO t8 VALUES (95,95);
+INSERT INTO t8 VALUES (96,96);
+INSERT INTO t8 VALUES (97,97);
+INSERT INTO t8 VALUES (98,98);
+INSERT INTO t8 VALUES (99,99);
+SELECT * FROM t7 ORDER BY a;
+a	b
+1	1
+2	2
+3	86
+4	4
+5	5
+SELECT * FROM t8 ORDER BY a;
+a	b
+1	1
+2	2
+3	3
+4	4
+5	5
+6	6
+7	7
+8	8
+12	12
+13	13
+16	16
+18	18
+19	19
+21	21
+23	24
+24	24
+26	26
+28	28
+29	28
+30	28
+31	28
+32	28
+33	28
+34	28
+35	28
+36	28
+37	28
+38	28
+39	28
+40	28
+41	28
+42	28
+43	43
+44	44
+46	46
+47	47
+49	49
+50	50
+51	51
+52	52
+54	54
+55	55
+57	57
+58	58
+59	59
+60	60
+61	61
+63	63
+64	64
+65	65
+66	66
+68	68
+72	72
+77	77
+79	79
+81	81
+83	83
+85	85
+87	87
+88	88
+89	89
+90	90
+91	91
+92	92
+93	93
+94	94
+95	95
+96	96
+97	97
+98	98
+99	99
+include/save_master_gtid.inc
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT * FROM t7 ORDER BY a;
+a	b
+1	1
+2	2
+3	86
+4	4
+5	5
+SELECT * FROM t8 ORDER BY a;
+a	b
+1	1
+2	2
+3	3
+4	4
+5	5
+6	6
+7	7
+8	8
+12	12
+13	13
+16	16
+18	18
+19	19
+21	21
+23	24
+24	24
+26	26
+28	28
+29	28
+30	28
+31	28
+32	28
+33	28
+34	28
+35	28
+36	28
+37	28
+38	28
+39	28
+40	28
+41	28
+42	28
+43	43
+44	44
+46	46
+47	47
+49	49
+50	50
+51	51
+52	52
+54	54
+55	55
+57	57
+58	58
+59	59
+60	60
+61	61
+63	63
+64	64
+65	65
+66	66
+68	68
+72	72
+77	77
+79	79
+81	81
+83	83
+85	85
+87	87
+88	88
+89	89
+90	90
+91	91
+92	92
+93	93
+94	94
+95	95
+96	96
+97	97
+98	98
+99	99
+include/stop_slave.inc
+SET GLOBAL slave_transaction_retries= @old_retries;
+include/start_slave.inc
 include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=@old_parallel_threads;
 include/start_slave.inc
 SET DEBUG_SYNC= 'RESET';
 DROP function foo;
-DROP TABLE t1,t2,t3,t4,t5,t6;
+DROP TABLE t1,t2,t3,t4,t5,t6,t7,t8;
 SET DEBUG_SYNC= 'RESET';
 include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/rpl_parallel.test b/mysql-test/suite/rpl/t/rpl_parallel.test
index d4b99d4..d6d3969 100644
--- a/mysql-test/suite/rpl/t/rpl_parallel.test
+++ b/mysql-test/suite/rpl/t/rpl_parallel.test
@@ -1843,6 +1843,208 @@ SET GLOBAL slave_parallel_threads=10;
 --source include/start_slave.inc
 
 
+--echo *** MDEV-7847: "Slave worker thread retried transaction 10 time(s) in vain, giving up", followed by replication hanging ***
+--echo *** MDEV-7882: Excessive transaction retry in parallel replication ***
+
+--connection server_1
+CREATE TABLE t7 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
+CREATE TABLE t8 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=40;
+SELECT @old_retries:=@@GLOBAL.slave_transaction_retries;
+SET GLOBAL slave_transaction_retries= 5;
+
+
+# Using dbug error injection, we artificially create event groups with a lot of
+# conflicting transactions in each event group. The bugs were originally seen
+# "in the wild" with transactions that did not conflict on the master, and only
+# conflicted very rarely on the slave (maybe some edge case with InnoDB btree
+# page splits or something like that). The event groups here loosely reflect
+# the structure of the original failure's group commits.
+
+
+--connection server_1
+INSERT INTO t7 VALUES (1,1), (2,2), (3,3), (4,4), (5,5);
+SET @old_dbug= @@SESSION.debug_dbug;
+SET @commit_id= 42;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+INSERT INTO t8 VALUES (1,1);
+INSERT INTO t8 VALUES (2,2);
+INSERT INTO t8 VALUES (3,3);
+INSERT INTO t8 VALUES (4,4);
+INSERT INTO t8 VALUES (5,5);
+INSERT INTO t8 VALUES (6,6);
+INSERT INTO t8 VALUES (7,7);
+INSERT INTO t8 VALUES (8,8);
+
+UPDATE t7 SET b=9 WHERE a=3;
+UPDATE t7 SET b=10 WHERE a=3;
+UPDATE t7 SET b=11 WHERE a=3;
+
+INSERT INTO t8 VALUES (12,12);
+INSERT INTO t8 VALUES (13,13);
+
+UPDATE t7 SET b=14 WHERE a=3;
+UPDATE t7 SET b=15 WHERE a=3;
+
+INSERT INTO t8 VALUES (16,16);
+
+UPDATE t7 SET b=17 WHERE a=3;
+
+INSERT INTO t8 VALUES (18,18);
+INSERT INTO t8 VALUES (19,19);
+
+UPDATE t7 SET b=20 WHERE a=3;
+
+INSERT INTO t8 VALUES (21,21);
+
+UPDATE t7 SET b=22 WHERE a=3;
+
+INSERT INTO t8 VALUES (23,24);
+INSERT INTO t8 VALUES (24,24);
+
+UPDATE t7 SET b=25 WHERE a=3;
+
+INSERT INTO t8 VALUES (26,26);
+
+UPDATE t7 SET b=27 WHERE a=3;
+
+BEGIN;
+INSERT INTO t8 VALUES (28,28);
+INSERT INTO t8 VALUES (29,28), (30,28);
+INSERT INTO t8 VALUES (31,28);
+INSERT INTO t8 VALUES (32,28);
+INSERT INTO t8 VALUES (33,28);
+INSERT INTO t8 VALUES (34,28);
+INSERT INTO t8 VALUES (35,28);
+INSERT INTO t8 VALUES (36,28);
+INSERT INTO t8 VALUES (37,28);
+INSERT INTO t8 VALUES (38,28);
+INSERT INTO t8 VALUES (39,28);
+INSERT INTO t8 VALUES (40,28);
+INSERT INTO t8 VALUES (41,28);
+INSERT INTO t8 VALUES (42,28);
+COMMIT;
+
+
+SET @commit_id=43;
+INSERT INTO t8 VALUES (43,43);
+INSERT INTO t8 VALUES (44,44);
+
+UPDATE t7 SET b=45 WHERE a=3;
+
+INSERT INTO t8 VALUES (46,46);
+INSERT INTO t8 VALUES (47,47);
+
+UPDATE t7 SET b=48 WHERE a=3;
+
+INSERT INTO t8 VALUES (49,49);
+INSERT INTO t8 VALUES (50,50);
+
+
+SET @commit_id=44;
+INSERT INTO t8 VALUES (51,51);
+INSERT INTO t8 VALUES (52,52);
+
+UPDATE t7 SET b=53 WHERE a=3;
+
+INSERT INTO t8 VALUES (54,54);
+INSERT INTO t8 VALUES (55,55);
+
+UPDATE t7 SET b=56 WHERE a=3;
+
+INSERT INTO t8 VALUES (57,57);
+
+UPDATE t7 SET b=58 WHERE a=3;
+
+INSERT INTO t8 VALUES (58,58);
+INSERT INTO t8 VALUES (59,59);
+INSERT INTO t8 VALUES (60,60);
+INSERT INTO t8 VALUES (61,61);
+
+UPDATE t7 SET b=62 WHERE a=3;
+
+INSERT INTO t8 VALUES (63,63);
+INSERT INTO t8 VALUES (64,64);
+INSERT INTO t8 VALUES (65,65);
+INSERT INTO t8 VALUES (66,66);
+
+UPDATE t7 SET b=67 WHERE a=3;
+
+INSERT INTO t8 VALUES (68,68);
+
+UPDATE t7 SET b=69 WHERE a=3;
+UPDATE t7 SET b=70 WHERE a=3;
+UPDATE t7 SET b=71 WHERE a=3;
+
+INSERT INTO t8 VALUES (72,72);
+
+UPDATE t7 SET b=73 WHERE a=3;
+UPDATE t7 SET b=74 WHERE a=3;
+UPDATE t7 SET b=75 WHERE a=3;
+UPDATE t7 SET b=76 WHERE a=3;
+
+INSERT INTO t8 VALUES (77,77);
+
+UPDATE t7 SET b=78 WHERE a=3;
+
+INSERT INTO t8 VALUES (79,79);
+
+UPDATE t7 SET b=80 WHERE a=3;
+
+INSERT INTO t8 VALUES (81,81);
+
+UPDATE t7 SET b=82 WHERE a=3;
+
+INSERT INTO t8 VALUES (83,83);
+
+UPDATE t7 SET b=84 WHERE a=3;
+
+
+SET @commit_id=45;
+INSERT INTO t8 VALUES (85,85);
+UPDATE t7 SET b=86 WHERE a=3;
+INSERT INTO t8 VALUES (87,87);
+
+
+SET @commit_id=46;
+INSERT INTO t8 VALUES (88,88);
+INSERT INTO t8 VALUES (89,89);
+INSERT INTO t8 VALUES (90,90);
+
+SET SESSION debug_dbug=@old_dbug;
+
+INSERT INTO t8 VALUES (91,91);
+INSERT INTO t8 VALUES (92,92);
+INSERT INTO t8 VALUES (93,93);
+INSERT INTO t8 VALUES (94,94);
+INSERT INTO t8 VALUES (95,95);
+INSERT INTO t8 VALUES (96,96);
+INSERT INTO t8 VALUES (97,97);
+INSERT INTO t8 VALUES (98,98);
+INSERT INTO t8 VALUES (99,99);
+
+
+SELECT * FROM t7 ORDER BY a;
+SELECT * FROM t8 ORDER BY a;
+--source include/save_master_gtid.inc
+
+
+--connection server_2
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+SELECT * FROM t7 ORDER BY a;
+SELECT * FROM t8 ORDER BY a;
+
+--source include/stop_slave.inc
+SET GLOBAL slave_transaction_retries= @old_retries;
+--source include/start_slave.inc
+
+
 # Clean up.
 --connection server_2
 --source include/stop_slave.inc
@@ -1852,7 +2054,7 @@ SET DEBUG_SYNC= 'RESET';
 
 --connection server_1
 DROP function foo;
-DROP TABLE t1,t2,t3,t4,t5,t6;
+DROP TABLE t1,t2,t3,t4,t5,t6,t7,t8;
 SET DEBUG_SYNC= 'RESET';
 
 --source include/rpl_end.inc
diff --git a/sql/log.cc b/sql/log.cc
index 38fe106..5801b3a 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -7224,6 +7224,15 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
   if (likely(is_open()))                       // Should always be true
   {
     commit_id= (last_in_queue == leader ? 0 : (uint64)leader->thd->query_id);
+    DBUG_EXECUTE_IF("binlog_force_commit_id",
+      {
+        const LEX_STRING name= { C_STRING_WITH_LEN("commit_id") };
+        bool null_value;
+        user_var_entry *entry=
+          (user_var_entry*) my_hash_search(&leader->thd->user_vars,
+                                           (uchar*) name.str, name.length);
+        commit_id= entry->val_int(&null_value);
+      });
     /*
       Commit every transaction in the queue.
 
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index 3fbc529..7df1ea3 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -113,6 +113,7 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
   wait_for_commit *wfc= &rgi->commit_orderer;
   int err;
 
+  thd->get_stmt_da()->set_overwrite_status(true);
   /*
     Remove any left-over registration to wait for a prior commit to
     complete. Normally, such wait would already have been removed at
@@ -129,14 +130,14 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
     for us to complete and rely on this also ensuring that any other
     event in the group has completed.
 
-    But in the error case, we have to abort anyway, and it seems best
-    to just complete as quickly as possible with unregister. Anyone
-    waiting for us will in any case receive the error back from their
-    wait_for_prior_commit() call.
+    And in the error case, correct GCO lifetime relies on the fact that once
+    the last event group in the GCO has executed wait_for_prior_commit(),
+    all earlier event groups have also committed; this way no more
+    mark_start_commit() calls can be made and it is safe to de-allocate
+    the GCO.
   */
-  if (rgi->worker_error)
-    wfc->unregister_wait_for_prior_commit();
-  else if ((err= wfc->wait_for_prior_commit(thd)))
+  err= wfc->wait_for_prior_commit(thd);
+  if (unlikely(err) && !rgi->worker_error)
     signal_error_to_sql_driver_thread(thd, rgi, err);
   thd->wait_for_commit_ptr= NULL;
 
@@ -193,6 +194,10 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
 
   thd->clear_error();
   thd->reset_killed();
+  /*
+    Would do thd->get_stmt_da()->set_overwrite_status(false) here, but
+    reset_diagnostics_area() already does that.
+  */
   thd->get_stmt_da()->reset_diagnostics_area();
   wfc->wakeup_subsequent_commits(rgi->worker_error);
 }
@@ -761,8 +766,7 @@ handle_rpl_parallel_thread(void *arg)
 
         if (unlikely(entry->stop_on_error_sub_id <= rgi->wait_commit_sub_id))
           skip_event_group= true;
-        else
-          register_wait_for_prior_event_group_commit(rgi, entry);
+        register_wait_for_prior_event_group_commit(rgi, entry);
 
         unlock_or_exit_cond(thd, &entry->LOCK_parallel_entry,
                             &did_enter_cond, &old_stage);
@@ -849,7 +853,9 @@ handle_rpl_parallel_thread(void *arg)
       else
       {
         delete qev->ev;
+        thd->get_stmt_da()->set_overwrite_status(true);
         err= thd->wait_for_prior_commit();
+        thd->get_stmt_da()->set_overwrite_status(false);
       }
 
       end_of_group=
diff --git a/sql/rpl_parallel.h b/sql/rpl_parallel.h
index 2604cd9..256dffe 100644
--- a/sql/rpl_parallel.h
+++ b/sql/rpl_parallel.h
@@ -53,7 +53,7 @@ struct group_commit_orderer {
   group_commit_orderer *prev_gco;
   group_commit_orderer *next_gco;
   /*
-    The sub_id of last event group in this the previous GCO.
+    The sub_id of last event group in the previous GCO.
     Only valid if prev_gco != NULL.
   */
   uint64 prior_sub_id;


More information about the commits mailing list