[Commits] a227cf8: MDEV-7335: Potential parallel slave deadlock with specific binlog corruption

Kristian Nielsen knielsen at knielsen-hq.org
Tue Feb 24 16:15:53 EET 2015


revision-id: a227cf8046c1dbf4c8689c9923d9b3c9e0abc165
parent(s): 79e9ff44d18f5fb986870cfac17fc661fb098098
committer: Kristian Nielsen
branch nick: server
timestamp: 2015-02-24 14:39:15 +0100
message:

MDEV-7335: Potential parallel slave deadlock with specific binlog corruption

If somehow the COMMIT or XID event in an event group was missing, the code in
parallel replication to handle this was not sufficient, leading to server
deadlock.

---
 mysql-test/suite/rpl/r/rpl_parallel.result |   74 ++++++++++++++++++++++++++++
 mysql-test/suite/rpl/t/rpl_parallel.test   |   56 +++++++++++++++++++++
 sql/rpl_parallel.cc                        |   31 ++++++------
 sql/slave.cc                               |   12 +++++
 4 files changed, 159 insertions(+), 14 deletions(-)

diff --git a/mysql-test/suite/rpl/r/rpl_parallel.result b/mysql-test/suite/rpl/r/rpl_parallel.result
index 7ceb5ee..3c66a54 100644
--- a/mysql-test/suite/rpl/r/rpl_parallel.result
+++ b/mysql-test/suite/rpl/r/rpl_parallel.result
@@ -1136,6 +1136,80 @@ SET GLOBAL debug_dbug=@old_dbug;
 SET GLOBAL slave_parallel_threads=0;
 SET GLOBAL slave_parallel_threads=10;
 include/start_slave.inc
+*** MDEV-7335: Potential parallel slave deadlock with specific binlog corruption ***
+include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=1;
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
+INSERT INTO t2 VALUES (101);
+INSERT INTO t2 VALUES (102);
+INSERT INTO t2 VALUES (103);
+INSERT INTO t2 VALUES (104);
+INSERT INTO t2 VALUES (105);
+SET gtid_seq_no=1000;
+INSERT INTO t2 VALUES (106);
+INSERT INTO t2 VALUES (107);
+INSERT INTO t2 VALUES (108);
+INSERT INTO t2 VALUES (109);
+INSERT INTO t2 VALUES (110);
+INSERT INTO t2 VALUES (111);
+INSERT INTO t2 VALUES (112);
+INSERT INTO t2 VALUES (113);
+INSERT INTO t2 VALUES (114);
+INSERT INTO t2 VALUES (115);
+INSERT INTO t2 VALUES (116);
+INSERT INTO t2 VALUES (117);
+INSERT INTO t2 VALUES (118);
+INSERT INTO t2 VALUES (119);
+INSERT INTO t2 VALUES (120);
+INSERT INTO t2 VALUES (121);
+INSERT INTO t2 VALUES (122);
+INSERT INTO t2 VALUES (123);
+INSERT INTO t2 VALUES (124);
+INSERT INTO t2 VALUES (125);
+INSERT INTO t2 VALUES (126);
+INSERT INTO t2 VALUES (127);
+INSERT INTO t2 VALUES (128);
+INSERT INTO t2 VALUES (129);
+INSERT INTO t2 VALUES (130);
+include/save_master_gtid.inc
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT * FROM t2 WHERE a >= 100 ORDER BY a;
+a
+101
+102
+103
+104
+105
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+include/stop_slave.inc
+SET GLOBAL debug_dbug=@old_dbug;
+SET GLOBAL slave_parallel_threads=10;
+include/start_slave.inc
 include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=@old_parallel_threads;
 include/start_slave.inc
diff --git a/mysql-test/suite/rpl/t/rpl_parallel.test b/mysql-test/suite/rpl/t/rpl_parallel.test
index d4b99d4..7397ede 100644
--- a/mysql-test/suite/rpl/t/rpl_parallel.test
+++ b/mysql-test/suite/rpl/t/rpl_parallel.test
@@ -1843,6 +1843,62 @@ SET GLOBAL slave_parallel_threads=10;
 --source include/start_slave.inc
 
 
+--echo *** MDEV-7335: Potential parallel slave deadlock with specific binlog corruption ***
+
+--connection server_2
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=1;
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
+
+--connection server_1
+INSERT INTO t2 VALUES (101);
+INSERT INTO t2 VALUES (102);
+INSERT INTO t2 VALUES (103);
+INSERT INTO t2 VALUES (104);
+INSERT INTO t2 VALUES (105);
+# Inject a partial event group (missing XID at the end). The bug was that such
+# partial group was not handled appropriately, leading to server deadlock.
+SET gtid_seq_no=1000;
+INSERT INTO t2 VALUES (106);
+INSERT INTO t2 VALUES (107);
+INSERT INTO t2 VALUES (108);
+INSERT INTO t2 VALUES (109);
+INSERT INTO t2 VALUES (110);
+INSERT INTO t2 VALUES (111);
+INSERT INTO t2 VALUES (112);
+INSERT INTO t2 VALUES (113);
+INSERT INTO t2 VALUES (114);
+INSERT INTO t2 VALUES (115);
+INSERT INTO t2 VALUES (116);
+INSERT INTO t2 VALUES (117);
+INSERT INTO t2 VALUES (118);
+INSERT INTO t2 VALUES (119);
+INSERT INTO t2 VALUES (120);
+INSERT INTO t2 VALUES (121);
+INSERT INTO t2 VALUES (122);
+INSERT INTO t2 VALUES (123);
+INSERT INTO t2 VALUES (124);
+INSERT INTO t2 VALUES (125);
+INSERT INTO t2 VALUES (126);
+INSERT INTO t2 VALUES (127);
+INSERT INTO t2 VALUES (128);
+INSERT INTO t2 VALUES (129);
+INSERT INTO t2 VALUES (130);
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+# The partial event group (a=106) should be rolled back and thus missing.
+SELECT * FROM t2 WHERE a >= 100 ORDER BY a;
+
+--source include/stop_slave.inc
+SET GLOBAL debug_dbug=@old_dbug;
+SET GLOBAL slave_parallel_threads=10;
+--source include/start_slave.inc
+
+
 # Clean up.
 --connection server_2
 --source include/stop_slave.inc
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index 46c3e4a..c6bb974 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -640,7 +640,7 @@ static void signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi,
       }
       DBUG_ASSERT(qev->typ==rpl_parallel_thread::queued_event::QUEUED_EVENT);
 
-      thd->rgi_slave= group_rgi= rgi;
+      thd->rgi_slave= rgi;
       gco= rgi->gco;
       /* Handle a new event group, which will be initiated by a GTID event. */
       if ((event_type= qev->ev->get_type_code()) == GTID_EVENT)
@@ -657,6 +657,21 @@ static void signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi,
             }
           });
 
+        if(unlikely(thd->wait_for_commit_ptr) && group_rgi != NULL)
+        {
+          /*
+            This indicates that we get a new GTID event in the middle of
+            a not completed event group. This is corrupt binlog (the master
+            will never write such binlog), so it does not happen unless
+            someone tries to inject wrong crafted binlog, but let us still
+            try to handle it somewhat nicely.
+          */
+          group_rgi->cleanup_context(thd, true);
+          finish_event_group(rpt, group_rgi->gtid_sub_id,
+                             group_rgi->parallel_entry, group_rgi);
+          rpt->loc_free_rgi(group_rgi);
+        }
+
         in_event_group= true;
         /*
           If the standalone flag is set, then this event group consists of a
@@ -742,19 +757,6 @@ static void signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi,
         unlock_or_exit_cond(thd, &entry->LOCK_parallel_entry,
                             &did_enter_cond, &old_stage);
 
-        if(thd->wait_for_commit_ptr)
-        {
-          /*
-            This indicates that we get a new GTID event in the middle of
-            a not completed event group. This is corrupt binlog (the master
-            will never write such binlog), so it does not happen unless
-            someone tries to inject wrong crafted binlog, but let us still
-            try to handle it somewhat nicely.
-          */
-          rgi->cleanup_context(thd, true);
-          thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
-          thd->wait_for_commit_ptr->wakeup_subsequent_commits(rgi->worker_error);
-        }
         thd->wait_for_commit_ptr= &rgi->commit_orderer;
 
         if (opt_gtid_ignore_duplicates)
@@ -780,6 +782,7 @@ static void signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi,
         }
       }
 
+      group_rgi= rgi;
       group_ending= is_group_ending(qev->ev, event_type);
       if (group_ending && likely(!rgi->worker_error))
       {
diff --git a/sql/slave.cc b/sql/slave.cc
index ba56ff5..4635b57 100644
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -5648,6 +5648,18 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
   }
   break;
 
+#ifndef DBUG_OFF
+  case XID_EVENT:
+    DBUG_EXECUTE_IF("slave_discard_xid_for_gtid_0_x_1000",
+    {
+      /* Inject an event group that is missing its XID commit event. */
+      if (mi->last_queued_gtid.domain_id == 0 &&
+          mi->last_queued_gtid.seq_no == 1000)
+        goto skip_relay_logging;
+    });
+    /* Fall through to default case ... */
+#endif
+
   default:
   default_action:
     if (mi->using_gtid != Master_info::USE_GTID_NO && mi->gtid_event_seen)


More information about the commits mailing list