[Commits] Rev 4513: MDEV-6903: gtid_slave_pos is incorrect after master crash in http://bazaar.launchpad.net/~maria-captains/maria/10.0

knielsen at knielsen-hq.org knielsen at knielsen-hq.org
Tue Nov 25 13:19:48 EET 2014


At http://bazaar.launchpad.net/~maria-captains/maria/10.0

------------------------------------------------------------
revno: 4513
revision-id: knielsen at knielsen-hq.org-20141125111948-6x80wdjifq215jla
parent: jplindst at mariadb.org-20141125063103-r7q5vxxw0mfpc55p
committer: Kristian Nielsen <knielsen at knielsen-hq.org>
branch nick: work-10.0
timestamp: Tue 2014-11-25 12:19:48 +0100
message:
  MDEV-6903: gtid_slave_pos is incorrect after master crash
  
  When a master slave restarts, it logs a special restart format description
  event in its binlog. When the slave sees this event, it knows it needs to roll
  back any active partial transaction, in case the master crashed previously in
  the middle of writing such transaction to its binlog.
  
  However, there was a bug where this rollback did not reset rgi->pending_gtid.
  This caused the @@gtid_slave_pos to be updated incorrectly with the GTID of
  the partial transaction that was rolled back.
  
  Fix this by always clearing rgi->pending_gtid in cleanup_context(), hopefully
  preventing similar bugs from turning up in other special cases where a
  transaction is rolled back during replication.
  
  Thanks to Pavel Ivanov for tracking down the issue and providing a test case.
=== modified file 'mysql-test/suite/rpl/r/rpl_gtid_crash.result'
--- a/mysql-test/suite/rpl/r/rpl_gtid_crash.result	2014-11-17 07:53:42 +0000
+++ b/mysql-test/suite/rpl/r/rpl_gtid_crash.result	2014-11-25 11:19:48 +0000
@@ -133,9 +133,17 @@ SELECT @@GLOBAL.server_id;
 3
 SELECT * from t1 WHERE a > 10 ORDER BY a;
 a
+gtid_check
+Binlog pos ok
 # Wait 30 seconds for SQL thread to catch up with IO thread
 SELECT * from t1 WHERE a > 10 ORDER BY a;
 a
+gtid_check
+Binlog pos ok
+gtid_check
+Slave pos ok
+gtid_check
+Current pos ok
 # Repeat this with additional transactions on the master
 SET GLOBAL debug_dbug="+d,inject_error_writing_xid";
 BEGIN;
@@ -175,11 +183,21 @@ SELECT * from t1 WHERE a > 10 ORDER BY a
 a
 13
 14
+gtid_check
+Binlog pos ok
+gtid_check
+Current pos ok
 # Wait 30 seconds for SQL thread to catch up with IO thread
 SELECT * from t1 WHERE a > 10 ORDER BY a;
 a
 13
 14
+gtid_check
+Binlog pos ok
+gtid_check
+Slave pos ok
+gtid_check
+Current pos ok
 # Repeat this with additional transactions on the master
 SET GLOBAL debug_dbug="+d,inject_error_writing_xid";
 BEGIN;
@@ -205,5 +223,48 @@ SELECT * from t1 WHERE a > 10 ORDER BY a
 14
 23
 24
+# Repeat this with slave restart
+SET GLOBAL debug_dbug="+d,inject_error_writing_xid";
+BEGIN;
+INSERT INTO t1 VALUES (25);
+COMMIT;
+ERROR HY000: Error writing file 'master-bin' (errno: 28 "No space left on device")
+SET GLOBAL debug_dbug="+d,crash_dispatch_command_before";
+COMMIT;
+Got one of the listed errors
+# Wait 30 seconds for IO thread to connect and SQL thread to catch up
+# with IO thread.
+include/stop_slave.inc
+gtid_check
+Binlog pos ok
+gtid_check
+Current pos ok
+INSERT INTO t1 VALUES (26);
+INSERT INTO t1 VALUES (27);
+SELECT * from t1 WHERE a > 10 ORDER BY a;
+a
+13
+14
+23
+24
+26
+27
+include/save_master_gtid.inc
+gtid_check
+Binlog pos ok
+gtid_check
+Slave pos ok
+gtid_check
+Current pos ok
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT * from t1 WHERE a > 10 ORDER BY a;
+a
+13
+14
+23
+24
+26
+27
 DROP TABLE t1;
 include/rpl_end.inc

=== modified file 'mysql-test/suite/rpl/t/rpl_gtid_crash.test'
--- a/mysql-test/suite/rpl/t/rpl_gtid_crash.test	2014-09-02 12:07:01 +0000
+++ b/mysql-test/suite/rpl/t/rpl_gtid_crash.test	2014-11-25 11:19:48 +0000
@@ -269,6 +269,7 @@ SET GLOBAL debug_dbug="+d,crash_before_w
 
 --connection server_1
 INSERT INTO t1 VALUES (9), (10);
+--let $saved_gtid=`SELECT @@last_gtid`
 --save_master_pos
 
 --connection server_2
@@ -333,6 +334,9 @@ EOF
 
 SELECT @@GLOBAL.server_id;
 SELECT * from t1 WHERE a > 10 ORDER BY a;
+--disable_query_log
+eval SELECT IF(INSTR(@@gtid_binlog_pos, '$saved_gtid'), "Binlog pos ok", CONCAT("Unexpected binlog pos: ", @@gtid_binlog_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+--enable_query_log
 
 --echo # Wait 30 seconds for SQL thread to catch up with IO thread
 --connection server_2
@@ -357,6 +361,11 @@ if ($read_log_pos != $exec_log_pos)
 }
 
 SELECT * from t1 WHERE a > 10 ORDER BY a;
+--disable_query_log
+eval SELECT IF(INSTR(@@gtid_binlog_pos, '$saved_gtid'), "Binlog pos ok", CONCAT("Unexpected binlog pos: ", @@gtid_binlog_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_slave_pos, '$saved_gtid'), "Slave pos ok", CONCAT("Unexpected slave pos: ", @@gtid_slave_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_current_pos, '$saved_gtid'), "Current pos ok", CONCAT("Unexpected current pos: ", @@gtid_current_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+--enable_query_log
 
 --echo # Repeat this with additional transactions on the master
 
@@ -387,6 +396,7 @@ EOF
 SELECT @@GLOBAL.server_id;
 INSERT INTO t1 VALUES (13);
 INSERT INTO t1 VALUES (14);
+--let $saved_gtid=`SELECT @@last_gtid`
 SELECT * from t1 WHERE a > 10 ORDER BY a;
 --source include/save_master_gtid.inc
 
@@ -420,6 +430,10 @@ EOF
 
 SELECT @@GLOBAL.server_id;
 SELECT * from t1 WHERE a > 10 ORDER BY a;
+--disable_query_log
+eval SELECT IF(INSTR(@@gtid_binlog_pos, '$saved_gtid'), "Binlog pos ok", CONCAT("Unexpected binlog pos: ", @@gtid_binlog_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_current_pos, '$saved_gtid'), "Current pos ok", CONCAT("Unexpected current pos: ", @@gtid_current_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+--enable_query_log
 
 --echo # Wait 30 seconds for SQL thread to catch up with IO thread
 --connection server_2
@@ -444,6 +458,11 @@ if ($read_log_pos != $exec_log_pos)
 }
 
 SELECT * from t1 WHERE a > 10 ORDER BY a;
+--disable_query_log
+eval SELECT IF(INSTR(@@gtid_binlog_pos, '$saved_gtid'), "Binlog pos ok", CONCAT("Unexpected binlog pos: ", @@gtid_binlog_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_slave_pos, '$saved_gtid'), "Slave pos ok", CONCAT("Unexpected slave pos: ", @@gtid_slave_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_current_pos, '$saved_gtid'), "Current pos ok", CONCAT("Unexpected current pos: ", @@gtid_current_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+--enable_query_log
 
 --echo # Repeat this with additional transactions on the master
 
@@ -472,10 +491,91 @@ EOF
 
 INSERT INTO t1 VALUES (23);
 INSERT INTO t1 VALUES (24);
+--let $saved_gtid=`SELECT @@last_gtid`
+SELECT * from t1 WHERE a > 10 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/sync_with_master_gtid.inc
+SELECT * from t1 WHERE a > 10 ORDER BY a;
+
+--echo # Repeat this with slave restart
+
+--connection server_1
+--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+wait
+EOF
+
+SET GLOBAL debug_dbug="+d,inject_error_writing_xid";
+BEGIN;
+INSERT INTO t1 VALUES (25);
+--error ER_ERROR_ON_WRITE
+COMMIT;
+SET GLOBAL debug_dbug="+d,crash_dispatch_command_before";
+--error 2006,2013
+COMMIT;
+
+--source include/wait_until_disconnected.inc
+
+--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+restart
+EOF
+
+--connection server_1
+--enable_reconnect
+--source include/wait_until_connected_again.inc
+
+--connection server_2
+--echo # Wait 30 seconds for IO thread to connect and SQL thread to catch up
+--echo # with IO thread.
+--let $wait_timeout= 300
+while ($wait_timeout != 0)
+{
+  --let $connected=`SELECT COUNT(*) > 0 FROM information_schema.processlist WHERE State = 'Waiting for master to send event'`
+  if ($connected)
+  {
+    --let $read_log_pos= query_get_value('SHOW SLAVE STATUS', Read_Master_Log_Pos, 1)
+    --let $exec_log_pos= query_get_value('SHOW SLAVE STATUS', Exec_Master_Log_Pos, 1)
+    if ($read_log_pos == $exec_log_pos)
+    {
+      --let $wait_timeout= 0
+    }
+    if ($read_log_pos != $exec_log_pos)
+    {
+      --sleep 0.1
+      --dec $wait_timeout
+    }
+  }
+  if (!$connected)
+  {
+    --sleep 0.1
+    --dec $wait_timeout
+  }
+}
+if (`SELECT NOT $connected OR $read_log_pos != $exec_log_pos`)
+{
+  --die Timeout wait for IO thread to connect and SQL thread to catch up with IO thread
+}
+
+--source include/stop_slave.inc
+
+--connection server_1
+--disable_query_log
+eval SELECT IF(INSTR(@@gtid_binlog_pos, '$saved_gtid'), "Binlog pos ok", CONCAT("Unexpected binlog pos: ", @@gtid_binlog_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_current_pos, '$saved_gtid'), "Current pos ok", CONCAT("Unexpected current pos: ", @@gtid_current_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+--enable_query_log
+INSERT INTO t1 VALUES (26);
+INSERT INTO t1 VALUES (27);
 SELECT * from t1 WHERE a > 10 ORDER BY a;
 --source include/save_master_gtid.inc
 
 --connection server_2
+--disable_query_log
+eval SELECT IF(INSTR(@@gtid_binlog_pos, '$saved_gtid'), "Binlog pos ok", CONCAT("Unexpected binlog pos: ", @@gtid_binlog_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_slave_pos, '$saved_gtid'), "Slave pos ok", CONCAT("Unexpected slave pos: ", @@gtid_slave_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_current_pos, '$saved_gtid'), "Current pos ok", CONCAT("Unexpected current pos: ", @@gtid_current_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+--enable_query_log
+--source include/start_slave.inc
 --source include/sync_with_master_gtid.inc
 SELECT * from t1 WHERE a > 10 ORDER BY a;
 

=== modified file 'sql/rpl_rli.cc'
--- a/sql/rpl_rli.cc	2014-11-13 09:20:48 +0000
+++ b/sql/rpl_rli.cc	2014-11-25 11:19:48 +0000
@@ -1717,6 +1717,11 @@ void rpl_group_info::cleanup_context(THD
     trans_rollback_stmt(thd); // if a "statement transaction"
     /* trans_rollback() also resets OPTION_GTID_BEGIN */
     trans_rollback(thd);      // if a "real transaction"
+    /*
+      Now that we have rolled back the transaction, make sure we do not
+      errorneously update the GTID position.
+    */
+    gtid_pending= false;
   }
   m_table_map.clear_tables();
   slave_close_thread_tables(thd);



More information about the commits mailing list