[Commits] Rev 3492: MDEV-450: Deadlock between starting a slave and reading system variables in http://bazaar.launchpad.net/~maria-captains/maria/5.5

knielsen at knielsen-hq.org knielsen at knielsen-hq.org
Tue Aug 14 13:40:41 EEST 2012


At http://bazaar.launchpad.net/~maria-captains/maria/5.5

------------------------------------------------------------
revno: 3492
revision-id: knielsen at knielsen-hq.org-20120814104040-hos41ce8qjtj643i
parent: sergii at pisem.net-20120813091443-ubw9kq0slp4uib21
committer: knielsen at knielsen-hq.org
branch nick: mariadb-5.5
timestamp: Tue 2012-08-14 12:40:40 +0200
message:
  MDEV-450: Deadlock between starting a slave and reading system variables
  
  Starting the SQL thread might deadlock with reading the values of the
  replication filtering options.
  
  The deadlock is due to a lock order violation when the variables are
  read or set. For example, reading replicate_ignore_table first
  acquires LOCK_global_system_variables in sys_var::value_ptr and later
  acquires LOCK_active_mi in Sys_var_rpl_filter::global_value_ptr. This
  violates the order established when starting a SQL thread, where
  LOCK_active_mi is acquired before start_slave, and ends up creating a
  thread (handle_slave_sql) that allocates a THD handle whose
  constructor acquires LOCK_global_system_variables in THD::init.
  
  The solution is to unlock LOCK_global_system_variables before the
  replication filtering options are set or read. This way the lock
  order is preserved and the data being read/set is still protected
  given that it acquires LOCK_active_mi.
=== added file 'mysql-test/suite/rpl/r/rpl_start_slave_deadlock_sys_vars.result'
--- a/mysql-test/suite/rpl/r/rpl_start_slave_deadlock_sys_vars.result	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/rpl/r/rpl_start_slave_deadlock_sys_vars.result	2012-08-14 10:40:40 +0000
@@ -0,0 +1,31 @@
+include/master-slave.inc
+[connection master]
+# connection: slave
+SET @save_slave_net_timeout = @@GLOBAL.slave_net_timeout;
+STOP SLAVE;
+include/wait_for_slave_to_stop.inc
+# open an extra connection to the slave
+# connection: slave2
+# set debug synchronization point
+SET DEBUG_SYNC='fix_slave_net_timeout SIGNAL parked WAIT_FOR go';
+# attempt to set slave_net_timeout, will wait on sync point
+SET @@GLOBAL.slave_net_timeout = 100;
+# connection: slave
+SET DEBUG_SYNC='now WAIT_FOR parked';
+# connection: slave1
+# attempt to start the SQL thread
+START SLAVE SQL_THREAD;
+# connection: slave
+# wait until SQL thread has been started
+# sleep a bit so that the SQL thread THD handle is initialized
+# signal the set slave_net_timeout to continue
+SET DEBUG_SYNC='now SIGNAL go';
+# connection: slave2
+# reap result of set slave_net_timeout
+# connection: slave1
+# reap result of starting the SQL thread
+# disconnect: slave2
+# connection: slave
+# cleanup
+SET @@GLOBAL.slave_net_timeout = @save_slave_net_timeout;
+include/rpl_end.inc

=== added file 'mysql-test/suite/rpl/t/rpl_start_slave_deadlock_sys_vars.test'
--- a/mysql-test/suite/rpl/t/rpl_start_slave_deadlock_sys_vars.test	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/rpl/t/rpl_start_slave_deadlock_sys_vars.test	2012-08-14 10:40:40 +0000
@@ -0,0 +1,57 @@
+source include/master-slave.inc;
+source include/have_debug_sync.inc;
+
+--echo # connection: slave
+connection slave;
+SET @save_slave_net_timeout = @@GLOBAL.slave_net_timeout;
+STOP SLAVE;
+source include/wait_for_slave_to_stop.inc;
+
+--echo # open an extra connection to the slave
+connect(slave2,127.0.0.1,root,,test,$SLAVE_MYPORT,);
+--echo # connection: slave2
+--echo # set debug synchronization point
+SET DEBUG_SYNC='fix_slave_net_timeout SIGNAL parked WAIT_FOR go';
+--echo # attempt to set slave_net_timeout, will wait on sync point
+--send SET @@GLOBAL.slave_net_timeout = 100
+
+--echo # connection: slave
+connection slave;
+SET DEBUG_SYNC='now WAIT_FOR parked';
+
+--echo # connection: slave1
+connection slave1;
+--echo # attempt to start the SQL thread
+--send START SLAVE SQL_THREAD
+
+--echo # connection: slave
+connection slave;
+--echo # wait until SQL thread has been started
+let $wait_condition=
+  select count(*) = 1 from information_schema.processlist
+  where state = "Waiting for slave thread to start" and info = "START SLAVE SQL_THREAD";
+--source include/wait_condition.inc
+--echo # sleep a bit so that the SQL thread THD handle is initialized
+sleep 2;
+--echo # signal the set slave_net_timeout to continue
+SET DEBUG_SYNC='now SIGNAL go';
+
+--echo # connection: slave2
+connection slave2;
+--echo # reap result of set slave_net_timeout
+--reap
+
+--echo # connection: slave1
+connection slave1;
+--echo # reap result of starting the SQL thread
+--reap
+
+--echo # disconnect: slave2
+disconnect slave2;
+
+--echo # connection: slave
+connection slave;
+--echo # cleanup
+SET @@GLOBAL.slave_net_timeout = @save_slave_net_timeout;
+
+source include/rpl_end.inc;

=== modified file 'sql/slave.cc'
--- a/sql/slave.cc	2012-08-09 15:22:00 +0000
+++ b/sql/slave.cc	2012-08-14 10:40:40 +0000
@@ -3005,6 +3005,8 @@ pthread_handler_t handle_slave_io(void *
   mysql= NULL ;
   retry_count= 0;
 
+  thd= new THD; // note that contructor of THD uses DBUG_ !
+
   mysql_mutex_lock(&mi->run_lock);
   /* Inform waiting threads that slave has started */
   mi->slave_run_id++;
@@ -3013,7 +3015,6 @@ pthread_handler_t handle_slave_io(void *
   mi->events_till_disconnect = disconnect_slave_event_count;
 #endif
 
-  thd= new THD; // note that contructor of THD uses DBUG_ !
   THD_CHECK_SENTRY(thd);
   mi->io_thd = thd;
 

=== modified file 'sql/sys_vars.cc'
--- a/sql/sys_vars.cc	2012-08-09 15:22:00 +0000
+++ b/sql/sys_vars.cc	2012-08-14 10:40:40 +0000
@@ -47,6 +47,7 @@
 #include "sql_base.h"                           // close_cached_tables
 #include <myisam.h>
 #include "log_slow.h"
+#include "debug_sync.h"                         // DEBUG_SYNC
 
 #include "log_event.h"
 #ifdef WITH_PERFSCHEMA_STORAGE_ENGINE
@@ -3256,6 +3257,13 @@ bool Sys_var_rpl_filter::do_check(THD *t
 {
   bool status;
 
+  /*
+    We must not be holding LOCK_global_system_variables here, otherwise we can
+    deadlock with THD::init() which is invoked from within the slave threads
+    with opposite locking order.
+  */
+  mysql_mutex_assert_not_owner(&LOCK_global_system_variables);
+
   mysql_mutex_lock(&LOCK_active_mi);
   mysql_mutex_lock(&active_mi->rli.run_lock);
 
@@ -3272,22 +3280,43 @@ bool Sys_var_rpl_filter::do_check(THD *t
   return status;
 }
 
-bool Sys_var_rpl_filter::global_update(THD *thd, set_var *var)
+void Sys_var_rpl_filter::lock(void)
 {
-  bool slave_running, status= false;
+  /*
+    Starting a slave thread causes the new thread to attempt to
+    acquire LOCK_global_system_variables (in THD::init) while
+    LOCK_active_mi is being held by the thread that initiated
+    the process. In order to not violate the lock order, unlock
+    LOCK_global_system_variables before grabbing LOCK_active_mi.
+  */
+  mysql_mutex_unlock(&LOCK_global_system_variables);
 
   mysql_mutex_lock(&LOCK_active_mi);
   mysql_mutex_lock(&active_mi->rli.run_lock);
+}
 
-  if (! (slave_running= active_mi->rli.slave_running))
-    status= set_filter_value(var->save_result.string_value.str);
-
+void Sys_var_rpl_filter::unlock(void)
+{
   mysql_mutex_unlock(&active_mi->rli.run_lock);
   mysql_mutex_unlock(&LOCK_active_mi);
 
+  mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+bool Sys_var_rpl_filter::global_update(THD *thd, set_var *var)
+{
+  bool slave_running, status= false;
+
+  lock();
+
+  if (! (slave_running= active_mi->rli.slave_running))
+    status= set_filter_value(var->save_result.string_value.str);
+
   if (slave_running)
     my_error(ER_SLAVE_MUST_STOP, MYF(0));
 
+  unlock();
+
   return slave_running || status;
 }
 
@@ -3326,8 +3355,7 @@ uchar *Sys_var_rpl_filter::global_value_
 
   tmp.length(0);
 
-  mysql_mutex_lock(&LOCK_active_mi);
-  mysql_mutex_lock(&active_mi->rli.run_lock);
+  lock();
 
   switch (opt_id) {
   case OPT_REPLICATE_DO_DB:
@@ -3350,8 +3378,7 @@ uchar *Sys_var_rpl_filter::global_value_
     break;
   }
 
-  mysql_mutex_unlock(&active_mi->rli.run_lock);
-  mysql_mutex_unlock(&LOCK_active_mi);
+  unlock();
 
   return (uchar *) thd->strmake(tmp.ptr(), tmp.length());
 }
@@ -3404,6 +3431,9 @@ static Sys_var_charptr Sys_slave_load_tm
 
 static bool fix_slave_net_timeout(sys_var *self, THD *thd, enum_var_type type)
 {
+  DEBUG_SYNC(thd, "fix_slave_net_timeout");
+
+  mysql_mutex_unlock(&LOCK_global_system_variables);
   mysql_mutex_lock(&LOCK_active_mi);
   DBUG_PRINT("info", ("slave_net_timeout=%u mi->heartbeat_period=%.3f",
                      slave_net_timeout,
@@ -3413,6 +3443,7 @@ static bool fix_slave_net_timeout(sys_va
                         ER_SLAVE_HEARTBEAT_VALUE_OUT_OF_RANGE_MAX,
                         ER(ER_SLAVE_HEARTBEAT_VALUE_OUT_OF_RANGE_MAX));
   mysql_mutex_unlock(&LOCK_active_mi);
+  mysql_mutex_lock(&LOCK_global_system_variables);
   return false;
 }
 static Sys_var_uint Sys_slave_net_timeout(
@@ -3439,6 +3470,7 @@ static bool check_slave_skip_counter(sys
 }
 static bool fix_slave_skip_counter(sys_var *self, THD *thd, enum_var_type type)
 {
+  mysql_mutex_unlock(&LOCK_global_system_variables);
   mysql_mutex_lock(&LOCK_active_mi);
   mysql_mutex_lock(&active_mi->rli.run_lock);
   /*
@@ -3454,6 +3486,7 @@ static bool fix_slave_skip_counter(sys_v
   }
   mysql_mutex_unlock(&active_mi->rli.run_lock);
   mysql_mutex_unlock(&LOCK_active_mi);
+  mysql_mutex_lock(&LOCK_global_system_variables);
   return 0;
 }
 static Sys_var_uint Sys_slave_skip_counter(

=== modified file 'sql/sys_vars.h'
--- a/sql/sys_vars.h	2012-08-09 15:22:00 +0000
+++ b/sql/sys_vars.h	2012-08-14 10:40:40 +0000
@@ -588,6 +588,8 @@ class Sys_var_rpl_filter: public sys_var
 protected:
   uchar *global_value_ptr(THD *thd, LEX_STRING *base);
   bool set_filter_value(const char *value);
+  void lock(void);
+  void unlock(void);
 };
 
 /**



More information about the commits mailing list