[Commits] d799f5a: MDEV-7249: Performance problem in parallel replication with multi-level slaves

Kristian Nielsen knielsen at knielsen-hq.org
Fri Mar 13 11:46:28 EET 2015


revision-id: d799f5a0eae04db7633ba8d7e44e77e810dbab65
parent(s): ed04c40b01c122436eda6552c550d62ce8a3920b
committer: Kristian Nielsen
branch nick: server
timestamp: 2015-03-13 10:46:00 +0100
message:

MDEV-7249: Performance problem in parallel replication with multi-level slaves

Parallel replication (in 10.0 / "conservative" mode) relies on binlog group
commits to group transactions that can be safely run in parallel on the
slave. The --binlog-commit-wait-count and --binlog-commit-wait-usec options
exist to increase the number of commits per group. But in case of conflicts
between transactions, this can cause unnecessary delay and reduced througput,
especially on a slave where commit order is fixed.

This patch adds a heuristics to reduce this problem. When transaction T1 goes
to commit, it will first wait for N transactions to queue up for a group
commit. However, if we detect that another transaction T2 is waiting for a row
lock held by T1, then we will skip the wait and let T1 commit immediately,
releasing locks and let T2 continue.

On a slave, this avoids the unfortunate situation where T1 is waiting for T2
to join the group commit, but T2 is waiting for T1 to release locks, causing
no work to be done for the duration of the --binlog-commit-wait-usec timeout.

(The heuristic seems reasonable on the master as well, so it is enabled for
all transactions, not just replication transactions).

---
 .../suite/binlog/r/binlog_commit_wait.result       |   57 +++++++++
 mysql-test/suite/binlog/t/binlog_commit_wait.test  |  123 ++++++++++++++++++++
 sql/handler.cc                                     |    6 +
 sql/log.cc                                         |   60 +++++++++-
 sql/log.h                                          |    2 +
 sql/sql_class.cc                                   |    6 +-
 sql/sql_class.h                                    |   12 ++
 sql/transaction.cc                                 |    2 +
 storage/innobase/lock/lock0lock.cc                 |    2 +-
 storage/xtradb/lock/lock0lock.cc                   |    2 +-
 10 files changed, 267 insertions(+), 5 deletions(-)

diff --git a/mysql-test/suite/binlog/r/binlog_commit_wait.result b/mysql-test/suite/binlog/r/binlog_commit_wait.result
new file mode 100644
index 0000000..e3d4b9e
--- /dev/null
+++ b/mysql-test/suite/binlog/r/binlog_commit_wait.result
@@ -0,0 +1,57 @@
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+SET @old_count= @@GLOBAL.binlog_commit_wait_count;
+SET GLOBAL binlog_commit_wait_count= 3;
+SET @old_usec= @@GLOBAL.binlog_commit_wait_usec;
+SET GLOBAL binlog_commit_wait_usec= 20000000;
+SET @a= current_timestamp();
+BEGIN;
+INSERT INTO t1 VALUES (1,0);
+COMMIT;
+INSERT INTO t1 VALUES (1,1);
+SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
+SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
+IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"))
+Ok
+ERROR 23000: Duplicate entry '1' for key 'PRIMARY'
+SET @a= current_timestamp();
+INSERT INTO t1 VALUES (2,0);
+INSERT INTO t1 VALUES (3,0);
+INSERT INTO t1 VALUES (4,0);
+SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
+SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
+IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"))
+Ok
+SET @a= current_timestamp();
+INSERT INTO t1 VALUES (6,0);
+BEGIN;
+UPDATE t1 SET b=b+1 WHERE a=1;
+UPDATE t1 SET b=b+10 WHERE a=1;
+SELECT SLEEP(0.25);
+SLEEP(0.25)
+0
+UPDATE t1 SET b=b+1 WHERE a=3;
+COMMIT;
+SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
+SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
+IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"))
+Ok
+SET @a= current_timestamp();
+INSERT INTO t1 VALUES (7,0);
+INSERT INTO t1 VALUES (8,0);
+SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
+SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
+IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"))
+Ok
+SELECT * FROM t1 ORDER BY a;
+a	b
+1	11
+2	0
+3	1
+4	0
+6	0
+7	0
+8	0
+DROP TABLE t1;
+SET GLOBAL binlog_commit_wait_count= @old_count;
+SET GLOBAL binlog_commit_wait_usec= @old_usec;
diff --git a/mysql-test/suite/binlog/t/binlog_commit_wait.test b/mysql-test/suite/binlog/t/binlog_commit_wait.test
new file mode 100644
index 0000000..5b3fb0a
--- /dev/null
+++ b/mysql-test/suite/binlog/t/binlog_commit_wait.test
@@ -0,0 +1,123 @@
+--source include/have_innodb.inc
+--source include/have_log_bin.inc
+
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+
+SET @old_count= @@GLOBAL.binlog_commit_wait_count;
+SET GLOBAL binlog_commit_wait_count= 3;
+SET @old_usec= @@GLOBAL.binlog_commit_wait_usec;
+SET GLOBAL binlog_commit_wait_usec= 20000000;
+
+connect(con1,localhost,root,,test);
+connect(con2,localhost,root,,test);
+connect(con3,localhost,root,,test);
+
+# Check that if T2 goes to wait for a row lock of T1 while T1 is waiting for
+# more transactions to arrive for group commit, the commit of T1 will complete
+# immediately.
+# We test this by setting a very high timeout (20 seconds), and testing that
+# that much time does not elapse.
+
+--connection default
+SET @a= current_timestamp();
+
+--connection con1
+BEGIN;
+INSERT INTO t1 VALUES (1,0);
+send COMMIT;
+
+--connection con2
+send INSERT INTO t1 VALUES (1,1);
+
+--connection con1
+reap;
+
+--connection default
+SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
+SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
+
+--connection con2
+--error ER_DUP_ENTRY
+reap;
+
+
+# Test that the commit triggers when sufficient commits have queued up.
+--connection default
+SET @a= current_timestamp();
+
+--connection con1
+send INSERT INTO t1 VALUES (2,0);
+
+--connection con2
+send INSERT INTO t1 VALUES (3,0);
+
+--connection con3
+INSERT INTO t1 VALUES (4,0);
+
+--connection con1
+reap;
+--connection con2
+reap;
+
+--connection default
+SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
+SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
+
+
+# Test that commit triggers immediately if there is already a transaction
+# waiting on another transaction that reaches its commit.
+
+--connection default
+SET @a= current_timestamp();
+
+--connection con1
+send INSERT INTO t1 VALUES (6,0);
+
+--connection con2
+BEGIN;
+UPDATE t1 SET b=b+1 WHERE a=1;
+
+--connection con3
+send UPDATE t1 SET b=b+10 WHERE a=1;
+
+--connection con2
+# A small sleep to let con3 have time to wait on con2.
+# The sleep might be too small on loaded host, but that is not a big problem;
+# it only means we will trigger a different code path (con3 waits after con2
+# is ready to commit rather than before); and either path should work the same.
+# So we will not get false positive in case of different timing; at worst false
+# negative.
+SELECT SLEEP(0.25);
+UPDATE t1 SET b=b+1 WHERE a=3;
+COMMIT;
+
+--connection con1
+reap;
+
+--connection default
+SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
+SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
+
+--connection default
+SET @a= current_timestamp();
+
+# Now con3 will be waiting for a following group commit to trigger.
+--connection con1
+send INSERT INTO t1 VALUES (7,0);
+--connection con2
+INSERT INTO t1 VALUES (8,0);
+--connection con3
+reap;
+
+--connection default
+SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
+SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
+
+--connection default
+SELECT * FROM t1 ORDER BY a;
+
+--connection default
+DROP TABLE t1;
+SET GLOBAL binlog_commit_wait_count= @old_count;
+SET GLOBAL binlog_commit_wait_usec= @old_usec;
diff --git a/sql/handler.cc b/sql/handler.cc
index b92db12..f28fb30 100644
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -1545,7 +1545,10 @@ int ha_commit_one_phase(THD *thd, bool all)
   }
   /* Free resources and perform other cleanup even for 'empty' transactions. */
   if (is_real_trans)
+  {
+    thd->has_waiter= false;
     thd->transaction.cleanup();
+  }
 
   DBUG_RETURN(error);
 }
@@ -1626,7 +1629,10 @@ int ha_rollback_trans(THD *thd, bool all)
 
   /* Always cleanup. Even if nht==0. There may be savepoints. */
   if (is_real_trans)
+  {
+    thd->has_waiter= false;
     thd->transaction.cleanup();
+  }
   if (all)
     thd->transaction_rollback_request= FALSE;
 
diff --git a/sql/log.cc b/sql/log.cc
index 6fa6156..30ecd51 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -7029,6 +7029,14 @@ bool MYSQL_BIN_LOG::write_incident(THD *thd)
       }
     }
 
+    /*
+      Handle the heuristics that if another transaction is waiting for this
+      transaction (or if it does so later), then we want to trigger group
+      commit immediately, without waiting for the binlog_commit_wait_usec
+      timeout to expire.
+    */
+    entry->thd->waiting_on_group_commit= true;
+
     /* Add the entry to the group commit queue. */
     next_entry= entry->next;
     entry->next= group_commit_queue;
@@ -7044,7 +7052,7 @@ bool MYSQL_BIN_LOG::write_incident(THD *thd)
     cur= entry->thd->wait_for_commit_ptr;
   }
 
-  if (opt_binlog_commit_wait_count > 0)
+  if (opt_binlog_commit_wait_count > 0 && orig_queue != NULL)
     mysql_cond_signal(&COND_prepare_ordered);
   mysql_mutex_unlock(&LOCK_prepare_ordered);
   DEBUG_SYNC(orig_entry->thd, "commit_after_release_LOCK_prepare_ordered");
@@ -7218,6 +7226,11 @@ bool MYSQL_BIN_LOG::write_incident(THD *thd)
     while (current)
     {
       group_commit_entry *next= current->next;
+      /*
+        Now that group commit is started, we can clear the flag; there is no
+        longer any use in waiters on this commit trying to trigger it early.
+      */
+      current->thd->waiting_on_group_commit= false;
       current->next= queue;
       queue= current;
       current= next;
@@ -7530,7 +7543,7 @@ bool MYSQL_BIN_LOG::write_incident(THD *thd)
   mysql_mutex_assert_owner(&LOCK_prepare_ordered);
 
   for (e= last_head= group_commit_queue, count= 0; e; e= e->next)
-    if (++count >= opt_binlog_commit_wait_count)
+    if (++count >= opt_binlog_commit_wait_count || unlikely(e->thd->has_waiter))
       return;
 
   mysql_mutex_unlock(&LOCK_log);
@@ -7545,13 +7558,20 @@ bool MYSQL_BIN_LOG::write_incident(THD *thd)
                               &wait_until);
     if (err == ETIMEDOUT)
       break;
+    if (unlikely(last_head->thd->has_waiter))
+      break;
     head= group_commit_queue;
     for (e= head; e && e != last_head; e= e->next)
+    {
       ++count;
+      if (unlikely(e->thd->has_waiter))
+        goto after_loop;
+    }
     if (count >= opt_binlog_commit_wait_count)
       break;
     last_head= head;
   }
+after_loop:
 
   /*
     We must not wait for LOCK_log while holding LOCK_prepare_ordered.
@@ -7575,6 +7595,42 @@ bool MYSQL_BIN_LOG::write_incident(THD *thd)
 }
 
 
+void
+MYSQL_BIN_LOG::binlog_trigger_immediate_group_commit()
+{
+  group_commit_entry *head;
+  mysql_mutex_lock(&LOCK_prepare_ordered);
+  head= group_commit_queue;
+  if (head)
+  {
+    head->thd->has_waiter= true;
+    mysql_cond_signal(&COND_prepare_ordered);
+  }
+  mysql_mutex_unlock(&LOCK_prepare_ordered);
+}
+
+
+/*
+  This function is called when a transaction T1 goes to wait for another
+  transaction T2. It is used to cut short any binlog group commit delay from
+  --binlog-commit-wait-count in the case where another transaction is stalled
+  on the wait due to conflicting row locks.
+
+  If T2 is already ready to group commit, any waiting group commit will be
+  signalled to proceed immediately. Otherwise, a flag will be set in T2, and
+  when T2 later becomes ready, immediate group commit will be triggered.
+*/
+void
+binlog_report_wait_for(THD *thd1, THD *thd2)
+{
+  if (opt_binlog_commit_wait_count == 0)
+    return;
+  thd2->has_waiter= true;
+  if (thd2->waiting_on_group_commit)
+    mysql_bin_log.binlog_trigger_immediate_group_commit();
+}
+
+
 /**
   Wait until we get a signal that the relay log has been updated.
 
diff --git a/sql/log.h b/sql/log.h
index a93e2e1..78a55f9 100644
--- a/sql/log.h
+++ b/sql/log.h
@@ -690,6 +690,7 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
   void set_max_size(ulong max_size_arg);
   void signal_update();
   void wait_for_sufficient_commits();
+  void binlog_trigger_immediate_group_commit();
   void wait_for_update_relay_log(THD* thd);
   int  wait_for_update_bin_log(THD* thd, const struct timespec * timeout);
   void init(ulong max_size);
@@ -1005,6 +1006,7 @@ bool general_log_print(THD *thd, enum enum_server_command command,
 bool general_log_write(THD *thd, enum enum_server_command command,
                        const char *query, uint query_length);
 
+void binlog_report_wait_for(THD *thd, THD *other_thd);
 void sql_perror(const char *message);
 bool flush_error_log();
 
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 9cdf5ce..8c52d5d 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -887,6 +887,7 @@ bool Drop_table_error_handler::handle_condition(THD *thd,
    in_lock_tables(0),
    bootstrap(0),
    derived_tables_processing(FALSE),
+   waiting_on_group_commit(FALSE), has_waiter(FALSE),
    spcont(NULL),
    m_parser_state(NULL),
 #if defined(ENABLED_DEBUG_SYNC)
@@ -4233,6 +4234,8 @@ static void thd_send_progress(THD *thd)
 {
   rpl_group_info *rgi;
 
+  if (mysql_bin_log.is_open() && opt_binlog_commit_wait_count > 0)
+    return true;
   if (!thd)
     return false;
   rgi= thd->rgi_slave;
@@ -4267,13 +4270,14 @@ static void thd_send_progress(THD *thd)
   not harmful, but could lead to unnecessary kill and retry, so best avoided).
 */
 extern "C" void
-thd_report_wait_for(const MYSQL_THD thd, MYSQL_THD other_thd)
+thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd)
 {
   rpl_group_info *rgi;
   rpl_group_info *other_rgi;
 
   if (!thd || !other_thd)
     return;
+  binlog_report_wait_for(thd, other_thd);
   rgi= thd->rgi_slave;
   other_rgi= other_thd->rgi_slave;
   if (!rgi || !other_rgi)
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 5d64837..3252bc1 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -2675,6 +2675,18 @@ class THD :public Statement,
   */
   bool       is_slave_error;
   /*
+    True when a transaction is queued up for binlog group commit.
+    Used so that if another transaction needs to wait for a row lock held by
+    this transaction, it can signal to trigger the group commit immediately,
+    skipping the normal --binlog-commit-wait-count wait.
+  */
+  bool waiting_on_group_commit;
+  /*
+    Set true when another transaction goes to wait on a row lock held by this
+    transaction. Used together with waiting_on_group_commit.
+  */
+  bool has_waiter;
+  /*
     In case of a slave, set to the error code the master got when executing
     the query. 0 if no error on the master.
   */
diff --git a/sql/transaction.cc b/sql/transaction.cc
index 51d8a08e..a70c075 100644
--- a/sql/transaction.cc
+++ b/sql/transaction.cc
@@ -149,6 +149,8 @@ bool trans_begin(THD *thd, uint flags)
     when we come here.  We should at some point change this to an assert.
   */
   thd->transaction.all.modified_non_trans_table= FALSE;
+  thd->has_waiter= false;
+  thd->waiting_on_group_commit= false;
 
   if (res)
     DBUG_RETURN(TRUE);
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index b822b40..850d81c 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -375,7 +375,7 @@ struct lock_stack_t {
 	ulint		heap_no;		/*!< heap number if rec lock */
 };
 
-extern "C" void thd_report_wait_for(const MYSQL_THD thd, MYSQL_THD other_thd);
+extern "C" void thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd);
 extern "C" int thd_need_wait_for(const MYSQL_THD thd);
 extern "C"
 int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc
index aa36202..f9ea524 100644
--- a/storage/xtradb/lock/lock0lock.cc
+++ b/storage/xtradb/lock/lock0lock.cc
@@ -375,7 +375,7 @@ struct lock_stack_t {
 	ulint		heap_no;		/*!< heap number if rec lock */
 };
 
-extern "C" void thd_report_wait_for(const MYSQL_THD thd, MYSQL_THD other_thd);
+extern "C" void thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd);
 extern "C" int thd_need_wait_for(const MYSQL_THD thd);
 extern "C"
 int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);


More information about the commits mailing list