[Commits] Rev 4141: MDEV-5992: EITS: Selectivity of non-indexed condition is counted twice in table's fanout in file:///home/psergey/dev2/10.0-commit/

Sergey Petrunya psergey at askmonty.org
Tue Apr 1 19:59:53 EEST 2014


At file:///home/psergey/dev2/10.0-commit/

------------------------------------------------------------
revno: 4141
revision-id: psergey at askmonty.org-20140401165951-wo69jy0f7pwqefju
parent: sergii at pisem.net-20140329163246-mzfkzyodua370lx5
committer: Sergey Petrunya <psergey at askmonty.org>
branch nick: 10.0-commit
timestamp: Tue 2014-04-01 09:59:51 -0700
message:
  MDEV-5992: EITS: Selectivity of non-indexed condition is counted twice in table's fanout
  MDEV-5984: EITS: Incorrect filtered% value for single-table select with range access
  - Fix calculate_cond_selectivity_for_table() to work correctly with range accesses 
    over multi-component keys:
    = First, take selectivity of all possible range scans into account. Remember which 
      fields were used bt the range scans.
    = Then, calculate selectivity produced by sargable predicates on fields. If a 
      field was used in a possible range access, assume its selectivity is already
      taken into account.
  - Fix table_cond_selectivity(): when quick select is used, selectivity of
    COND(table) is taken into account in matching_candidates_in_table(). In
    table_cond_selectivity() we should not apply it for the second time.
=== modified file 'mysql-test/r/selectivity_no_engine.result'
--- a/mysql-test/r/selectivity_no_engine.result	2014-03-27 20:32:53 +0000
+++ b/mysql-test/r/selectivity_no_engine.result	2014-04-01 16:59:51 +0000
@@ -118,6 +118,27 @@ id	select_type	table	type	possible_keys
 Note	1003	select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` <= <cache>(-(1)))
 drop table t1, t2;
 # 
+# MDEV-5984: EITS: Incorrect filtered% value for single-table select with range access
+# 
+create table t1(a int);
+insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t2 (a int, b int, col1 varchar(64), col2 varchar(64), key(a,b));
+insert into t2 select A.a+10*B.a, C.a+10*D.a, 'filler-data1', 'filler-data2' from t1 A, t1 B, t1 C, t1 D;
+set histogram_size=100;
+set optimizer_use_condition_selectivity=4;
+set use_stat_tables='preferably';
+analyze table t2 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t2	analyze	status	Engine-independent statistics collected
+test.t2	analyze	status	Table is already up to date
+# This must show filtered=100%:
+explain extended select * from t2 where a in (1,2,3) and b in (1,2,3);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t2	range	a	a	10	NULL	9	100.00	Using index condition
+Warnings:
+Note	1003	select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t2`.`col1` AS `col1`,`test`.`t2`.`col2` AS `col2` from `test`.`t2` where ((`test`.`t2`.`a` in (1,2,3)) and (`test`.`t2`.`b` in (1,2,3)))
+drop table t2, t1;
+# 
 # End of the test file
 # 
 set use_stat_tables= @save_use_stat_tables;

=== modified file 'mysql-test/t/selectivity_no_engine.test'
--- a/mysql-test/t/selectivity_no_engine.test	2014-03-27 09:08:00 +0000
+++ b/mysql-test/t/selectivity_no_engine.test	2014-04-01 16:59:51 +0000
@@ -84,7 +84,23 @@ explain extended select * from t1 where
 explain extended select * from t1 where col1<=-1;
 drop table t1, t2;
 
+--echo # 
+--echo # MDEV-5984: EITS: Incorrect filtered% value for single-table select with range access
+--echo # 
+create table t1(a int);
+insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t2 (a int, b int, col1 varchar(64), col2 varchar(64), key(a,b));
+insert into t2 select A.a+10*B.a, C.a+10*D.a, 'filler-data1', 'filler-data2' from t1 A, t1 B, t1 C, t1 D;
+
+set histogram_size=100;
+set optimizer_use_condition_selectivity=4;
+set use_stat_tables='preferably';
+analyze table t2 persistent for all;
+--echo # This must show filtered=100%:
+explain extended select * from t2 where a in (1,2,3) and b in (1,2,3);
 
+drop table t2, t1;
 --echo # 
 --echo # End of the test file
 --echo # 

=== modified file 'sql/opt_range.cc'
--- a/sql/opt_range.cc	2014-03-26 21:25:38 +0000
+++ b/sql/opt_range.cc	2014-04-01 16:59:51 +0000
@@ -3390,6 +3390,17 @@ double records_in_column_ranges(PARAM *p
     on the rows of 'table' in the processed query.
     The calculated selectivity is assigned to the field table->cond_selectivity.
     
+    Selectivity is calculated as a product of selectivities imposed by:
+
+    1. possible range accesses. (if multiple range accesses use the same
+       restrictions on the same field, we make adjustments for that)
+    2. Sargable conditions on fields for which we have column statistics (if 
+       a field is used in a possible range access, we assume that selectivity
+       is already provided by the range access' estimates)
+    3. Reading a few records from the table pages and checking the condition
+       selectivity (this is used for conditions like "column LIKE '%val%'" 
+       where approaches #1 and #2 do not provide selectivity data).
+
   NOTE
     Currently the selectivities of range conditions over different columns are
     considered independent. 
@@ -3415,14 +3426,90 @@ bool calculate_cond_selectivity_for_tabl
   if (table->pos_in_table_list->schema_table)
     DBUG_RETURN(FALSE);
   
+  MY_BITMAP handled_columns;
+  my_bitmap_map* buf;
+  if (!(buf= (my_bitmap_map*)thd->alloc(table->s->column_bitmap_size)))
+    DBUG_RETURN(TRUE);
+  my_bitmap_init(&handled_columns, buf, table->s->fields, FALSE);
+
+  /*
+    First, take into account possible range accesses. 
+    range access estimates are the most precise, we prefer them to any other
+    estimate sources.
+  */
+
+  for (keynr= 0;  keynr < table->s->keys; keynr++)
+  {
+    if (table->quick_keys.is_set(keynr))
+      set_if_bigger(max_quick_key_parts, table->quick_key_parts[keynr]);
+  }
+
+  /* 
+    Walk through all indexes, indexes where range access uses more keyparts 
+    go first.
+  */
+  for (uint quick_key_parts= max_quick_key_parts;
+       quick_key_parts; quick_key_parts--)
+  {
+    for (keynr= 0;  keynr < table->s->keys; keynr++)
+    {
+      if (table->quick_keys.is_set(keynr) &&
+          table->quick_key_parts[keynr] == quick_key_parts)
+      {
+        uint i;
+        uint used_key_parts= table->quick_key_parts[keynr];
+        double quick_cond_selectivity= table->quick_rows[keynr] / 
+	                               table_records;
+        KEY *key_info= table->key_info + keynr;
+        KEY_PART_INFO* key_part= key_info->key_part;
+        /*
+          Suppose, there are range conditions on two keys
+            KEY1 (col1, col2)
+            KEY2 (col3, col2)
+          
+          we don't want to count selectivity of condition on col2 twice.
+          
+          First, find the longest key prefix that's made of columns whose
+          selectivity wasn't already accounted for.
+        */
+        for (i= 0; i < used_key_parts; i++, key_part++)
+        {
+          if (bitmap_is_set(&handled_columns, key_part->fieldnr-1))
+	    break; 
+          bitmap_set_bit(&handled_columns, key_part->fieldnr-1);
+        }
+        if (i)
+        {
+          /* 
+            There is at least 1-column prefix of columns whose selectivity has
+            not yet been accounted for.
+          */
+          table->cond_selectivity*= quick_cond_selectivity;
+          if (i != used_key_parts)
+	  {
+            /*
+              Range access got us estimate for #used_key_parts.
+              We need estimate for #(i-1) key parts.
+            */
+            double f1= key_info->actual_rec_per_key(i-1);
+            double f2= key_info->actual_rec_per_key(i);
+            table->cond_selectivity*= f1 / f2;
+          }
+        }
+      }
+    }
+  }
+   
+  /* 
+    Second step: calculate the selectivity of the range conditions not 
+    supported by any index
+  */
+  bitmap_subtract(used_fields, &handled_columns);
+  /* no need to do: my_bitmap_free(&handled_columns); */
+
   if (thd->variables.optimizer_use_condition_selectivity > 2 &&
       !bitmap_is_clear_all(used_fields))
   {
-    /* 
-      Calculate the selectivity of the range conditions not supported
-      by any index
-    */
-
     PARAM param;
     MEM_ROOT alloc;
     SEL_TREE *tree;
@@ -3509,47 +3596,8 @@ bool calculate_cond_selectivity_for_tabl
 
   bitmap_clear_all(used_fields);
 
-  for (keynr= 0;  keynr < table->s->keys; keynr++)
-  {
-    if (table->quick_keys.is_set(keynr))
-      set_if_bigger(max_quick_key_parts, table->quick_key_parts[keynr]);
-  }
-
-  for (uint quick_key_parts= max_quick_key_parts;
-       quick_key_parts; quick_key_parts--)
-  {
-    for (keynr= 0;  keynr < table->s->keys; keynr++)
-    {
-      if (table->quick_keys.is_set(keynr) &&
-          table->quick_key_parts[keynr] == quick_key_parts)
-      {
-        uint i;
-        uint used_key_parts= table->quick_key_parts[keynr];
-        double quick_cond_selectivity= table->quick_rows[keynr] / 
-	                               table_records;
-        KEY *key_info= table->key_info + keynr;
-        KEY_PART_INFO* key_part= key_info->key_part;
-        for (i= 0; i < used_key_parts; i++, key_part++)
-        {
-          if (bitmap_is_set(used_fields, key_part->fieldnr-1))
-	    break; 
-          bitmap_set_bit(used_fields, key_part->fieldnr-1);
-        }
-        if (i)
-        {
-          table->cond_selectivity*= quick_cond_selectivity;
-          if (i != used_key_parts)
-	  {
-            double f1= key_info->actual_rec_per_key(i-1);
-            double f2= key_info->actual_rec_per_key(i);
-            table->cond_selectivity*= f1 / f2;
-          }
-        }
-      } 
-    }
-  }
 
-  /* Calculate selectivity of probably highly selective predicates */
+  /* Check if we can improve selectivity estimates by using sampling */
   ulong check_rows=
     MY_MIN(thd->variables.optimizer_selectivity_sampling_limit,
         (ulong) (table_records * SELECTIVITY_SAMPLING_SHARE));

=== modified file 'sql/sql_select.cc'
--- a/sql/sql_select.cc	2014-03-26 21:25:38 +0000
+++ b/sql/sql_select.cc	2014-04-01 16:59:51 +0000
@@ -5557,7 +5557,20 @@ void set_position(JOIN *join,uint idx,JO
 }
 
 
-/* Estimate of the number matching candidates in the joined table */
+/*
+  Estimate how many records we will get if we read just this table and apply
+  a part of WHERE that can be checked for it.
+
+  @detail
+  Estimate how many records we will get if we
+   - read the given table with its "independent" access method (either quick 
+     select or full table/index scan),
+   - apply the part of WHERE that refers only to this table.
+
+  @seealso
+    table_cond_selectivity() produces selectivity of condition that is checked
+    after joining rows from this table to rows from preceding tables.
+*/
 
 inline
 double matching_candidates_in_table(JOIN_TAB *s, bool with_found_constraint,
@@ -7236,14 +7249,25 @@ double table_multi_eq_cond_selectivity(J
 
 /**
   @brief
-  Get the selectivity of conditions when joining a table
+    Get the selectivity of conditions when joining a table
 
   @param join       The optimized join
   @param s          The table to be joined for evaluation
   @param rem_tables The bitmap of tables to be joined later
 
+  @detail
+    Get selectivity of conditions that can be applied when joining this table
+    with previous tables.
+
+    For quick selects and full table scans, selectivity of COND(this_table)
+    is accounted for in matching_candidates_in_table(). Here, we only count
+    selectivity of COND(this_table, previous_tables). 
+
+    For other access methods, we need to calculate selectivity of the whole
+    condition, "COND(this_table) AND COND(this_table, previous_tables)".
+
   @retval
-  selectivity of the conditions imposed on the rows of s
+    selectivity of the conditions imposed on the rows of s
 */
 
 static
@@ -7255,22 +7279,21 @@ double table_cond_selectivity(JOIN *join
   TABLE *table= s->table;
   MY_BITMAP *read_set= table->read_set;
   double sel= s->table->cond_selectivity;
-  double table_records= table->stat_records();
   POSITION *pos= &join->positions[idx];
   uint keyparts= 0;
   uint found_part_ref_or_null= 0;
 
-  /* Discount the selectivity of the access method used to join table s */
-  if (s->quick && s->quick->index != MAX_KEY)
-  {
-    if (pos->key == 0 && table_records > 0)
-    {
-      sel/= table->quick_rows[s->quick->index]/table_records;
-    }
-  }
-  else if (pos->key != 0)
+  if (pos->key != 0)
   {
-    /* A ref/ access or hash join is used to join table */
+    /* 
+      A ref access or hash join is used for this table.
+
+      It could have some parts with "t.key_part=const". Using ref access
+      means that we will only get records where the condition holds, so we
+      should remove its selectivity from the condition selectivity.
+      
+      (TODO: more details about the "t.key=othertable.col" case)
+    */
     KEYUSE *keyuse= pos->key;
     KEYUSE *prev_ref_keyuse= keyuse;
     uint key= keyuse->key;
@@ -7317,9 +7340,14 @@ double table_cond_selectivity(JOIN *join
   }
   else
   {
+    /*
+      The table is accessed with full table scan, or quick select.
+      Selectivity of COND(table) is already accounted for in 
+      matching_candidates_in_table().
+    */
     sel= 1;
   }
-    
+
   /* 
     If the field f from the table is equal to a field from one the
     earlier joined tables then the selectivity of the range conditions



More information about the commits mailing list