[Commits] Rev 4151: MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered% in file:///home/psergey/dev2/10.0/

Sergey Petrunya psergey at askmonty.org
Mon Apr 21 14:37:57 EEST 2014


At file:///home/psergey/dev2/10.0/

------------------------------------------------------------
revno: 4151
revision-id: psergey at askmonty.org-20140421113755-0xkafcnaamk6gw01
parent: psergey at askmonty.org-20140411210132-2awihjjspe4kyrot
committer: Sergey Petrunya <psergey at askmonty.org>
branch nick: 10.0
timestamp: Mon 2014-04-21 15:37:55 +0400
message:
  MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered%
  MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities
  MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value
  - Made a number of fixes in table_cond_selectivity() so that it returns
    correct selectivity estimates.
  - Added comments in related code.
=== modified file 'mysql-test/r/selectivity_no_engine.result'
--- a/mysql-test/r/selectivity_no_engine.result	2014-04-01 16:59:51 +0000
+++ b/mysql-test/r/selectivity_no_engine.result	2014-04-21 11:37:55 +0000
@@ -139,6 +139,118 @@ id	select_type	table	type	possible_keys
 Note	1003	select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t2`.`col1` AS `col1`,`test`.`t2`.`col2` AS `col2` from `test`.`t2` where ((`test`.`t2`.`a` in (1,2,3)) and (`test`.`t2`.`b` in (1,2,3)))
 drop table t2, t1;
 # 
+# MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered%
+# 
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(key1 int, col1 int, key(key1));
+insert into t1 select A.a, A.a from t0 A, t0 B, t0 C;
+set histogram_size=100;
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+analyze table t1 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	status	Table is already up to date
+# 10% is ok
+explain extended select * from t1 where col1=2;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	9.90	Using where
+Warnings:
+Note	1003	select `test`.`t1`.`key1` AS `key1`,`test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` = 2)
+# Must show 100%, not 10%
+explain extended select * from t1 where key1=2;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ref	key1	key1	5	const	98	100.00	
+Warnings:
+Note	1003	select `test`.`t1`.`key1` AS `key1`,`test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`key1` = 2)
+drop table t0, t1;
+# 
+# MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities
+# 
+create table t1(a int);
+insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t2(a int);
+insert into t2 select A.a + B.a* 10 + C.a * 100 from t1 A, t1 B, t1 C;
+create table t3 as select * from t2;
+set histogram_size=100;
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+analyze table t2 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t2	analyze	status	Engine-independent statistics collected
+test.t2	analyze	status	OK
+analyze table t3 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t3	analyze	status	Engine-independent statistics collected
+test.t3	analyze	status	OK
+explain extended select * from t2 A where A.a < 40;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	A	ALL	NULL	NULL	NULL	NULL	1000	4.95	Using where
+Warnings:
+Note	1003	select `test`.`A`.`a` AS `a` from `test`.`t2` `A` where (`test`.`A`.`a` < 40)
+explain extended select * from t3 B where B.a < 100;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	B	ALL	NULL	NULL	NULL	NULL	1000	9.90	Using where
+Warnings:
+Note	1003	select `test`.`B`.`a` AS `a` from `test`.`t3` `B` where (`test`.`B`.`a` < 100)
+explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	A	ALL	NULL	NULL	NULL	NULL	1000	4.95	Using where
+1	SIMPLE	B	ALL	NULL	NULL	NULL	NULL	1000	9.90	Using where; Using join buffer (flat, BNL join)
+Warnings:
+Note	1003	select `test`.`A`.`a` AS `a`,`test`.`B`.`a` AS `a` from `test`.`t2` `A` join `test`.`t3` `B` where ((`test`.`A`.`a` < 40) and (`test`.`B`.`a` < 100))
+explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100 and B.a=A.a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	A	ALL	NULL	NULL	NULL	NULL	1000	4.95	Using where
+1	SIMPLE	B	ALL	NULL	NULL	NULL	NULL	1000	4.95	Using where; Using join buffer (flat, BNL join)
+Warnings:
+Note	1003	select `test`.`A`.`a` AS `a`,`test`.`B`.`a` AS `a` from `test`.`t2` `A` join `test`.`t3` `B` where ((`test`.`B`.`a` = `test`.`A`.`a`) and (`test`.`A`.`a` < 40) and (`test`.`A`.`a` < 100))
+drop table t1, t2, t3;
+select 1;
+1
+1
+# 
+# MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value
+# 
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int, kp2 int, 
+filler1 char(100),
+filler2 char(100),
+key(kp1, kp2)
+);
+insert into t1 
+select 
+A.a,
+B.a,
+'filler-data-1',
+'filler-data-2'
+from t0 A, t0 B, t0 C;
+set histogram_size=100;
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+analyze table t1 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	status	Table is already up to date
+# NOTE: 10*100%, 10*100% rows is ok
+explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=t0.a+1;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t0	ALL	NULL	NULL	NULL	NULL	10	100.00	Using where
+1	SIMPLE	t1	ref	kp1	kp1	10	test.t0.a,func	10	100.00	Using index condition
+Warnings:
+Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`filler1` AS `filler1`,`test`.`t1`.`filler2` AS `filler2` from `test`.`t0` join `test`.`t1` where ((`test`.`t1`.`kp1` = `test`.`t0`.`a`) and (`test`.`t1`.`kp2` = (`test`.`t0`.`a` + 1)))
+# NOTE: t0: 10*100% is ok,  t1: 10*9.90% is bad. t1 should have 10*100%.
+explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=4;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t0	ALL	NULL	NULL	NULL	NULL	10	100.00	Using where
+1	SIMPLE	t1	ref	kp1	kp1	10	test.t0.a,const	10	100.00	
+Warnings:
+Note	1003	select `test`.`t0`.`a` AS `a`,`test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`filler1` AS `filler1`,`test`.`t1`.`filler2` AS `filler2` from `test`.`t0` join `test`.`t1` where ((`test`.`t1`.`kp1` = `test`.`t0`.`a`) and (`test`.`t1`.`kp2` = 4))
+drop table t0, t1;
+# 
 # End of the test file
 # 
 set use_stat_tables= @save_use_stat_tables;

=== modified file 'mysql-test/t/selectivity_no_engine.test'
--- a/mysql-test/t/selectivity_no_engine.test	2014-04-01 16:59:51 +0000
+++ b/mysql-test/t/selectivity_no_engine.test	2014-04-21 11:37:55 +0000
@@ -101,6 +101,78 @@ analyze table t2 persistent for all;
 explain extended select * from t2 where a in (1,2,3) and b in (1,2,3);
 
 drop table t2, t1;
+
+--echo # 
+--echo # MDEV-5980: EITS: if condition is used for REF access, its selectivity is still in filtered%
+--echo # 
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(key1 int, col1 int, key(key1));
+insert into t1 select A.a, A.a from t0 A, t0 B, t0 C;
+
+set histogram_size=100;
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+analyze table t1 persistent for all;
+--echo # 10% is ok
+explain extended select * from t1 where col1=2;
+--echo # Must show 100%, not 10%
+explain extended select * from t1 where key1=2;
+drop table t0, t1;
+
+##--disable_parsing
+--echo # 
+--echo # MDEV-5985: EITS: selectivity estimates look illogical for join and non-key equalities
+--echo # 
+create table t1(a int);
+insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t2(a int);
+insert into t2 select A.a + B.a* 10 + C.a * 100 from t1 A, t1 B, t1 C;
+create table t3 as select * from t2;
+set histogram_size=100;
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+analyze table t2 persistent for all;
+analyze table t3 persistent for all;
+explain extended select * from t2 A where A.a < 40;
+explain extended select * from t3 B where B.a < 100;
+explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100;
+explain extended select * from t2 A, t3 B where A.a < 40 and B.a < 100 and B.a=A.a;
+drop table t1, t2, t3;
+## --enable_parsing
+select 1;
+##--disable_parsing
+--echo # 
+--echo # MDEV-6003: EITS: ref access, keypart2=const vs keypart2=expr - inconsistent filtered% value
+--echo # 
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+  kp1 int, kp2 int, 
+  filler1 char(100),
+  filler2 char(100),
+  key(kp1, kp2)
+);
+
+insert into t1 
+select 
+  A.a,
+  B.a,
+  'filler-data-1',
+  'filler-data-2'
+from t0 A, t0 B, t0 C;
+set histogram_size=100;
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+analyze table t1 persistent for all;
+--echo # NOTE: 10*100%, 10*100% rows is ok
+explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=t0.a+1;
+
+--echo # NOTE: t0: 10*100% is ok,  t1: 10*9.90% is bad. t1 should have 10*100%.
+explain extended select * from t0, t1 where t1.kp1=t0.a and t1.kp2=4;
+drop table t0, t1;
+##--enable_parsing
+
 --echo # 
 --echo # End of the test file
 --echo # 

=== modified file 'sql/opt_range.cc'
--- a/sql/opt_range.cc	2014-04-01 16:59:51 +0000
+++ b/sql/opt_range.cc	2014-04-21 11:37:55 +0000
@@ -3401,6 +3401,10 @@ double records_in_column_ranges(PARAM *p
        selectivity (this is used for conditions like "column LIKE '%val%'" 
        where approaches #1 and #2 do not provide selectivity data).
 
+  SEE ALSO
+     table_cond_selectivity()
+     matching_candidates_in_table()
+
   NOTE
     Currently the selectivities of range conditions over different columns are
     considered independent. 

=== modified file 'sql/sql_select.cc'
--- a/sql/sql_select.cc	2014-04-11 21:01:32 +0000
+++ b/sql/sql_select.cc	2014-04-21 11:37:55 +0000
@@ -7165,8 +7165,18 @@ double table_multi_eq_cond_selectivity(J
       the current value of sel by this selectivity
     */
     table_map used_tables= item_equal->used_tables();
+
+    /* 
+      Equalities that do not include fields in this table do not matter 
+    */
     if (!(used_tables & table_bit))
       continue;
+
+    /* 
+      Equalities that include a constant are taken into account in
+      table->cond_selectivity. Selectivity from there is taken into account 
+      in matching_candidates_in_table() and/or table_cond_selectivity(). 
+    */
     if (item_equal->get_const())
       continue;
 
@@ -7178,14 +7188,23 @@ double table_multi_eq_cond_selectivity(J
       Field *fld= fi.get_curr_field();
       if (fld->table->map != table_bit)
         continue;
+
       if (pos->key == 0)
+      {
+        /* 
+          No ref access used (and no const in the multi-equality). We will
+          need to adjust the selectivity.
+        */
         adjust_sel= TRUE;
+      }
       else
       {
+        /* Ok, [eq]ref access is used */
         uint i;
         KEYUSE *keyuse= pos->key;
         uint key= keyuse->key;
 
+        /* Find which keypart participates in the equality */
         for (i= 0; i < keyparts; i++)
 	{
           uint fldno;
@@ -7196,6 +7215,7 @@ double table_multi_eq_cond_selectivity(J
           if (fld->field_index == fldno)
             break;
         }
+
         if (i == keyparts)
 	{
           /* 
@@ -7221,6 +7241,7 @@ double table_multi_eq_cond_selectivity(J
         }          
       }
     }
+
     if (adjust_sel)
     {
       /* 
@@ -7266,6 +7287,10 @@ double table_multi_eq_cond_selectivity(J
 
     For other access methods, we need to calculate selectivity of the whole
     condition, "COND(this_table) AND COND(this_table, previous_tables)".
+  
+  @seealso
+    calculate_cond_selectivity_for_table()
+    matching_candidates_in_table()
 
   @retval
     selectivity of the conditions imposed on the rows of s
@@ -7276,9 +7301,7 @@ double table_cond_selectivity(JOIN *join
                               table_map rem_tables)
 {
   uint16 ref_keyuse_steps[MAX_REF_PARTS - 1];
-  Field *field;
   TABLE *table= s->table;
-  MY_BITMAP *read_set= table->read_set;
   double sel= s->table->cond_selectivity;
   POSITION *pos= &join->positions[idx];
   uint keyparts= 0;
@@ -7287,23 +7310,79 @@ double table_cond_selectivity(JOIN *join
   if (pos->key != 0)
   {
     /* 
-      A ref access or hash join is used for this table.
+      A ref access or hash join is used for this table. ref access is created
+      from
+
+        tbl.keypart1=expr1 AND tbl.keypart2=expr2 AND ...
+      
+      and it will only return rows for which this condition is satisified.
+      Suppose, certain expr{i} is a constant. Since ref access only returns
+      rows that satisfy
+        
+         tbl.keypart{i}=const       (*)
+
+      then selectivity of this equality should not be counted in return value 
+      of this function. This function uses the value of 
+       
+         table->cond_selectivity=selectivity(COND(tbl))
+      
+      as a starting point. This value includes selectivity of equality (*). We
+      should somehow discount it. 
+      
+      Looking at calculate_cond_selectivity_for_table(), one can see that that
+      value is not necessarily a direct multiplicand in table->cond_selectivity
 
-      It could have some parts with "t.key_part=const". Using ref access
-      means that we will only get records where the condition holds, so we
-      should remove its selectivity from the condition selectivity.
+      There are three possible ways to discount
+      1. There is a range access on t.keypart{i}=const. 
+         (an important special case: multi-keypart ref(const) access)
+      
+      2. The field has a histogram. field[x]->cond_selectivity has the data.
       
+      3. Use index stats on this index:
+         rec_per_key[key_part+1]/rec_per_key[key_part]
+
       (TODO: more details about the "t.key=othertable.col" case)
     */
     KEYUSE *keyuse= pos->key;
     KEYUSE *prev_ref_keyuse= keyuse;
     uint key= keyuse->key;
-    do
+    
+    /*
+      Check if we have a prefix of key=const that matches a quick select.
+    */
+    if (!is_hash_join_key_no(key))
+    {
+      table_map quick_key_map= (table_map(1) << table->quick_key_parts[key]) - 1;
+      if (table->quick_rows[key] && 
+          !(quick_key_map & ~table->const_key_parts[key]))
+      {
+        /* 
+          Ok, there is an equality for each of the key parts used by the
+          quick select. This means, quick select's estimate can be reused to
+          discount the selectivity of a prefix of a ref access.
+        */
+        for (; quick_key_map & 1 ; quick_key_map>>= 1)
+        {
+          while (keyuse->keypart == keyparts)
+            keyuse++;
+          keyparts++;
+        }
+        sel /= table->quick_rows[key] / table->stat_records();
+      }
+    }
+    
+    /*
+      Go through the "keypart{N}=..." equalities and find those that were
+      already taken into account in table->cond_selectivity.
+    */
+    //do
+    while (keyuse->table == table && keyuse->key == key)
     {
       if (!(keyuse->used_tables & (rem_tables | table->map)))
       {
         if (are_tables_local(s, keyuse->val->used_tables()))
-	{
+	{                            /// ^^ why val->used_tables here but just
+                                     ///       used_tables above?
           if (is_hash_join_key_no(key))
 	  {
             if (keyparts == keyuse->keypart)
@@ -7312,22 +7391,40 @@ double table_cond_selectivity(JOIN *join
           else
 	  {
             if (keyparts == keyuse->keypart &&
-                !(~(keyuse->val->used_tables()) & pos->ref_depend_map) &&
+                !((keyuse->val->used_tables()) & ~pos->ref_depend_map) &&
                 !(found_part_ref_or_null & keyuse->optimize))
 	    {
+              /* Found a KEYUSE object that will be used by ref access */
               keyparts++;
               found_part_ref_or_null|= keyuse->optimize & ~KEY_OPTIMIZE_EQ;
             }
           }
+
+
           if (keyparts > keyuse->keypart)
 	  {
+            /* Ok this is the keyuse that will be used for ref access */
             uint fldno;
             if (is_hash_join_key_no(key))
 	      fldno= keyuse->keypart;
             else
               fldno= table->key_info[key].key_part[keyparts-1].fieldnr - 1;
             if (keyuse->val->const_item())
-              sel*= table->field[fldno]->cond_selectivity; 
+            {              
+              // psergey: not multiply, divide instead.
+              // before, we've had:
+              // sel*= table->field[fldno]->cond_selectivity; 
+
+              sel /= table->field[fldno]->cond_selectivity;
+              /* 
+               TODO: we could do better here:
+                 1. cond_selectivity might be =1 (the default) because quick 
+                    select on some index prevented us from analyzing 
+                    histogram for this column.
+                 2. we could get an estimate through this?
+                     rec_per_key[key_part-1] / rec_per_key[key_part]
+              */
+            }
             if (keyparts > 1)
 	    {
               ref_keyuse_steps[keyparts-2]= keyuse - prev_ref_keyuse;
@@ -7337,14 +7434,16 @@ double table_cond_selectivity(JOIN *join
 	}
       }
       keyuse++;
-    } while (keyuse->table == table && keyuse->key == key);
+    }
   }
   else
   {
     /*
       The table is accessed with full table scan, or quick select.
-      Selectivity of COND(table) is already accounted for in 
+      Selectivity of COND(this_table) is already accounted for in 
       matching_candidates_in_table().
+      For COND(this_table, previous_tables) we don't have any meaningful
+      estimates.
     */
     sel= 1;
   }
@@ -7353,7 +7452,30 @@ double table_cond_selectivity(JOIN *join
     If the field f from the table is equal to a field from one the
     earlier joined tables then the selectivity of the range conditions
     over the field f must be discounted.
+
+    psergey: I think this is wrong. Example:
+     
+      ## no keys used
+      select * from t1, t2 where t1.col=t2.col and t2.col<5
+
+      ## a variant with key:
+      select * from t1, t2 where t1.col=t2.col and t2.col<5 and t2.key=t1.col2
+
+    suppose the join order is t1, t2. Attached conditions:
+
+      t1:  t2.col<3
+      t2:  [t2.col=t1.col] AND t2.col<3
+
+    Suppose, we're now looking at selectivity for table t2.
+    - in case t2 uses full table scan (or quick select): all selectivity is
+      already accounted for in matching_candidates_in_table(). [YES. CHECKED]
+    - in case t2 uses ref access
+       = if the equality is used for ref access, we have already 
+         discounted its selectivity above.
+       = if the equality is not used for ref access, we should still count its
+         selectivity.
   */ 
+#if 0
   for (Field **f_ptr=table->field ; (field= *f_ptr) ; f_ptr++)
   {
     if (!bitmap_is_set(read_set, field->field_index) ||
@@ -7371,6 +7493,7 @@ double table_cond_selectivity(JOIN *join
       }
     }
   }
+#endif 
 
   sel*= table_multi_eq_cond_selectivity(join, idx, s, rem_tables,
                                         keyparts, ref_keyuse_steps);



More information about the commits mailing list