[Commits] Rev 4075: MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong in file:///home/psergey/dev2/10.0/

Sergey Petrunya psergey at askmonty.org
Tue Mar 25 18:50:27 EET 2014


At file:///home/psergey/dev2/10.0/

------------------------------------------------------------
revno: 4075
revision-id: psergey at askmonty.org-20140325165025-wz0qpo23pza1cson
parent: psergey at askmonty.org-20140321114237-8neebgq7jfthre82
committer: Sergey Petrunya <psergey at askmonty.org>
branch nick: 10.0
timestamp: Tue 2014-03-25 20:50:25 +0400
message:
  MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong
  - Use a new selectivity calculation formula in Histogram::point_selectivity. 
    The formula is different from the old one because it was developed from scratch.
    it doesn't have any possible division-by-zero problems.
  
  - The patch shows a regression for testscase for mdev-4350 (filtered=99.22 vs older one 
    of filtered=49.61).  This seems to be pure chance. It should be addressed separately.
=== modified file 'mysql-test/r/selectivity.result'
--- a/mysql-test/r/selectivity.result	2014-03-20 20:53:41 +0000
+++ b/mysql-test/r/selectivity.result	2014-03-25 16:50:25 +0000
@@ -1,4 +1,4 @@
-drop table if exists t1,t2,t3;
+drop table if exists t0,t1,t2,t3;
 select @@global.use_stat_tables;
 @@global.use_stat_tables
 COMPLEMENTARY
@@ -826,7 +826,7 @@ flush table t1;
 set optimizer_use_condition_selectivity=4;
 explain extended select * from t1 where a=0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1025	49.61	Using where
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1025	99.22	Using where
 Warnings:
 Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0)
 drop table t1;
@@ -1308,15 +1308,54 @@ test.t2	analyze	status	OK
 # The following two must have the same in 'Extra' column:
 explain extended select * from t2 where col1 IN (20, 180);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	1100	1.37	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	1100	1.00	Using where
 Warnings:
 Note	1003	select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (20,180))
 explain extended select * from t2 where col1 IN (180, 20);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	1100	1.37	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	1100	1.00	Using where
 Warnings:
 Note	1003	select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (180,20))
 drop table t1, t2;
+#
+# MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a int);
+insert into t1 select A.a from t0 A, t0 B, t0 C;
+set histogram_size=20;
+set histogram_type='single_prec_hb';
+analyze table t1 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	OK
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+# Should select about 10%:
+explain extended select * from t1 where a=2;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	9.52	Using where
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 2)
+# Should select about 10%:
+explain extended select * from t1 where a=1;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	9.52	Using where
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 1)
+# Must not have filtered=100%:
+explain extended select * from t1 where a=0;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	9.52	Using where
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0)
+# Again, must not have filtered=100%:
+explain extended select * from t1 where a=-1;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	9.52	Using where
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = <cache>(-(1)))
+drop table t0, t1;
 set histogram_type=@save_histogram_type;
 set histogram_size=@save_histogram_size;
 set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity;

=== modified file 'mysql-test/r/selectivity_innodb.result'
--- a/mysql-test/r/selectivity_innodb.result	2014-03-21 11:42:37 +0000
+++ b/mysql-test/r/selectivity_innodb.result	2014-03-25 16:50:25 +0000
@@ -1,7 +1,7 @@
 SET SESSION STORAGE_ENGINE='InnoDB';
 set @save_optimizer_switch_for_selectivity_test=@@optimizer_switch;
 set optimizer_switch='extended_keys=on';
-drop table if exists t1,t2,t3;
+drop table if exists t0,t1,t2,t3;
 select @@global.use_stat_tables;
 @@global.use_stat_tables
 COMPLEMENTARY
@@ -835,7 +835,7 @@ flush table t1;
 set optimizer_use_condition_selectivity=4;
 explain extended select * from t1 where a=0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1025	49.61	Using where
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1025	99.22	Using where
 Warnings:
 Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0)
 drop table t1;
@@ -1318,15 +1318,54 @@ test.t2	analyze	status	OK
 # The following two must have the same in 'Extra' column:
 explain extended select * from t2 where col1 IN (20, 180);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	1100	1.37	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	1100	1.00	Using where
 Warnings:
 Note	1003	select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (20,180))
 explain extended select * from t2 where col1 IN (180, 20);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	1100	1.37	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	1100	1.00	Using where
 Warnings:
 Note	1003	select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (180,20))
 drop table t1, t2;
+#
+# MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a int);
+insert into t1 select A.a from t0 A, t0 B, t0 C;
+set histogram_size=20;
+set histogram_type='single_prec_hb';
+analyze table t1 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	OK
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+# Should select about 10%:
+explain extended select * from t1 where a=2;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	9.52	Using where
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 2)
+# Should select about 10%:
+explain extended select * from t1 where a=1;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	9.52	Using where
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 1)
+# Must not have filtered=100%:
+explain extended select * from t1 where a=0;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	9.52	Using where
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0)
+# Again, must not have filtered=100%:
+explain extended select * from t1 where a=-1;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	9.52	Using where
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = <cache>(-(1)))
+drop table t0, t1;
 set histogram_type=@save_histogram_type;
 set histogram_size=@save_histogram_size;
 set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity;

=== modified file 'mysql-test/t/selectivity.test'
--- a/mysql-test/t/selectivity.test	2014-03-20 20:53:41 +0000
+++ b/mysql-test/t/selectivity.test	2014-03-25 16:50:25 +0000
@@ -1,7 +1,7 @@
 --source include/have_stat_tables.inc
 
 --disable_warnings
-drop table if exists t1,t2,t3;
+drop table if exists t0,t1,t2,t3;
 --enable_warnings
 
 select @@global.use_stat_tables;
@@ -885,6 +885,29 @@ explain extended select * from t2 where
 
 drop table t1, t2;
 
+--echo #
+--echo # MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong
+--echo #
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a int);
+insert into t1 select A.a from t0 A, t0 B, t0 C;
+set histogram_size=20;
+set histogram_type='single_prec_hb';
+analyze table t1 persistent for all;
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+--echo # Should select about 10%:
+explain extended select * from t1 where a=2;
+--echo # Should select about 10%:
+explain extended select * from t1 where a=1;
+--echo # Must not have filtered=100%:
+explain extended select * from t1 where a=0;
+--echo # Again, must not have filtered=100%:
+explain extended select * from t1 where a=-1;
+
+drop table t0, t1;
+
 set histogram_type=@save_histogram_type;
 set histogram_size=@save_histogram_size;
 set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity;

=== modified file 'sql/sql_statistics.h'
--- a/sql/sql_statistics.h	2014-02-19 10:05:15 +0000
+++ b/sql/sql_statistics.h	2014-03-25 16:50:25 +0000
@@ -234,24 +234,88 @@ class Histogram
     sel= bucket_sel * (max - min + 1);
     return sel;
   } 
+  
+
+  /*
+    Estimate selectivity of "col=const" using a histogram
+
+    @param pos      Position of "const" between column's min_value and max_value.
+                    This is a number in [0..1] range.
+    @param avg_sel  Average selectivity of condition "col=const" in this table.
+                    It is calcuated as (#non_null_values / #distinct_values).
+
+    @detail
+      The idea is to use histogram data to produce a more precise estimate of
+      how many records will match key=const1 for given value of "const1".
+
+      We assume that value const1 is present in the table (Since the histogram
+      doesn't store the exact values, we don't have any way to check that).
+
+      The value of avg_sel is the starting point.
+  */
 
   double point_selectivity(double pos, double avg_sel)
   {
     double sel;
-    double bucket_sel= 1.0/(get_width() + 1);  
+    //double bucket_sel= 1.0/(get_width() + 1);  
+
+    /*
+      Find the first bucket that contains rows with value of 'pos'. 
+      then, check how many following buckets have the same endpoint value (this
+      is approximation of "have the same constant").
+    */
     uint min= find_bucket(pos, TRUE);
     uint max= min;
     while (max + 1 < get_width() && get_value(max + 1) == get_value(max))
       max++;
+    
+    /*
+      Ok, now we know how many buckets are occupied by the value of 'pos'.
+
+      Note that value of 1 actually means "one bucket, maybe the whole bucket,
+      or maybe a tiny fraction of one bucket".
+    */
+    uint used_buckets= max + 1 - min;
+
+    /* How many buckets a random constant will use, on average?  */
+    double avg_buckets_per_value= (get_width() + 1) * avg_sel;
+
+    /*
+      Compare the number of buckets value of 'pos' is using with the number 
+      of buckets that a random constant would use.  Selectivity avg_sel should
+      be multiplied by:
+
+        used_buckets / avg_buckets_per_value
+      
+      There is problem, though. The value of used_buckets=1 actually means
+      "either 1 or less than that". When avg_buckets_per_value>1, we should
+      still use (used_buckets / avg_buckets_per_value)  - division moves the 
+      selectivity in the right direction.
+
+      However, when both used_buckets=1 and avg_buckets_per_value<1, we dont
+      really know whether the original estimate of avg_sel should be increased
+      or decreased.
+    */
+    if (used_buckets == 1 && avg_buckets_per_value<1)
+    {
+      sel= avg_sel;
+    }
+    else
+    {
+      sel= avg_sel * used_buckets / avg_buckets_per_value;
+    }
+
+/*
     double inv_prec_factor= (double) 1.0 / prec_factor(); 
     double width= (max + 1 == get_width() ?
                    1.0 : get_value(max) * inv_prec_factor) -
 	          (min == 0 ?
                    0.0 : get_value(min-1) * inv_prec_factor); 
     sel= avg_sel * (bucket_sel * (max + 1 - min)) / width;
+*/
     return sel;
   }
-             
+
 };
 
 



More information about the commits mailing list