[Commits] 3ac32917ab6: MDEV-21130: Histograms: use JSON as on-disk format

Sergei Petrunia psergey at askmonty.org
Fri Jul 23 01:26:50 EEST 2021


revision-id: 3ac32917ab6c42a5a0f9ed817dd8d3c7e20ce34d (mariadb-10.6.2-68-g3ac32917ab6)
parent(s): 5ddb8069145b518426be7fd31881d1d3fa5f53b4
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-07-23 01:26:50 +0300
message:

MDEV-21130: Histograms: use JSON as on-disk format

Preparation for handling different kinds of histograms:

- In Column_statistics, change "Histogram histogram" into
  "Histogram *histogram_".  This allows for different kinds
  of Histogram classes with virtual functions.

- [Almost] remove the usage of Histogram->set_values and
  Histogram->set_size. The code outside the histogram should
  not make any assumptions about what/how is stored in the Histogram.

- Introduce drafts of methods to read/save histograms to/from disk.

---
 sql/sql_statistics.cc | 214 ++++++++++++++++++++++++++++++++++++--------------
 sql/sql_statistics.h  |  90 ++++++++++++++-------
 sql/table.h           |  10 ++-
 3 files changed, 226 insertions(+), 88 deletions(-)

diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index 56e20ecf48e..2ec9aaa5965 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -311,7 +311,7 @@ class Column_statistics_collected :public Column_statistics
 
   inline void init(THD *thd, Field * table_field);
   inline bool add();
-  inline void finish(ha_rows rows, double sample_fraction);
+  inline void finish(MEM_ROOT *mem_root, ha_rows rows, double sample_fraction);
   inline void cleanup();
 };
 
@@ -1068,21 +1068,22 @@ class Column_stat: public Stat_table
           stat_field->store(stats->get_avg_frequency());
           break; 
         case COLUMN_STAT_HIST_SIZE:
-          stat_field->store(stats->histogram.get_size());
+          // Note: this is dumb. the histogram size is stored with the
+          // histogram!
+          stat_field->store(stats->histogram_? 
+                              stats->histogram_->get_size() : 0);
           break;
         case COLUMN_STAT_HIST_TYPE:
-          stat_field->store(stats->histogram.get_type() + 1);
+          if (stats->histogram_)
+            stat_field->store(stats->histogram_->get_type() + 1);
+          else
+            stat_field->set_null();
           break;
         case COLUMN_STAT_HISTOGRAM:
-          if (stats->histogram.get_type() == JSON)
-          {
-            stat_field->store((char *) stats->histogram.get_values(),
-                              strlen((char *) stats->histogram.get_values()), &my_charset_bin);
-          } else
-          {
-            stat_field->store((char *) stats->histogram.get_values(),
-                              stats->histogram.get_size(), &my_charset_bin);
-          }
+          if (stats->histogram_)
+            stats->histogram_->serialize(stat_field);
+          else
+            stat_field->set_null();
           break;
         }
       }
@@ -1111,6 +1112,7 @@ class Column_stat: public Stat_table
   void get_stat_values()
   {
     table_field->read_stats->set_all_nulls();
+    table_field->read_stats->histogram_type_on_disk= INVALID_HISTOGRAM;
 
     if (table_field->read_stats->min_value)
       table_field->read_stats->min_value->set_null();
@@ -1122,7 +1124,7 @@ class Column_stat: public Stat_table
       char buff[MAX_FIELD_WIDTH];
       String val(buff, sizeof(buff), &my_charset_bin);
 
-      for (uint i= COLUMN_STAT_MIN_VALUE; i <= COLUMN_STAT_HIST_TYPE; i++)
+      for (uint i= COLUMN_STAT_MIN_VALUE; i <= COLUMN_STAT_HISTOGRAM; i++)
       {  
         Field *stat_field= stat_table->field[i];
 
@@ -1166,13 +1168,22 @@ class Column_stat: public Stat_table
             table_field->read_stats->set_avg_frequency(stat_field->val_real());
             break;
           case COLUMN_STAT_HIST_SIZE:
-            table_field->read_stats->histogram.set_size(stat_field->val_int());
+            //TODO: ignore this. The size is a part of histogram!
+            //table_field->read_stats->histogram.set_size(stat_field->val_int());
             break;            
           case COLUMN_STAT_HIST_TYPE:
-            Histogram_type hist_type= (Histogram_type) (stat_field->val_int() -
-                                                        1);
-            table_field->read_stats->histogram.set_type(hist_type);
-            break;            
+            // TODO: save this next to histogram.
+            // For some reason, the histogram itself is read in
+            //   read_histograms_for_table
+            {
+              Histogram_type hist_type= (Histogram_type) (stat_field->val_int() -
+                                                          1);
+              table_field->read_stats->histogram_type_on_disk= hist_type;
+              break;
+            }
+          case COLUMN_STAT_HISTOGRAM:
+            //TODO: if stat_field->length() == 0 then histogram_type_on_disk is set to INVALID_HISTOGRAM
+            break;
           }
         }
       }
@@ -1195,7 +1206,7 @@ class Column_stat: public Stat_table
     of read_stats->histogram.
   */    
 
-  void get_histogram_value()
+  Histogram * load_histogram(MEM_ROOT *mem_root)
   {
     if (find_stat())
     {
@@ -1205,13 +1216,54 @@ class Column_stat: public Stat_table
       Field *stat_field= stat_table->field[fldno];
       table_field->read_stats->set_not_null(fldno);
       stat_field->val_str(&val);
-      memcpy(table_field->read_stats->histogram.get_values(),
-             val.ptr(), table_field->read_stats->histogram.get_size());
+      // histogram-todo: here, create the histogram of appropriate type.
+      Histogram *hist= new (mem_root) Histogram();
+      if (!hist->parse(mem_root, table_field->read_stats->histogram_type_on_disk, 
+                       (const uchar*)val.ptr(), val.length()))
+      {
+        table_field->read_stats->histogram_= hist;
+        return hist;
+      }
+      //memcpy(table_field->read_stats->histogram_.get_values(),
+      //       val.ptr(), table_field->read_stats->histogram.get_size());
     }
+    return NULL;
   }
-
 };
 
+bool Histogram::parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr_arg, uint size_arg)
+{
+  // Just copy the data
+  size = (uint8) size_arg;
+  type = type_arg;
+  values = (uchar*)alloc_root(mem_root, size_arg);
+  memcpy(values, ptr_arg, size_arg);
+  return false;
+}
+
+
+/*
+  Save the histogram data info a table field.
+*/
+void Histogram::serialize(Field *field)
+{
+  if (get_type() == JSON)
+  {
+    field->store((char*)get_values(), strlen((char*)get_values()), 
+                 &my_charset_bin);
+  }
+  else
+    field->store((char*)get_values(), get_size(), &my_charset_bin);
+}
+
+void Histogram::init_for_collection(MEM_ROOT *mem_root,
+                                    Histogram_type htype_arg, 
+                                    ulonglong size_arg)
+{
+  type= htype_arg;
+  values = (uchar*)alloc_root(mem_root, size_arg);
+  size= (uint8) size_arg;
+}
 
 /*
   An object of the class Index_stat is created to read statistical
@@ -1552,7 +1604,7 @@ class Histogram_builder
     Column_statistics *col_stats= col->collected_stats;
     min_value= col_stats->min_value;
     max_value= col_stats->max_value;
-    histogram= &col_stats->histogram;
+    histogram= col_stats->histogram_;
     hist_width= histogram->get_width();
     bucket_capacity= (double) records / (hist_width + 1);
     curr_bucket= 0;
@@ -1605,7 +1657,7 @@ std::vector<std::string> bucket_bounds = {};
     Column_statistics *col_stats= col->collected_stats;
     min_value= col_stats->min_value;
     max_value= col_stats->max_value;
-    histogram= &col_stats->histogram;
+    histogram= col_stats->histogram_;
     hist_width= histogram->get_width();
     bucket_capacity= (double) records / (hist_width + 1);
     curr_bucket= 0;
@@ -1765,9 +1817,9 @@ class Count_distinct_field: public Sql_alloc
     @brief
     Calculate a histogram of the tree
   */
-   void walk_tree_with_histogram(ha_rows rows)
+  void walk_tree_with_histogram(ha_rows rows)
   {
-    if(table_field->collected_stats->histogram.get_type() == JSON)
+    if (table_field->collected_stats->histogram_->get_type() == JSON)
     {
       Histogram_builder_json hist_builder(table_field, tree_key_length, rows);
       tree->walk(table_field->table, json_histogram_build_walk,
@@ -1775,7 +1827,8 @@ class Count_distinct_field: public Sql_alloc
       hist_builder.build();
       distincts= hist_builder.get_count_distinct();
       distincts_single_occurence= hist_builder.get_count_single_occurence();
-    } else
+    }
+    else
     {
       Histogram_builder hist_builder(table_field, tree_key_length, rows);
       tree->walk(table_field->table, histogram_build_walk,
@@ -1799,18 +1852,19 @@ class Count_distinct_field: public Sql_alloc
     @brief
     Get the size of the histogram in bytes built for table_field
   */
+  /*
   uint get_hist_size()
   {
     return table_field->collected_stats->histogram.get_size();
-  }
+  }*/
 
   /*
     @brief
     Get the pointer to the histogram built for table_field
   */
-  uchar *get_histogram()
+  Histogram *get_histogram()
   {
-    return table_field->collected_stats->histogram.get_values();
+    return table_field->collected_stats->histogram_;
   }
 
 };
@@ -2209,7 +2263,7 @@ int alloc_statistics_for_table(THD* thd, TABLE *table)
   uint key_parts= table->s->ext_key_parts;
   ulonglong *idx_avg_frequency= (ulonglong*) alloc_root(&table->mem_root,
                                                sizeof(ulonglong) * key_parts);
-
+/*
   uint hist_size= thd->variables.histogram_size;
   Histogram_type hist_type= (Histogram_type) (thd->variables.histogram_type);
   uchar *histogram= NULL;
@@ -2220,16 +2274,16 @@ int alloc_statistics_for_table(THD* thd, TABLE *table)
       bzero(histogram, hist_size * columns);
 
   }
-
-  if (!table_stats || !column_stats || !index_stats || !idx_avg_frequency ||
-      (hist_size && !histogram))
+*/
+  if (!table_stats || !column_stats || !index_stats || !idx_avg_frequency)
+    //|| (hist_size && !histogram))
     DBUG_RETURN(1);
 
   table->collected_stats= table_stats;
   table_stats->column_stats= column_stats;
   table_stats->index_stats= index_stats;
   table_stats->idx_avg_frequency= idx_avg_frequency;
-  table_stats->histograms= histogram;
+  //table_stats->histograms= histogram;
   
   memset(column_stats, 0, sizeof(Column_statistics) * columns);
 
@@ -2237,10 +2291,12 @@ int alloc_statistics_for_table(THD* thd, TABLE *table)
   {
     if (bitmap_is_set(table->read_set, (*field_ptr)->field_index))
     {
+      column_stats->histogram_ = NULL;
+      /*
       column_stats->histogram.set_size(hist_size);
       column_stats->histogram.set_type(hist_type);
       column_stats->histogram.set_values(histogram);
-      histogram+= hist_size;
+      histogram+= hist_size;*/
       (*field_ptr)->collected_stats= column_stats++;
     }
   }
@@ -2459,6 +2515,25 @@ bool Column_statistics_collected::add()
 }
 
 
+/* 
+  Create an empty Histogram object from histogram_type.
+
+  Note: it is not yet clear whether collection-time histogram should be the same 
+  as lookup-time histogram. At the moment, they are.
+*/
+
+Histogram* get_histogram_by_type(MEM_ROOT *mem_root, Histogram_type hist_type) {
+  switch (hist_type) {
+  case SINGLE_PREC_HB:
+  case DOUBLE_PREC_HB:
+  case JSON:
+    return new Histogram();
+  default:
+    DBUG_ASSERT(0);
+  }
+  return NULL;
+};
+
 /**
   @brief
   Get the results of aggregation when collecting the statistics on a column
@@ -2468,7 +2543,7 @@ bool Column_statistics_collected::add()
 */
 
 inline
-void Column_statistics_collected::finish(ha_rows rows, double sample_fraction)
+void Column_statistics_collected::finish(MEM_ROOT *mem_root, ha_rows rows, double sample_fraction)
 {
   double val;
 
@@ -2486,10 +2561,19 @@ void Column_statistics_collected::finish(ha_rows rows, double sample_fraction)
   }
   if (count_distinct)
   {
-    uint hist_size= count_distinct->get_hist_size();
+    //uint hist_size= count_distinct->get_hist_size();
+    uint hist_size= current_thd->variables.histogram_size;
+    Histogram_type hist_type= (Histogram_type) (current_thd->variables.histogram_type);
+    bool have_histogram= false;
+    if (hist_size != 0 && hist_type != INVALID_HISTOGRAM)
+    {
+      have_histogram= true;
+      histogram_= new Histogram;
+      histogram_->init_for_collection(mem_root, hist_type, hist_size);
+    }
 
     /* Compute cardinality statistics and optionally histogram. */
-    if (hist_size == 0)
+    if (!have_histogram)
       count_distinct->walk_tree();
     else
       count_distinct->walk_tree_with_histogram(rows - nulls);
@@ -2527,13 +2611,14 @@ void Column_statistics_collected::finish(ha_rows rows, double sample_fraction)
       set_not_null(COLUMN_STAT_AVG_FREQUENCY);
     }
     else
-      hist_size= 0;
-    histogram.set_size(hist_size);
+      have_histogram= false ; // TODO: need this?
+    //histogram.set_size(hist_size);
     set_not_null(COLUMN_STAT_HIST_SIZE);
-    if (hist_size && distincts)
+    if (have_histogram && distincts)
     {
       set_not_null(COLUMN_STAT_HIST_TYPE);
-      histogram.set_values(count_distinct->get_histogram());
+      //histogram.set_values(count_distinct->get_histogram());
+      histogram_= count_distinct->get_histogram();
       set_not_null(COLUMN_STAT_HISTOGRAM);
     } 
     delete count_distinct;
@@ -2795,7 +2880,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
       continue;
     bitmap_set_bit(table->write_set, table_field->field_index); 
     if (!rc)
-      table_field->collected_stats->finish(rows, sample_fraction);
+      table_field->collected_stats->finish(&table->mem_root, rows, sample_fraction);
     else
       table_field->collected_stats->cleanup();
   }
@@ -3001,16 +3086,19 @@ int read_statistics_for_table(THD *thd, TABLE *table, TABLE_LIST *stat_tables)
    
   /* Read statistics from the statistical table column_stats */
   stat_table= stat_tables[COLUMN_STAT].table;
-  ulong total_hist_size= 0;
+  //ulong total_hist_size= 0;
+  bool have_histograms= false;
   Column_stat column_stat(stat_table, table);
   for (field_ptr= table_share->field; *field_ptr; field_ptr++)
   {
     table_field= *field_ptr;
     column_stat.set_key_fields(table_field);
     column_stat.get_stat_values();
-    total_hist_size+= table_field->read_stats->histogram.get_size();
+    //total_hist_size+= table_field->read_stats->histogram.get_size();
+    if (table_field->read_stats->histogram_type_on_disk != INVALID_HISTOGRAM)
+      have_histograms= true;
   }
-  table_share->stats_cb.total_hist_size= total_hist_size;
+  table_share->stats_cb.total_hist_size= have_histograms? 1:0; // total_hist_size
 
   /* Read statistics from the statistical table index_stats */
   stat_table= stat_tables[INDEX_STAT].table;
@@ -3147,28 +3235,36 @@ int read_histograms_for_table(THD *thd, TABLE *table, TABLE_LIST *stat_tables)
 {
   TABLE_STATISTICS_CB *stats_cb= &table->s->stats_cb;
   DBUG_ENTER("read_histograms_for_table");
-
+  
+  // histograms-todo: why do we use synchronization here, when we load 
+  //  histogram for the TABLE object, not TABLE_SHARE?
+  //  is it because of the use of stats_cb->mem_root?
   if (stats_cb->start_histograms_load())
   {
-    uchar *histogram= (uchar *) alloc_root(&stats_cb->mem_root,
-                                           stats_cb->total_hist_size);
+    //uchar *histogram= (uchar *) alloc_root(&stats_cb->mem_root,
+    //                                       stats_cb->total_hist_size);
+    /*
     if (!histogram)
     {
       stats_cb->abort_histograms_load();
       DBUG_RETURN(1);
     }
-    memset(histogram, 0, stats_cb->total_hist_size);
+    */
+    //memset(histogram, 0, stats_cb->total_hist_size);
 
     Column_stat column_stat(stat_tables[COLUMN_STAT].table, table);
     for (Field **field_ptr= table->s->field; *field_ptr; field_ptr++)
     {
       Field *table_field= *field_ptr;
-      if (uint hist_size= table_field->read_stats->histogram.get_size())
+      //if (uint hist_size= table_field->read_stats->histogram.get_size())
+      if (table_field->read_stats->histogram_type_on_disk != INVALID_HISTOGRAM)
       {
         column_stat.set_key_fields(table_field);
-        table_field->read_stats->histogram.set_values(histogram);
-        column_stat.get_histogram_value();
-        histogram+= hist_size;
+        //table_field->read_stats->histogram.set_values(histogram);
+
+        table_field->read_stats->histogram_=
+          column_stat.load_histogram(&stats_cb->mem_root); 
+        //histogram+= hist_size;
       }
     }
     stats_cb->end_histograms_load();
@@ -3860,8 +3956,8 @@ double get_column_range_cardinality(Field *field,
       if (avg_frequency > 1.0 + 0.000001 && 
           col_stats->min_max_values_are_provided())
       {
-        Histogram *hist= &col_stats->histogram;
-        if (hist->is_usable(thd))
+        Histogram *hist= col_stats->histogram_;
+        if (hist && hist->is_usable(thd))
         {
           store_key_image_to_rec(field, (uchar *) min_endp->key,
                                  field->key_length());
@@ -3904,8 +4000,8 @@ double get_column_range_cardinality(Field *field,
       else
         max_mp_pos= 1.0;
 
-      Histogram *hist= &col_stats->histogram;
-      if (hist->is_usable(thd))
+      Histogram *hist= col_stats->histogram_;
+      if (hist && hist->is_usable(thd))
         sel= hist->range_selectivity(min_mp_pos, max_mp_pos);
       else
         sel= (max_mp_pos - min_mp_pos);
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
index a554721d50b..178bc11a278 100644
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -43,7 +43,8 @@ enum enum_histogram_type
 {
   SINGLE_PREC_HB,
   DOUBLE_PREC_HB,
-  JSON
+  JSON,
+  INVALID_HISTOGRAM
 } Histogram_type;
 
 enum enum_stat_tables
@@ -141,40 +142,70 @@ double get_column_range_cardinality(Field *field,
 bool is_stat_table(const LEX_CSTRING *db, LEX_CSTRING *table);
 bool is_eits_usable(Field* field);
 
-class Histogram
+/*
+  Common base for all histograms
+*/
+class Histogram_base : public Sql_alloc
 {
+public:
+  virtual bool parse(MEM_ROOT *mem_root, Histogram_type type_arg, 
+                     const uchar *ptr, uint size)= 0;
+  virtual void serialize(Field *to_field)= 0;
 
-private:
-  Histogram_type type;
-  uint8 size; /* Size of values array, in bytes */
-  uchar *values;
+  virtual Histogram_type get_type()=0;
+  
+  // Legacy: return the size of the histogram on disk.
+  // This will be stored in mysql.column_stats.hist_size column.
+  // Newer, JSON-based histograms may return 0.
+  virtual uint get_size()=0;
 
-  uint prec_factor()
+  virtual ~Histogram_base(){}
+};
+
+class Histogram : public Histogram_base
+{
+public:
+  bool parse(MEM_ROOT *mem_root, Histogram_type type_arg, 
+             const uchar *ptr_arg, uint size_arg) override;
+  void serialize(Field *to_field) override;
+  Histogram_type get_type() override { return type; }
+
+  uint get_size() override { return (uint) size; }
+
+  // returns number of buckets in the histogram
+  uint get_width()
   {
     switch (type) {
     case SINGLE_PREC_HB:
     case JSON:
-      return ((uint) (1 << 8) - 1);
+      return size;
     case DOUBLE_PREC_HB:
-      return ((uint) (1 << 16) - 1);
+      return size / 2;
+    default:
+      DBUG_ASSERT(0);
     }
-    return 1;
+    return 0;
   }
 
-public:
-  uint get_width()
+private:
+  Histogram_type type;
+  uint8 size; /* Size of values array, in bytes */
+  uchar *values;
+
+  uint prec_factor()
   {
     switch (type) {
     case SINGLE_PREC_HB:
     case JSON:
-      return size;
+      return ((uint) (1 << 8) - 1);
     case DOUBLE_PREC_HB:
-      return size / 2;
+      return ((uint) (1 << 16) - 1);
+    default:
+      DBUG_ASSERT(0);
     }
-    return 0;
+    return 1;
   }
 
-private:
   uint get_value(uint i)
   {
     DBUG_ASSERT(i < get_width());
@@ -184,6 +215,8 @@ class Histogram
       return (uint) (((uint8 *) values)[i]);
     case DOUBLE_PREC_HB:
       return (uint) uint2korr(values + i * 2);
+    default:
+      DBUG_ASSERT(0);
     }
     return 0;
   }
@@ -227,19 +260,13 @@ class Histogram
     return i;
   }
 
-public:
-
-  uint get_size() { return (uint) size; }
-
-  Histogram_type get_type() { return type; }
-
   uchar *get_values() { return (uchar *) values; }
+public:
+  void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg, ulonglong size);
 
-  void set_size (ulonglong sz) { size= (uint8) sz; }
-
-  void set_type (Histogram_type t) { type= t; }
-
+  // Note: these two are used only for saving the JSON text:
   void set_values (uchar *vals) { values= (uchar *) vals; }
+  void set_size (ulonglong sz) { size= (uint8) sz; }
 
   bool is_available() { return get_size() > 0 && get_values(); }
 
@@ -264,6 +291,9 @@ class Histogram
     case DOUBLE_PREC_HB:
       int2store(values + i * 2, val * prec_factor());
       return;
+    default:
+      DBUG_ASSERT(0);
+      return;
     }
   }
 
@@ -277,6 +307,9 @@ class Histogram
     case DOUBLE_PREC_HB:
       int2store(values + i * 2, uint2korr(values + i * 2 - 2));
       return;
+    default:
+      DBUG_ASSERT(0);
+      return;
     }
   }
 
@@ -314,7 +347,7 @@ class Table_statistics
 
   /* Array of records per key for index prefixes */
   ulonglong *idx_avg_frequency;
-  uchar *histograms;                /* Sequence of histograms       */                    
+  //uchar *histograms;                /* Sequence of histograms       */
 };
 
 
@@ -377,7 +410,8 @@ class Column_statistics
 
 public:
 
-  Histogram histogram;
+  Histogram_type histogram_type_on_disk;
+  Histogram *histogram_;
 
   uint32 no_values_provided_bitmap()
   {
diff --git a/sql/table.h b/sql/table.h
index 2e074abcea0..f557f4ca59e 100644
--- a/sql/table.h
+++ b/sql/table.h
@@ -679,7 +679,15 @@ class TABLE_STATISTICS_CB
 public:
   MEM_ROOT  mem_root; /* MEM_ROOT to allocate statistical data for the table */
   Table_statistics *table_stats; /* Structure to access the statistical data */
-  ulong total_hist_size;         /* Total size of all histograms */
+
+  /*
+    Total size of all histograms. A value of 0 means historams are not present,
+    and histograms_are_ready() can finish sooner.
+
+    Currently we just set it to 1 when we expect to load histograms.
+    histogram-todo: rename this or even remove?
+  */
+  ulong total_hist_size;
 
   bool histograms_are_ready() const
   {


More information about the commits mailing list