madlib-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jingyi...@apache.org
Subject [madlib] branch master updated: Minibatch Preprocessor for Deep learning
Date Mon, 28 Jan 2019 19:23:58 GMT
This is an automated email from the ASF dual-hosted git repository.

jingyimei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
     new 8de32ed  Minibatch Preprocessor for Deep learning
8de32ed is described below

commit 8de32ede33c48d2f4a440f0f639c94a277a359c1
Author: Nandish Jayaram <njayaram@apache.org>
AuthorDate: Mon Dec 17 17:54:42 2018 -0800

    Minibatch Preprocessor for Deep learning
    
    The minibatch preprocessor we currently have in MADlib is bloated for DL
    tasks. This feature adds a simplified version of creating buffers.  It
    can divide each element of the independent array by a normalizing constant
    for standardization (which is 255.0 by default), and optionally shift the
    dependent variables by an offset if they are a numeric type.
    
    Closes #342
    
    Co-authored-by: Arvind Sridhar <asridhar@pivotal.io>
    Co-authored-by: Domino Valdano <dvaldano@pivotal.io>
---
 doc/mainpage.dox.in                                |   6 +
 .../utilities/minibatch_preprocessing.py_in        | 192 +++++++++-
 .../utilities/minibatch_preprocessing_dl.sql_in    | 396 +++++++++++++++++++++
 .../test/minibatch_preprocessing_dl.sql_in         | 125 +++++++
 .../postgres/modules/utilities/utilities.py_in     |   4 +-
 5 files changed, 719 insertions(+), 4 deletions(-)

diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index aab37af..5568da6 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -285,6 +285,12 @@ complete matrix stored as a distributed table.
 Interface and implementation are subject to change.
 @{
     @defgroup grp_cg Conjugate Gradient
+    @defgroup grp_dl Deep Learning
+    @brief A collection of deep learning interfaces.
+    @details A collection of deep learning interfaces.
+    @{
+        @defgroup grp_minibatch_preprocessing_dl Mini-Batch Preprocessor for Deep Learning
+    @}
     @defgroup grp_knn k-Nearest Neighbors
     @defgroup grp_bayes Naive Bayes Classification
     @defgroup grp_sample Random Sampling
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
index 88433c9..be568a9 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
@@ -39,7 +39,7 @@ from utilities import py_list_to_sql_string
 from utilities import split_quoted_delimited_str
 from utilities import unique_string
 from utilities import validate_module_input_params
-from utilities import NUMERIC, INTEGER, TEXT, BOOLEAN, INCLUDE_ARRAY
+from utilities import NUMERIC, INTEGER, TEXT, BOOLEAN, INCLUDE_ARRAY, ONLY_ARRAY
 
 from mean_std_dev_calculator import MeanStdDevCalculator
 from validate_args import get_expr_type
@@ -51,6 +51,111 @@ m4_changequote(`<!', `!>')
 MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname"
 MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname"
 
+class MiniBatchPreProcessorDL:
+    def __init__(self, schema_madlib, source_table, output_table,
+                 dependent_varname, independent_varname, buffer_size,
+                 normalizing_const, dependent_offset, **kwargs):
+        self.schema_madlib = schema_madlib
+        self.source_table = source_table
+        self.output_table = output_table
+        self.dependent_varname = dependent_varname
+        self.independent_varname = independent_varname
+        self.buffer_size = buffer_size
+        self.normalizing_const = normalizing_const
+        self.dependent_offset = dependent_offset
+        self.module_name = "minibatch_preprocessor_DL"
+        self.output_summary_table = add_postfix(self.output_table, "_summary")
+        self._validate_args()
+        self.num_of_buffers = self._get_num_buffers()
+
+    def minibatch_preprocessor_dl(self):
+        norm_tbl = unique_string(desp='normalized')
+        # Create a temp table that has independent var normalized.
+
+        dependent_varname_with_offset = self.dependent_varname
+        if self.dependent_offset:
+            dependent_varname_with_offset = '{0} + {1}'.format(self.dependent_varname, self.dependent_offset)
+
+        scalar_mult_sql = """
+            CREATE TEMP TABLE {norm_tbl} AS
+            SELECT {self.schema_madlib}.array_scalar_mult(
+                {self.independent_varname}::REAL[], (1/{self.normalizing_const})::REAL) AS
x_norm,
+                {dependent_varname_with_offset} AS y,
+                row_number() over() AS row_id
+            FROM {self.source_table}
+        """.format(**locals())
+        plpy.execute(scalar_mult_sql)
+        # Create the mini-batched output table
+        if is_platform_pg():
+            distributed_by_clause = ''
+        else:
+            distributed_by_clause= ' DISTRIBUTED BY (buffer_id) '
+        sql = """
+            CREATE TABLE {self.output_table} AS
+            SELECT * FROM
+            (
+                SELECT {self.schema_madlib}.agg_array_concat(
+                    ARRAY[{norm_tbl}.x_norm::REAL[]]) AS {x},
+                    array_agg({norm_tbl}.y) AS {y},
+                    ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS buffer_id
+                FROM {norm_tbl}
+                GROUP BY buffer_id
+            ) b
+            {distributed_by_clause}
+        """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME,
+                   y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME,
+                   **locals())
+        plpy.execute(sql)
+        plpy.execute("DROP TABLE {0}".format(norm_tbl))
+        # Create summary table
+        self._create_output_summary_table()
+
+    def _create_output_summary_table(self):
+        query = """
+            CREATE TABLE {self.output_summary_table} AS
+            SELECT
+                $__madlib__${self.source_table}$__madlib__$::TEXT AS source_table,
+                $__madlib__${self.output_table}$__madlib__$::TEXT AS output_table,
+                $__madlib__${self.dependent_varname}$__madlib__$::TEXT AS dependent_varname,
+                $__madlib__${self.independent_varname}$__madlib__$::TEXT AS independent_varname,
+                $__madlib__${self.dependent_vartype}$__madlib__$::TEXT AS dependent_vartype,
+                {self.buffer_size} AS buffer_size
+        """.format(self=self)
+        plpy.execute(query)
+
+    def _validate_args(self):
+        validate_module_input_params(
+            self.source_table, self.output_table, self.independent_varname,
+            self.dependent_varname, self.module_name, None,
+            [self.output_summary_table])
+        self.independent_vartype = get_expr_type(
+            self.independent_varname, self.source_table)
+        _assert(is_valid_psql_type(self.independent_vartype,
+                                   NUMERIC | ONLY_ARRAY),
+                "Invalid independent variable type, should be an array of " \
+                "one of {0}".format(','.join(NUMERIC)))
+        self.dependent_vartype = get_expr_type(
+            self.dependent_varname, self.source_table)
+        dep_valid_types = NUMERIC | TEXT | BOOLEAN
+        _assert(is_valid_psql_type(self.dependent_vartype, dep_valid_types),
+                "Invalid dependent variable type, should be one of {0}".
+                format(','.join(dep_valid_types)))
+        if self.buffer_size is not None:
+            _assert(self.buffer_size > 0,
+                    "minibatch_preprocessor_dl: The buffer size has to be a " \
+                    "positive integer or NULL.")
+
+    def _get_num_buffers(self):
+        num_rows_in_tbl = plpy.execute("""
+                SELECT count(*) AS cnt FROM {0}
+            """.format(self.source_table))[0]['cnt']
+        buffer_size_calculator = MiniBatchBufferSizeCalculator()
+        indepdent_var_dim = _tbl_dimension_rownum(
+            self.schema_madlib, self.source_table, self.independent_varname,
+            skip_row_count=True)
+        self.buffer_size = buffer_size_calculator.calculate_default_buffer_size(
+            self.buffer_size, num_rows_in_tbl, indepdent_var_dim[0])
+        return ceil((1.0*num_rows_in_tbl)/self.buffer_size)
 
 class MiniBatchPreProcessor:
     """
@@ -509,7 +614,7 @@ class MiniBatchDocumentation:
         SELECT {schema_madlib}.{method}(
             source_table,          -- TEXT. Name of the table containing input
                                       data.  Can also be a view
-            output_table ,         -- TEXT. Name of the output table for
+            output_table,          -- TEXT. Name of the output table for
                                       mini-batching
             dependent_varname,     -- TEXT. Name of the dependent variable column
             independent_varname,   -- TEXT. Name of the independent variable
@@ -580,3 +685,86 @@ class MiniBatchDocumentation:
             for help.
         """.format(**locals())
 # ---------------------------------------------------------------------
+    @staticmethod
+    def minibatch_preprocessor_dl_help(schema_madlib, message):
+        method = "minibatch_preprocessor_dl"
+        summary = """
+        ----------------------------------------------------------------
+                            SUMMARY
+        ----------------------------------------------------------------
+        For Deep Learning based techniques such as Convolutional Neural Nets,
+        the input data is mostly images. These images can be represented as an
+        array of numbers where all elements are between 0 and 255 in value.
+        It is standard practice to divide each of these numbers by 255.0 to
+        normalize the image data. minibatch_preprocessor() is for general
+        use-cases, but for deep learning based use-cases we provide
+        minibatch_preprocessor_dl() that is light-weight and is
+        specific to image datasets.
+
+        The normalizing constant is parameterized, and can be specified based
+        on the kind of image data used.
+
+        For more details on function usage:
+        SELECT {schema_madlib}.{method}('usage')
+        """.format(**locals())
+
+        usage = """
+        ---------------------------------------------------------------------------
+                                        USAGE
+        ---------------------------------------------------------------------------
+        SELECT {schema_madlib}.{method}(
+            source_table,          -- TEXT. Name of the table containing input
+                                      data.  Can also be a view
+            output_table,          -- TEXT. Name of the output table for
+                                      mini-batching
+            dependent_varname,     -- TEXT. Name of the dependent variable column
+            independent_varname,   -- TEXT. Name of the independent variable
+                                      column
+            buffer_size            -- INTEGER. Default computed automatically.
+                                      Number of source input rows to pack into a buffer
+            normalizing_const      -- DOUBLE PRECISON. Default 255.0. The
+                                      normalizing constant to use for
+                                      standardizing arrays in independent_varname.
+            dependent_offset       -- INTEGER. If specified, shifts all dependent
+                                      variable values by this number (should
+                                      only be used for numeric types).
+        );
+
+
+        ---------------------------------------------------------------------------
+                                        OUTPUT
+        ---------------------------------------------------------------------------
+        The output table produced by MiniBatch Preprocessor contains the
+        following columns:
+
+        buffer_id               -- INTEGER.  Unique id for packed table.
+        dependent_varname       -- ANYARRAY[]. Packed array of dependent variables.
+        independent_varname     -- REAL[]. Packed array of independent
+                                   variables.
+
+        ---------------------------------------------------------------------------
+        The algorithm also creates a summary table named <output_table>_summary
+        that has the following columns:
+
+        source_table              -- Source table name.
+        output_table              -- Output table name from preprocessor.
+        dependent_varname         -- Dependent variable values from the original table
+                                     (shifted by dependent_offset, if specified).
+        independent_varname       -- Independent variable values from the original
+                                     table.
+        dependent_vartype         -- Type of the dependent variable from the
+                                     original table.
+        buffer_size               -- Buffer size used in preprocessing step.
+
+        ---------------------------------------------------------------------------
+        """.format(**locals())
+
+
+        if not message:
+            return summary
+        elif message.lower() in ('usage', 'help', '?'):
+            return usage
+        return """
+            No such option. Use "SELECT {schema_madlib}.minibatch_preprocessor_dl()"
+            for help.
+        """.format(**locals())
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
new file mode 100644
index 0000000..994e458
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
@@ -0,0 +1,396 @@
+/* ----------------------------------------------------------------------- */
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * @file minibatch_preprocessing_dl.sql_in
+ * @brief TODO
+ * @date December 2018
+ *
+ */
+/* ----------------------------------------------------------------------- */
+
+m4_include(`SQLCommon.m4')
+
+/**
+@addtogroup grp_minibatch_preprocessing_dl
+
+<div class="toc"><b>Contents</b><ul>
+<li class="level1"><a href="#minibatch_preprocessor_dl">Mini-Batch Preprocessor
for Deep Learning</a></li>
+<li class="level1"><a href="#example">Examples</a></li>
+</ul></div>
+
+For Deep Learning based techniques such as Convolutional Neural Nets, the input
+data is mostly images. These images can be represented as an array of numbers
+where all elements are between 0 and 255 in value. It is standard practice
+to divide each of these numbers by 255.0 to normalize the image data.
+minibatch_preprocessor() is for general use-cases, but for deep learning based
+use-cases we provide minibatch_preprocessor_dl() that is light-weight and is
+specific to image datasets. The normalizing constant is parameterized, and can
+be specified based on the kind of image data used.
+<pre class="syntax">
+minibatch_preprocessor_dl(source_table,
+                        output_table,
+                        dependent_varname,
+                        independent_varname,
+                        buffer_size,
+                        normalizing_const,
+                        dependent_offset
+                        )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+  <dt>source_table</dt>
+  <dd>TEXT. Name of the table containing input data.  Can also be a view.
+  </dd>
+
+  <dt>output_table</dt>
+  <dd>TEXT.  Name of the output table from the preprocessor which
+  will be used as input to algorithms that support mini-batching.
+  Note that the arrays packed into the output table are randomized
+  and normalized (by dividing each element in the independent variable array
+  by the normalizing_const), so they will not match up in an obvious way with
+  the rows in the source table.
+  </dd>
+
+  <dt>dependent_varname</dt>
+  <dd>TEXT. Name of the dependent variable column.
+  </dd>
+
+  <dt>independent_varname</dt>
+  <dd>TEXT. Name of the independent variable column. The column must be of
+  a numeric array type.
+  </dd>
+
+  <dt>buffer_size (optional)</dt>
+  <dd>INTEGER, default: computed.  Buffer size is the
+  number of rows from the
+  source table that are packed into one row of the preprocessor
+  output table.  The default value is computed considering size of
+  the source table, number of independent variables, number of groups,
+  and number of segments in the database cluster.  For larger data sets,
+  the computed buffer size will typically be a value in the millions.
+  </dd>
+
+  <dt>normalizing_const (optional)</dt>
+  <dd>DOUBLE PRECISION, default: 255.0. The normalizing constant to divide
+  each value in the independent_varname array by.
+  </dd>
+
+  <dt>dependent_offset (optional)</dt>
+  <dd>INTEGER, default: NULL. If specified, shifts all dependent
+  variable values by this number (should only be used for numeric types).
+  </dd>
+
+</dl>
+
+<b>Output tables</b>
+<br>
+    The output table produced by the mini-batch preprocessor contains the following columns:
+    <table class="output">
+      <tr>
+        <th>buffer_id</th>
+        <td>INTEGER. Unique id for packed table.
+        </td>
+      </tr>
+      <tr>
+        <th>dependent_varname</th>
+        <td>ANYARRAY[]. Packed array of dependent variables. The type
+        of the array is the same as the type of the dependent variable from
+        the source table.
+        </td>
+      </tr>
+      <tr>
+        <th>independent_varname</th>
+        <td>REAL[]. Packed array of independent variables.
+        </td>
+      </tr>
+    </table>
+
+A summary table named \<output_table\>_summary is also created, which has the following
columns:
+    <table class="output">
+    <tr>
+        <th>source_table</th>
+        <td>Name of the source table.</td>
+    </tr>
+    <tr>
+        <th>output_table</th>
+        <td>Name of output table generated by preprocessor.</td>
+    </tr>
+    <tr>
+        <th>dependent_varname</th>
+        <td>Dependent variable from the source table.</td>
+    </tr>
+    <tr>
+        <th>independent_varname</th>
+        <td>Independent variable from the source table.</td>
+    </tr>
+    <tr>
+        <th>dependent_vartype</th>
+        <td>Type of the dependent varialbe from the source table.</td>
+    </tr>
+    <tr>
+        <th>buffer_size</th>
+        <td>Buffer size used in preprocessing step.</td>
+    </tr>
+   </table>
+
+@anchor example
+@par Examples
+-#  Create an input data set based on the well known iris data set:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data;
+CREATE TABLE iris_data(
+    id serial,
+    attributes numeric[],
+    class_text varchar,
+    class integer,
+    state varchar
+);
+INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
+(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
+(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
+(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
+(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
+(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
+(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
+(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
+(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
+(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
+(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
+(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
+(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
+(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
+(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
+(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
+(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
+(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
+(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
+(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
+(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
+(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
+(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
+(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
+(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
+(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
+(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
+(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
+(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
+(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
+(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
+(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
+(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
+(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
+(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
+(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
+(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
+(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
+(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
+(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
+(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
+(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
+(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
+(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
+(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
+(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
+(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
+(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
+(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
+(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
+</pre>
+
+-#  Run the preprocessor:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary;
+SELECT madlib.minibatch_preprocessor_dl('iris_data',         -- Source table
+                                     'iris_data_packed',  -- Output table
+                                     'class_text',        -- Dependent variable
+                                     'attributes',        -- Independent variables
+                                     NULL,                -- buffer size
+                                     2                    -- normalizing constant
+                                     );
+</pre>
+For small datasets like in this example, buffer size is mainly
+determined by the number of segments in the database.
+This example is run on a Greenplum database with 3 segments,
+so there are 3 rows with a buffer size of 18.
+For PostgresSQL, there would be only one row with a buffer
+size of 52 since it is a single node database.
+For larger data sets, other factors go into
+computing buffers size besides number of segments.
+Also, note that the dependent variable has
+been one-hot encoded since it is categorical.
+Here is a sample of the packed output table:
+<pre class="example">
+\\x on
+SELECT * FROM iris_data_packed;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-------+-------------------------------------
+independent_varname | {{2.55,1.7,0.75,0.1},{2.55,1.9,0.95,0.2},{2.9,1.35,1.95,0.6},{3.35,1.55,2.2,0.7},{2.85,1.3,1.75,0.5},{3.1,1.1,2.25,0.75},...}}
+dependent_varname   | {Iris_versicolor,Iris_versicolor,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_setosa,...}
+buffer_id           | 0
+-[ RECORD 2 ]-------+-------------------------------------
+independent_varname | {{2.4,1.5,0.7,0.15},{2.2,1.6,0.65,0.1},{2.8,1.45,1.8,0.65},{2.9,1.3,2,0.6},{2.2,1.45,0.7,0.1},{2.85,1.4,2.25,0.65},{2.8,1.25,1.95,0.55},...}}
+dependent_varname   | {Iris_setosa,Iris_setosa,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_versicolor,...}
+buffer_id           | 1
+-[ RECORD 2 ]-------+-------------------------------------
+independent_varname | {{2.4,1.7,0.8,0.1},{2.9,1.35,2.05,0.5},{3.15,1.65,2.35,0.8},{2.7,1.95,0.65,0.2},{2.75,1.3,2.2,0.6},{3,1.1,2,0.5},{2.55,1.9,0.8,0.1},...}}
+dependent_varname   | {Iris_versicolor,Iris_setosa,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_setosa,...}
+buffer_id           | 2
+</pre>
+Review the output summary table:
+<pre class="example">
+\\x on
+SELECT * FROM iris_data_packed_summary;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-------+------------------
+source_table        | iris_data
+output_table        | iris_data_packed
+dependent_varname   | class_text
+independent_varname | attributes
+dependent_vartype   | character varying
+buffer_size         | 18
+</pre>
+
+-# Generally the default buffer size will work well,
+but if you have occasion to change it:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary;
+SELECT madlib.minibatch_preprocessor_dl('iris_data',         -- Source table
+                                     'iris_data_packed',  -- Output table
+                                     'class_text',        -- Dependent variable
+                                     'attributes',        -- Independent variables
+                                     10                   -- Buffer size
+                                     );
+</pre>
+Review the number of buffers in the output table:
+<pre class="example">
+SELECT COUNT(*) FROM iris_data_packed;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]
+count | 6
+</pre>
+Review the output summary table:
+<pre class="example">
+\\x on
+SELECT * FROM iris_data_packed_summary;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-------+------------------
+source_table        | iris_data
+output_table        | iris_data_packed
+dependent_varname   | class_text
+independent_varname | attributes
+dependent_vartype   | character varying
+buffer_size         | 10
+</pre>
+
+*/
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+    source_table                VARCHAR,
+    output_table                VARCHAR,
+    dependent_varname           VARCHAR,
+    independent_varname         VARCHAR,
+    buffer_size                 INTEGER,
+    normalizing_const           DOUBLE PRECISION,
+    dependent_offset            INTEGER
+) RETURNS VOID AS $$
+    PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
+    from utilities.control import MinWarning
+    with AOControl(False):
+        with MinWarning('error'):
+            minibatch_preprocessor_obj = minibatch_preprocessing.MiniBatchPreProcessorDL(**globals())
+            minibatch_preprocessor_obj.minibatch_preprocessor_dl()
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+    source_table            VARCHAR,
+    output_table            VARCHAR,
+    dependent_varname       VARCHAR,
+    independent_varname     VARCHAR,
+    buffer_size             INTEGER,
+    normalizing_const       DOUBLE PRECISION
+) RETURNS VOID AS $$
+  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, $6, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+    source_table            VARCHAR,
+    output_table            VARCHAR,
+    dependent_varname       VARCHAR,
+    independent_varname     VARCHAR,
+    buffer_size             INTEGER
+) RETURNS VOID AS $$
+  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+    source_table            VARCHAR,
+    output_table            VARCHAR,
+    dependent_varname       VARCHAR,
+    independent_varname     VARCHAR
+) RETURNS VOID AS $$
+  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+    message VARCHAR
+) RETURNS VARCHAR AS $$
+    PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
+    return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_dl_help(schema_madlib,
message)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl()
+RETURNS VARCHAR AS $$
+    PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
+    return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_dl_help(schema_madlib,
'')
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_transition(anyarray, anyarray)
+  RETURNS anyarray
+   AS 'select $1 || $2'
+   LANGUAGE SQL
+   IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_merge(anyarray, anyarray)
+  RETURNS anyarray
+   AS 'select $1 || $2'
+   LANGUAGE SQL
+   IMMUTABLE
+   RETURNS NULL ON NULL INPUT;
+
+DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.agg_array_concat(anyarray);
+CREATE AGGREGATE MADLIB_SCHEMA.agg_array_concat(anyarray) (
+   SFUNC = MADLIB_SCHEMA.agg_array_concat_transition,
+   STYPE = anyarray,
+   PREFUNC = MADLIB_SCHEMA.agg_array_concat_merge
+   );
diff --git a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in
new file mode 100644
index 0000000..dd9b0d5
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in
@@ -0,0 +1,125 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_input;
+CREATE TABLE minibatch_preprocessor_dl_input(id serial, x double precision[], label TEXT);
+INSERT INTO minibatch_preprocessor_dl_input(x, label) VALUES
+(ARRAY[1,2,3,4,5,6], 'a'),
+(ARRAY[11,2,3,4,5,6], 'a'),
+(ARRAY[11,22,33,4,5,6], 'a'),
+(ARRAY[11,22,33,44,5,6], 'a'),
+(ARRAY[11,22,33,44,65,6], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,144,65,56], 'a'),
+(ARRAY[11,22,233,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'b'),
+(ARRAY[11,22,33,44,65,56], 'b'),
+(ARRAY[11,22,33,44,65,56], 'b');
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+  'minibatch_preprocessor_dl_input',
+  'minibatch_preprocessor_dl_batch',
+  'id',
+  'x',
+  5);
+
+SELECT assert(count(*)=4, 'Incorrect number of buffers in minibatch_preprocessor_dl_batch.')
+FROM minibatch_preprocessor_dl_batch;
+
+SELECT assert(array_upper(independent_varname, 1)=5, 'Incorrect buffer size.')
+FROM minibatch_preprocessor_dl_batch WHERE buffer_id=1;
+
+SELECT assert(array_upper(independent_varname, 1)=2, 'Incorrect buffer size.')
+FROM minibatch_preprocessor_dl_batch WHERE buffer_id=4;
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+  'minibatch_preprocessor_dl_input',
+  'minibatch_preprocessor_dl_batch',
+  'label',
+  'x');
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_input;
+CREATE TABLE minibatch_preprocessor_dl_input(id serial, x double precision[], y INTEGER);
+INSERT INTO minibatch_preprocessor_dl_input(x, y) VALUES
+(ARRAY[1,2,3,4,5,6], 4),
+(ARRAY[11,2,3,4,5,6], 3),
+(ARRAY[11,22,33,4,5,6], 8),
+(ARRAY[11,22,33,44,5,6], 2),
+(ARRAY[11,22,33,44,65,6], 5),
+(ARRAY[11,22,33,44,65,56], 6),
+(ARRAY[11,22,33,44,65,56], 2),
+(ARRAY[11,22,33,44,65,56], 10),
+(ARRAY[11,22,33,44,65,56], 3),
+(ARRAY[11,22,33,44,65,56], 7),
+(ARRAY[11,22,33,44,65,56], 6),
+(ARRAY[11,22,33,44,65,56], -6),
+(ARRAY[11,22,33,144,65,56], 9),
+(ARRAY[11,22,233,44,65,56], 0),
+(ARRAY[11,22,33,44,65,56], 12),
+(ARRAY[11,22,33,44,65,56], -3),
+(ARRAY[11,22,33,44,65,56], -1);
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+  'minibatch_preprocessor_dl_input',
+  'minibatch_preprocessor_dl_batch',
+  'y',
+  'x',
+  4,
+  5);
+
+-- Test that indepdendent vars get divided by 5, by verifying min value goes from 1 to 0.2,
and max value from 233 to 46.6
+SELECT assert(relative_error(MIN(x),0.2) < 0.00001, 'Independent var not normalized properly!')
FROM (SELECT UNNEST(independent_varname) as x FROM minibatch_preprocessor_dl_batch) a;
+SELECT assert(relative_error(MAX(x),46.6) < 0.00001, 'Independent var not normalized properly!')
FROM (SELECT UNNEST(independent_varname) as x FROM minibatch_preprocessor_dl_batch) a;
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+  'minibatch_preprocessor_dl_input',
+  'minibatch_preprocessor_dl_batch',
+  'y',
+  'x',
+  4,
+  5,
+  6);
+
+-- Test that dependent vars gets shifted by +6, by verifying minimum value goes from -6 to
0
+SELECT assert(abs(MIN(y))<0.00001, 'Dependent var not shifted properly!') FROM (SELECT
UNNEST(dependent_varname) as y FROM minibatch_preprocessor_dl_batch) a;
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+  'minibatch_preprocessor_dl_input',
+  'minibatch_preprocessor_dl_batch',
+  'y',
+  'x',
+  4,
+  5,
+  -6);
+
+-- Test that dependent vars gets shifted by -6, by verifying minimum value goes from -6 to
-12
+SELECT assert(relative_error(MIN(y), -12)<0.00001, 'Dependent var not shifted properly!')
FROM (SELECT UNNEST(dependent_varname) as y FROM minibatch_preprocessor_dl_batch) a;
\ No newline at end of file
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index 50c426b..89907b3 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -875,8 +875,8 @@ def collate_plpy_result(plpy_result_rows):
 
 
 def validate_module_input_params(source_table, output_table, independent_varname,
-                                 dependent_varname, module_name, grouping_cols,
-                                 other_output_tables=None):
+                                 dependent_varname, module_name,
+                                 grouping_cols=None, other_output_tables=None):
     """
     This function is supposed to be used for validating params for
     supervised learning like algos, e.g. linear regression, mlp, etc. since all


Mime
View raw message