Merge pull request #5 from nabeel-oz/machine-learning-with-sklearn

Machine learning with sklearn
nabeel-oz · Sep 4, 2018 · 854e919 · 854e919
2 parents 9dabe32 + 51b2ce6
commit 854e919
Show file tree

Hide file tree

Showing 15 changed files with 2,420 additions and 52 deletions.
diff --git a/README.md b/README.md
@@ -16,12 +16,13 @@ This repository provides a server side extension (SSE) for Qlik Sense built usin
 
 The current implementation includes:
 
-- Clustering : Implemented using [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/comparing_clustering_algorithms.html), a high performance algorithm that is great for exploratory data analysis. 
-- Time series forecasting : Implemented using [Facebook Prophet](https://research.fb.com/prophet-forecasting-at-scale/), a modern library for easily generating good quality forecasts.
-- Seasonality and holiday analysis: Also using Facebook Prophet.
-- Linear correlations : Implemented using Pandas.
+- **Supervised Machine Learning** : Implemented using [scikit-learn](http://scikit-learn.org/stable/index.html), the go-to machine learning library for Python. This SSE implements the full machine learning flow from data preparation, model training and evaluation, to making predictions in Qlik.
+- **Clustering** : Implemented using [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/comparing_clustering_algorithms.html), a high performance algorithm that is great for exploratory data analysis. 
+- **Time series forecasting** : Implemented using [Facebook Prophet](https://research.fb.com/prophet-forecasting-at-scale/), a modern library for easily generating good quality forecasts.
+- **Seasonality and holiday analysis** : Also using Facebook Prophet.
+- **Linear correlations** : Implemented using Pandas.
 
-Further information on these features is available through the Usage section below.
+Further information on these features is available through the [Usage](#usage) section below.
 
 For more information on Qlik Server Side Extensions see [qlik-oss](https://github.com/qlik-oss/server-side-extension).
 
@@ -80,6 +81,7 @@ Sample Qlik Sense apps are provided and each app includes a series of Bookmarks
 
 | Documentation | Sample App | App Dependencies |
 | --- | --- | --- |
-| [Clustering](docs/Clustering.md) | [Sample App - Clustering with HDBSCAN](docs/Sample_App_Clustering.qvf) | The [qsVariable](https://github.com/erikwett/qsVariable) extension. <br/><br/>Qlik Sense April 2018 or later to view the multi-layered maps. |
-| [Prophet](docs/Prophet.md) | [Sample App - Facebook Prophet](docs/Sample_App_Prophet.qvf) | The [qsVariable](https://github.com/erikwett/qsVariable) extension. <br/><br/>Use the bookmarks to step through the sheets with relevant selections. |
 | [Correlations](docs/Correlation.md) | [Sample App - Correlations](docs/Sample_App_Correlations.qvf) | None. |
+| [Clustering](docs/Clustering.md) | [Sample App - Clustering with HDBSCAN](docs/Sample_App_Clustering.qvf) | The [qsVariable](https://github.com/erikwett/qsVariable) extension. <br/><br/>Qlik Sense April 2018 or later to view the multi-layered maps. |
+| [Forecasting](docs/Prophet.md) | [Sample App - Facebook Prophet](docs/Sample_App_Prophet.qvf) | The [qsVariable](https://github.com/erikwett/qsVariable) extension. <br/><br/>Use the bookmarks to step through the sheets with relevant selections. |
+| [Supervised Machine Learning](docs/scikit-learn.md) | [Sample App - Train_Test](docs/Sample_App_sklearn_Train_Test.qvf)<br><br>[Sample App - Predict](docs/Sample_App_sklearn_Predict.qvf) | Make sure you run the load for the Train_Test app before using the Predict app.<br><br>The [qsVariable](https://github.com/erikwett/qsVariable) extension. |
diff --git a/core/__main__.py b/core/__main__.py
@@ -6,6 +6,7 @@
 import sys
 import time
 import locale
+import warnings
 from concurrent import futures
 
 # Add Generated folder to module path.
@@ -15,20 +16,25 @@
 import ServerSideExtension_pb2 as SSE
 import grpc
 
+# Turn off warnings by default
+if not sys.warnoptions:
+    warnings.simplefilter("ignore")
+
 # Import libraries for added functions
 import numpy as np
 import pandas as pd
 import _utils as utils
 from _prophet import ProphetForQlik
 from _clustering import HDBSCANForQlik
+from _sklearn import SKLearnForQlik
 
 # Set the default port for this SSE Extension
 _DEFAULT_PORT = '50055'
 
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 _MINFLOAT = float('-inf')
 
-# Set the locale for number formatting based on user settings
+# Set the locale for number formats based on user settings
 locale.setlocale(locale.LC_NUMERIC, '')
 
 class ExtensionService(SSE.ConnectorServicer):
@@ -68,7 +74,22 @@ def functions(self):
             5: '_prophet',
             6: '_prophet',
             7: '_prophet',
-            8: '_prophet_seasonality'
+            8: '_prophet_seasonality',
+            9: '_sklearn',
+            10: '_sklearn',
+            11: '_sklearn',
+            12: '_sklearn',
+            13: '_sklearn',
+            14: '_sklearn',
+            15: '_sklearn',
+            16: '_sklearn',
+            17: '_sklearn',
+            18: '_sklearn',
+            19: '_sklearn',
+            20: '_sklearn',
+            21: '_sklearn',
+            22: '_sklearn',
+            23: '_sklearn'
         }
 
     """
@@ -194,8 +215,8 @@ def _correlation(request, context):
                 # Check that the lists are of equal length
                 if len(x) == len(y) and len(x) > 0:
                     # Create a Pandas data frame using the lists
-                    df = pd.DataFrame({'x': [utils._string_to_float(d) for d in x], \
-                                       'y': [utils._string_to_float(d) for d in y]})
+                    df = pd.DataFrame({'x': [utils.atof(d) for d in x], \
+                                       'y': [utils.atof(d) for d in y]})
 
                     # Calculate the correlation matrix for the two series in the data frame
                     corr_matrix = df.corr(method=corr_type)
@@ -352,6 +373,113 @@ def _prophet_seasonality(request, context):
             # Yield Row data as Bundled rows
             yield SSE.BundledRows(rows=response_rows)  
 
+    @staticmethod
+    def _sklearn(request, context):
+        """
+        Setup the meta data for a sklearn machine learning model.
+        :param request: an iterable sequence of RowData
+        :param context:
+        :return: Refer to comments below as the response depends on the function called
+        :Qlik expression examples:
+        :<AAI Connection Name>.
+        """
+        # Get a list from the generator object so that it can be iterated over multiple times
+        request_list = [request_rows for request_rows in request]
+
+        # Get the function id from the header to determine the variant being called
+        function = ExtensionService._get_function_id(context)
+
+        # Create an instance of the SKLearnForQlik class
+        model = SKLearnForQlik(request_list, context)
+
+        # Call the function based on the mapping in functions.json
+        # The IF conditions are grouped based on similar output structure
+        if function in (9, 10, 21):    
+            if function == 9:
+                # Set up the model and save to disk
+                response = model.setup()
+            elif function == 21:
+                # Set up a model with specific metric and dimensionality reduction arguments and save to disk
+                response = model.setup(advanced=True)
+            elif function == 10:
+                # Set feature definitions for an existing model
+                response = model.set_features()
+
+            # Get the response as SSE.Rows
+            response_rows = utils.get_response_rows(response.values.tolist(), ["str", "str", "str"])
+
+        elif function == 11:
+            # Return the feature definitions for an existing model
+            response = model.get_features()
+
+            # Get the response as SSE.Rows
+            response_rows = utils.get_response_rows(response.values.tolist(), ["str", "num", "str", "str", "str",\
+                                                                               "str", "num"])
+
+        elif function == 12:
+            # Train and Test an existing model, saving the sklearn pipeline for further predictions
+            response = model.fit()
+
+            # Get the response as SSE.Rows
+            response_rows = utils.get_response_rows(response.values.tolist(), ["str", "str", "str", "str", "num"])
+
+        elif function in (14, 16, 19, 20):
+            if function == 14:
+                # Provide predictions in a chart expression based on an existing model
+                response = model.predict(load_script=False)
+            elif function == 16:
+                # Provide predictions probabilities in a chart expression based on an existing model
+                response = model.predict(load_script=False, variant="predict_proba")
+            elif function == 19:
+                # Get a list of models based on a search string
+                response = model.list_models()
+            elif function == 20:
+                # Get a string that can be evaluated to get the features expression for the predict function
+                response = model.get_features_expression()
+
+            # Get the response as SSE.Rows
+            response_rows = utils.get_response_rows(response.values.tolist(), ["str"])
+
+        elif function in (15, 17):
+            if function == 15:
+                # Provide predictions in the load script based on an existing model
+                response = model.predict(load_script=True)    
+            if function == 17:
+                # Provide prediction probabilities in the load script based on an existing model
+                response = model.predict(load_script=True, variant="predict_proba")
+
+            # Get the response as SSE.Rows
+            response_rows = utils.get_response_rows(response.values.tolist(), ["str", "str", "str"])
+
+        elif function in (18, 22):
+            if function == 18:
+                response = model.get_metrics()
+            elif function == 22:
+                response = model.calculate_metrics()
+
+            # Check whether the metrics are for a classifier
+            if "accuracy" in response.columns:
+                estimator_type = "classifier"
+            # Check whether the metrics are for a regressor
+            elif "r2_score" in response.columns:
+                estimator_type = "regressor"
+
+            # We convert values to type SSE.Dual, and group columns into a iterable
+            if estimator_type == "classifier":
+                # Get the response as SSE.Rows
+                response_rows = utils.get_response_rows(response.values.tolist(), ["str", "str", "num", "num", "num",\
+                                                                                   "num", "num"])
+            elif estimator_type == "regressor":
+                # Get the response as SSE.Rows
+                response_rows = utils.get_response_rows(response.values.tolist(), ["str", "num", "num", "num", "num", "num"])
+
+        elif function == 23:
+            response = model.get_confusion_matrix()
+            response_rows = utils.get_response_rows(response.values.tolist(), ["str", "str", "str", "num"])
+
+        # Yield Row data as Bundled rows
+        yield SSE.BundledRows(rows=response_rows)
+
     @staticmethod
     def _get_function_id(context):
         """

diff --git a/core/_clustering.py b/core/_clustering.py
@@ -86,7 +86,7 @@ def __init__(self, request, context, variant="standard"):
                 self.input_df = pd.DataFrame([s.split(';') for r in self.input_df.values for s in r], index=self.input_df.index)
 
                 # Convert strings to numbers using locale settings
-                self.input_df = self.input_df.applymap(lambda s: locale.atof(s) if s else np.NaN)
+                self.input_df = self.input_df.applymap(lambda s: utils.atof(s) if s else np.NaN)
 
         # Finally we prepare the data for the clustering algorithm:
 
@@ -264,19 +264,19 @@ def _set_params(self, kwargs):
             # The minimum size of clusters. 
             # The default value is 5.
             if 'min_cluster_size' in self.kwargs:
-                self.min_cluster_size = int(self.kwargs['min_cluster_size'])
+                self.min_cluster_size = utils.atoi(self.kwargs['min_cluster_size'])
 
             # The number of samples in a neighbourhood for a point to be considered a core point.
             if 'min_samples' in self.kwargs:
-                self.min_samples = int(self.kwargs['min_samples'])
+                self.min_samples = utils.atoi(self.kwargs['min_samples'])
 
             # p value to use if using the minkowski metric.
             if 'p' in self.kwargs:
-                self.p = int(self.kwargs['p'])
+                self.p = utils.atoi(self.kwargs['p'])
 
             # A distance scaling parameter as used in robust single linkage.
             if 'alpha' in self.kwargs:
-                self.alpha = float(self.kwargs['alpha'])
+                self.alpha = utils.atof(self.kwargs['alpha'])
 
             # The method used to select clusters from the condensed tree.
             # Options are: eom, leaf.
@@ -309,7 +309,7 @@ def _set_params(self, kwargs):
             if self.scaler == 'minmax':
                 if 'feature_range' in self.kwargs:
                     self.feature_range = ''.join(c for c in self.kwargs['feature_range'] if c not in '()').split(';')
-                    self.feature_range = (int(self.feature_range[0]),int(self.feature_range[1]))
+                    self.feature_range = (utils.atoi(self.feature_range[0]),utils.atoi(self.feature_range[1]))
 
             # Parameters for the Robust scaler
             # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
@@ -320,21 +320,21 @@ def _set_params(self, kwargs):
                     self.with_scaling = 'true' == self.kwargs['with_scaling'].lower()
                 if 'quantile_range' in self.kwargs:
                     self.quantile_range = ''.join(c for c in self.kwargs['quantile_range'] if c not in '()').split(';')
-                    self.quantile_range = (float(self.quantile_range[0]),float(self.quantile_range[1]))
+                    self.quantile_range = (utils.atof(self.quantile_range[0]),utils.atof(self.quantile_range[1]))
 
             # Parameters for the Quantile Transformer
             # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html
             if self.scaler == 'quantile':
                 if 'n_quantiles' in self.kwargs:
-                    self.n_quantiles = int(self.kwargs['n_quantiles'])
+                    self.n_quantiles = utils.atoi(self.kwargs['n_quantiles'])
                 if 'output_distribution' in self.kwargs:
                     self.output_distribution = self.kwargs['output_distribution'].lower()
                 if 'ignore_implicit_zeros' in self.kwargs:
                     self.ignore_implicit_zeros = 'true' == self.kwargs['ignore_implicit_zeros'].lower()
                 if 'subsample' in self.kwargs:
-                    self.subsample = int(self.kwargs['subsample'])
+                    self.subsample = utils.atoi(self.kwargs['subsample'])
                 if 'random_state' in self.kwargs:
-                    self.random_state = int(self.kwargs['random_state'])
+                    self.random_state = utils.atoi(self.kwargs['random_state'])
 
         # Set up a list of possible key word arguments for the HDBSCAN() function
         hdbscan_params = ['algorithm', 'metric', 'min_cluster_size', 'min_samples', 'p', 'alpha',\