Fixes after testing with larget dataset

Fixes: Feature importance failing for dataset > 5000 samples,
nabeel-oz · Oct 30, 2018 · c58318e · c58318e
1 parent b95de73
commit c58318e
Show file tree

Hide file tree

Showing 7 changed files with 63 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -103,7 +103,7 @@ _Note that this SSE and Docker do not handle file locking, and so do not support
 
 5. Now whenever you want to start this Python service you can run `Qlik-Py-Start.bat`.
 
-6. Now you need to [set up an Analytics Connection in Qlik Sense Enterprise](https://help.qlik.com/en-US/sense/February2018/Subsystems/ManagementConsole/Content/create-analytic-connection.htm) or [update the Settings.ini file in Qlik Sense Desktop](https://help.qlik.com/en-US/sense/February2018/Subsystems/Hub/Content/Introduction/configure-analytic-connection-desktop.htm). If you are using the sample apps make sure you use `PyTools` as the name for the analytics connection, or alternatively, update all of the expressions to use the new name.
+6. Now you need to [set up an Analytics Connection in Qlik Sense Enterprise](https://help.qlik.com/en-US/sense/September2018/Subsystems/ManagementConsole/Content/Sense_QMC/create-analytic-connection.htm) or [update the Settings.ini file in Qlik Sense Desktop](https://help.qlik.com/en-US/sense/September2018/Subsystems/Hub/Content/Sense_Hub/Introduction/configure-analytic-connection-desktop.htm). If you are using the sample apps make sure you use `PyTools` as the name for the analytics connection, or alternatively, update all of the expressions to use the new name.
      - For Qlik Sense Desktop you need to update the `settings.ini` file:<br/><br/>![QSD Analytics Connection](docs/images/Install-04.png)
      - For Qlik Sense Enterprise you need to create an Analytics Connection through QMC:<br/><br/>![QSE Analytics Connection](docs/images/Install-02.png)
      - The Analytics Connection can point to a different machine and can be [secured with certificates](https://github.com/qlik-oss/server-side-extension/blob/master/generate_certs_guide/README.md):<br/><br/>![QSE Secure Analytics Connection](docs/images/Install-03.png)

diff --git a/core/_machine_learning.py b/core/_machine_learning.py
@@ -327,6 +327,8 @@ def transform(self, X, y=None):
                 unique = self.hasher(self.hash_df, c, self.hash_meta["strategy_args"].loc[c])
                 self.hash_df = self.hash_df.join(unique, on=c)
                 self.hash_df = self.hash_df.drop(c, axis=1)
+                # Fill any missing values in the hash dataframe
+                self.hash_df = self.fillna(self.hash_df, missing="zeros")
 
         if self.cv:
             # Get a subset of the data that requires count vectorizing
@@ -386,7 +388,7 @@ def transform(self, X, y=None):
             if self.X_transform is None:
                 self.X_transform = self.text_df
             else:
-                self.X_transform = self.X_transform.join(self.text_df)
+                self.X_transform = pd.concat([self.X_transform, self.text_df], join='outer', axis=1, sort=False)
 
         if self.scale:
             # Get a subset of the data that requires scaling
@@ -395,7 +397,7 @@ def transform(self, X, y=None):
         # If scale_hashed = True join the hashed columns to the scaling dataframe
         if self.hash and self.scale_hashed:
             if self.scale:
-                self.scale_df = self.scale_df.join(self.hash_df)
+                self.scale_df = pd.concat([self.scale_df, self.hash_df], join='outer', axis=1, sort=False)
             else:
                 self.scale_df = self.hash_df
                 # If only hashed columns are being scaled, the scaler needs to be instantiated
@@ -405,12 +407,12 @@ def transform(self, X, y=None):
             if self.X_transform is None:
                 self.X_transform = self.hash_df
             else:
-                self.X_transform = self.X_transform.join(self.hash_df)
+                self.X_transform = pd.concat([self.X_transform, self.hash_df], join='outer', axis=1, sort=False)
 
         # If scale_vectors = True join the count vectorized columns to the scaling dataframe
         if self.cv and self.scale_vectors:
             if self.scale or (self.hash and self.scale_hashed):
-                self.scale_df = self.scale_df.join(self.cv_df)
+                self.scale_df = pd.concat([self.scale_df, self.cv_df], join='outer', axis=1, sort=False)
             else:
                 self.scale_df = self.cv_df
                 # If only count vectorized columns are being scaled, the scaler needs to be instantiated
@@ -420,12 +422,12 @@ def transform(self, X, y=None):
             if self.X_transform is None:
                 self.X_transform = self.cv_df
             else:
-                self.X_transform = self.X_transform.join(self.cv_df)
+                self.X_transform = pd.concat([self.X_transform, self.cv_df], join='outer', axis=1, sort=False)
 
         # If scale_vectors = True join the tfidf vectorized columns to the scaling dataframe
         if self.tfidf and self.scale_vectors:
             if self.scale or (self.hash and self.scale_hashed) or self.cv:
-                self.scale_df = self.scale_df.join(self.tfidf_df)
+                self.scale_df = pd.concat([self.scale_df, self.tfidf_df], join='outer', axis=1, sort=False)
             else:
                 self.scale_df = self.tfidf_df
                 # If only tfidf vectorized columns are being scaled, the scaler needs to be instantiated
@@ -435,7 +437,7 @@ def transform(self, X, y=None):
             if self.X_transform is None:
                 self.X_transform = self.tfidf_df
             else:
-                self.X_transform = self.X_transform.join(self.tfidf_df)
+                self.X_transform = pd.concat([self.X_transform, self.tfidf_df], join='outer', axis=1, sort=False)
 
         try:
             # Perform scaling on the relevant data
@@ -452,19 +454,21 @@ def transform(self, X, y=None):
                 if self.X_transform is None:
                     self.X_transform = self.scale_df
                 else:
-                    self.X_transform = self.X_transform.join(self.scale_df)
+                    self.X_transform = pd.concat([self.X_transform, self.scale_df], join='outer', axis=1, sort=False)
         except AttributeError:
             pass
 
         if self.no_prep:
             # Get a subset of the data that doesn't require preprocessing
             self.no_prep_df = X[self.none_meta.index.tolist()]
+            # Fill any missing values in the no prep dataframe
+            self.no_prep_df = self.fillna(self.no_prep_df, missing="zeros")
 
             # Finally join the columns that do not require preprocessing to the result dataset
             if self.X_transform is None:
                 self.X_transform = self.no_prep_df
             else:
-                self.X_transform = self.X_transform.join(self.no_prep_df)
+                self.X_transform = pd.concat([self.X_transform, self.no_prep_df], join='outer', axis=1, sort=False)
 
         # Output information to the terminal and log file if required
         if self.log is not None:
@@ -559,10 +563,13 @@ def _print_log(self, step):
                 pass
 
         elif step == 3:
-            sys.stdout.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head()))
-
-            with open(self.log,'a', encoding='utf-8') as f:
-                f.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head()))
+            try:
+                sys.stdout.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head()))
+
+                with open(self.log,'a', encoding='utf-8') as f:
+                    f.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head()))
+            except AttributeError:
+                pass
 
     @staticmethod
     def hasher(df, col, n_features):

diff --git a/core/_sklearn.py b/core/_sklearn.py
@@ -493,11 +493,13 @@ def fit(self):
             # Select the dataset for calculating importances
             if self.model.validation == "hold-out":
                 X = self.X_test
+                y = self.y_test
             else:
                 X = train_test_df
+                y = target_df.values.ravel()
 
             # Calculate model agnostic feature importances
-            self._calc_importances(data = X)
+            self._calc_importances(X = X, y = y)
 
         # Persist the model to disk
         self.model = self.model.save(self.model.name, self.path, self.model.compress)
@@ -730,7 +732,7 @@ def calculate_metrics(self, caller="external"):
 
             if self.model.calc_feature_importances:
                 # Calculate model agnostic feature importances
-                self._calc_importances(data = self.X_test)
+                self._calc_importances(X = self.X_test, y = self.y_test)
 
             self.response = metrics_df
 
@@ -816,10 +818,14 @@ def predict(self, load_script=False, variant="predict"):
             # Set the key column as the index
             self.request_df.set_index("key", drop=False, inplace=True)
 
-        # Split the features provided as a string into individual columns
-        self.X = pd.DataFrame([x[feature_col_num].split("|") for x in self.request_df.values.tolist()],\
-                                     columns=self.model.features_df.loc[:,"name"].tolist(),\
-                                     index=self.request_df.index)
+        try:
+            # Split the features provided as a string into individual columns
+            self.X = pd.DataFrame([x[feature_col_num].split("|") for x in self.request_df.values.tolist()],\
+                                        columns=self.model.features_df.loc[:,"name"].tolist(),\
+                                        index=self.request_df.index)
+        except AssertionError as ae:
+            err = "The number of input columns do not match feature definitions. Ensure you are using the | delimiter and that the target is not included in your input to the prediction function."
+            raise AssertionError(err) from ae
 
         # Convert the data types based on feature definitions 
         self.X = utils.convert_types(self.X, self.model.features_df)
@@ -1405,29 +1411,35 @@ def _prep_confusion_matrix(self, y_test, y_pred, labels):
         self.model.confusion_matrix.loc[:,"model_name"] = self.model.name
         self.model.confusion_matrix = self.model.confusion_matrix.loc[:,["model_name", "true_label", "pred_label", "count"]]
 
-    def _calc_importances(self, data=None):
+    def _calc_importances(self, X=None, y=None):
         """
         Calculate feature importances.
         Importances are calculated using the Skater library to provide this capability for all sklearn algorithms.
         For more information: https://www.datascience.com/resources/tools/skater
         """
 
         # Fill null values in the test set according to the model settings
-        X_test = utils.fillna(data, method=self.model.missing)
+        X_test = utils.fillna(X, method=self.model.missing)
 
         # Calculate model agnostic feature importances using the skater library
-        interpreter = Interpretation(X_test, feature_names=self.model.features_df.index.tolist())
+        interpreter = Interpretation(X_test, training_labels=y, feature_names=self.model.features_df.index.tolist())
 
         if self.model.estimator_type == "classifier":
             try:
                 # We use the predicted probabilities from the estimator if available
                 predictor = self.model.pipe.predict_proba
+
+                # Set up keyword arguments accordingly
+                imm_kwargs = {"probability": True}
             except AttributeError:
                 # Otherwise we simply use the predict method
                 predictor = self.model.pipe.predict
+
+                # Set up keyword arguments accordingly
+                imm_kwargs = {"probability": False, "unique_values": self.model.pipe.classes_}
 
             # Set up a skater InMemoryModel to calculate feature importances
-            imm = InMemoryModel(predictor, examples = X_test[:10], model_type="classifier", unique_values=self.model.pipe.classes_)
+            imm = InMemoryModel(predictor, examples = X_test[:10], model_type="classifier", **imm_kwargs)
 
         elif self.model.estimator_type == "regressor":
             # Set up a skater InMemoryModel to calculate feature importances using the predict method

diff --git a/docker/Dockerfile → docker/Dockerfile v.3.5 b/docker/Dockerfile → docker/Dockerfile v.3.5
diff --git a/docker/Dockerfile v.3.6 b/docker/Dockerfile v.3.6
@@ -0,0 +1,17 @@
+# Use the previous version of qlik-py-tools as a parent image
+FROM nabeeloz/qlik-py-tools:3.5
+
+# Set the working directory to /qlik-py-tools/core
+WORKDIR /qlik-py-tools/core
+
+# Copy all files from the core subdirectory into the container
+COPY ./core/* /qlik-py-tools/core/
+
+# Copy modified file for skater
+COPY ./feature_importance.py /usr/local/lib/python3.6/site-packages/skater-1.1.2-py3.6.egg/skater/core/global_interpretation/
+
+# Make port 80 available to the world outside this container
+EXPOSE 80
+
+# Run __main__.py when the container launches
+CMD ["python", "__main__.py"]
diff --git a/docker/feature_importance.py b/docker/feature_importance.py
@@ -139,8 +139,8 @@ def feature_importance(self, model_instance, ascending=True, filter_classes=None
                           scaled=use_scaling,
                           scorer=scorer)
 
-        self.interpreter.logger.warn("Multiprocessing has known issues with GRPC, using single process")
-        self.interpreter.logger.warn("More information here: https://github.com/grpc/grpc/blob/master/doc/fork_support.md")
+        # Multiprocessing results in issues with GRPC when using Docker, using single process
+        # More information here: https://github.com/grpc/grpc/blob/master/doc/fork_support.md
 
         importances = {}
         importance_dicts = []

diff --git a/docs/README.md b/docs/README.md
@@ -103,7 +103,7 @@ _Note that this SSE and Docker do not handle file locking, and so do not support
 
 5. Now whenever you want to start this Python service you can run `Qlik-Py-Start.bat`.
 
-6. Now you need to [set up an Analytics Connection in Qlik Sense Enterprise](https://help.qlik.com/en-US/sense/February2018/Subsystems/ManagementConsole/Content/create-analytic-connection.htm) or [update the Settings.ini file in Qlik Sense Desktop](https://help.qlik.com/en-US/sense/February2018/Subsystems/Hub/Content/Introduction/configure-analytic-connection-desktop.htm). If you are using the sample apps make sure you use `PyTools` as the name for the analytics connection, or alternatively, update all of the expressions to use the new name.
+6. Now you need to [set up an Analytics Connection in Qlik Sense Enterprise](https://help.qlik.com/en-US/sense/September2018/Subsystems/ManagementConsole/Content/Sense_QMC/create-analytic-connection.htm) or [update the Settings.ini file in Qlik Sense Desktop](https://help.qlik.com/en-US/sense/September2018/Subsystems/Hub/Content/Sense_Hub/Introduction/configure-analytic-connection-desktop.htm). If you are using the sample apps make sure you use `PyTools` as the name for the analytics connection, or alternatively, update all of the expressions to use the new name.
      - For Qlik Sense Desktop you need to update the `settings.ini` file:<br/><br/>![QSD Analytics Connection](images/Install-04.png)
      - For Qlik Sense Enterprise you need to create an Analytics Connection through QMC:<br/><br/>![QSE Analytics Connection](images/Install-02.png)
      - The Analytics Connection can point to a different machine and can be [secured with certificates](https://github.com/qlik-oss/server-side-extension/blob/master/generate_certs_guide/README.md):<br/><br/>![QSE Secure Analytics Connection](images/Install-03.png)