diff --git a/README.md b/README.md index 26baffd..cc797d0 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ _Note that this SSE and Docker do not handle file locking, and so do not support 5. Now whenever you want to start this Python service you can run `Qlik-Py-Start.bat`. -6. Now you need to [set up an Analytics Connection in Qlik Sense Enterprise](https://help.qlik.com/en-US/sense/February2018/Subsystems/ManagementConsole/Content/create-analytic-connection.htm) or [update the Settings.ini file in Qlik Sense Desktop](https://help.qlik.com/en-US/sense/February2018/Subsystems/Hub/Content/Introduction/configure-analytic-connection-desktop.htm). If you are using the sample apps make sure you use `PyTools` as the name for the analytics connection, or alternatively, update all of the expressions to use the new name. +6. Now you need to [set up an Analytics Connection in Qlik Sense Enterprise](https://help.qlik.com/en-US/sense/September2018/Subsystems/ManagementConsole/Content/Sense_QMC/create-analytic-connection.htm) or [update the Settings.ini file in Qlik Sense Desktop](https://help.qlik.com/en-US/sense/September2018/Subsystems/Hub/Content/Sense_Hub/Introduction/configure-analytic-connection-desktop.htm). If you are using the sample apps make sure you use `PyTools` as the name for the analytics connection, or alternatively, update all of the expressions to use the new name. - For Qlik Sense Desktop you need to update the `settings.ini` file:

![QSD Analytics Connection](docs/images/Install-04.png) - For Qlik Sense Enterprise you need to create an Analytics Connection through QMC:

![QSE Analytics Connection](docs/images/Install-02.png) - The Analytics Connection can point to a different machine and can be [secured with certificates](https://github.com/qlik-oss/server-side-extension/blob/master/generate_certs_guide/README.md):

![QSE Secure Analytics Connection](docs/images/Install-03.png) diff --git a/core/_machine_learning.py b/core/_machine_learning.py index 9a44175..e61b4c7 100644 --- a/core/_machine_learning.py +++ b/core/_machine_learning.py @@ -327,6 +327,8 @@ def transform(self, X, y=None): unique = self.hasher(self.hash_df, c, self.hash_meta["strategy_args"].loc[c]) self.hash_df = self.hash_df.join(unique, on=c) self.hash_df = self.hash_df.drop(c, axis=1) + # Fill any missing values in the hash dataframe + self.hash_df = self.fillna(self.hash_df, missing="zeros") if self.cv: # Get a subset of the data that requires count vectorizing @@ -386,7 +388,7 @@ def transform(self, X, y=None): if self.X_transform is None: self.X_transform = self.text_df else: - self.X_transform = self.X_transform.join(self.text_df) + self.X_transform = pd.concat([self.X_transform, self.text_df], join='outer', axis=1, sort=False) if self.scale: # Get a subset of the data that requires scaling @@ -395,7 +397,7 @@ def transform(self, X, y=None): # If scale_hashed = True join the hashed columns to the scaling dataframe if self.hash and self.scale_hashed: if self.scale: - self.scale_df = self.scale_df.join(self.hash_df) + self.scale_df = pd.concat([self.scale_df, self.hash_df], join='outer', axis=1, sort=False) else: self.scale_df = self.hash_df # If only hashed columns are being scaled, the scaler needs to be instantiated @@ -405,12 +407,12 @@ def transform(self, X, y=None): if self.X_transform is None: self.X_transform = self.hash_df else: - self.X_transform = self.X_transform.join(self.hash_df) + self.X_transform = pd.concat([self.X_transform, self.hash_df], join='outer', axis=1, sort=False) # If scale_vectors = True join the count vectorized columns to the scaling dataframe if self.cv and self.scale_vectors: if self.scale or (self.hash and self.scale_hashed): - self.scale_df = self.scale_df.join(self.cv_df) + self.scale_df = pd.concat([self.scale_df, self.cv_df], join='outer', axis=1, sort=False) else: self.scale_df = self.cv_df # If only count vectorized columns are being scaled, the scaler needs to be instantiated @@ -420,12 +422,12 @@ def transform(self, X, y=None): if self.X_transform is None: self.X_transform = self.cv_df else: - self.X_transform = self.X_transform.join(self.cv_df) + self.X_transform = pd.concat([self.X_transform, self.cv_df], join='outer', axis=1, sort=False) # If scale_vectors = True join the tfidf vectorized columns to the scaling dataframe if self.tfidf and self.scale_vectors: if self.scale or (self.hash and self.scale_hashed) or self.cv: - self.scale_df = self.scale_df.join(self.tfidf_df) + self.scale_df = pd.concat([self.scale_df, self.tfidf_df], join='outer', axis=1, sort=False) else: self.scale_df = self.tfidf_df # If only tfidf vectorized columns are being scaled, the scaler needs to be instantiated @@ -435,7 +437,7 @@ def transform(self, X, y=None): if self.X_transform is None: self.X_transform = self.tfidf_df else: - self.X_transform = self.X_transform.join(self.tfidf_df) + self.X_transform = pd.concat([self.X_transform, self.tfidf_df], join='outer', axis=1, sort=False) try: # Perform scaling on the relevant data @@ -452,19 +454,21 @@ def transform(self, X, y=None): if self.X_transform is None: self.X_transform = self.scale_df else: - self.X_transform = self.X_transform.join(self.scale_df) + self.X_transform = pd.concat([self.X_transform, self.scale_df], join='outer', axis=1, sort=False) except AttributeError: pass if self.no_prep: # Get a subset of the data that doesn't require preprocessing self.no_prep_df = X[self.none_meta.index.tolist()] + # Fill any missing values in the no prep dataframe + self.no_prep_df = self.fillna(self.no_prep_df, missing="zeros") # Finally join the columns that do not require preprocessing to the result dataset if self.X_transform is None: self.X_transform = self.no_prep_df else: - self.X_transform = self.X_transform.join(self.no_prep_df) + self.X_transform = pd.concat([self.X_transform, self.no_prep_df], join='outer', axis=1, sort=False) # Output information to the terminal and log file if required if self.log is not None: @@ -559,10 +563,13 @@ def _print_log(self, step): pass elif step == 3: - sys.stdout.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head())) - - with open(self.log,'a', encoding='utf-8') as f: - f.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head())) + try: + sys.stdout.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head())) + + with open(self.log,'a', encoding='utf-8') as f: + f.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head())) + except AttributeError: + pass @staticmethod def hasher(df, col, n_features): diff --git a/core/_sklearn.py b/core/_sklearn.py index 7f657dc..0082695 100644 --- a/core/_sklearn.py +++ b/core/_sklearn.py @@ -493,11 +493,13 @@ def fit(self): # Select the dataset for calculating importances if self.model.validation == "hold-out": X = self.X_test + y = self.y_test else: X = train_test_df + y = target_df.values.ravel() # Calculate model agnostic feature importances - self._calc_importances(data = X) + self._calc_importances(X = X, y = y) # Persist the model to disk self.model = self.model.save(self.model.name, self.path, self.model.compress) @@ -730,7 +732,7 @@ def calculate_metrics(self, caller="external"): if self.model.calc_feature_importances: # Calculate model agnostic feature importances - self._calc_importances(data = self.X_test) + self._calc_importances(X = self.X_test, y = self.y_test) self.response = metrics_df @@ -816,10 +818,14 @@ def predict(self, load_script=False, variant="predict"): # Set the key column as the index self.request_df.set_index("key", drop=False, inplace=True) - # Split the features provided as a string into individual columns - self.X = pd.DataFrame([x[feature_col_num].split("|") for x in self.request_df.values.tolist()],\ - columns=self.model.features_df.loc[:,"name"].tolist(),\ - index=self.request_df.index) + try: + # Split the features provided as a string into individual columns + self.X = pd.DataFrame([x[feature_col_num].split("|") for x in self.request_df.values.tolist()],\ + columns=self.model.features_df.loc[:,"name"].tolist(),\ + index=self.request_df.index) + except AssertionError as ae: + err = "The number of input columns do not match feature definitions. Ensure you are using the | delimiter and that the target is not included in your input to the prediction function." + raise AssertionError(err) from ae # Convert the data types based on feature definitions self.X = utils.convert_types(self.X, self.model.features_df) @@ -1405,7 +1411,7 @@ def _prep_confusion_matrix(self, y_test, y_pred, labels): self.model.confusion_matrix.loc[:,"model_name"] = self.model.name self.model.confusion_matrix = self.model.confusion_matrix.loc[:,["model_name", "true_label", "pred_label", "count"]] - def _calc_importances(self, data=None): + def _calc_importances(self, X=None, y=None): """ Calculate feature importances. Importances are calculated using the Skater library to provide this capability for all sklearn algorithms. @@ -1413,21 +1419,27 @@ def _calc_importances(self, data=None): """ # Fill null values in the test set according to the model settings - X_test = utils.fillna(data, method=self.model.missing) + X_test = utils.fillna(X, method=self.model.missing) # Calculate model agnostic feature importances using the skater library - interpreter = Interpretation(X_test, feature_names=self.model.features_df.index.tolist()) + interpreter = Interpretation(X_test, training_labels=y, feature_names=self.model.features_df.index.tolist()) if self.model.estimator_type == "classifier": try: # We use the predicted probabilities from the estimator if available predictor = self.model.pipe.predict_proba + + # Set up keyword arguments accordingly + imm_kwargs = {"probability": True} except AttributeError: # Otherwise we simply use the predict method predictor = self.model.pipe.predict + + # Set up keyword arguments accordingly + imm_kwargs = {"probability": False, "unique_values": self.model.pipe.classes_} # Set up a skater InMemoryModel to calculate feature importances - imm = InMemoryModel(predictor, examples = X_test[:10], model_type="classifier", unique_values=self.model.pipe.classes_) + imm = InMemoryModel(predictor, examples = X_test[:10], model_type="classifier", **imm_kwargs) elif self.model.estimator_type == "regressor": # Set up a skater InMemoryModel to calculate feature importances using the predict method diff --git a/docker/Dockerfile b/docker/Dockerfile v.3.5 similarity index 100% rename from docker/Dockerfile rename to docker/Dockerfile v.3.5 diff --git a/docker/Dockerfile v.3.6 b/docker/Dockerfile v.3.6 new file mode 100644 index 0000000..93e877e --- /dev/null +++ b/docker/Dockerfile v.3.6 @@ -0,0 +1,17 @@ +# Use the previous version of qlik-py-tools as a parent image +FROM nabeeloz/qlik-py-tools:3.5 + +# Set the working directory to /qlik-py-tools/core +WORKDIR /qlik-py-tools/core + +# Copy all files from the core subdirectory into the container +COPY ./core/* /qlik-py-tools/core/ + +# Copy modified file for skater +COPY ./feature_importance.py /usr/local/lib/python3.6/site-packages/skater-1.1.2-py3.6.egg/skater/core/global_interpretation/ + +# Make port 80 available to the world outside this container +EXPOSE 80 + +# Run __main__.py when the container launches +CMD ["python", "__main__.py"] \ No newline at end of file diff --git a/docker/feature_importance.py b/docker/feature_importance.py index 7d27c0e..f143a81 100644 --- a/docker/feature_importance.py +++ b/docker/feature_importance.py @@ -139,8 +139,8 @@ def feature_importance(self, model_instance, ascending=True, filter_classes=None scaled=use_scaling, scorer=scorer) - self.interpreter.logger.warn("Multiprocessing has known issues with GRPC, using single process") - self.interpreter.logger.warn("More information here: https://github.com/grpc/grpc/blob/master/doc/fork_support.md") + # Multiprocessing results in issues with GRPC when using Docker, using single process + # More information here: https://github.com/grpc/grpc/blob/master/doc/fork_support.md importances = {} importance_dicts = [] diff --git a/docs/README.md b/docs/README.md index 5996b3d..daacd91 100644 --- a/docs/README.md +++ b/docs/README.md @@ -103,7 +103,7 @@ _Note that this SSE and Docker do not handle file locking, and so do not support 5. Now whenever you want to start this Python service you can run `Qlik-Py-Start.bat`. -6. Now you need to [set up an Analytics Connection in Qlik Sense Enterprise](https://help.qlik.com/en-US/sense/February2018/Subsystems/ManagementConsole/Content/create-analytic-connection.htm) or [update the Settings.ini file in Qlik Sense Desktop](https://help.qlik.com/en-US/sense/February2018/Subsystems/Hub/Content/Introduction/configure-analytic-connection-desktop.htm). If you are using the sample apps make sure you use `PyTools` as the name for the analytics connection, or alternatively, update all of the expressions to use the new name. +6. Now you need to [set up an Analytics Connection in Qlik Sense Enterprise](https://help.qlik.com/en-US/sense/September2018/Subsystems/ManagementConsole/Content/Sense_QMC/create-analytic-connection.htm) or [update the Settings.ini file in Qlik Sense Desktop](https://help.qlik.com/en-US/sense/September2018/Subsystems/Hub/Content/Sense_Hub/Introduction/configure-analytic-connection-desktop.htm). If you are using the sample apps make sure you use `PyTools` as the name for the analytics connection, or alternatively, update all of the expressions to use the new name. - For Qlik Sense Desktop you need to update the `settings.ini` file:

![QSD Analytics Connection](images/Install-04.png) - For Qlik Sense Enterprise you need to create an Analytics Connection through QMC:

![QSE Analytics Connection](images/Install-02.png) - The Analytics Connection can point to a different machine and can be [secured with certificates](https://github.com/qlik-oss/server-side-extension/blob/master/generate_certs_guide/README.md):

![QSE Secure Analytics Connection](images/Install-03.png)