diff --git a/README.md b/README.md
index 26baffd..cc797d0 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ _Note that this SSE and Docker do not handle file locking, and so do not support
5. Now whenever you want to start this Python service you can run `Qlik-Py-Start.bat`.
-6. Now you need to [set up an Analytics Connection in Qlik Sense Enterprise](https://help.qlik.com/en-US/sense/February2018/Subsystems/ManagementConsole/Content/create-analytic-connection.htm) or [update the Settings.ini file in Qlik Sense Desktop](https://help.qlik.com/en-US/sense/February2018/Subsystems/Hub/Content/Introduction/configure-analytic-connection-desktop.htm). If you are using the sample apps make sure you use `PyTools` as the name for the analytics connection, or alternatively, update all of the expressions to use the new name.
+6. Now you need to [set up an Analytics Connection in Qlik Sense Enterprise](https://help.qlik.com/en-US/sense/September2018/Subsystems/ManagementConsole/Content/Sense_QMC/create-analytic-connection.htm) or [update the Settings.ini file in Qlik Sense Desktop](https://help.qlik.com/en-US/sense/September2018/Subsystems/Hub/Content/Sense_Hub/Introduction/configure-analytic-connection-desktop.htm). If you are using the sample apps make sure you use `PyTools` as the name for the analytics connection, or alternatively, update all of the expressions to use the new name.
- For Qlik Sense Desktop you need to update the `settings.ini` file:
![QSD Analytics Connection](docs/images/Install-04.png)
- For Qlik Sense Enterprise you need to create an Analytics Connection through QMC:
![QSE Analytics Connection](docs/images/Install-02.png)
- The Analytics Connection can point to a different machine and can be [secured with certificates](https://github.com/qlik-oss/server-side-extension/blob/master/generate_certs_guide/README.md):
![QSE Secure Analytics Connection](docs/images/Install-03.png)
diff --git a/core/_machine_learning.py b/core/_machine_learning.py
index 9a44175..e61b4c7 100644
--- a/core/_machine_learning.py
+++ b/core/_machine_learning.py
@@ -327,6 +327,8 @@ def transform(self, X, y=None):
unique = self.hasher(self.hash_df, c, self.hash_meta["strategy_args"].loc[c])
self.hash_df = self.hash_df.join(unique, on=c)
self.hash_df = self.hash_df.drop(c, axis=1)
+ # Fill any missing values in the hash dataframe
+ self.hash_df = self.fillna(self.hash_df, missing="zeros")
if self.cv:
# Get a subset of the data that requires count vectorizing
@@ -386,7 +388,7 @@ def transform(self, X, y=None):
if self.X_transform is None:
self.X_transform = self.text_df
else:
- self.X_transform = self.X_transform.join(self.text_df)
+ self.X_transform = pd.concat([self.X_transform, self.text_df], join='outer', axis=1, sort=False)
if self.scale:
# Get a subset of the data that requires scaling
@@ -395,7 +397,7 @@ def transform(self, X, y=None):
# If scale_hashed = True join the hashed columns to the scaling dataframe
if self.hash and self.scale_hashed:
if self.scale:
- self.scale_df = self.scale_df.join(self.hash_df)
+ self.scale_df = pd.concat([self.scale_df, self.hash_df], join='outer', axis=1, sort=False)
else:
self.scale_df = self.hash_df
# If only hashed columns are being scaled, the scaler needs to be instantiated
@@ -405,12 +407,12 @@ def transform(self, X, y=None):
if self.X_transform is None:
self.X_transform = self.hash_df
else:
- self.X_transform = self.X_transform.join(self.hash_df)
+ self.X_transform = pd.concat([self.X_transform, self.hash_df], join='outer', axis=1, sort=False)
# If scale_vectors = True join the count vectorized columns to the scaling dataframe
if self.cv and self.scale_vectors:
if self.scale or (self.hash and self.scale_hashed):
- self.scale_df = self.scale_df.join(self.cv_df)
+ self.scale_df = pd.concat([self.scale_df, self.cv_df], join='outer', axis=1, sort=False)
else:
self.scale_df = self.cv_df
# If only count vectorized columns are being scaled, the scaler needs to be instantiated
@@ -420,12 +422,12 @@ def transform(self, X, y=None):
if self.X_transform is None:
self.X_transform = self.cv_df
else:
- self.X_transform = self.X_transform.join(self.cv_df)
+ self.X_transform = pd.concat([self.X_transform, self.cv_df], join='outer', axis=1, sort=False)
# If scale_vectors = True join the tfidf vectorized columns to the scaling dataframe
if self.tfidf and self.scale_vectors:
if self.scale or (self.hash and self.scale_hashed) or self.cv:
- self.scale_df = self.scale_df.join(self.tfidf_df)
+ self.scale_df = pd.concat([self.scale_df, self.tfidf_df], join='outer', axis=1, sort=False)
else:
self.scale_df = self.tfidf_df
# If only tfidf vectorized columns are being scaled, the scaler needs to be instantiated
@@ -435,7 +437,7 @@ def transform(self, X, y=None):
if self.X_transform is None:
self.X_transform = self.tfidf_df
else:
- self.X_transform = self.X_transform.join(self.tfidf_df)
+ self.X_transform = pd.concat([self.X_transform, self.tfidf_df], join='outer', axis=1, sort=False)
try:
# Perform scaling on the relevant data
@@ -452,19 +454,21 @@ def transform(self, X, y=None):
if self.X_transform is None:
self.X_transform = self.scale_df
else:
- self.X_transform = self.X_transform.join(self.scale_df)
+ self.X_transform = pd.concat([self.X_transform, self.scale_df], join='outer', axis=1, sort=False)
except AttributeError:
pass
if self.no_prep:
# Get a subset of the data that doesn't require preprocessing
self.no_prep_df = X[self.none_meta.index.tolist()]
+ # Fill any missing values in the no prep dataframe
+ self.no_prep_df = self.fillna(self.no_prep_df, missing="zeros")
# Finally join the columns that do not require preprocessing to the result dataset
if self.X_transform is None:
self.X_transform = self.no_prep_df
else:
- self.X_transform = self.X_transform.join(self.no_prep_df)
+ self.X_transform = pd.concat([self.X_transform, self.no_prep_df], join='outer', axis=1, sort=False)
# Output information to the terminal and log file if required
if self.log is not None:
@@ -559,10 +563,13 @@ def _print_log(self, step):
pass
elif step == 3:
- sys.stdout.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head()))
-
- with open(self.log,'a', encoding='utf-8') as f:
- f.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head()))
+ try:
+ sys.stdout.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head()))
+
+ with open(self.log,'a', encoding='utf-8') as f:
+ f.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head()))
+ except AttributeError:
+ pass
@staticmethod
def hasher(df, col, n_features):
diff --git a/core/_sklearn.py b/core/_sklearn.py
index 7f657dc..0082695 100644
--- a/core/_sklearn.py
+++ b/core/_sklearn.py
@@ -493,11 +493,13 @@ def fit(self):
# Select the dataset for calculating importances
if self.model.validation == "hold-out":
X = self.X_test
+ y = self.y_test
else:
X = train_test_df
+ y = target_df.values.ravel()
# Calculate model agnostic feature importances
- self._calc_importances(data = X)
+ self._calc_importances(X = X, y = y)
# Persist the model to disk
self.model = self.model.save(self.model.name, self.path, self.model.compress)
@@ -730,7 +732,7 @@ def calculate_metrics(self, caller="external"):
if self.model.calc_feature_importances:
# Calculate model agnostic feature importances
- self._calc_importances(data = self.X_test)
+ self._calc_importances(X = self.X_test, y = self.y_test)
self.response = metrics_df
@@ -816,10 +818,14 @@ def predict(self, load_script=False, variant="predict"):
# Set the key column as the index
self.request_df.set_index("key", drop=False, inplace=True)
- # Split the features provided as a string into individual columns
- self.X = pd.DataFrame([x[feature_col_num].split("|") for x in self.request_df.values.tolist()],\
- columns=self.model.features_df.loc[:,"name"].tolist(),\
- index=self.request_df.index)
+ try:
+ # Split the features provided as a string into individual columns
+ self.X = pd.DataFrame([x[feature_col_num].split("|") for x in self.request_df.values.tolist()],\
+ columns=self.model.features_df.loc[:,"name"].tolist(),\
+ index=self.request_df.index)
+ except AssertionError as ae:
+ err = "The number of input columns do not match feature definitions. Ensure you are using the | delimiter and that the target is not included in your input to the prediction function."
+ raise AssertionError(err) from ae
# Convert the data types based on feature definitions
self.X = utils.convert_types(self.X, self.model.features_df)
@@ -1405,7 +1411,7 @@ def _prep_confusion_matrix(self, y_test, y_pred, labels):
self.model.confusion_matrix.loc[:,"model_name"] = self.model.name
self.model.confusion_matrix = self.model.confusion_matrix.loc[:,["model_name", "true_label", "pred_label", "count"]]
- def _calc_importances(self, data=None):
+ def _calc_importances(self, X=None, y=None):
"""
Calculate feature importances.
Importances are calculated using the Skater library to provide this capability for all sklearn algorithms.
@@ -1413,21 +1419,27 @@ def _calc_importances(self, data=None):
"""
# Fill null values in the test set according to the model settings
- X_test = utils.fillna(data, method=self.model.missing)
+ X_test = utils.fillna(X, method=self.model.missing)
# Calculate model agnostic feature importances using the skater library
- interpreter = Interpretation(X_test, feature_names=self.model.features_df.index.tolist())
+ interpreter = Interpretation(X_test, training_labels=y, feature_names=self.model.features_df.index.tolist())
if self.model.estimator_type == "classifier":
try:
# We use the predicted probabilities from the estimator if available
predictor = self.model.pipe.predict_proba
+
+ # Set up keyword arguments accordingly
+ imm_kwargs = {"probability": True}
except AttributeError:
# Otherwise we simply use the predict method
predictor = self.model.pipe.predict
+
+ # Set up keyword arguments accordingly
+ imm_kwargs = {"probability": False, "unique_values": self.model.pipe.classes_}
# Set up a skater InMemoryModel to calculate feature importances
- imm = InMemoryModel(predictor, examples = X_test[:10], model_type="classifier", unique_values=self.model.pipe.classes_)
+ imm = InMemoryModel(predictor, examples = X_test[:10], model_type="classifier", **imm_kwargs)
elif self.model.estimator_type == "regressor":
# Set up a skater InMemoryModel to calculate feature importances using the predict method
diff --git a/docker/Dockerfile b/docker/Dockerfile v.3.5
similarity index 100%
rename from docker/Dockerfile
rename to docker/Dockerfile v.3.5
diff --git a/docker/Dockerfile v.3.6 b/docker/Dockerfile v.3.6
new file mode 100644
index 0000000..93e877e
--- /dev/null
+++ b/docker/Dockerfile v.3.6
@@ -0,0 +1,17 @@
+# Use the previous version of qlik-py-tools as a parent image
+FROM nabeeloz/qlik-py-tools:3.5
+
+# Set the working directory to /qlik-py-tools/core
+WORKDIR /qlik-py-tools/core
+
+# Copy all files from the core subdirectory into the container
+COPY ./core/* /qlik-py-tools/core/
+
+# Copy modified file for skater
+COPY ./feature_importance.py /usr/local/lib/python3.6/site-packages/skater-1.1.2-py3.6.egg/skater/core/global_interpretation/
+
+# Make port 80 available to the world outside this container
+EXPOSE 80
+
+# Run __main__.py when the container launches
+CMD ["python", "__main__.py"]
\ No newline at end of file
diff --git a/docker/feature_importance.py b/docker/feature_importance.py
index 7d27c0e..f143a81 100644
--- a/docker/feature_importance.py
+++ b/docker/feature_importance.py
@@ -139,8 +139,8 @@ def feature_importance(self, model_instance, ascending=True, filter_classes=None
scaled=use_scaling,
scorer=scorer)
- self.interpreter.logger.warn("Multiprocessing has known issues with GRPC, using single process")
- self.interpreter.logger.warn("More information here: https://github.com/grpc/grpc/blob/master/doc/fork_support.md")
+ # Multiprocessing results in issues with GRPC when using Docker, using single process
+ # More information here: https://github.com/grpc/grpc/blob/master/doc/fork_support.md
importances = {}
importance_dicts = []
diff --git a/docs/README.md b/docs/README.md
index 5996b3d..daacd91 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -103,7 +103,7 @@ _Note that this SSE and Docker do not handle file locking, and so do not support
5. Now whenever you want to start this Python service you can run `Qlik-Py-Start.bat`.
-6. Now you need to [set up an Analytics Connection in Qlik Sense Enterprise](https://help.qlik.com/en-US/sense/February2018/Subsystems/ManagementConsole/Content/create-analytic-connection.htm) or [update the Settings.ini file in Qlik Sense Desktop](https://help.qlik.com/en-US/sense/February2018/Subsystems/Hub/Content/Introduction/configure-analytic-connection-desktop.htm). If you are using the sample apps make sure you use `PyTools` as the name for the analytics connection, or alternatively, update all of the expressions to use the new name.
+6. Now you need to [set up an Analytics Connection in Qlik Sense Enterprise](https://help.qlik.com/en-US/sense/September2018/Subsystems/ManagementConsole/Content/Sense_QMC/create-analytic-connection.htm) or [update the Settings.ini file in Qlik Sense Desktop](https://help.qlik.com/en-US/sense/September2018/Subsystems/Hub/Content/Sense_Hub/Introduction/configure-analytic-connection-desktop.htm). If you are using the sample apps make sure you use `PyTools` as the name for the analytics connection, or alternatively, update all of the expressions to use the new name.
- For Qlik Sense Desktop you need to update the `settings.ini` file:
![QSD Analytics Connection](images/Install-04.png)
- For Qlik Sense Enterprise you need to create an Analytics Connection through QMC:
![QSE Analytics Connection](images/Install-02.png)
- The Analytics Connection can point to a different machine and can be [secured with certificates](https://github.com/qlik-oss/server-side-extension/blob/master/generate_certs_guide/README.md):
![QSE Secure Analytics Connection](images/Install-03.png)