From 1cdadbd90c44ed8928c225d923d55ec398e09a93 Mon Sep 17 00:00:00 2001 From: Nabeel Date: Mon, 19 Nov 2018 16:28:19 +1100 Subject: [PATCH] Code improvements Fix for preprocessing under a rare case where parallel calls are made by Qlik --- README.md | 1 + core/_machine_learning.py | 274 +++++++++++++++++++++----------------- core/_sklearn.py | 6 +- core/functions.json | 4 +- docs/README.md | 1 + 5 files changed, 161 insertions(+), 125 deletions(-) diff --git a/README.md b/README.md index cc797d0..e13b8d5 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,7 @@ _Note that this SSE and Docker do not handle file locking, and so do not support 3. Download this git repository or get the [latest release](https://github.com/nabeel-qlik/qlik-py-tools/releases) and extract it to a location of your choice. The machine where you are placing this repository should have access to a local or remote Qlik Sense instance. 4. Right click `Qlik-Py-Init.bat` and chose 'Run as Administrator'. You can open this file in a text editor to review the commands that will be executed. If everything goes smoothly you will see a Python virtual environment being set up, project files being copied, some packages being installed and TCP Port `50055` being opened for inbound communication. + - Note that the script always ends with a "All done" message and does not check for errors. - If you need to change the port you can do so in the file `core\__main__.py` by opening the file with a text editor, changing the value of the `_DEFAULT_PORT` variable, and then saving the file. You will also need to update `Qlik-Py-Init.bat` to use the same port in the `netsh` command. This command will only work if you run the batch file through an elevated command prompt (i.e. with administrator privileges). - Once the execution completes, do a quick scan of the log to see everything installed correctly. The libraries imported are: `grpcio`, `grpcio-tools`, `numpy`, `scipy`, `pandas`, `cython`, `pystan`, `fbprophet`, `scikit-learn`, `hdbscan`, `skater` and their dependencies. Also, check that the `core` and `generated` directories have been copied successfully to the newly created `qlik-py-env` directory. - If the initialization fails for any reason, you can simply delete the `qlik-py-env` directory and re-run `Qlik-Py-Init.bat`. diff --git a/core/_machine_learning.py b/core/_machine_learning.py index e61b4c7..32e8c87 100644 --- a/core/_machine_learning.py +++ b/core/_machine_learning.py @@ -187,106 +187,106 @@ def fit(self, X, y=None, features=None, retrain=False): if self.ohe: # Get a subset of the data that requires one hot encoding - self.ohe_df = X[self.ohe_meta.index.tolist()] + ohe_df = X[self.ohe_meta.index.tolist()] # Apply one hot encoding to relevant columns - self.ohe_df = pd.get_dummies(self.ohe_df, columns=self.ohe_df.columns) + ohe_df = pd.get_dummies(ohe_df, columns=ohe_df.columns) # Keep a copy of the OHE dataframe structure so we can align the transform dataset - self.ohe_df_structure = pd.DataFrame().reindex_like(self.ohe_df) + self.ohe_df_structure = pd.DataFrame().reindex_like(ohe_df) # Scaling needs to be fit exclusively on the training data so as not to influence the results if self.scale: # Get a subset of the data that requires scaling - self.scale_df = X[self.scale_meta.index.tolist()] + scale_df = X[self.scale_meta.index.tolist()] if self.hash: # Get a subset of the data that requires feature hashing - self.hash_df = X[self.hash_meta.index.tolist()] - hash_cols = self.hash_df.columns + hash_df = X[self.hash_meta.index.tolist()] + hash_cols = hash_df.columns # Hash unique values for each relevant column and then join to a dataframe for hashed data for c in hash_cols: - unique = self.hasher(self.hash_df, c, self.hash_meta["strategy_args"].loc[c]) - self.hash_df = self.hash_df.join(unique, on=c) - self.hash_df = self.hash_df.drop(c, axis=1) + unique = self.hasher(hash_df, c, self.hash_meta["strategy_args"].loc[c]) + hash_df = hash_df.join(unique, on=c) + hash_df = hash_df.drop(c, axis=1) # If hashed columns need to be scaled, these need to be considered when setting up the scaler as well if self.scale_hashed: if self.scale: - self.scale_df = self.scale_df.join(self.hash_df) + scale_df = scale_df.join(hash_df) else: - self.scale_df = self.hash_df + scale_df = hash_df if self.cv: # Get a subset of the data that requires count vectorizing - self.cv_df = X[self.cv_meta.index.tolist()] - cv_cols = self.cv_df.columns + cv_df = X[self.cv_meta.index.tolist()] + cv_cols = cv_df.columns # Get count vectors for each relevant column and then join to a dataframe for count vectorized data for c in cv_cols: - unique = self.text_vectorizer(self.cv_df, c, type="count", **self.cv_meta["strategy_args"].loc[c]) - self.cv_df = self.cv_df.join(unique, on=c) - self.cv_df = self.cv_df.drop(c, axis=1) + unique = self.text_vectorizer(cv_df, c, type="count", **self.cv_meta["strategy_args"].loc[c]) + cv_df = cv_df.join(unique, on=c) + cv_df = cv_df.drop(c, axis=1) # Keep a copy of the count vectorized dataframe structure so we can align the transform dataset - self.cv_df_structure = pd.DataFrame().reindex_like(self.cv_df) + self.cv_df_structure = pd.DataFrame().reindex_like(cv_df) # If text vector columns need to be scaled, these need to be considered when setting up the scaler as well if self.scale_vectors: if self.scale or (self.scale_hashed and self.hash): - self.scale_df = self.scale_df.join(self.cv_df) + scale_df = scale_df.join(cv_df) else: - self.scale_df = self.cv_df + scale_df = cv_df if self.tfidf: # Get a subset of the data that requires tfidf vectorizing - self.tfidf_df = X[self.tfidf_meta.index.tolist()] - tfidf_cols = self.tfidf_df.columns + tfidf_df = X[self.tfidf_meta.index.tolist()] + tfidf_cols = tfidf_df.columns # Get tfidf vectors for each relevant column and then join to a dataframe for tfidf vectorized data for c in tfidf_cols: - unique = self.text_vectorizer(self.tfidf_df, c, type="tfidf", **self.tfidf_meta["strategy_args"].loc[c]) - self.tfidf_df = self.tfidf_df.join(unique, on=c) - self.tfidf_df = self.tfidf_df.drop(c, axis=1) + unique = self.text_vectorizer(tfidf_df, c, type="tfidf", **self.tfidf_meta["strategy_args"].loc[c]) + tfidf_df = tfidf_df.join(unique, on=c) + tfidf_df = tfidf_df.drop(c, axis=1) # Keep a copy of the tfidf vectorized dataframe structure so we can align the transform dataset - self.tfidf_df_structure = pd.DataFrame().reindex_like(self.tfidf_df) + self.tfidf_df_structure = pd.DataFrame().reindex_like(tfidf_df) # If text vector columns need to be scaled, these need to be considered when setting up the scaler as well if self.scale_vectors: if self.scale or (self.scale_hashed and self.hash) or self.cv: - self.scale_df = self.scale_df.join(self.tfidf_df) + scale_df = scale_df.join(tfidf_df) else: - self.scale_df = self.tfidf_df + scale_df = tfidf_df if self.text: # Get a subset of the data that requires text similarity OHE - self.text_df = X[self.text_meta.index.tolist()] - text_cols = self.text_df.columns + text_df = X[self.text_meta.index.tolist()] + text_cols = text_df.columns # Get text similarity OHE for each relevant column and then join to a dataframe for text similarity OHE data for c in text_cols: - unique = self.text_similarity(self.text_df, c) - self.text_df = self.text_df.join(unique, on=c) - self.text_df = self.text_df.drop(c, axis=1) + unique = self.text_similarity(text_df, c) + text_df = text_df.join(unique, on=c) + text_df = text_df.drop(c, axis=1) # Keep a copy of the text similarity OHE dataframe structure so we can align the transform dataset - self.text_df_structure = pd.DataFrame().reindex_like(self.text_df) + self.text_df_structure = pd.DataFrame().reindex_like(text_df) try: - if len(self.scale_df) > 0: + if len(scale_df) > 0: # Get an instance of the sklearn scaler fit to X - self.scaler_instance = self.get_scaler(self.scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs) + self.scaler_instance = self.get_scaler(scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs) # Keep a copy of the scaling dataframe structure so we can align the transform dataset - self.scale_df_structure = pd.DataFrame().reindex_like(self.scale_df) + self.scale_df_structure = pd.DataFrame().reindex_like(scale_df) except AttributeError: pass # Output information to the terminal and log file if required if self.log is not None: - self._print_log(2) + self._print_log(2, ohe_df=ohe_df, scale_df=scale_df, hash_df=hash_df, cv_df=cv_df, tfidf_df=tfidf_df, text_df=text_df) return self @@ -298,186 +298,186 @@ def transform(self, X, y=None): Returns X_transform as a numpy array or a pandas dataframe based on return_type set in constructor. """ - self.X_transform = None + X_transform = None if self.ohe: # Get a subset of the data that requires one hot encoding - self.ohe_df = X[self.ohe_meta.index.tolist()] + ohe_df = X[self.ohe_meta.index.tolist()] # Apply one hot encoding to relevant columns - self.ohe_df = pd.get_dummies(self.ohe_df, columns=self.ohe_df.columns) + ohe_df = pd.get_dummies(ohe_df, columns=ohe_df.columns) # Align the columns with the original dataset. # This is to prevent different number or order of features between training and test datasets. - self.ohe_df = self.ohe_df.align(self.ohe_df_structure, join='right', axis=1)[0] + ohe_df = ohe_df.align(self.ohe_df_structure, join='right', axis=1)[0] # Fill missing values in the OHE dataframe, that may appear after alignment, with zeros. - self.ohe_df = self.fillna(self.ohe_df, missing="zeros") + ohe_df = self.fillna(ohe_df, missing="zeros") # Add the encoded columns to the result dataset - self.X_transform = self.ohe_df + X_transform = ohe_df if self.hash: # Get a subset of the data that requires feature hashing - self.hash_df = X[self.hash_meta.index.tolist()] - hash_cols = self.hash_df.columns + hash_df = X[self.hash_meta.index.tolist()] + hash_cols = hash_df.columns # Hash unique values for each relevant column and then join to a dataframe for hashed data for c in hash_cols: - unique = self.hasher(self.hash_df, c, self.hash_meta["strategy_args"].loc[c]) - self.hash_df = self.hash_df.join(unique, on=c) - self.hash_df = self.hash_df.drop(c, axis=1) + unique = self.hasher(hash_df, c, self.hash_meta["strategy_args"].loc[c]) + hash_df = hash_df.join(unique, on=c) + hash_df = hash_df.drop(c, axis=1) # Fill any missing values in the hash dataframe - self.hash_df = self.fillna(self.hash_df, missing="zeros") + hash_df = self.fillna(hash_df, missing="zeros") if self.cv: # Get a subset of the data that requires count vectorizing - self.cv_df = X[self.cv_meta.index.tolist()] - cv_cols = self.cv_df.columns + cv_df = X[self.cv_meta.index.tolist()] + cv_cols = cv_df.columns # Get count vectors for each relevant column and then join to a dataframe for count vectorized data for c in cv_cols: - unique = self.text_vectorizer(self.cv_df, c, type="count", **self.cv_meta["strategy_args"].loc[c]) - self.cv_df = self.cv_df.join(unique, on=c) - self.cv_df = self.cv_df.drop(c, axis=1) + unique = self.text_vectorizer(cv_df, c, type="count", **self.cv_meta["strategy_args"].loc[c]) + cv_df = cv_df.join(unique, on=c) + cv_df = cv_df.drop(c, axis=1) # Align the columns with the original dataset. # This is to prevent different number or order of features between training and test datasets. - self.cv_df = self.cv_df.align(self.cv_df_structure, join='right', axis=1)[0] + cv_df = cv_df.align(self.cv_df_structure, join='right', axis=1)[0] # Fill missing values in the dataframe that may appear after alignment with zeros. - self.cv_df = self.fillna(self.cv_df, missing="zeros") + cv_df = self.fillna(cv_df, missing="zeros") if self.tfidf: # Get a subset of the data that requires tfidf vectorizing - self.tfidf_df = X[self.tfidf_meta.index.tolist()] - tfidf_cols = self.tfidf_df.columns + tfidf_df = X[self.tfidf_meta.index.tolist()] + tfidf_cols = tfidf_df.columns # Get tfidf vectors for each relevant column and then join to a dataframe for tfidf vectorized data for c in tfidf_cols: - unique = self.text_vectorizer(self.tfidf_df, c, type="tfidf", **self.tfidf_meta["strategy_args"].loc[c]) - self.tfidf_df = self.tfidf_df.join(unique, on=c) - self.tfidf_df = self.tfidf_df.drop(c, axis=1) + unique = self.text_vectorizer(tfidf_df, c, type="tfidf", **self.tfidf_meta["strategy_args"].loc[c]) + tfidf_df = tfidf_df.join(unique, on=c) + tfidf_df = tfidf_df.drop(c, axis=1) # Align the columns with the original dataset. # This is to prevent different number or order of features between training and test datasets. - self.tfidf_df = self.tfidf_df.align(self.tfidf_df_structure, join='right', axis=1)[0] + tfidf_df = tfidf_df.align(self.tfidf_df_structure, join='right', axis=1)[0] # Fill missing values in the dataframe that may appear after alignment with zeros. - self.tfidf_df = self.fillna(self.tfidf_df, missing="zeros") + tfidf_df = self.fillna(tfidf_df, missing="zeros") if self.text: # Get a subset of the data that requires text similarity OHE - self.text_df = X[self.text_meta.index.tolist()] - text_cols = self.text_df.columns + text_df = X[self.text_meta.index.tolist()] + text_cols = text_df.columns # Get text similarity OHE for each relevant column and then join to a dataframe for text similarity OHE data for c in text_cols: - unique = self.text_similarity(self.text_df, c) - self.text_df = self.text_df.join(unique, on=c) - self.text_df = self.text_df.drop(c, axis=1) + unique = self.text_similarity(text_df, c) + text_df = text_df.join(unique, on=c) + text_df = text_df.drop(c, axis=1) # Align the columns with the original dataset. # This is to prevent different number or order of features between training and test datasets. - self.text_df = self.text_df.align(self.text_df_structure, join='right', axis=1)[0] + text_df = text_df.align(self.text_df_structure, join='right', axis=1)[0] # Fill missing values in the dataframe that may appear after alignment with zeros. - self.text_df = self.fillna(self.text_df, missing="zeros") + text_df = self.fillna(text_df, missing="zeros") # Add the text similary OHE data to the result dataset - if self.X_transform is None: - self.X_transform = self.text_df + if X_transform is None: + X_transform = text_df else: - self.X_transform = pd.concat([self.X_transform, self.text_df], join='outer', axis=1, sort=False) + X_transform = pd.concat([X_transform, text_df], join='outer', axis=1, sort=False) if self.scale: # Get a subset of the data that requires scaling - self.scale_df = X[self.scale_meta.index.tolist()] + scale_df = X[self.scale_meta.index.tolist()] # If scale_hashed = True join the hashed columns to the scaling dataframe if self.hash and self.scale_hashed: if self.scale: - self.scale_df = pd.concat([self.scale_df, self.hash_df], join='outer', axis=1, sort=False) + scale_df = pd.concat([scale_df, hash_df], join='outer', axis=1, sort=False) else: - self.scale_df = self.hash_df + scale_df = hash_df # If only hashed columns are being scaled, the scaler needs to be instantiated - self.scaler_instance = self.get_scaler(self.scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs) + self.scaler_instance = self.get_scaler(scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs) elif self.hash: # Add the hashed columns to the result dataset - if self.X_transform is None: - self.X_transform = self.hash_df + if X_transform is None: + X_transform = hash_df else: - self.X_transform = pd.concat([self.X_transform, self.hash_df], join='outer', axis=1, sort=False) + X_transform = pd.concat([X_transform, hash_df], join='outer', axis=1, sort=False) # If scale_vectors = True join the count vectorized columns to the scaling dataframe if self.cv and self.scale_vectors: if self.scale or (self.hash and self.scale_hashed): - self.scale_df = pd.concat([self.scale_df, self.cv_df], join='outer', axis=1, sort=False) + scale_df = pd.concat([scale_df, cv_df], join='outer', axis=1, sort=False) else: - self.scale_df = self.cv_df + scale_df = cv_df # If only count vectorized columns are being scaled, the scaler needs to be instantiated - self.scaler_instance = self.get_scaler(self.scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs) + self.scaler_instance = self.get_scaler(scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs) elif self.cv: # Add the count vectorized columns to the result dataset - if self.X_transform is None: - self.X_transform = self.cv_df + if X_transform is None: + X_transform = cv_df else: - self.X_transform = pd.concat([self.X_transform, self.cv_df], join='outer', axis=1, sort=False) + X_transform = pd.concat([X_transform, cv_df], join='outer', axis=1, sort=False) # If scale_vectors = True join the tfidf vectorized columns to the scaling dataframe if self.tfidf and self.scale_vectors: if self.scale or (self.hash and self.scale_hashed) or self.cv: - self.scale_df = pd.concat([self.scale_df, self.tfidf_df], join='outer', axis=1, sort=False) + scale_df = pd.concat([scale_df, tfidf_df], join='outer', axis=1, sort=False) else: - self.scale_df = self.tfidf_df + scale_df = tfidf_df # If only tfidf vectorized columns are being scaled, the scaler needs to be instantiated - self.scaler_instance = self.get_scaler(self.scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs) + self.scaler_instance = self.get_scaler(scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs) elif self.tfidf: # Add the count vectorized columns to the result dataset - if self.X_transform is None: - self.X_transform = self.tfidf_df + if X_transform is None: + X_transform = tfidf_df else: - self.X_transform = pd.concat([self.X_transform, self.tfidf_df], join='outer', axis=1, sort=False) + X_transform = pd.concat([X_transform, tfidf_df], join='outer', axis=1, sort=False) try: # Perform scaling on the relevant data - if len(self.scale_df) > 0: + if len(scale_df) > 0: # Align the columns with the original dataset. # This is to prevent different number or order of features between training and test datasets. - self.scale_df = self.scale_df.align(self.scale_df_structure, join='right', axis=1)[0] + scale_df = scale_df.align(self.scale_df_structure, join='right', axis=1)[0] - self.scale_df = self.fillna(self.scale_df, missing=self.missing) + scale_df = self.fillna(scale_df, missing=self.missing) - self.scale_df = pd.DataFrame(self.scaler_instance.transform(self.scale_df), index=self.scale_df.index, columns=self.scale_df.columns) + scale_df = pd.DataFrame(self.scaler_instance.transform(scale_df), index=scale_df.index, columns=scale_df.columns) # Add the scaled columns to the result dataset - if self.X_transform is None: - self.X_transform = self.scale_df + if X_transform is None: + X_transform = scale_df else: - self.X_transform = pd.concat([self.X_transform, self.scale_df], join='outer', axis=1, sort=False) + X_transform = pd.concat([X_transform, scale_df], join='outer', axis=1, sort=False) except AttributeError: pass if self.no_prep: # Get a subset of the data that doesn't require preprocessing - self.no_prep_df = X[self.none_meta.index.tolist()] + no_prep_df = X[self.none_meta.index.tolist()] # Fill any missing values in the no prep dataframe - self.no_prep_df = self.fillna(self.no_prep_df, missing="zeros") + no_prep_df = self.fillna(no_prep_df, missing="zeros") # Finally join the columns that do not require preprocessing to the result dataset - if self.X_transform is None: - self.X_transform = self.no_prep_df + if X_transform is None: + X_transform = no_prep_df else: - self.X_transform = pd.concat([self.X_transform, self.no_prep_df], join='outer', axis=1, sort=False) + X_transform = pd.concat([X_transform, no_prep_df], join='outer', axis=1, sort=False) # Output information to the terminal and log file if required if self.log is not None: - self._print_log(3) + self._print_log(3, ohe_df=ohe_df, scale_df=scale_df, hash_df=hash_df, cv_df=cv_df, tfidf_df=tfidf_df, text_df=text_df, X_transform=X_transform) if self.return_type == 'np': - return self.X_transform.values + return X_transform.values - return self.X_transform + return X_transform def fit_transform(self, X, y=None, features=None, retrain=False): @@ -491,10 +491,11 @@ def fit_transform(self, X, y=None, features=None, retrain=False): return self.fit(X, y, features, retrain).transform(X, y) - def _print_log(self, step): + def _print_log(self, step, **kwargs): """ Output useful information to stdout and the log file if debugging is required. step: Print the corresponding step in the log + kwargs: dictionary of dataframes to be used in the log """ if step == 1: @@ -530,44 +531,77 @@ def _print_log(self, step): elif step == 2: if self.ohe: - sys.stdout.write("ohe_df shape:{0}\nSample Data:\n{1}\n\n".format(self.ohe_df.shape, self.ohe_df.head())) + sys.stdout.write("Fit ohe_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['ohe_df'].shape, kwargs['ohe_df'].head())) with open(self.log,'a', encoding='utf-8') as f: - f.write("ohe_df shape:{0}\nSample Data:\n{1}\n\n".format(self.ohe_df.shape, self.ohe_df.head())) + f.write("Fit ohe_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['ohe_df'].shape, kwargs['ohe_df'].head())) if self.hash: - sys.stdout.write("hash_df shape:{0}\nSample Data:\n{1}\n\n".format(self.hash_df.shape, self.hash_df.head())) + sys.stdout.write("Fit hash_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['hash_df'].shape, kwargs['hash_df'].head())) with open(self.log,'a', encoding='utf-8') as f: - f.write("hash_df shape:{0}\nSample Data:\n{1}\n\n".format(self.hash_df.shape, self.hash_df.head())) + f.write("Fit hash_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['hash_df'].shape, kwargs['hash_df'].head())) if self.cv: - sys.stdout.write("cv_df shape:{0}\nSample Data:\n{1}\n\n".format(self.cv_df.shape, self.cv_df.head())) + sys.stdout.write("Fit cv_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['cv_df'].shape, kwargs['cv_df'].head())) with open(self.log,'a', encoding='utf-8') as f: - f.write("cv_df shape:{0}\nSample Data:\n{1}\n\n".format(self.cv_df.shape, self.cv_df.head())) + f.write("Fit cv_df shape:{0}\nSample Data:\n{1}\n\n".format(self.cv_df.shape, self.cv_df.head())) if self.tfidf: - sys.stdout.write("tfidf_df shape:{0}\nSample Data:\n{1}\n\n".format(self.tfidf_df.shape, self.tfidf_df.head())) + sys.stdout.write("Fit tfidf_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['tfidf_df'].shape, kwargs['tfidf_df'].head())) with open(self.log,'a', encoding='utf-8') as f: - f.write("tfidf_df shape:{0}\nSample Data:\n{1}\n\n".format(self.tfidf_df.shape, self.tfidf_df.head())) + f.write("Fit tfidf_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['tfidf_df'].shape, kwargs['tfidf_df'].head())) try: if len(self.scale_df) > 0: - sys.stdout.write("scale_df shape:{0}\nSample Data:\n{1}\n\n".format(self.scale_df.shape, self.scale_df.head())) + sys.stdout.write("Fit scale_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['scale_df'].shape, kwargs['scale_df'].head())) with open(self.log,'a', encoding='utf-8') as f: - f.write("scale_df shape:{0}\nSample Data:\n{1}\n\n".format(self.scale_df.shape, self.scale_df.head())) + f.write("Fit scale_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['scale_df'].shape, kwargs['scale_df'].head())) except AttributeError: pass elif step == 3: + if self.ohe: + sys.stdout.write("Transform ohe_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['ohe_df'].shape, kwargs['ohe_df'].head())) + + with open(self.log,'a', encoding='utf-8') as f: + f.write("Transform ohe_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['ohe_df'].shape, kwargs['ohe_df'].head())) + + if self.hash: + sys.stdout.write("Transform hash_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['hash_df'].shape, kwargs['hash_df'].head())) + + with open(self.log,'a', encoding='utf-8') as f: + f.write("Transform hash_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['hash_df'].shape, kwargs['hash_df'].head())) + + if self.cv: + sys.stdout.write("Transform cv_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['cv_df'].shape, kwargs['cv_df'].head())) + + with open(self.log,'a', encoding='utf-8') as f: + f.write("Transform cv_df shape:{0}\nSample Data:\n{1}\n\n".format(self.cv_df.shape, self.cv_df.head())) + + if self.tfidf: + sys.stdout.write("Transform tfidf_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['tfidf_df'].shape, kwargs['tfidf_df'].head())) + + with open(self.log,'a', encoding='utf-8') as f: + f.write("Transform tfidf_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['tfidf_df'].shape, kwargs['tfidf_df'].head())) + + try: + if len(self.scale_df) > 0: + sys.stdout.write("Transform scale_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['scale_df'].shape, kwargs['scale_df'].head())) + + with open(self.log,'a', encoding='utf-8') as f: + f.write("Transform scale_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['scale_df'].shape, kwargs['scale_df'].head())) + except AttributeError: + pass + try: - sys.stdout.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head())) + sys.stdout.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['X_transform'].shape, kwargs['X_transform'].head())) with open(self.log,'a', encoding='utf-8') as f: - f.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(self.X_transform.shape, self.X_transform.head())) + f.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['X_transform'].shape, kwargs['X_transform'].head())) except AttributeError: pass diff --git a/core/_sklearn.py b/core/_sklearn.py index 0082695..e719605 100644 --- a/core/_sklearn.py +++ b/core/_sklearn.py @@ -855,7 +855,7 @@ def predict(self, load_script=False, variant="predict"): else: # Predict y for X using the previously fit pipeline self.y = self.model.pipe.predict(self.X) - + # Prepare the response self.response = pd.DataFrame(self.y, columns=["result"], index=self.X.index) @@ -1556,14 +1556,14 @@ def _send_table_description(self, variant): table_header = (('qlik-tabledescription-bin', self.table.SerializeToString()),) self.context.send_initial_metadata(table_header) - def _get_model(self): + def _get_model(self, use_cache=True): """ Get the model from the class model cache or disk. Update the cache if loading from disk. Return the model. """ - if self.model.name in self.__class__.model_cache: + if use_cache and self.model.name in self.__class__.model_cache: # Load the model from cache self.model = self.__class__.model_cache[self.model.name] diff --git a/core/functions.json b/core/functions.json index 034dff9..c5a716a 100644 --- a/core/functions.json +++ b/core/functions.json @@ -121,10 +121,10 @@ "Params": { "a_model_name": 0, "b_feature_name": 0, - "c_var_type": 0, + "c_var_type": 0, "d_data_type": 0, "e_strategy": 0, - "f_hash_length": 0 + "f_strategy_args": 0 } }, { diff --git a/docs/README.md b/docs/README.md index daacd91..6049af0 100644 --- a/docs/README.md +++ b/docs/README.md @@ -97,6 +97,7 @@ _Note that this SSE and Docker do not handle file locking, and so do not support 3. Download this git repository or get the [latest release](https://github.com/nabeel-qlik/qlik-py-tools/releases) and extract it to a location of your choice. The machine where you are placing this repository should have access to a local or remote Qlik Sense instance. 4. Right click `Qlik-Py-Init.bat` and chose 'Run as Administrator'. You can open this file in a text editor to review the commands that will be executed. If everything goes smoothly you will see a Python virtual environment being set up, project files being copied, some packages being installed and TCP Port `50055` being opened for inbound communication. + - Note that the script always ends with a "All done" message and does not check for errors. - If you need to change the port you can do so in the file `core\__main__.py` by opening the file with a text editor, changing the value of the `_DEFAULT_PORT` variable, and then saving the file. You will also need to update `Qlik-Py-Init.bat` to use the same port in the `netsh` command. This command will only work if you run the batch file through an elevated command prompt (i.e. with administrator privileges). - Once the execution completes, do a quick scan of the log to see everything installed correctly. The libraries imported are: `grpcio`, `grpcio-tools`, `numpy`, `scipy`, `pandas`, `cython`, `pystan`, `fbprophet`, `scikit-learn`, `hdbscan`, `skater` and their dependencies. Also, check that the `core` and `generated` directories have been copied successfully to the newly created `qlik-py-env` directory. - If the initialization fails for any reason, you can simply delete the `qlik-py-env` directory and re-run `Qlik-Py-Init.bat`.