update arrow dict by adding alpha, and coloring arrows based on stren…

…gth.
erdogant · Apr 28, 2023 · ae9be30 · ae9be30
1 parent fb8c58a
commit ae9be30
Show file tree

Hide file tree

Showing 2 changed files with 148 additions and 46 deletions.
diff --git a/pca/examples.py b/pca/examples.py
@@ -2,6 +2,7 @@
 import pandas as pd
 import numpy as np
 import matplotlib as mpl
+from scatterd import scatterd
 
 import numpy as np
 from sklearn.datasets import load_iris
@@ -15,13 +16,42 @@
 model = pca()
 model.fit_transform(X)
 
+# Loading are automatically set based on weak/strong
+model.biplot(s=0)
+# Change strong and weak colors
+model.biplot(s=0, arrowdict={'color_strong': 'r', 'color_weak': 'g'})
+# Set alpha to constant value
+model.biplot(s=0, arrowdict={'alpha': 0.8})
+# Change arrow text color
+model.biplot(s=0, arrowdict={'color_text': 'k'})
+# Change arrow color, which automatically changes the label color too
+model.biplot(s=0, color_arrow='k')
+# Almost Remove arrows but keep the text
+model.biplot(s=0, color_arrow='k', arrowdict={'alpha': 0.9})
+# Set color text
+model.biplot(s=0, arrowdict={'color_text': 'k'})
+# Set color arrow and color text
+model.biplot(s=0, color_arrow='k', arrowdict={'color_text': 'g'})
+# Set color arrow and color text and alpha
+model.biplot(s=0, color_arrow='k', arrowdict={'color_text': 'g', 'alpha': 0.8})
+
+# Change the scale factor of the arrow
+model.biplot(arrowdict={'scale_factor': 2})
+model.biplot3d(arrowdict={'scale_factor': 3})
+
+model.biplot(s=0, arrowdict={'weight':'bold', 'fontsize': 24, 'color_text': 'r'}, color_arrow='k')
+model.biplot3d(density=True, fontsize=0, arrowdict={'weight':'bold', 'fontsize': 14})
+model.biplot3d(density=True, fontsize=0, s=0, arrowdict={'fontsize': 24})
+
+model.biplot(density=True, fontsize=0, arrowdict={'weight':'bold', 'fontsize': 14})
+
 # model.scatter(density=True)
-# model.biplot(c=[0,0,0], density=True)
 
 # model.scatter3d(fontsize=16, PC=[0,1,3])
 # model.scatter3d(density=True)
 
 model.biplot3d(arrowdict={'weight':'bold'},  c=None, n_feat=5)
+# model.biplot3d(arrowdict={'weight':'bold'},   n_feat=5)
 
 
 # %%
@@ -40,9 +70,9 @@
 model = pca(normalize=True, n_components=None)
 # Fit transform with dataframe
 results = model.fit_transform(df)
-
-model.biplot(labels=df['flavanoids'].values, legend=False, cmap='seismic', n_feat=3, fontsize=20, arrowdict={'fontsize':28, 'c':'g'}, density=True)
-model.biplot3d(legend=False, n_feat=3, fontcolor='r', arrowdict={'fontsize':22, 'c':'k'}, density=True)
+# Plot
+model.biplot(fontsize=0, labels=df['flavanoids'].values, legend=False, cmap='seismic', n_feat=3, arrowdict={'fontsize':28, 'c':'g'}, density=True)
+# model.biplot3d(legend=None, n_feat=3, fontcolor='r', arrowdict={'fontsize':22, 'c':'k'}, density=True, figsize=(35, 30))
 
 
 # %% Demonstration of specifying colors, markers, alpha, and size per sample
@@ -136,23 +166,18 @@
               visible=True,
               )
 
-
-
-
-
-
 # %%
 # Load pca
 from pca import pca
 
 # Initialize pca
-model = pca()
+model = pca(n_components=3)
 
 # Load example data set
 df = model.import_example(data='iris')
 
 # Fit transform
-results = model.fit_transform(X)
+results = model.fit_transform(df)
 
 # Make plot
 model.biplot(HT2=True,
@@ -392,11 +417,11 @@
 # Color on classlabel (Unchanged)
 model.biplot()
 # Use cmap colors for classlabels (unchanged)
-model.biplot(labels=y, cmap=mpl.colors.ListedColormap(['green', 'red', 'blue']))
+model.biplot(labels=y, cmap=mpl.colors.ListedColormap(['green', 'red', 'blue']), density=True)
 # Do not show points when cmap=None (unchanged)
-model.biplot(labels=load_iris().target, cmap=None)
+model.biplot(labels=load_iris().target, cmap=None, density=True)
 # Plot all points as unique entity (unchanged)
-model.biplot(labels=y, gradient='#ffffff', cmap=mpl.colors.ListedColormap(['green', 'red', 'blue']))
+model.biplot(labels=y, gradient='#ffffff', cmap=mpl.colors.ListedColormap(['green', 'red', 'blue']), density=True)
 
 
 # %%
@@ -459,6 +484,7 @@
 
 # Plot image
 model.biplot(SPE=True, HT2=True, density=True)
+model.biplot3d(SPE=True, HT2=True, density=True, arrowdict={'scale_factor': 1})
 
 # %% Detect unseen outliers
 # Import libraries
@@ -474,7 +500,7 @@
 # model = pca(alpha=0.05, detect_outliers=None)
 
 # Fit transform
-model.fit_transform(X)
+model.fit_transform(X, row_labels=np.zeros(X.shape[0]))
 
 model.scatter(SPE=True, HT2=True)
 
@@ -486,9 +512,9 @@
     PCnew = model.transform(X_unseen, row_labels=np.repeat('mapped_' + str(i), X_unseen.shape[0]), update_outlier_params=True)
 
     # Scatterplot
-    model.scatter(SPE=True, HT2=True)
+    # model.scatter(SPE=True, HT2=True)
     # Biplot
-    # Model.biplot(SPE=True, HT2=True)
+    model.biplot(SPE=True, HT2=True, density=True)
 
 
 # %%

diff --git a/pca/pca.py b/pca/pca.py
@@ -17,6 +17,7 @@
 import os
 from adjustText import adjust_text
 import statsmodels.stats.multitest as multitest
+from typing import List, Union, Tuple
 
 
 # %% Association learning across all variables
@@ -697,8 +698,8 @@ def biplot(self,
                fontcolor=[0,0,0],
                fontsize=18,
                fontweight='normal',
-               color_arrow='#000000',
-               arrowdict={'fontsize': None, 'c': None, 'weight': None, 'ha': 'center', 'va': 'center'},
+               color_arrow=None,
+               arrowdict={'fontsize': None, 'color_text': None, 'weight': None, 'alpha': None, 'color_strong': '#880808', 'color_weak': '#002a77', 'scale_factor': None},
                cmap='tab20c',
                title=None,
                legend=None,
@@ -766,19 +767,30 @@ def biplot(self,
         density_on_top : bool, (default: False)
             True : The density is the highest layer.
             False : The density is the lowest layer.
-        color_arrow : String, (default: '#000000')
-            color for the arrow.
-            * '#000000'
-            * 'r'
-        arrowdict : dict.
-            dictionary containing properties for the arrow font-text.
-            Note that the [c]olor: None inherits the color used in color_arrow.
-            {'fontsize': None, 'c': None, 'weight': None, 'ha': 'center', 'va': 'center'}
-        fontsize : String, optional
-            The fontsize of the y labels that are plotted in the graph. The default is 16.
+        fontsize : String (default: 16)
+            The fontsize of the labels.
         fontcolor: list/array of RGB colors with same size as X (default : None)
             None : Use same colorscheme as for c
             '#000000' : If the input is a single color, all fonts will get this color.
+        fontweight : String, (default: 'normal')
+            The fontweight of the labels.
+                * 'normal'
+                * 'bold'
+        color_arrow : String, (default: None)
+            color for the arrow.
+                * None: Color arrows based on strength using 'color_strong' and 'color_weak'.
+                * '#000000'
+                * 'r'
+        arrowdict : dict.
+            Dictionary containing properties for the arrow font-text.
+            {'fontsize': None, 'color_text': None, 'weight': None, 'alpha': None, 'color_strong': '#880808', 'color_weak': '#002a77', 'ha': 'center', 'va': 'center'}
+                * fontsize: None automatically adjust based on the fontsize. Specify otherwise.
+                * 'color_text': None automatically adjust color based color_strong and color_weak. Specify hex color otherwise.
+                * 'weight': None automatically adjust based on fontweight. Specify otherwise: 'normal', 'bold'
+                * 'alpha': None automatically adjust based on loading value.
+                * 'color_strong': Hex color for strong loadings (color_arrow needs to be set to None).
+                * 'color_weak': Hex color for weak loadings (color_arrow needs to be set to None).
+                * 'scale_factor': The scale factor for the arrow length. None automatically sets changes between 2d and 3d plots.
         cmap : String, optional, default: 'Set1'
             Colormap. If set to None, no points are shown.
         title : str, default: None
@@ -829,7 +841,7 @@ def biplot(self,
         # Scatterplot
         fig, ax = self.scatter(labels=labels, legend=legend, PC=PC, SPE=SPE, HT2=HT2, cmap=cmap, visible=visible, figsize=figsize, alpha=alpha, title=title, gradient=gradient, fig=fig, ax=ax, c=c, s=s, jitter=jitter, marker=marker, fontcolor=fontcolor, fontweight=fontweight, fontsize=fontsize, edgecolor=edgecolor, density=density, density_on_top=density_on_top, dpi=dpi, verbose=verbose)
         # Add the loadings with arrow to the plot
-        fig, ax = _plot_loadings(self, topfeat, n_feat, PC, d3, color_arrow, arrowdict, fig, ax, verbose)
+        fig, ax = _plot_loadings(self, topfeat, n_feat, PC, d3, arrowdict, fig, ax, verbose)
         # Plot
         if visible: plt.show()
         # Return
@@ -1467,10 +1479,10 @@ def _biplot_input_checks(results, PC, cmap, arrowdict, color_arrow, fontsize, fo
 
 
 def _set_arrowdict(arrowdict, color_arrow=None, fontsize=18, fontweight='normal'):
-    color_arrow = 'black' if (color_arrow is None) else color_arrow
-    arrowdict = {**{'fontsize': 18 if fontsize==0 else fontsize, 'c': color_arrow, 'weight': fontweight, 'ha': 'center', 'va': 'center'}, **arrowdict}
-    if arrowdict.get('c') is None and (color_arrow is not None):
-        arrowdict['c'] = color_arrow
+    # color_arrow = None if (color_arrow is None) else color_arrow
+    arrowdict = {**{'color_arrow': color_arrow, 'color_text': None, 'fontsize': 18 if fontsize==0 else fontsize, 'weight': fontweight, 'alpha': None, 'ha': 'center', 'va': 'center', 'color_strong': '#880808', 'color_weak': '#002a77', 'scale_factor': None}, **arrowdict}
+    # if arrowdict.get('color_text') is None and (color_arrow is not None):
+        # arrowdict['color_text'] = color_arrow
     if arrowdict.get('fontsize') is None:
         arrowdict['fontsize'] = 18 if fontsize==0 else fontsize
     if arrowdict.get('weight') is None:
@@ -1528,7 +1540,7 @@ def _setup_figure(fig, ax, d3, visible, figsize, dpi):
     return fig, ax
 
 
-def _plot_loadings(self, topfeat, n_feat, PC, d3, color_arrow, arrowdict, fig, ax, verbose):
+def _plot_loadings(self, topfeat, n_feat, PC, d3, arrowdict, fig, ax, verbose):
     topfeat = pd.concat([topfeat.iloc[PC, :], topfeat.loc[~topfeat.index.isin(PC), :]])
     topfeat.reset_index(inplace=True)
     # Collect coefficients
@@ -1542,7 +1554,12 @@ def _plot_loadings(self, topfeat, n_feat, PC, d3, color_arrow, arrowdict, fig, a
     # Plot and scale values for arrows and text by taking the absolute minimum range of the x-axis and y-axis.
     max_axis = np.max(np.abs(self.results['PC'].iloc[:, PC]).min(axis=1))
     max_arrow = np.abs(coeff).max().max()
-    scale = (np.max([1, np.round(max_axis / max_arrow, 2)])) * 0.93
+    if arrowdict['scale_factor'] is None: 
+        scale_factor = 1.8 if d3 else 1.1
+    else:
+        scale_factor = arrowdict['scale_factor']
+    # Scale the arrows using the scale factor
+    scale = (np.max([1, np.round(max_axis / max_arrow, 2)])) * scale_factor
 
     # For vizualization purposes we will keep only the unique feature-names
     topfeat = topfeat.drop_duplicates(subset=['feature'])
@@ -1551,6 +1568,10 @@ def _plot_loadings(self, topfeat, n_feat, PC, d3, color_arrow, arrowdict, fig, a
         if verbose>=2: print('[pca] >[WARNING]: n_feat can not be reached because of the limitation of n_components (=%d). n_feat is reduced to %d.' %(self.n_components, n_feat))
 
     # Plot arrows and text
+    arrow_line_color = arrowdict['color_arrow']
+    arrow_text_color = arrowdict['color_text']
+    arrow_alpha = arrowdict['alpha']
+    alpha_scaled = normalize_size(topfeat['loading'].abs().values.reshape(-1,1), minscale=0.35, maxscale=0.95, scaler='minmax')
     texts = []
     for i in range(0, n_feat):
         getfeat = topfeat['feature'].iloc[i]
@@ -1559,17 +1580,24 @@ def _plot_loadings(self, topfeat, n_feat, PC, d3, color_arrow, arrowdict, fig, a
         # Set first PC vs second PC direction. Note that these are not neccarily the best loading.
         xarrow = getcoef[0] * scale  # First PC in the x-axis direction
         yarrow = getcoef[1] * scale  # Second PC in the y-axis direction
-        txtcolor = 'y' if topfeat['type'].iloc[i] == 'weak' else 'g'
+        # Set the arrow and arrow-text colors
+        # Update arrow color if None
+        loading_color = arrowdict['color_weak'] if topfeat['type'].iloc[i] == 'weak' else arrowdict['color_strong']
+        # Update colors if None
+        if arrowdict['color_arrow'] is None: arrow_line_color = loading_color
+        if arrowdict['color_text'] is None: arrow_text_color = loading_color
+        if arrowdict['alpha'] is None: arrow_alpha =  alpha_scaled[i]
 
         if d3:
             zarrow = getcoef[2] * scale
-            ax.quiver(mean_x, mean_y, mean_z, xarrow - mean_x, yarrow - mean_y, zarrow - mean_z, color=color_arrow, alpha=0.8, lw=2)
-            texts.append(ax.text(xarrow, yarrow, zarrow, label, color=arrowdict['c'], ha='center', va='center', fontsize=arrowdict['fontsize'], zorder=25))
+            ax.quiver(mean_x, mean_y, mean_z, xarrow - mean_x, yarrow - mean_y, zarrow - mean_z, color=arrow_line_color, alpha=arrow_alpha, lw=2)
+            texts.append(ax.text(xarrow, yarrow, zarrow, label, fontsize=arrowdict['fontsize'], c=arrow_text_color, weight=arrowdict['weight'], ha=arrowdict['ha'], va=arrowdict['va'], zorder=25))
         else:
-            ax.arrow(mean_x, mean_y, xarrow - mean_x, yarrow - mean_y, color=color_arrow, alpha=0.8, width=0.002, head_width=0.1, head_length=0.1 * 1.1, length_includes_head=True, zorder=10)
-            texts.append(ax.text(xarrow, yarrow, label, fontdict=arrowdict, zorder=10))
+            head_width = 0.1
+            ax.arrow(mean_x, mean_y, xarrow - mean_x, yarrow - mean_y, color=arrow_line_color, alpha=arrow_alpha, width=0.002, head_width=head_width, head_length=head_width * 1.1, length_includes_head=True, zorder=10)
+            texts.append(ax.text(xarrow, yarrow, label, fontsize=arrowdict['fontsize'], c=arrow_text_color, weight=arrowdict['weight'], ha=arrowdict['ha'], va=arrowdict['va'], zorder=10))
 
-    # Plot the adjusted text labels to prevent overlap
+    # Plot the adjusted text labels to prevent overlap. Do not adjust text in 3d plots as it will mess up the locations.
     if len(texts)>0 and not d3: adjust_text(texts)
 
     # Return
@@ -1595,9 +1623,9 @@ def _add_plot_SPE(self, xs, ys, zs, SPE, d3, alpha, s, fig, ax):
     if SPE and ('y_bool_spe' in self.results['outliers'].columns):
         label_spe = str(sum(Ioutlier2)) + ' outlier(s) (SPE/DmodX)'
         if d3:
-            ax.scatter(xs[Ioutlier2], ys[Ioutlier2], zs[Ioutlier2], marker='x', color=[0.5, 0.5, 0.5], s=s, label=label_spe, alpha=alpha)
+            ax.scatter(xs[Ioutlier2], ys[Ioutlier2], zs[Ioutlier2], marker='x', color=[0.5, 0.5, 0.5], s=s, label=label_spe, alpha=alpha, zorder=15)
         else:
-            ax.scatter(xs[Ioutlier2], ys[Ioutlier2], marker='d', color=[0.5, 0.5, 0.5], s=s, label=label_spe, alpha=alpha)
+            ax.scatter(xs[Ioutlier2], ys[Ioutlier2], marker='d', color=[0.5, 0.5, 0.5], s=s, label=label_spe, alpha=alpha, zorder=15)
 
     return fig, ax
 
@@ -1613,9 +1641,9 @@ def _add_plot_HT2(self, xs, ys, zs, HT2, d3, alpha, s, fig, ax):
     if HT2 and ('y_bool' in self.results['outliers'].columns):
         label_t2 = str(sum(Ioutlier1)) + ' outlier(s) (hotelling t2)'
         if d3:
-            ax.scatter(xs[Ioutlier1], ys[Ioutlier1], zs[Ioutlier1], marker='d', color=[0, 0, 0], s=s, label=label_t2, alpha=alpha)
+            ax.scatter(xs[Ioutlier1], ys[Ioutlier1], zs[Ioutlier1], marker='d', color=[0, 0, 0], s=s, label=label_t2, alpha=alpha, zorder=15)
         else:
-            ax.scatter(xs[Ioutlier1], ys[Ioutlier1], marker='x', color=[0, 0, 0], s=s, label=label_t2, alpha=alpha)
+            ax.scatter(xs[Ioutlier1], ys[Ioutlier1], marker='x', color=[0, 0, 0], s=s, label=label_t2, alpha=alpha, zorder=15)
 
     return fig, ax
 
@@ -1645,6 +1673,54 @@ def _add_plot_properties(self, PC, d3, title, legend, labels, fig, ax, fontsize,
     return fig, ax
 
 
+def normalize_size(getsizes, minscale: Union[int, float] = 0.5, maxscale: Union[int, float] = 4, scaler: str = 'zscore'):
+    """Normalize values between minimum and maximum value.
+
+    Parameters
+    ----------
+    getsizes : input array
+        Array of values that needs to be scaled.
+    minscale : Union[int, float], optional
+        Minimum value. The default is 0.5.
+    maxscale : Union[int, float], optional
+        Maximum value. The default is 4.
+    scaler : str, optional
+        Type of scaler. The default is 'zscore'.
+            * 'zscore'
+            * 'minmax'
+
+    Returns
+    -------
+    getsizes : array-like
+        scaled values between min-max.
+
+    """
+    # Instead of Min-Max scaling, that shrinks any distribution in the [0, 1] interval, scaling the variables to
+    # Z-scores is better. Min-Max Scaling is too sensitive to outlier observations and generates unseen problems,
+
+    # Set sizes to 0 if not available
+    getsizes[np.isinf(getsizes)]=0
+    getsizes[np.isnan(getsizes)]=0
+
+    # out-of-scale datapoints.
+    if scaler == 'zscore' and len(np.unique(getsizes)) > 3:
+        getsizes = (getsizes.flatten() - np.mean(getsizes)) / np.std(getsizes)
+        getsizes = getsizes + (minscale - np.min(getsizes))
+    elif scaler == 'minmax':
+        try:
+            from sklearn.preprocessing import MinMaxScaler
+        except:
+            raise Exception('sklearn needs to be pip installed first. Try: pip install scikit-learn')
+        # scaling
+        getsizes = MinMaxScaler(feature_range=(minscale, maxscale)).fit_transform(getsizes).flatten()
+    else:
+        getsizes = getsizes.ravel()
+    # Max digits is 4
+    getsizes = np.array(list(map(lambda x: round(x, 4), getsizes)))
+
+    return getsizes
+
+
 def _show_deprecated_warning(label, y, verbose):
     if label is not None:
         if verbose>=2: print('[pca]> [WARNING]: De parameter <label> is deprecated and will not be supported in future version.')