Skip to content

Commit

Permalink
update arrow dict by adding alpha, and coloring arrows based on stren…
Browse files Browse the repository at this point in the history
…gth.
  • Loading branch information
erdogant committed Apr 28, 2023
1 parent fb8c58a commit ae9be30
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 46 deletions.
60 changes: 43 additions & 17 deletions pca/examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
import numpy as np
import matplotlib as mpl
from scatterd import scatterd

import numpy as np
from sklearn.datasets import load_iris
Expand All @@ -15,13 +16,42 @@
model = pca()
model.fit_transform(X)

# Loading are automatically set based on weak/strong
model.biplot(s=0)
# Change strong and weak colors
model.biplot(s=0, arrowdict={'color_strong': 'r', 'color_weak': 'g'})
# Set alpha to constant value
model.biplot(s=0, arrowdict={'alpha': 0.8})
# Change arrow text color
model.biplot(s=0, arrowdict={'color_text': 'k'})
# Change arrow color, which automatically changes the label color too
model.biplot(s=0, color_arrow='k')
# Almost Remove arrows but keep the text
model.biplot(s=0, color_arrow='k', arrowdict={'alpha': 0.9})
# Set color text
model.biplot(s=0, arrowdict={'color_text': 'k'})
# Set color arrow and color text
model.biplot(s=0, color_arrow='k', arrowdict={'color_text': 'g'})
# Set color arrow and color text and alpha
model.biplot(s=0, color_arrow='k', arrowdict={'color_text': 'g', 'alpha': 0.8})

# Change the scale factor of the arrow
model.biplot(arrowdict={'scale_factor': 2})
model.biplot3d(arrowdict={'scale_factor': 3})

model.biplot(s=0, arrowdict={'weight':'bold', 'fontsize': 24, 'color_text': 'r'}, color_arrow='k')
model.biplot3d(density=True, fontsize=0, arrowdict={'weight':'bold', 'fontsize': 14})
model.biplot3d(density=True, fontsize=0, s=0, arrowdict={'fontsize': 24})

model.biplot(density=True, fontsize=0, arrowdict={'weight':'bold', 'fontsize': 14})

# model.scatter(density=True)
# model.biplot(c=[0,0,0], density=True)

# model.scatter3d(fontsize=16, PC=[0,1,3])
# model.scatter3d(density=True)

model.biplot3d(arrowdict={'weight':'bold'}, c=None, n_feat=5)
# model.biplot3d(arrowdict={'weight':'bold'}, n_feat=5)


# %%
Expand All @@ -40,9 +70,9 @@
model = pca(normalize=True, n_components=None)
# Fit transform with dataframe
results = model.fit_transform(df)

model.biplot(labels=df['flavanoids'].values, legend=False, cmap='seismic', n_feat=3, fontsize=20, arrowdict={'fontsize':28, 'c':'g'}, density=True)
model.biplot3d(legend=False, n_feat=3, fontcolor='r', arrowdict={'fontsize':22, 'c':'k'}, density=True)
# Plot
model.biplot(fontsize=0, labels=df['flavanoids'].values, legend=False, cmap='seismic', n_feat=3, arrowdict={'fontsize':28, 'c':'g'}, density=True)
# model.biplot3d(legend=None, n_feat=3, fontcolor='r', arrowdict={'fontsize':22, 'c':'k'}, density=True, figsize=(35, 30))


# %% Demonstration of specifying colors, markers, alpha, and size per sample
Expand Down Expand Up @@ -136,23 +166,18 @@
visible=True,
)






# %%
# Load pca
from pca import pca

# Initialize pca
model = pca()
model = pca(n_components=3)

# Load example data set
df = model.import_example(data='iris')

# Fit transform
results = model.fit_transform(X)
results = model.fit_transform(df)

# Make plot
model.biplot(HT2=True,
Expand Down Expand Up @@ -392,11 +417,11 @@
# Color on classlabel (Unchanged)
model.biplot()
# Use cmap colors for classlabels (unchanged)
model.biplot(labels=y, cmap=mpl.colors.ListedColormap(['green', 'red', 'blue']))
model.biplot(labels=y, cmap=mpl.colors.ListedColormap(['green', 'red', 'blue']), density=True)
# Do not show points when cmap=None (unchanged)
model.biplot(labels=load_iris().target, cmap=None)
model.biplot(labels=load_iris().target, cmap=None, density=True)
# Plot all points as unique entity (unchanged)
model.biplot(labels=y, gradient='#ffffff', cmap=mpl.colors.ListedColormap(['green', 'red', 'blue']))
model.biplot(labels=y, gradient='#ffffff', cmap=mpl.colors.ListedColormap(['green', 'red', 'blue']), density=True)


# %%
Expand Down Expand Up @@ -459,6 +484,7 @@

# Plot image
model.biplot(SPE=True, HT2=True, density=True)
model.biplot3d(SPE=True, HT2=True, density=True, arrowdict={'scale_factor': 1})

# %% Detect unseen outliers
# Import libraries
Expand All @@ -474,7 +500,7 @@
# model = pca(alpha=0.05, detect_outliers=None)

# Fit transform
model.fit_transform(X)
model.fit_transform(X, row_labels=np.zeros(X.shape[0]))

model.scatter(SPE=True, HT2=True)

Expand All @@ -486,9 +512,9 @@
PCnew = model.transform(X_unseen, row_labels=np.repeat('mapped_' + str(i), X_unseen.shape[0]), update_outlier_params=True)

# Scatterplot
model.scatter(SPE=True, HT2=True)
# model.scatter(SPE=True, HT2=True)
# Biplot
# Model.biplot(SPE=True, HT2=True)
model.biplot(SPE=True, HT2=True, density=True)


# %%
Expand Down
134 changes: 105 additions & 29 deletions pca/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import os
from adjustText import adjust_text
import statsmodels.stats.multitest as multitest
from typing import List, Union, Tuple


# %% Association learning across all variables
Expand Down Expand Up @@ -697,8 +698,8 @@ def biplot(self,
fontcolor=[0,0,0],
fontsize=18,
fontweight='normal',
color_arrow='#000000',
arrowdict={'fontsize': None, 'c': None, 'weight': None, 'ha': 'center', 'va': 'center'},
color_arrow=None,
arrowdict={'fontsize': None, 'color_text': None, 'weight': None, 'alpha': None, 'color_strong': '#880808', 'color_weak': '#002a77', 'scale_factor': None},
cmap='tab20c',
title=None,
legend=None,
Expand Down Expand Up @@ -766,19 +767,30 @@ def biplot(self,
density_on_top : bool, (default: False)
True : The density is the highest layer.
False : The density is the lowest layer.
color_arrow : String, (default: '#000000')
color for the arrow.
* '#000000'
* 'r'
arrowdict : dict.
dictionary containing properties for the arrow font-text.
Note that the [c]olor: None inherits the color used in color_arrow.
{'fontsize': None, 'c': None, 'weight': None, 'ha': 'center', 'va': 'center'}
fontsize : String, optional
The fontsize of the y labels that are plotted in the graph. The default is 16.
fontsize : String (default: 16)
The fontsize of the labels.
fontcolor: list/array of RGB colors with same size as X (default : None)
None : Use same colorscheme as for c
'#000000' : If the input is a single color, all fonts will get this color.
fontweight : String, (default: 'normal')
The fontweight of the labels.
* 'normal'
* 'bold'
color_arrow : String, (default: None)
color for the arrow.
* None: Color arrows based on strength using 'color_strong' and 'color_weak'.
* '#000000'
* 'r'
arrowdict : dict.
Dictionary containing properties for the arrow font-text.
{'fontsize': None, 'color_text': None, 'weight': None, 'alpha': None, 'color_strong': '#880808', 'color_weak': '#002a77', 'ha': 'center', 'va': 'center'}
* fontsize: None automatically adjust based on the fontsize. Specify otherwise.
* 'color_text': None automatically adjust color based color_strong and color_weak. Specify hex color otherwise.
* 'weight': None automatically adjust based on fontweight. Specify otherwise: 'normal', 'bold'
* 'alpha': None automatically adjust based on loading value.
* 'color_strong': Hex color for strong loadings (color_arrow needs to be set to None).
* 'color_weak': Hex color for weak loadings (color_arrow needs to be set to None).
* 'scale_factor': The scale factor for the arrow length. None automatically sets changes between 2d and 3d plots.
cmap : String, optional, default: 'Set1'
Colormap. If set to None, no points are shown.
title : str, default: None
Expand Down Expand Up @@ -829,7 +841,7 @@ def biplot(self,
# Scatterplot
fig, ax = self.scatter(labels=labels, legend=legend, PC=PC, SPE=SPE, HT2=HT2, cmap=cmap, visible=visible, figsize=figsize, alpha=alpha, title=title, gradient=gradient, fig=fig, ax=ax, c=c, s=s, jitter=jitter, marker=marker, fontcolor=fontcolor, fontweight=fontweight, fontsize=fontsize, edgecolor=edgecolor, density=density, density_on_top=density_on_top, dpi=dpi, verbose=verbose)
# Add the loadings with arrow to the plot
fig, ax = _plot_loadings(self, topfeat, n_feat, PC, d3, color_arrow, arrowdict, fig, ax, verbose)
fig, ax = _plot_loadings(self, topfeat, n_feat, PC, d3, arrowdict, fig, ax, verbose)
# Plot
if visible: plt.show()
# Return
Expand Down Expand Up @@ -1467,10 +1479,10 @@ def _biplot_input_checks(results, PC, cmap, arrowdict, color_arrow, fontsize, fo


def _set_arrowdict(arrowdict, color_arrow=None, fontsize=18, fontweight='normal'):
color_arrow = 'black' if (color_arrow is None) else color_arrow
arrowdict = {**{'fontsize': 18 if fontsize==0 else fontsize, 'c': color_arrow, 'weight': fontweight, 'ha': 'center', 'va': 'center'}, **arrowdict}
if arrowdict.get('c') is None and (color_arrow is not None):
arrowdict['c'] = color_arrow
# color_arrow = None if (color_arrow is None) else color_arrow
arrowdict = {**{'color_arrow': color_arrow, 'color_text': None, 'fontsize': 18 if fontsize==0 else fontsize, 'weight': fontweight, 'alpha': None, 'ha': 'center', 'va': 'center', 'color_strong': '#880808', 'color_weak': '#002a77', 'scale_factor': None}, **arrowdict}
# if arrowdict.get('color_text') is None and (color_arrow is not None):
# arrowdict['color_text'] = color_arrow
if arrowdict.get('fontsize') is None:
arrowdict['fontsize'] = 18 if fontsize==0 else fontsize
if arrowdict.get('weight') is None:
Expand Down Expand Up @@ -1528,7 +1540,7 @@ def _setup_figure(fig, ax, d3, visible, figsize, dpi):
return fig, ax


def _plot_loadings(self, topfeat, n_feat, PC, d3, color_arrow, arrowdict, fig, ax, verbose):
def _plot_loadings(self, topfeat, n_feat, PC, d3, arrowdict, fig, ax, verbose):
topfeat = pd.concat([topfeat.iloc[PC, :], topfeat.loc[~topfeat.index.isin(PC), :]])
topfeat.reset_index(inplace=True)
# Collect coefficients
Expand All @@ -1542,7 +1554,12 @@ def _plot_loadings(self, topfeat, n_feat, PC, d3, color_arrow, arrowdict, fig, a
# Plot and scale values for arrows and text by taking the absolute minimum range of the x-axis and y-axis.
max_axis = np.max(np.abs(self.results['PC'].iloc[:, PC]).min(axis=1))
max_arrow = np.abs(coeff).max().max()
scale = (np.max([1, np.round(max_axis / max_arrow, 2)])) * 0.93
if arrowdict['scale_factor'] is None:
scale_factor = 1.8 if d3 else 1.1
else:
scale_factor = arrowdict['scale_factor']
# Scale the arrows using the scale factor
scale = (np.max([1, np.round(max_axis / max_arrow, 2)])) * scale_factor

# For vizualization purposes we will keep only the unique feature-names
topfeat = topfeat.drop_duplicates(subset=['feature'])
Expand All @@ -1551,6 +1568,10 @@ def _plot_loadings(self, topfeat, n_feat, PC, d3, color_arrow, arrowdict, fig, a
if verbose>=2: print('[pca] >[WARNING]: n_feat can not be reached because of the limitation of n_components (=%d). n_feat is reduced to %d.' %(self.n_components, n_feat))

# Plot arrows and text
arrow_line_color = arrowdict['color_arrow']
arrow_text_color = arrowdict['color_text']
arrow_alpha = arrowdict['alpha']
alpha_scaled = normalize_size(topfeat['loading'].abs().values.reshape(-1,1), minscale=0.35, maxscale=0.95, scaler='minmax')
texts = []
for i in range(0, n_feat):
getfeat = topfeat['feature'].iloc[i]
Expand All @@ -1559,17 +1580,24 @@ def _plot_loadings(self, topfeat, n_feat, PC, d3, color_arrow, arrowdict, fig, a
# Set first PC vs second PC direction. Note that these are not neccarily the best loading.
xarrow = getcoef[0] * scale # First PC in the x-axis direction
yarrow = getcoef[1] * scale # Second PC in the y-axis direction
txtcolor = 'y' if topfeat['type'].iloc[i] == 'weak' else 'g'
# Set the arrow and arrow-text colors
# Update arrow color if None
loading_color = arrowdict['color_weak'] if topfeat['type'].iloc[i] == 'weak' else arrowdict['color_strong']
# Update colors if None
if arrowdict['color_arrow'] is None: arrow_line_color = loading_color
if arrowdict['color_text'] is None: arrow_text_color = loading_color
if arrowdict['alpha'] is None: arrow_alpha = alpha_scaled[i]

if d3:
zarrow = getcoef[2] * scale
ax.quiver(mean_x, mean_y, mean_z, xarrow - mean_x, yarrow - mean_y, zarrow - mean_z, color=color_arrow, alpha=0.8, lw=2)
texts.append(ax.text(xarrow, yarrow, zarrow, label, color=arrowdict['c'], ha='center', va='center', fontsize=arrowdict['fontsize'], zorder=25))
ax.quiver(mean_x, mean_y, mean_z, xarrow - mean_x, yarrow - mean_y, zarrow - mean_z, color=arrow_line_color, alpha=arrow_alpha, lw=2)
texts.append(ax.text(xarrow, yarrow, zarrow, label, fontsize=arrowdict['fontsize'], c=arrow_text_color, weight=arrowdict['weight'], ha=arrowdict['ha'], va=arrowdict['va'], zorder=25))
else:
ax.arrow(mean_x, mean_y, xarrow - mean_x, yarrow - mean_y, color=color_arrow, alpha=0.8, width=0.002, head_width=0.1, head_length=0.1 * 1.1, length_includes_head=True, zorder=10)
texts.append(ax.text(xarrow, yarrow, label, fontdict=arrowdict, zorder=10))
head_width = 0.1
ax.arrow(mean_x, mean_y, xarrow - mean_x, yarrow - mean_y, color=arrow_line_color, alpha=arrow_alpha, width=0.002, head_width=head_width, head_length=head_width * 1.1, length_includes_head=True, zorder=10)
texts.append(ax.text(xarrow, yarrow, label, fontsize=arrowdict['fontsize'], c=arrow_text_color, weight=arrowdict['weight'], ha=arrowdict['ha'], va=arrowdict['va'], zorder=10))

# Plot the adjusted text labels to prevent overlap
# Plot the adjusted text labels to prevent overlap. Do not adjust text in 3d plots as it will mess up the locations.
if len(texts)>0 and not d3: adjust_text(texts)

# Return
Expand All @@ -1595,9 +1623,9 @@ def _add_plot_SPE(self, xs, ys, zs, SPE, d3, alpha, s, fig, ax):
if SPE and ('y_bool_spe' in self.results['outliers'].columns):
label_spe = str(sum(Ioutlier2)) + ' outlier(s) (SPE/DmodX)'
if d3:
ax.scatter(xs[Ioutlier2], ys[Ioutlier2], zs[Ioutlier2], marker='x', color=[0.5, 0.5, 0.5], s=s, label=label_spe, alpha=alpha)
ax.scatter(xs[Ioutlier2], ys[Ioutlier2], zs[Ioutlier2], marker='x', color=[0.5, 0.5, 0.5], s=s, label=label_spe, alpha=alpha, zorder=15)
else:
ax.scatter(xs[Ioutlier2], ys[Ioutlier2], marker='d', color=[0.5, 0.5, 0.5], s=s, label=label_spe, alpha=alpha)
ax.scatter(xs[Ioutlier2], ys[Ioutlier2], marker='d', color=[0.5, 0.5, 0.5], s=s, label=label_spe, alpha=alpha, zorder=15)

return fig, ax

Expand All @@ -1613,9 +1641,9 @@ def _add_plot_HT2(self, xs, ys, zs, HT2, d3, alpha, s, fig, ax):
if HT2 and ('y_bool' in self.results['outliers'].columns):
label_t2 = str(sum(Ioutlier1)) + ' outlier(s) (hotelling t2)'
if d3:
ax.scatter(xs[Ioutlier1], ys[Ioutlier1], zs[Ioutlier1], marker='d', color=[0, 0, 0], s=s, label=label_t2, alpha=alpha)
ax.scatter(xs[Ioutlier1], ys[Ioutlier1], zs[Ioutlier1], marker='d', color=[0, 0, 0], s=s, label=label_t2, alpha=alpha, zorder=15)
else:
ax.scatter(xs[Ioutlier1], ys[Ioutlier1], marker='x', color=[0, 0, 0], s=s, label=label_t2, alpha=alpha)
ax.scatter(xs[Ioutlier1], ys[Ioutlier1], marker='x', color=[0, 0, 0], s=s, label=label_t2, alpha=alpha, zorder=15)

return fig, ax

Expand Down Expand Up @@ -1645,6 +1673,54 @@ def _add_plot_properties(self, PC, d3, title, legend, labels, fig, ax, fontsize,
return fig, ax


def normalize_size(getsizes, minscale: Union[int, float] = 0.5, maxscale: Union[int, float] = 4, scaler: str = 'zscore'):
"""Normalize values between minimum and maximum value.
Parameters
----------
getsizes : input array
Array of values that needs to be scaled.
minscale : Union[int, float], optional
Minimum value. The default is 0.5.
maxscale : Union[int, float], optional
Maximum value. The default is 4.
scaler : str, optional
Type of scaler. The default is 'zscore'.
* 'zscore'
* 'minmax'
Returns
-------
getsizes : array-like
scaled values between min-max.
"""
# Instead of Min-Max scaling, that shrinks any distribution in the [0, 1] interval, scaling the variables to
# Z-scores is better. Min-Max Scaling is too sensitive to outlier observations and generates unseen problems,

# Set sizes to 0 if not available
getsizes[np.isinf(getsizes)]=0
getsizes[np.isnan(getsizes)]=0

# out-of-scale datapoints.
if scaler == 'zscore' and len(np.unique(getsizes)) > 3:
getsizes = (getsizes.flatten() - np.mean(getsizes)) / np.std(getsizes)
getsizes = getsizes + (minscale - np.min(getsizes))
elif scaler == 'minmax':
try:
from sklearn.preprocessing import MinMaxScaler
except:
raise Exception('sklearn needs to be pip installed first. Try: pip install scikit-learn')
# scaling
getsizes = MinMaxScaler(feature_range=(minscale, maxscale)).fit_transform(getsizes).flatten()
else:
getsizes = getsizes.ravel()
# Max digits is 4
getsizes = np.array(list(map(lambda x: round(x, 4), getsizes)))

return getsizes


def _show_deprecated_warning(label, y, verbose):
if label is not None:
if verbose>=2: print('[pca]> [WARNING]: De parameter <label> is deprecated and will not be supported in future version.')
Expand Down

0 comments on commit ae9be30

Please sign in to comment.