Skip to content

Commit

Permalink
update sparsity figure
Browse files Browse the repository at this point in the history
  • Loading branch information
jjc2718 committed Jun 14, 2023
1 parent aa0d6b2 commit 9cdaed5
Show file tree
Hide file tree
Showing 4 changed files with 1,545 additions and 1,923 deletions.
173 changes: 100 additions & 73 deletions 01_stratified_classification/lasso_range_gene_optimizers.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@

output_plots = True
output_plots_dir = os.path.join(
cfg.repo_root, '01_stratified_classification', 'optimizers_plots', 'figshare'
cfg.repo_root, '01_stratified_classification', 'optimizers_plots'
# cfg.repo_root, '01_stratified_classification', 'optimizers_plots', 'figshare'
)


Expand Down Expand Up @@ -109,30 +110,47 @@
# In[5]:


sns.set({'figure.figsize': (12, 10)})
sns.set_style('whitegrid')
ll_nz_coefs_df['optimizer'] = 'liblinear'
sgd_nz_coefs_df['optimizer'] = 'SGD'
all_nz_coefs_df = pd.concat((ll_nz_coefs_df, sgd_nz_coefs_df))

fig, axarr = plt.subplots(2, 1)
def precision_round(number, digits=2):
power = "{:e}".format(number).split('e')[1]
return round(number, -(int(power) - digits))

sns.boxplot(
data=ll_nz_coefs_df.sort_values(by=['lasso_param']),
x='lasso_param', y='nz_coefs', ax=axarr[0]
)
axarr[0].set_title('liblinear optimizer', size=16)
axarr[0].set_xlabel('')
axarr[0].set_ylabel('Number of nonzero coefficients', size=13)
axarr[0].tick_params(axis='both', labelsize=12)
axarr[0].tick_params(axis='x', rotation=45)
# invert liblinear lasso parameters
ll_inv_params = (
1 / all_nz_coefs_df.loc[all_nz_coefs_df.optimizer == 'liblinear', 'lasso_param']
).apply(precision_round)
all_nz_coefs_df.loc[all_nz_coefs_df.optimizer == 'liblinear', 'lasso_param'] = ll_inv_params
all_nz_coefs_df = all_nz_coefs_df[all_nz_coefs_df.lasso_param != 3.16e-08]

print(np.sort(all_nz_coefs_df.loc[all_nz_coefs_df.optimizer == 'liblinear'].lasso_param.unique()))
print(np.sort(all_nz_coefs_df.loc[all_nz_coefs_df.optimizer == 'SGD'].lasso_param.unique()))


# In[6]:


sns.set({'figure.figsize': (12, 5)})
sns.set_style('whitegrid')

sns.boxplot(
data=sgd_nz_coefs_df.sort_values(by=['lasso_param']),
x='lasso_param', y='nz_coefs', ax=axarr[1]
data=all_nz_coefs_df.sort_values(by=['lasso_param']),
x='lasso_param', y='nz_coefs', hue='optimizer'
)
axarr[1].set_title('SGD optimizer', size=16)
axarr[1].set_xlabel('LASSO parameter', size=13)
axarr[1].set_ylabel('Number of nonzero coefficients', size=13)
axarr[1].tick_params(axis='both', labelsize=12)
axarr[1].tick_params(axis='x', rotation=45)
plt.xlabel('LASSO parameter (lower = less regularization)', size=13)
plt.ylabel('Number of nonzero coefficients', size=13)
plt.title(
f'LASSO parameter vs. number of nonzero coefficients, {plot_gene}, {lr_schedule}',
size=16, y=1.03
)
handles, labels = plt.gca().get_legend_handles_labels()
new_labels = ['liblinear (1 / param)', r'SGD (unchanged param)']
plt.legend(title='Optimizer', handles=handles, labels=new_labels,
fontsize=14, title_fontsize=14)
plt.gca().tick_params(axis='both', labelsize=12)
plt.gca().tick_params(axis='x', rotation=45)

# color the boxplot lines/edges rather than the box fill
# this makes it easier to discern colors at the extremes; i.e. very many or few nonzero coefs
Expand All @@ -153,16 +171,8 @@ def color_boxes(ax):
line.set_color(col)
line.set_mfc(col) # facecolor of fliers
line.set_mec(col) # edgecolor of fliers

color_boxes(axarr[0])
color_boxes(axarr[1])

plt.suptitle(
f'LASSO parameter vs. number of nonzero coefficients, {plot_gene}, {lr_schedule}',
size=18, y=0.995
)

plt.tight_layout()
color_boxes(plt.gca())

if output_plots:
os.makedirs(output_plots_dir, exist_ok=True)
Expand All @@ -171,7 +181,7 @@ def color_boxes(ax):

# ### Get coefficient magnitude information for each lasso penalty

# In[6]:
# In[7]:


ll_sum_coefs_df = []
Expand Down Expand Up @@ -218,7 +228,7 @@ def color_boxes(ax):
all_coefs_df.head()


# In[7]:
# In[8]:


sns.set({'figure.figsize': (10, 6)})
Expand All @@ -239,7 +249,7 @@ def color_boxes(ax):
plt.tight_layout()


# In[8]:
# In[9]:


# plot coef magnitudes on same axis
Expand All @@ -254,7 +264,7 @@ def color_boxes(ax):
print(all_coefs_df.param_same_axis.sort_values().unique())


# In[9]:
# In[10]:


sns.set({'figure.figsize': (10, 5)})
Expand Down Expand Up @@ -282,7 +292,7 @@ def color_boxes(ax):

# ### Get performance information for each lasso penalty

# In[10]:
# In[11]:


ll_perf_df = au.load_prediction_results_lasso_range(ll_results_dir,
Expand All @@ -296,7 +306,7 @@ def color_boxes(ax):
ll_perf_df.head()


# In[11]:
# In[12]:


# get mean performance for each lasso parameter
Expand All @@ -310,7 +320,7 @@ def color_boxes(ax):
ll_mean_perf_df.head()


# In[12]:
# In[13]:


sgd_perf_df = au.load_prediction_results_lasso_range(sgd_results_dir,
Expand All @@ -324,7 +334,7 @@ def color_boxes(ax):
sgd_perf_df.head()


# In[13]:
# In[14]:


# get mean performance for each lasso parameter
Expand All @@ -338,7 +348,7 @@ def color_boxes(ax):
sgd_mean_perf_df.head()


# In[14]:
# In[15]:


print('liblinear:', ll_mean_perf_df['mean'].max(),
Expand All @@ -347,7 +357,7 @@ def color_boxes(ax):
'( param =', sgd_mean_perf_df['mean'].idxmax(), ')')


# In[15]:
# In[16]:


sns.set_style('ticks')
Expand Down Expand Up @@ -412,15 +422,15 @@ def color_boxes(ax):
#
# Even though SGD seems to have lots of nonzero coefficients, it's possible that lots of them are close to 0, or effectively 0. We'll plot the coefficient magnitudes on the same axis as the liblinear coefficients, to get a sense of this.

# In[16]:
# In[17]:


# plot coefficient distributions for this seed/fold
plot_seed = 42
plot_fold = 0


# In[17]:
# In[18]:


ll_nz_coefs_df['optimizer'] = 'liblinear'
Expand All @@ -430,7 +440,7 @@ def color_boxes(ax):
nz_coefs_df.head()


# In[18]:
# In[19]:


perf_coefs_df = (plot_df
Expand All @@ -443,7 +453,7 @@ def color_boxes(ax):
perf_coefs_df.head()


# In[19]:
# In[20]:


# get top-performing lasso param for each gene,
Expand All @@ -461,7 +471,7 @@ def color_boxes(ax):
ll_mean_perf_df.head()


# In[20]:
# In[21]:


# get top-performing lasso param for each gene,
Expand All @@ -479,15 +489,15 @@ def color_boxes(ax):
sgd_mean_perf_df.head()


# In[21]:
# In[22]:


ll_top_lasso_param = ll_mean_perf_df.iloc[0, :].lasso_param
sgd_top_lasso_param = sgd_mean_perf_df.iloc[0, :].lasso_param
print(ll_top_lasso_param, sgd_top_lasso_param)


# In[22]:
# In[23]:


# get coefficient info for liblinear
Expand Down Expand Up @@ -515,7 +525,7 @@ def color_boxes(ax):
ll_coefs_df.sort_values(by='abs+1', ascending=False).head(10)


# In[23]:
# In[24]:


# get coefficient info for sgd
Expand Down Expand Up @@ -544,7 +554,7 @@ def color_boxes(ax):
sgd_coefs_df.sort_values(by='abs+1', ascending=False).head(10)


# In[24]:
# In[25]:


sns.set({'figure.figsize': (8, 3)})
Expand All @@ -559,7 +569,7 @@ def color_boxes(ax):
plt.title(f'Log-log coefficient magnitude distribution, {plot_gene}', y=1.03)


# In[25]:
# In[26]:


sns.set({'figure.figsize': (10, 4)})
Expand All @@ -575,7 +585,7 @@ def color_boxes(ax):
#
# We want to separate the log-likelihood loss (data loss) from the weight penalty (regularization term) in the logistic regression loss function, to see if that breakdown is any different between optimizers.

# In[26]:
# In[27]:


# get loss function values from file
Expand All @@ -598,7 +608,7 @@ def get_loss_values(results_dir, optimizer):
return loss_df.reset_index(drop=True)


# In[27]:
# In[28]:


ll_loss_df = get_loss_values(ll_results_dir, 'liblinear')
Expand Down Expand Up @@ -637,7 +647,7 @@ def get_loss_values(results_dir, optimizer):
loss_df.head()


# In[28]:
# In[29]:


sns.set_style('ticks')
Expand Down
4 changes: 2 additions & 2 deletions 01_stratified_classification/nbconverted/optimizer_figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,11 @@


supp_f1 = Figure(
"675", "555",
"675", "340",
etree.Element("rect", {"width": "100%", "height": "100%", "fill": "white"}),
SVG(
os.path.join(cfg.repo_root, '01_stratified_classification', 'optimizers_plots', 'KRAS_constant_search_coefs_count.svg')
).scale(0.75).move(20, 10),
).scale(0.85).move(20, 10),
)

display(supp_f1)
Expand Down
Loading

0 comments on commit 9cdaed5

Please sign in to comment.