Skip to content

Commit

Permalink
Merge pull request #84 from jjc2718/sparsity_fig
Browse files Browse the repository at this point in the history
Update sparsity figure + regenerate figshare data
  • Loading branch information
jjc2718 authored Jun 26, 2023
2 parents aa0d6b2 + af22c04 commit ca5aabb
Show file tree
Hide file tree
Showing 5 changed files with 1,595 additions and 1,942 deletions.
217 changes: 135 additions & 82 deletions 01_stratified_classification/lasso_range_gene_optimizers.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,29 @@
metric = 'aupr'

output_plots = True
output_plots_dir = os.path.join(
cfg.repo_root, '01_stratified_classification', 'optimizers_plots', 'figshare'
)

# toggle this in papermill script to generate all results
figshare = False

# ### Get nonzero coefficient information for each lasso penalty

# In[3]:


if figshare:
output_plots_dir = os.path.join(
cfg.repo_root, '01_stratified_classification', 'optimizers_plots', 'figshare'
)
else:
output_plots_dir = os.path.join(
cfg.repo_root, '01_stratified_classification', 'optimizers_plots'
)


# ### Get nonzero coefficient information for each lasso penalty

# In[4]:


ll_nz_coefs_df = []

# get coefficient info for training dataset specified above
Expand All @@ -80,7 +93,7 @@
ll_nz_coefs_df.head()


# In[4]:
# In[5]:


sgd_nz_coefs_df = []
Expand All @@ -106,33 +119,52 @@
sgd_nz_coefs_df.head()


# In[5]:
# In[6]:


sns.set({'figure.figsize': (12, 10)})
sns.set_style('whitegrid')
ll_nz_coefs_df['optimizer'] = 'liblinear'
sgd_nz_coefs_df['optimizer'] = 'SGD'
all_nz_coefs_df = pd.concat((ll_nz_coefs_df, sgd_nz_coefs_df))

fig, axarr = plt.subplots(2, 1)
def precision_round(number, digits=2):
power = "{:e}".format(number).split('e')[1]
return round(number, -(int(power) - digits))

sns.boxplot(
data=ll_nz_coefs_df.sort_values(by=['lasso_param']),
x='lasso_param', y='nz_coefs', ax=axarr[0]
)
axarr[0].set_title('liblinear optimizer', size=16)
axarr[0].set_xlabel('')
axarr[0].set_ylabel('Number of nonzero coefficients', size=13)
axarr[0].tick_params(axis='both', labelsize=12)
axarr[0].tick_params(axis='x', rotation=45)
# invert liblinear lasso parameters
ll_inv_params = (
1 / all_nz_coefs_df.loc[all_nz_coefs_df.optimizer == 'liblinear', 'lasso_param']
).apply(precision_round)
all_nz_coefs_df.loc[all_nz_coefs_df.optimizer == 'liblinear', 'lasso_param'] = ll_inv_params

# accidentally fit model for this parameter for liblinear but not SGD, so just drop it
all_nz_coefs_df = all_nz_coefs_df[all_nz_coefs_df.lasso_param != 3.16e-08]

print(np.sort(all_nz_coefs_df.loc[all_nz_coefs_df.optimizer == 'liblinear'].lasso_param.unique()))
print(np.sort(all_nz_coefs_df.loc[all_nz_coefs_df.optimizer == 'SGD'].lasso_param.unique()))


# In[7]:


sns.set({'figure.figsize': (12, 5)})
sns.set_style('whitegrid')

sns.boxplot(
data=sgd_nz_coefs_df.sort_values(by=['lasso_param']),
x='lasso_param', y='nz_coefs', ax=axarr[1]
data=all_nz_coefs_df.sort_values(by=['lasso_param']),
x='lasso_param', y='nz_coefs', hue='optimizer'
)
plt.xlabel('LASSO parameter (lower = less regularization)', size=13)
plt.ylabel('Number of nonzero coefficients', size=13)
plt.title(
f'LASSO parameter vs. number of nonzero coefficients, {plot_gene}, {lr_schedule}',
size=16, y=1.03
)
axarr[1].set_title('SGD optimizer', size=16)
axarr[1].set_xlabel('LASSO parameter', size=13)
axarr[1].set_ylabel('Number of nonzero coefficients', size=13)
axarr[1].tick_params(axis='both', labelsize=12)
axarr[1].tick_params(axis='x', rotation=45)
handles, labels = plt.gca().get_legend_handles_labels()
new_labels = ['liblinear (1 / param)', r'SGD (unchanged param)']
plt.legend(title='Optimizer', handles=handles, labels=new_labels,
fontsize=14, title_fontsize=14)
plt.gca().tick_params(axis='both', labelsize=12)
plt.gca().tick_params(axis='x', rotation=45)

# color the boxplot lines/edges rather than the box fill
# this makes it easier to discern colors at the extremes; i.e. very many or few nonzero coefs
Expand All @@ -153,16 +185,8 @@ def color_boxes(ax):
line.set_color(col)
line.set_mfc(col) # facecolor of fliers
line.set_mec(col) # edgecolor of fliers

color_boxes(axarr[0])
color_boxes(axarr[1])

plt.suptitle(
f'LASSO parameter vs. number of nonzero coefficients, {plot_gene}, {lr_schedule}',
size=18, y=0.995
)

plt.tight_layout()
color_boxes(plt.gca())

if output_plots:
os.makedirs(output_plots_dir, exist_ok=True)
Expand All @@ -171,7 +195,7 @@ def color_boxes(ax):

# ### Get coefficient magnitude information for each lasso penalty

# In[6]:
# In[8]:


ll_sum_coefs_df = []
Expand Down Expand Up @@ -218,7 +242,7 @@ def color_boxes(ax):
all_coefs_df.head()


# In[7]:
# In[9]:


sns.set({'figure.figsize': (10, 6)})
Expand All @@ -239,7 +263,7 @@ def color_boxes(ax):
plt.tight_layout()


# In[8]:
# In[10]:


# plot coef magnitudes on same axis
Expand All @@ -254,7 +278,7 @@ def color_boxes(ax):
print(all_coefs_df.param_same_axis.sort_values().unique())


# In[9]:
# In[11]:


sns.set({'figure.figsize': (10, 5)})
Expand Down Expand Up @@ -282,7 +306,7 @@ def color_boxes(ax):

# ### Get performance information for each lasso penalty

# In[10]:
# In[12]:


ll_perf_df = au.load_prediction_results_lasso_range(ll_results_dir,
Expand All @@ -296,7 +320,7 @@ def color_boxes(ax):
ll_perf_df.head()


# In[11]:
# In[13]:


# get mean performance for each lasso parameter
Expand All @@ -310,7 +334,7 @@ def color_boxes(ax):
ll_mean_perf_df.head()


# In[12]:
# In[14]:


sgd_perf_df = au.load_prediction_results_lasso_range(sgd_results_dir,
Expand All @@ -324,7 +348,7 @@ def color_boxes(ax):
sgd_perf_df.head()


# In[13]:
# In[15]:


# get mean performance for each lasso parameter
Expand All @@ -338,7 +362,7 @@ def color_boxes(ax):
sgd_mean_perf_df.head()


# In[14]:
# In[16]:


print('liblinear:', ll_mean_perf_df['mean'].max(),
Expand All @@ -347,7 +371,7 @@ def color_boxes(ax):
'( param =', sgd_mean_perf_df['mean'].idxmax(), ')')


# In[15]:
# In[17]:


sns.set_style('ticks')
Expand Down Expand Up @@ -412,15 +436,15 @@ def color_boxes(ax):
#
# Even though SGD seems to have lots of nonzero coefficients, it's possible that lots of them are close to 0, or effectively 0. We'll plot the coefficient magnitudes on the same axis as the liblinear coefficients, to get a sense of this.

# In[16]:
# In[18]:


# plot coefficient distributions for this seed/fold
plot_seed = 42
plot_fold = 0


# In[17]:
# In[19]:


ll_nz_coefs_df['optimizer'] = 'liblinear'
Expand All @@ -430,7 +454,7 @@ def color_boxes(ax):
nz_coefs_df.head()


# In[18]:
# In[20]:


perf_coefs_df = (plot_df
Expand All @@ -443,7 +467,7 @@ def color_boxes(ax):
perf_coefs_df.head()


# In[19]:
# In[21]:


# get top-performing lasso param for each gene,
Expand All @@ -461,7 +485,7 @@ def color_boxes(ax):
ll_mean_perf_df.head()


# In[20]:
# In[22]:


# get top-performing lasso param for each gene,
Expand All @@ -479,15 +503,15 @@ def color_boxes(ax):
sgd_mean_perf_df.head()


# In[21]:
# In[23]:


ll_top_lasso_param = ll_mean_perf_df.iloc[0, :].lasso_param
sgd_top_lasso_param = sgd_mean_perf_df.iloc[0, :].lasso_param
print(ll_top_lasso_param, sgd_top_lasso_param)


# In[22]:
# In[24]:


# get coefficient info for liblinear
Expand Down Expand Up @@ -515,7 +539,7 @@ def color_boxes(ax):
ll_coefs_df.sort_values(by='abs+1', ascending=False).head(10)


# In[23]:
# In[25]:


# get coefficient info for sgd
Expand Down Expand Up @@ -544,7 +568,7 @@ def color_boxes(ax):
sgd_coefs_df.sort_values(by='abs+1', ascending=False).head(10)


# In[24]:
# In[26]:


sns.set({'figure.figsize': (8, 3)})
Expand All @@ -559,7 +583,7 @@ def color_boxes(ax):
plt.title(f'Log-log coefficient magnitude distribution, {plot_gene}', y=1.03)


# In[25]:
# In[27]:


sns.set({'figure.figsize': (10, 4)})
Expand All @@ -575,7 +599,7 @@ def color_boxes(ax):
#
# We want to separate the log-likelihood loss (data loss) from the weight penalty (regularization term) in the logistic regression loss function, to see if that breakdown is any different between optimizers.

# In[26]:
# In[28]:


# get loss function values from file
Expand All @@ -598,7 +622,7 @@ def get_loss_values(results_dir, optimizer):
return loss_df.reset_index(drop=True)


# In[27]:
# In[29]:


ll_loss_df = get_loss_values(ll_results_dir, 'liblinear')
Expand Down Expand Up @@ -637,7 +661,7 @@ def get_loss_values(results_dir, optimizer):
loss_df.head()


# In[28]:
# In[30]:


sns.set_style('ticks')
Expand All @@ -647,8 +671,10 @@ def get_loss_values(results_dir, optimizer):
data=loss_df,
x='lasso_param', y='loss_value', hue='loss_component',
hue_order=['log_loss', 'l1_penalty', 'total_loss'],
style='loss_component',
style_order=['log_loss', 'l1_penalty', 'total_loss'],
marker='o', kind='line', col='optimizer',
col_wrap=2, height=5, aspect=1.6,
col_wrap=2, height=5, aspect=1.6, markersize=8.5, linewidth=3.5,
facet_kws={'sharex': False}
)
g.set(xscale='log', yscale='log')
Expand All @@ -662,6 +688,8 @@ def get_loss_values(results_dir, optimizer):
g.set_titles('Optimizer: {col_name}')
sns.move_legend(g, "center", bbox_to_anchor=[1.05, 0.5], frameon=True)
g._legend.set_title('Loss component')
for legobj in g._legend.legendHandles:
legobj.set_linewidth(3.5)

plt.suptitle(f'LASSO parameter vs. training loss, {plot_gene}', y=1.05)

Expand Down
4 changes: 2 additions & 2 deletions 01_stratified_classification/nbconverted/optimizer_figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,11 @@


supp_f1 = Figure(
"675", "555",
"675", "340",
etree.Element("rect", {"width": "100%", "height": "100%", "fill": "white"}),
SVG(
os.path.join(cfg.repo_root, '01_stratified_classification', 'optimizers_plots', 'KRAS_constant_search_coefs_count.svg')
).scale(0.75).move(20, 10),
).scale(0.85).move(20, 10),
)

display(supp_f1)
Expand Down
Loading

0 comments on commit ca5aabb

Please sign in to comment.