Merge pull request #84 from jjc2718/sparsity_fig

Update sparsity figure + regenerate figshare data
greenelab · Jun 26, 2023 · ca5aabb · ca5aabb
2 parents aa0d6b2 + af22c04
commit ca5aabb
Show file tree

Hide file tree

Showing 5 changed files with 1,595 additions and 1,942 deletions.
diff --git a/01_stratified_classification/lasso_range_gene_optimizers.ipynb b/01_stratified_classification/lasso_range_gene_optimizers.ipynb
diff --git a/01_stratified_classification/nbconverted/lasso_range_gene_optimizers.py b/01_stratified_classification/nbconverted/lasso_range_gene_optimizers.py
@@ -47,16 +47,29 @@
 metric = 'aupr'
 
 output_plots = True
-output_plots_dir = os.path.join(
-    cfg.repo_root, '01_stratified_classification', 'optimizers_plots', 'figshare'
-)
 
+# toggle this in papermill script to generate all results
+figshare = False
 
-# ### Get nonzero coefficient information for each lasso penalty
 
 # In[3]:
 
 
+if figshare:
+    output_plots_dir = os.path.join(
+        cfg.repo_root, '01_stratified_classification', 'optimizers_plots', 'figshare'
+    )
+else:
+    output_plots_dir = os.path.join(
+        cfg.repo_root, '01_stratified_classification', 'optimizers_plots'
+    )
+
+
+# ### Get nonzero coefficient information for each lasso penalty
+
+# In[4]:
+
+
 ll_nz_coefs_df = []
 
 # get coefficient info for training dataset specified above
@@ -80,7 +93,7 @@
 ll_nz_coefs_df.head()
 
 
-# In[4]:
+# In[5]:
 
 
 sgd_nz_coefs_df = []
@@ -106,33 +119,52 @@
 sgd_nz_coefs_df.head()
 
 
-# In[5]:
+# In[6]:
 
 
-sns.set({'figure.figsize': (12, 10)})
-sns.set_style('whitegrid')
+ll_nz_coefs_df['optimizer'] = 'liblinear'
+sgd_nz_coefs_df['optimizer'] = 'SGD'
+all_nz_coefs_df = pd.concat((ll_nz_coefs_df, sgd_nz_coefs_df))
 
-fig, axarr = plt.subplots(2, 1)
+def precision_round(number, digits=2):
+    power = "{:e}".format(number).split('e')[1]
+    return round(number, -(int(power) - digits))
 
-sns.boxplot(
-    data=ll_nz_coefs_df.sort_values(by=['lasso_param']),
-    x='lasso_param', y='nz_coefs', ax=axarr[0]
-)
-axarr[0].set_title('liblinear optimizer', size=16)
-axarr[0].set_xlabel('')
-axarr[0].set_ylabel('Number of nonzero coefficients', size=13)
-axarr[0].tick_params(axis='both', labelsize=12)
-axarr[0].tick_params(axis='x', rotation=45)
+# invert liblinear lasso parameters
+ll_inv_params = (
+    1 / all_nz_coefs_df.loc[all_nz_coefs_df.optimizer == 'liblinear', 'lasso_param']
+).apply(precision_round)
+all_nz_coefs_df.loc[all_nz_coefs_df.optimizer == 'liblinear', 'lasso_param'] = ll_inv_params
+
+# accidentally fit model for this parameter for liblinear but not SGD, so just drop it
+all_nz_coefs_df = all_nz_coefs_df[all_nz_coefs_df.lasso_param != 3.16e-08]
+
+print(np.sort(all_nz_coefs_df.loc[all_nz_coefs_df.optimizer == 'liblinear'].lasso_param.unique()))
+print(np.sort(all_nz_coefs_df.loc[all_nz_coefs_df.optimizer == 'SGD'].lasso_param.unique()))
+
+
+# In[7]:
+
+
+sns.set({'figure.figsize': (12, 5)})
+sns.set_style('whitegrid')
 
 sns.boxplot(
-    data=sgd_nz_coefs_df.sort_values(by=['lasso_param']),
-    x='lasso_param', y='nz_coefs', ax=axarr[1]
+    data=all_nz_coefs_df.sort_values(by=['lasso_param']),
+    x='lasso_param', y='nz_coefs', hue='optimizer'
+)
+plt.xlabel('LASSO parameter (lower = less regularization)', size=13)
+plt.ylabel('Number of nonzero coefficients', size=13)
+plt.title(
+    f'LASSO parameter vs. number of nonzero coefficients, {plot_gene}, {lr_schedule}',
+    size=16, y=1.03
 )
-axarr[1].set_title('SGD optimizer', size=16)
-axarr[1].set_xlabel('LASSO parameter', size=13)
-axarr[1].set_ylabel('Number of nonzero coefficients', size=13)
-axarr[1].tick_params(axis='both', labelsize=12)
-axarr[1].tick_params(axis='x', rotation=45)
+handles, labels = plt.gca().get_legend_handles_labels()
+new_labels = ['liblinear (1 / param)', r'SGD (unchanged param)']
+plt.legend(title='Optimizer', handles=handles, labels=new_labels,
+           fontsize=14, title_fontsize=14)
+plt.gca().tick_params(axis='both', labelsize=12)
+plt.gca().tick_params(axis='x', rotation=45)
 
 # color the boxplot lines/edges rather than the box fill
 # this makes it easier to discern colors at the extremes; i.e. very many or few nonzero coefs
@@ -153,16 +185,8 @@ def color_boxes(ax):
             line.set_color(col)
             line.set_mfc(col)  # facecolor of fliers
             line.set_mec(col)  # edgecolor of fliers
-
-color_boxes(axarr[0])
-color_boxes(axarr[1])
 
-plt.suptitle(
-    f'LASSO parameter vs. number of nonzero coefficients, {plot_gene}, {lr_schedule}',
-    size=18, y=0.995
-)
-
-plt.tight_layout()
+color_boxes(plt.gca())
 
 if output_plots:
     os.makedirs(output_plots_dir, exist_ok=True)
@@ -171,7 +195,7 @@ def color_boxes(ax):
 
 # ### Get coefficient magnitude information for each lasso penalty
 
-# In[6]:
+# In[8]:
 
 
 ll_sum_coefs_df = []
@@ -218,7 +242,7 @@ def color_boxes(ax):
 all_coefs_df.head()
 
 
-# In[7]:
+# In[9]:
 
 
 sns.set({'figure.figsize': (10, 6)})
@@ -239,7 +263,7 @@ def color_boxes(ax):
 plt.tight_layout()
 
 
-# In[8]:
+# In[10]:
 
 
 # plot coef magnitudes on same axis
@@ -254,7 +278,7 @@ def color_boxes(ax):
 print(all_coefs_df.param_same_axis.sort_values().unique())
 
 
-# In[9]:
+# In[11]:
 
 
 sns.set({'figure.figsize': (10, 5)})
@@ -282,7 +306,7 @@ def color_boxes(ax):
 
 # ### Get performance information for each lasso penalty
 
-# In[10]:
+# In[12]:
 
 
 ll_perf_df = au.load_prediction_results_lasso_range(ll_results_dir,
@@ -296,7 +320,7 @@ def color_boxes(ax):
 ll_perf_df.head()
 
 
-# In[11]:
+# In[13]:
 
 
 # get mean performance for each lasso parameter
@@ -310,7 +334,7 @@ def color_boxes(ax):
 ll_mean_perf_df.head()
 
 
-# In[12]:
+# In[14]:
 
 
 sgd_perf_df = au.load_prediction_results_lasso_range(sgd_results_dir,
@@ -324,7 +348,7 @@ def color_boxes(ax):
 sgd_perf_df.head()
 
 
-# In[13]:
+# In[15]:
 
 
 # get mean performance for each lasso parameter
@@ -338,7 +362,7 @@ def color_boxes(ax):
 sgd_mean_perf_df.head()
 
 
-# In[14]:
+# In[16]:
 
 
 print('liblinear:', ll_mean_perf_df['mean'].max(),
@@ -347,7 +371,7 @@ def color_boxes(ax):
       '( param =', sgd_mean_perf_df['mean'].idxmax(), ')')
 
 
-# In[15]:
+# In[17]:
 
 
 sns.set_style('ticks')
@@ -412,15 +436,15 @@ def color_boxes(ax):
 # 
 # Even though SGD seems to have lots of nonzero coefficients, it's possible that lots of them are close to 0, or effectively 0. We'll plot the coefficient magnitudes on the same axis as the liblinear coefficients, to get a sense of this.
 
-# In[16]:
+# In[18]:
 
 
 # plot coefficient distributions for this seed/fold
 plot_seed = 42
 plot_fold = 0
 
 
-# In[17]:
+# In[19]:
 
 
 ll_nz_coefs_df['optimizer'] = 'liblinear'
@@ -430,7 +454,7 @@ def color_boxes(ax):
 nz_coefs_df.head()
 
 
-# In[18]:
+# In[20]:
 
 
 perf_coefs_df = (plot_df
@@ -443,7 +467,7 @@ def color_boxes(ax):
 perf_coefs_df.head()
 
 
-# In[19]:
+# In[21]:
 
 
 # get top-performing lasso param for each gene,
@@ -461,7 +485,7 @@ def color_boxes(ax):
 ll_mean_perf_df.head()
 
 
-# In[20]:
+# In[22]:
 
 
 # get top-performing lasso param for each gene,
@@ -479,15 +503,15 @@ def color_boxes(ax):
 sgd_mean_perf_df.head()
 
 
-# In[21]:
+# In[23]:
 
 
 ll_top_lasso_param = ll_mean_perf_df.iloc[0, :].lasso_param
 sgd_top_lasso_param = sgd_mean_perf_df.iloc[0, :].lasso_param
 print(ll_top_lasso_param, sgd_top_lasso_param)
 
 
-# In[22]:
+# In[24]:
 
 
 # get coefficient info for liblinear
@@ -515,7 +539,7 @@ def color_boxes(ax):
 ll_coefs_df.sort_values(by='abs+1', ascending=False).head(10)
 
 
-# In[23]:
+# In[25]:
 
 
 # get coefficient info for sgd
@@ -544,7 +568,7 @@ def color_boxes(ax):
 sgd_coefs_df.sort_values(by='abs+1', ascending=False).head(10)
 
 
-# In[24]:
+# In[26]:
 
 
 sns.set({'figure.figsize': (8, 3)})
@@ -559,7 +583,7 @@ def color_boxes(ax):
 plt.title(f'Log-log coefficient magnitude distribution, {plot_gene}', y=1.03)
 
 
-# In[25]:
+# In[27]:
 
 
 sns.set({'figure.figsize': (10, 4)})
@@ -575,7 +599,7 @@ def color_boxes(ax):
 # 
 # We want to separate the log-likelihood loss (data loss) from the weight penalty (regularization term) in the logistic regression loss function, to see if that breakdown is any different between optimizers.
 
-# In[26]:
+# In[28]:
 
 
 # get loss function values from file
@@ -598,7 +622,7 @@ def get_loss_values(results_dir, optimizer):
     return loss_df.reset_index(drop=True)
 
 
-# In[27]:
+# In[29]:
 
 
 ll_loss_df = get_loss_values(ll_results_dir, 'liblinear')
@@ -637,7 +661,7 @@ def get_loss_values(results_dir, optimizer):
 loss_df.head()
 
 
-# In[28]:
+# In[30]:
 
 
 sns.set_style('ticks')
@@ -647,8 +671,10 @@ def get_loss_values(results_dir, optimizer):
         data=loss_df,
         x='lasso_param', y='loss_value', hue='loss_component',
         hue_order=['log_loss', 'l1_penalty', 'total_loss'],
+        style='loss_component',
+        style_order=['log_loss', 'l1_penalty', 'total_loss'],
         marker='o', kind='line', col='optimizer',
-        col_wrap=2, height=5, aspect=1.6,
+        col_wrap=2, height=5, aspect=1.6, markersize=8.5, linewidth=3.5,
         facet_kws={'sharex': False}
     )
     g.set(xscale='log', yscale='log')
@@ -662,6 +688,8 @@ def get_loss_values(results_dir, optimizer):
     g.set_titles('Optimizer: {col_name}')
     sns.move_legend(g, "center", bbox_to_anchor=[1.05, 0.5], frameon=True)
     g._legend.set_title('Loss component')
+    for legobj in g._legend.legendHandles:
+        legobj.set_linewidth(3.5)
 
     plt.suptitle(f'LASSO parameter vs. training loss, {plot_gene}', y=1.05)
 

diff --git a/01_stratified_classification/nbconverted/optimizer_figures.py b/01_stratified_classification/nbconverted/optimizer_figures.py
@@ -150,11 +150,11 @@
 
 
 supp_f1 = Figure(
-    "675", "555",
+    "675", "340",
     etree.Element("rect", {"width": "100%", "height": "100%", "fill": "white"}),
     SVG(
         os.path.join(cfg.repo_root, '01_stratified_classification', 'optimizers_plots', 'KRAS_constant_search_coefs_count.svg')
-    ).scale(0.75).move(20, 10),
+    ).scale(0.85).move(20, 10),
 )
 
 display(supp_f1)