From e4ce454fab85126c2babffbaacddb1318b692094 Mon Sep 17 00:00:00 2001
From: DoronHav <havivd@lc10.hpc.private>
Date: Fri, 26 Apr 2024 14:36:13 -0400
Subject: [PATCH] Format with black and ruff wassersteinwormhole package

---
 wassersteinwormhole/DefaultConfig.py          |   2 +-
 wassersteinwormhole/Wormhole.py               | 615 +++++++++++-------
 wassersteinwormhole/__init__.py               |   2 +-
 wassersteinwormhole/_utils_Transformer.py     |  33 +-
 .../_utils_WeightedAttention.py               |  31 +-
 wassersteinwormhole/utils_OT.py               |  14 +-
 6 files changed, 409 insertions(+), 288 deletions(-)

diff --git a/wassersteinwormhole/DefaultConfig.py b/wassersteinwormhole/DefaultConfig.py
index 1a9a3fe..519cd32 100644
--- a/wassersteinwormhole/DefaultConfig.py
+++ b/wassersteinwormhole/DefaultConfig.py
@@ -2,7 +2,7 @@
 from flax import linen as nn
 import jax.numpy as jnp
 
-from typing import Callable, Any, Optional
+from typing import Callable, Any
 
 @struct.dataclass
 class DefaultConfig:
diff --git a/wassersteinwormhole/Wormhole.py b/wassersteinwormhole/Wormhole.py
index b94b8a6..54c6a82 100644
--- a/wassersteinwormhole/Wormhole.py
+++ b/wassersteinwormhole/Wormhole.py
@@ -1,102 +1,136 @@
-
-import optax
-from flax import linen as nn
-from flax import struct
-from flax.training import train_state 
+from functools import partial
 
 import jax
 import jax.numpy as jnp
 import jax.scipy as jsp
-from jax import random, grad, jit, vmap
-from functools import partial
-import scipy.stats
 import numpy as np
+import optax
+import scipy.stats
+from flax import linen as nn
+from jax import jit, random
 from tqdm import trange
 
-from wassersteinwormhole._utils_Transformer import * 
 import wassersteinwormhole.utils_OT as utils_OT
-
+from wassersteinwormhole._utils_Transformer import Metrics, TrainState, Transformer
 from wassersteinwormhole.DefaultConfig import DefaultConfig
 
-    
-    
+
+
 def MaxMinScale(arr):
-    
     """
     :meta private:
     """
-       
-    min_arr = arr.min(axis = 0)
-    max_arr = arr.max(axis = 0)
-    
-    arr = 2*(arr - arr.min(axis = 0, keepdims = True))/(arr.max(axis = 0, keepdims = True) - arr.min(axis = 0, keepdims = True))-1
-    return(arr)
-
-def pad_pointclouds(point_clouds, weights, max_shape = -1):
+
+    arr = (
+        2
+        * (arr - arr.min(axis=0, keepdims=True))
+        / (arr.max(axis=0, keepdims=True) - arr.min(axis=0, keepdims=True))
+        - 1
+    )
+    return arr
+
+
+def pad_pointclouds(point_clouds, weights, max_shape=-1):
     """
     :meta private:
     """
-       
-    if(max_shape == -1):
-        max_shape = np.max([pc.shape[0] for pc in point_clouds])+1
+
+    if max_shape == -1:
+        max_shape = np.max([pc.shape[0] for pc in point_clouds]) + 1
     else:
         max_shape = max_shape + 1
-    weights_pad = np.asarray([np.concatenate((weight, np.zeros(max_shape - pc.shape[0])), axis = 0) for pc, weight in zip(point_clouds, weights)])
-    point_clouds_pad = np.asarray([np.concatenate([pc, np.zeros([max_shape - pc.shape[0], pc.shape[-1]])], axis = 0) for pc in point_clouds])
-    
-    weights_pad = weights_pad/weights_pad.sum(axis = 1, keepdims = True)
+    weights_pad = np.asarray(
+        [
+            np.concatenate((weight, np.zeros(max_shape - pc.shape[0])), axis=0)
+            for pc, weight in zip(point_clouds, weights)
+        ]
+    )
+    point_clouds_pad = np.asarray(
+        [
+            np.concatenate(
+                [pc, np.zeros([max_shape - pc.shape[0], pc.shape[-1]])], axis=0
+            )
+            for pc in point_clouds
+        ]
+    )
+
+    weights_pad = weights_pad / weights_pad.sum(axis=1, keepdims=True)
+
+    return (
+        point_clouds_pad[:, :-1].astype("float32"),
+        weights_pad[:, :-1].astype("float32"),
+    )
+
+
+class Wormhole:
+    """
+    Initializes Wormhole model and processes input point clouds
 
-    return(point_clouds_pad[:, :-1].astype('float32'), weights_pad[:, :-1].astype('float32'))
 
-class Wormhole():
-    
-    """
-    Initializes Wormhole model and processes input point clouds 
-    
-    
     :param point_clouds: (list of np.array) list of train-set point clouds to train Wormhole on
     :param weights: (list of np.array) list of per point weight for each train-set point cloud (default None, indicating uniform weights)
     :param point_clouds_test: (list of np.array) list of test-set point clouds (default None)
     :param weights_test: (list of np.array)  list of per point weight for each test-set point cloud (default None, indicating uniform weights)
-    :param config: (flax struct.dataclass) object with parameters for Wormhole such as OT metric choice, emedding dimention, etc. See docs for 'DefaultConfig.py' and tutorial details. 
-    
+    :param config: (flax struct.dataclass) object with parameters for Wormhole such as OT metric choice, emedding dimention, etc. See docs for 'DefaultConfig.py' and tutorial details.
+
     :return: initialized Wormhole model
-    """ 
-        
-    def __init__(self, point_clouds, weights = None, point_clouds_test = None, weights_test = None, config = DefaultConfig):
-    
+    """
+
+    def __init__(
+        self,
+        point_clouds,
+        weights=None,
+        point_clouds_test=None,
+        weights_test=None,
+        config=DefaultConfig,
+    ):
+
         self.config = config
         self.point_clouds = point_clouds
-        
-        if(weights is None):
-            self.weights = [np.ones(pc.shape[0])/pc.shape[0] for pc in self.point_clouds]
+
+        if weights is None:
+            self.weights = [
+                np.ones(pc.shape[0]) / pc.shape[0] for pc in self.point_clouds
+            ]
         else:
             self.weights = weights
-        
-        if(point_clouds_test is None):
-            self.point_clouds, self.weights = pad_pointclouds(self.point_clouds, self.weights)
+
+        if point_clouds_test is None:
+            self.point_clouds, self.weights = pad_pointclouds(
+                self.point_clouds, self.weights
+            )
         else:
             self.point_clouds_test = point_clouds_test
-            
-            if(weights_test is None):
-                self.weights_test = [np.ones(pc.shape[0])/pc.shape[0] for pc in self.point_clouds_test]
+
+            if weights_test is None:
+                self.weights_test = [
+                    np.ones(pc.shape[0]) / pc.shape[0] for pc in self.point_clouds_test
+                ]
             else:
                 self.weights_test = weights_test
-                
-            
-            total_point_clouds, total_weights = pad_pointclouds(list(self.point_clouds) + list(self.point_clouds_test), list(self.weights) + list(self.weights_test))
-            self.point_clouds, self.weights = total_point_clouds[:len(list(self.point_clouds))], total_weights[:len(list(self.point_clouds))]
-            self.point_clouds_test, self.weights_test = total_point_clouds[len(list(self.point_clouds)):], total_weights[len(list(self.point_clouds)):]
 
-        
-        self.scale_weights = np.exp(-jsp.special.xlogy(self.weights, self.weights).sum(axis = 1).mean())
-        self.out_seq_len = int(jnp.exp(-jsp.special.xlogy(self.weights, self.weights).sum(axis = 1).mean()))
+            total_point_clouds, total_weights = pad_pointclouds(
+                list(self.point_clouds) + list(self.point_clouds_test),
+                list(self.weights) + list(self.weights_test),
+            )
+            self.point_clouds, self.weights = (
+                total_point_clouds[: len(list(self.point_clouds))],
+                total_weights[: len(list(self.point_clouds))],
+            )
+            self.point_clouds_test, self.weights_test = (
+                total_point_clouds[len(list(self.point_clouds)) :],
+                total_weights[len(list(self.point_clouds)) :],
+            )
+
+        self.scale_weights = np.exp(
+            -jsp.special.xlogy(self.weights, self.weights).sum(axis=1)
+        ).mean()
+        self.out_seq_len = int(
+            jnp.exp(-jsp.special.xlogy(self.weights, self.weights).sum(axis=1)).mean()
+        )
 
         self.inp_dim = self.point_clouds.shape[-1]
 
-
-
-        
         self.eps_enc = config.eps_enc
         self.eps_dec = config.eps_dec
 
@@ -104,80 +138,109 @@ def __init__(self, point_clouds, weights = None, point_clouds_test = None, weigh
         self.lse_dec = config.lse_dec
 
         self.coeff_dec = config.coeff_dec
-        
+
         self.dist_func_enc = config.dist_func_enc
         self.dist_func_dec = config.dist_func_dec
-        
-        self.jit_dist_enc = jax.jit(jax.vmap(getattr(utils_OT, self.dist_func_enc), (0, 0, None, None), 0), static_argnums=[2,3])
-        self.jit_dist_dec = jax.jit(jax.vmap(getattr(utils_OT, self.dist_func_dec), (0, 0, None, None), 0), static_argnums=[2,3])
-        
-        if(self.coeff_dec < 0):
-            self.jit_dist_dec  = jax.jit(jax.vmap(utils_OT.Zeros, (0, 0, None, None), 0), static_argnums=[2,3]) 
-            self.coeff_dec = 0.0 
+
+        self.jit_dist_enc = jax.jit(
+            jax.vmap(getattr(utils_OT, self.dist_func_enc), (0, 0, None, None), 0),
+            static_argnums=[2, 3],
+        )
+        self.jit_dist_dec = jax.jit(
+            jax.vmap(getattr(utils_OT, self.dist_func_dec), (0, 0, None, None), 0),
+            static_argnums=[2, 3],
+        )
+
+        if self.coeff_dec < 0:
+            self.jit_dist_dec = jax.jit(
+                jax.vmap(utils_OT.Zeros, (0, 0, None, None), 0), static_argnums=[2, 3]
+            )
+            self.coeff_dec = 0.0
 
         self.scale = config.scale
         self.factor = config.factor
         self.point_clouds = self.scale_func(self.point_clouds) * self.factor
-        if(point_clouds_test is not None):
-            self.point_clouds_test = self.scale_func(self.point_clouds_test)*self.factor
-        
-      
-        self.pc_max_val = np.max(self.point_clouds[self.weights > 0]) #* (1 + 1 * np.isin(self.dist_func_dec, ['GS', 'GW']))
-        self.pc_min_val = np.min(self.point_clouds[self.weights > 0]) #* (1 + 1 * np.isin(self.dist_func_dec, ['GS', 'GW']))
-        self.scale_out = True #not np.isin(self.dist_func_dec, ['GS', 'GW'])
-        
-        self.model = Transformer(self.config, out_seq_len = self.out_seq_len, inp_dim = self.inp_dim,
-                                 scale_weights = self.scale_weights, scale_out = self.scale_out, min_val = self.pc_min_val, max_val = self.pc_max_val)
-
+        if point_clouds_test is not None:
+            self.point_clouds_test = (
+                self.scale_func(self.point_clouds_test) * self.factor
+            )
+
+        self.pc_max_val = np.max(
+            self.point_clouds[self.weights > 0]
+        )  # * (1 + 1 * np.isin(self.dist_func_dec, ['GS', 'GW']))
+        self.pc_min_val = np.min(
+            self.point_clouds[self.weights > 0]
+        )  # * (1 + 1 * np.isin(self.dist_func_dec, ['GS', 'GW']))
+        self.scale_out = True  # not np.isin(self.dist_func_dec, ['GS', 'GW'])
+
+        self.model = Transformer(
+            self.config,
+            out_seq_len=self.out_seq_len,
+            inp_dim=self.inp_dim,
+            scale_weights=self.scale_weights,
+            scale_out=self.scale_out,
+            min_val=self.pc_min_val,
+            max_val=self.pc_max_val,
+        )
 
     def scale_func(self, point_clouds):
-            
         """
         :meta private:
         """
-    
-        if(self.scale == 'max_dist_total'):
-            if(not hasattr(self, 'max_scale_num')):
+
+        if self.scale == "max_dist_total":
+            if not hasattr(self, "max_scale_num"):
                 max_dist = 0
                 for _ in range(10):
-                    i,j = np.random.choice(np.arange(len(self.point_clouds)), 2,replace = False)
-                    if(self.dist_func_enc == 'GW' or self.dist_func_enc == 'GS'):
-                        max_ij = np.max(scipy.spatial.distance.cdist(self.point_clouds[i], self.point_clouds[i]))
+                    i, j = np.random.choice(
+                        np.arange(len(self.point_clouds)), 2, replace=False
+                    )
+                    if self.dist_func_enc == "GW" or self.dist_func_enc == "GS":
+                        max_ij = np.max(
+                            scipy.spatial.distance.cdist(
+                                self.point_clouds[i], self.point_clouds[i]
+                            )
+                        )
                     else:
-                        max_ij = np.max(scipy.spatial.distance.cdist(self.point_clouds[i], self.point_clouds[j]))
+                        max_ij = np.max(
+                            scipy.spatial.distance.cdist(
+                                self.point_clouds[i], self.point_clouds[j]
+                            )
+                        )
                     max_dist = np.maximum(max_ij, max_dist)
                 self.max_scale_num = max_dist
             else:
-                print("Using Calculated Max Dist Scaling Values") 
-            return(point_clouds/self.max_scale_num)
-        if(self.scale == 'max_dist_each'):
-            print("Using Per Sample Max Dist") 
-            pc_scale = np.asarray([pc/np.max(scipy.spatial.distance.pdist(pc)) for pc in point_clouds])
-            return(pc_scale)
-        if(self.scale == 'min_max_each'):
-            print("Scaling Per Sample") 
-            max_val = point_clouds.max(axis = 1, keepdims = True)
-            min_val = point_clouds.min(axis = 1, keepdims = True)
-            return(2 * (point_clouds - min_val)/(max_val - min_val) - 1)
-        elif(self.scale == 'min_max_total'):
-            if(not hasattr(self, 'max_val')):
-                self.max_val = self.point_clouds.max(axis = ((0,1)), keepdims = True)
-                self.min_val = self.point_clouds.min(axis = ((0,1)), keepdims = True)
+                print("Using Calculated Max Dist Scaling Values")
+            return point_clouds / self.max_scale_num
+        if self.scale == "max_dist_each":
+            print("Using Per Sample Max Dist")
+            pc_scale = np.asarray(
+                [pc / np.max(scipy.spatial.distance.pdist(pc)) for pc in point_clouds]
+            )
+            return pc_scale
+        if self.scale == "min_max_each":
+            print("Scaling Per Sample")
+            max_val = point_clouds.max(axis=1, keepdims=True)
+            min_val = point_clouds.min(axis=1, keepdims=True)
+            return 2 * (point_clouds - min_val) / (max_val - min_val) - 1
+        elif self.scale == "min_max_total":
+            if not hasattr(self, "max_val"):
+                self.max_val = self.point_clouds.max(axis=((0, 1)), keepdims=True)
+                self.min_val = self.point_clouds.min(axis=((0, 1)), keepdims=True)
             else:
-                print("Using Calculated Min Max Scaling Values") 
-            return(2 * (point_clouds - self.min_val)/(self.max_val - self.min_val) - 1)
-        elif(self.scale == 'min_max_total_all_axis'):
-            if(not hasattr(self, 'max_val')):
-                self.max_val = self.point_clouds.max(keepdims = True)
-                self.min_val = self.point_clouds.min(keepdims = True)
+                print("Using Calculated Min Max Scaling Values")
+            return 2 * (point_clouds - self.min_val) / (self.max_val - self.min_val) - 1
+        elif self.scale == "min_max_total_all_axis":
+            if not hasattr(self, "max_val"):
+                self.max_val = self.point_clouds.max(keepdims=True)
+                self.min_val = self.point_clouds.min(keepdims=True)
             else:
-                print("Using Calculated Min Max Scaling Values") 
-            return(2 * (point_clouds - self.min_val)/(self.max_val - self.min_val) - 1)
+                print("Using Calculated Min Max Scaling Values")
+            return 2 * (point_clouds - self.min_val) / (self.max_val - self.min_val) - 1
         else:
-            return(point_clouds)
-    
-    def encode(self, pc, weights, max_batch = 256):
-                
+            return point_clouds
+
+    def encode(self, pc, weights, max_batch=256):
         """
         Encode point clouds with trained Wormhole model
 
@@ -187,21 +250,29 @@ def encode(self, pc, weights, max_batch = 256):
         :param max_batch: (int) maximum size of batch during inference calls to Wormhole (default 256)
 
         :return enc: per point cloud embeddings
-        """ 
-        
-        if(pc.shape[0] < max_batch):
-            enc = self.model.bind({'params': self.params}).Encoder(pc, weights, deterministic = True)
-        else: # For when the GPU can't pass all point-clouds at once
-            num_split = int(pc.shape[0]/max_batch)+1
+        """
+
+        if pc.shape[0] < max_batch:
+            enc = self.model.bind({"params": self.params}).Encoder(
+                pc, weights, deterministic=True
+            )
+        else:  # For when the GPU can't pass all point-clouds at once
+            num_split = int(pc.shape[0] / max_batch) + 1
             pc_split = np.array_split(pc, num_split)
             mask_split = np.array_split(weights, num_split)
-            
-            enc = np.concatenate([self.model.bind({'params': self.params}).Encoder(pc_split[split_ind], mask_split[split_ind], deterministic = True) for
-                                  split_ind in range(num_split)], axis = 0)
+
+            enc = np.concatenate(
+                [
+                    self.model.bind({"params": self.params}).Encoder(
+                        pc_split[split_ind], mask_split[split_ind], deterministic=True
+                    )
+                    for split_ind in range(num_split)
+                ],
+                axis=0,
+            )
         return enc
-    
-    def decode(self, enc, max_batch = 256):
-        
+
+    def decode(self, enc, max_batch=256):
         """
         Decode embedding back into point clouds using Wormhole decoder
 
@@ -210,111 +281,169 @@ def decode(self, enc, max_batch = 256):
         :param max_batch: (int) maximum size of batch during inference calls to Wormhole (default 256)
 
         :return dec: decoded point clouds from embeddings
-        """ 
-        
-        if(enc.shape[0]<max_batch):
-            dec = self.model.bind({'params': self.params}).Decoder(enc, deterministic = True)
-            if(self.scale_out):
-                dec = nn.sigmoid(dec) * (self.pc_max_val - self.pc_min_val) + self.pc_min_val
+        """
+
+        if enc.shape[0] < max_batch:
+            dec = self.model.bind({"params": self.params}).Decoder(
+                enc, deterministic=True
+            )
+            if self.scale_out:
+                dec = (
+                    nn.sigmoid(dec) * (self.pc_max_val - self.pc_min_val)
+                    + self.pc_min_val
+                )
         else:
-            num_split = int(enc.shape[0]/max_batch)+1
-            enc_split = np.array_split(enc, num_split) 
-            dec = np.concatenate([self.model.bind({'params': self.params}).Decoder(enc_split[split_ind], deterministic = True) 
-                                  for split_ind in range(num_split)], axis = 0)
-            if(self.scale_out):
-                dec_split = np.array_split(dec, num_split) 
-                dec = np.concatenate([nn.sigmoid(dec_split[split_ind]) * (self.pc_max_val - self.pc_min_val) + self.pc_min_val for split_ind in range(num_split)], axis = 0)
+            num_split = int(enc.shape[0] / max_batch) + 1
+            enc_split = np.array_split(enc, num_split)
+            dec = np.concatenate(
+                [
+                    self.model.bind({"params": self.params}).Decoder(
+                        enc_split[split_ind], deterministic=True
+                    )
+                    for split_ind in range(num_split)
+                ],
+                axis=0,
+            )
+            if self.scale_out:
+                dec_split = np.array_split(dec, num_split)
+                dec = np.concatenate(
+                    [
+                        nn.sigmoid(dec_split[split_ind])
+                        * (self.pc_max_val - self.pc_min_val)
+                        + self.pc_min_val
+                        for split_ind in range(num_split)
+                    ],
+                    axis=0,
+                )
         return dec
-    
-    #@partial(jit, static_argnums=(0,4))
-    def call(self, pc, weights, deterministic = True, key = random.key(0)):
-                            
+
+    # @partial(jit, static_argnums=(0,4))
+    def call(self, pc, weights, deterministic=True, key=random.key(0)):
         """
         :meta private:
         """
-    
-        enc, dec = self.model.apply(self.variables, inputs = pc, weights = weights, deterministic = deterministic, dropout_rng = key)
-        return(enc, dec)
-    
-    #@partial(jit, static_argnums=(0,4))
+
+        enc, dec = self.model.apply(
+            self.variables,
+            inputs=pc,
+            weights=weights,
+            deterministic=deterministic,
+            dropout_rng=key,
+        )
+        return (enc, dec)
+
+    # @partial(jit, static_argnums=(0,4))
     def compute_losses(self, pc, weights, enc, dec):
-                      
         """
         :meta private:
         """
-    
-        pc_pairwise_dist = self.jit_dist_enc([pc[self.tri_u_ind[:, 0]], weights[self.tri_u_ind[:, 0]]],
-                                             [pc[self.tri_u_ind[:, 1]], weights[self.tri_u_ind[:, 1]]], 
-                                             self.eps_enc, self.lse_enc)
-       
-        enc_pairwise_dist = jnp.mean(jnp.square(enc[self.tri_u_ind[:, 0]] - enc[self.tri_u_ind[:, 1]]), axis = 1)
-        pc_dec_dist = self.jit_dist_dec([pc, weights], [dec, self.pseudo_weights], 
-                                        self.eps_dec, self.lse_dec)
-        
+
+        pc_pairwise_dist = self.jit_dist_enc(
+            [pc[self.tri_u_ind[:, 0]], weights[self.tri_u_ind[:, 0]]],
+            [pc[self.tri_u_ind[:, 1]], weights[self.tri_u_ind[:, 1]]],
+            self.eps_enc,
+            self.lse_enc,
+        )
+
+        enc_pairwise_dist = jnp.mean(
+            jnp.square(enc[self.tri_u_ind[:, 0]] - enc[self.tri_u_ind[:, 1]]), axis=1
+        )
+        pc_dec_dist = self.jit_dist_dec(
+            [pc, weights], [dec, self.pseudo_weights], self.eps_dec, self.lse_dec
+        )
+
         # pc_dec_dist = 0
-        return(pc_pairwise_dist, enc_pairwise_dist, pc_dec_dist)
-       
-    
-    def create_train_state(self, key = random.key(0), init_lr = 0.0001, decay_steps = 2000):
-                      
+        return (pc_pairwise_dist, enc_pairwise_dist, pc_dec_dist)
+
+    def create_train_state(self, key=random.key(0), init_lr=0.0001, decay_steps=2000):
         """
         :meta private:
         """
-    
+
         key, subkey = random.split(key)
-        params = self.model.init(rngs = {'params': key}, dropout_rng = subkey, deterministic = False,
-                                         inputs = self.point_clouds[0:1], weights = self.weights[0:1])['params']
-        
-        lr_sched = optax.exponential_decay(init_lr, decay_steps, 0.9, staircase = True)
-        tx = optax.adam(lr_sched)#
-        
-        return(TrainState.create(
-          apply_fn=self.model.apply, params=params, tx=tx,
-          metrics=Metrics.empty()))
-    
-    @partial(jit, static_argnums=(0, ))
-    def train_step(self, state, pc, weights, key = random.key(0)):
-                      
+        params = self.model.init(
+            rngs={"params": key},
+            dropout_rng=subkey,
+            deterministic=False,
+            inputs=self.point_clouds[0:1],
+            weights=self.weights[0:1],
+        )["params"]
+
+        lr_sched = optax.exponential_decay(init_lr, decay_steps, 0.9, staircase=True)
+        tx = optax.adam(lr_sched)  #
+
+        return TrainState.create(
+            apply_fn=self.model.apply, params=params, tx=tx, metrics=Metrics.empty()
+        )
+
+    @partial(jit, static_argnums=(0,))
+    def train_step(self, state, pc, weights, key=random.key(0)):
         """
         :meta private:
         """
-    
+
         def loss_fn(params):
-            enc, dec = state.apply_fn({'params':params}, inputs = pc, weights = weights, deterministic = False, dropout_rng = key)
-            pc_pairwise_dist, enc_pairwise_dist, pc_dec_dist = self.compute_losses(pc, weights, enc, dec)
-            
+            enc, dec = state.apply_fn(
+                {"params": params},
+                inputs=pc,
+                weights=weights,
+                deterministic=False,
+                dropout_rng=key,
+            )
+            pc_pairwise_dist, enc_pairwise_dist, pc_dec_dist = self.compute_losses(
+                pc, weights, enc, dec
+            )
+
             enc_loss = jnp.mean(jnp.square(pc_pairwise_dist - enc_pairwise_dist))
             dec_loss = jnp.mean(pc_dec_dist)
-            enc_corr = jnp.corrcoef(enc_pairwise_dist, pc_pairwise_dist)[0,1]
-            return(enc_loss + self.coeff_dec * dec_loss, [enc_loss, dec_loss, enc_corr])
-    
-        grad_fn = jax.value_and_grad(loss_fn, has_aux = True)
+            enc_corr = jnp.corrcoef(enc_pairwise_dist, pc_pairwise_dist)[0, 1]
+            return (
+                enc_loss + self.coeff_dec * dec_loss,
+                [enc_loss, dec_loss, enc_corr],
+            )
+
+        grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
         loss, grads = grad_fn(state.params)
         state = state.apply_gradients(grads=grads)
-        return(state, loss)
-    
-    @partial(jit, static_argnums=(0, ))
-    def compute_metrics(self, state, pc, weights, key = random.key(0)):
-                      
+        return (state, loss)
+
+    @partial(jit, static_argnums=(0,))
+    def compute_metrics(self, state, pc, weights, key=random.key(0)):
         """
         :meta private:
         """
-    
-        enc, dec  = state.apply_fn({'params': state.params}, inputs = pc, weights = weights, deterministic = False, dropout_rng = key)
-        pc_pairwise_dist, enc_pairwise_dist, pc_dec_dist = self.compute_losses(pc, weights, enc, dec)
-        
+
+        enc, dec = state.apply_fn(
+            {"params": state.params},
+            inputs=pc,
+            weights=weights,
+            deterministic=False,
+            dropout_rng=key,
+        )
+        pc_pairwise_dist, enc_pairwise_dist, pc_dec_dist = self.compute_losses(
+            pc, weights, enc, dec
+        )
+
         enc_loss = jnp.mean(jnp.square(pc_pairwise_dist - enc_pairwise_dist))
         dec_loss = jnp.mean(pc_dec_dist)
-        enc_corr = jnp.corrcoef(enc_pairwise_dist, pc_pairwise_dist)[0,1]
-        
-    
-        metric_updates = state.metrics.single_from_model_output(enc_loss = enc_loss, dec_loss = dec_loss, enc_corr = enc_corr)
+        enc_corr = jnp.corrcoef(enc_pairwise_dist, pc_pairwise_dist)[0, 1]
+
+        metric_updates = state.metrics.single_from_model_output(
+            enc_loss=enc_loss, dec_loss=dec_loss, enc_corr=enc_corr
+        )
         metrics = state.metrics.merge(metric_updates)
         state = state.replace(metrics=metrics)
-        return(state)
-
-    def train(self, training_steps = 10000, batch_size = 16, verbose = 8, init_lr = 0.0001, decay_steps = 2000, key = random.key(0)):
-          
+        return state
+
+    def train(
+        self,
+        training_steps=10000,
+        batch_size=16,
+        verbose=8,
+        init_lr=0.0001,
+        decay_steps=2000,
+        key=random.key(0),
+    ):
         """
         Set up optimization parameters and train the ENVI moodel
 
@@ -327,46 +456,74 @@ def train(self, training_steps = 10000, batch_size = 16, verbose = 8, init_lr =
         :param key: (jax.random.key) random seed (default jax.random.key(0))
 
         :return: nothing
-        """ 
-        
+        """
+
         batch_size = min(self.point_clouds.shape[0], batch_size)
-        
-        self.tri_u_ind = jnp.stack(jnp.triu_indices(batch_size, 1), axis =1)
-        self.pseudo_weights = jnp.ones([batch_size, self.out_seq_len])/self.out_seq_len
+
+        self.tri_u_ind = jnp.stack(jnp.triu_indices(batch_size, 1), axis=1)
+        self.pseudo_weights = (
+            jnp.ones([batch_size, self.out_seq_len]) / self.out_seq_len
+        )
 
         key, subkey = random.split(key)
-        state = self.create_train_state(subkey, init_lr = init_lr, decay_steps = decay_steps)
-        
-        
+        state = self.create_train_state(
+            subkey, init_lr=init_lr, decay_steps=decay_steps
+        )
 
-        
-        tq = trange(training_steps, leave=True, desc = "")
-        enc_loss_mean, dec_loss_mean, enc_corr_mean, count = 0,0,0,0
+        tq = trange(training_steps, leave=True, desc="")
+        enc_loss_mean, dec_loss_mean, enc_corr_mean, count = 0, 0, 0, 0
         for training_step in tq:
             key, subkey = random.split(key)
 
-            if(batch_size < self.point_clouds.shape[0]):
-                batch_ind = random.choice(key = subkey, a = self.point_clouds.shape[0], shape = [batch_size], replace = False)
-                point_clouds_batch, weights_batch = self.point_clouds[batch_ind], self.weights[batch_ind]
+            if batch_size < self.point_clouds.shape[0]:
+                batch_ind = random.choice(
+                    key=subkey,
+                    a=self.point_clouds.shape[0],
+                    shape=[batch_size],
+                    replace=False,
+                )
+                point_clouds_batch, weights_batch = (
+                    self.point_clouds[batch_ind],
+                    self.weights[batch_ind],
+                )
             else:
                 point_clouds_batch, weights_batch = self.point_clouds, self.weights
 
             key, subkey = random.split(key)
-            state, loss = self.train_step(state, point_clouds_batch, weights_batch, subkey)
+            state, loss = self.train_step(
+                state, point_clouds_batch, weights_batch, subkey
+            )
             self.params = state.params
 
-            enc_loss_mean, dec_loss_mean, enc_corr_mean, count = enc_loss_mean + loss[1][0], dec_loss_mean + loss[1][1], enc_corr_mean + loss[1][2], count + 1
-
-            if(training_step%verbose==0):
-                print_statement = ''
-                for metric,value in zip(['enc_loss', 'dec_loss', 'enc_corr'], [enc_loss_mean, dec_loss_mean, enc_corr_mean]):
-                    if(metric == 'enc_corr'):
-                        print_statement = print_statement + ' ' + metric + ': {:.3f}'.format(value/count)
+            enc_loss_mean, dec_loss_mean, enc_corr_mean, count = (
+                enc_loss_mean + loss[1][0],
+                dec_loss_mean + loss[1][1],
+                enc_corr_mean + loss[1][2],
+                count + 1,
+            )
+
+            if training_step % verbose == 0:
+                print_statement = ""
+                for metric, value in zip(
+                    ["enc_loss", "dec_loss", "enc_corr"],
+                    [enc_loss_mean, dec_loss_mean, enc_corr_mean],
+                ):
+                    if metric == "enc_corr":
+                        print_statement = (
+                            print_statement
+                            + " "
+                            + metric
+                            + ": {:.3f}".format(value / count)
+                        )
                     else:
-                        print_statement = print_statement + ' ' + metric + ': {:.3e}'.format(value/count)
+                        print_statement = (
+                            print_statement
+                            + " "
+                            + metric
+                            + ": {:.3e}".format(value / count)
+                        )
 
                 # state.replace(metrics=state.metrics.empty())
-                enc_loss_mean, dec_loss_mean, enc_corr_mean, count = 0,0,0,0
+                enc_loss_mean, dec_loss_mean, enc_corr_mean, count = 0, 0, 0, 0
                 tq.set_description(print_statement)
-                tq.refresh() # to show immediately the update
-
+                tq.refresh()  # to show immediately the update
diff --git a/wassersteinwormhole/__init__.py b/wassersteinwormhole/__init__.py
index b2a304d..2e91b78 100644
--- a/wassersteinwormhole/__init__.py
+++ b/wassersteinwormhole/__init__.py
@@ -1 +1 @@
-from .Wormhole import Wormhole
\ No newline at end of file
+from .Wormhole import Wormhole  # noqa: F401
\ No newline at end of file
diff --git a/wassersteinwormhole/_utils_Transformer.py b/wassersteinwormhole/_utils_Transformer.py
index 76698ec..41335a9 100644
--- a/wassersteinwormhole/_utils_Transformer.py
+++ b/wassersteinwormhole/_utils_Transformer.py
@@ -1,25 +1,16 @@
-import optax
-from flax import linen as nn
-from flax import struct
-from flax.training import train_state 
-from clu import metrics
+from typing import Any, Optional
 
-import jax
 import jax.numpy as jnp
+from clu import metrics
+from flax import linen as nn
+from flax import struct
+from flax.training import train_state
 from jax import random
 
-
-from functools import partial
-import scipy.stats
-import numpy as np
-
-from typing import Callable, Any, Optional
-
 from wassersteinwormhole._utils_WeightedAttention import WeightedMultiheadAttention
 from wassersteinwormhole.DefaultConfig import DefaultConfig
 
 
-
 class Embedding(nn.Module):
     """Transformer embedding block.
 
@@ -240,17 +231,11 @@ def setup(self):
         out_seq_len = self.out_seq_len
         inp_dim = self.inp_dim
         scale_weights = self.scale_weights
-        scale_out = self.scale_out
-        min_val = self.min_val
-        max_val = self.max_val
         
         self.Encoder = EncoderModel(config, scale_weights)#(inputs, weights, deterministic=deterministic)
         self.Decoder = DecoderModel(config, out_seq_len, inp_dim)#(enc, deterministic=deterministic)
     
     def __call__(self, inputs, weights, deterministic = True, dropout_rng = random.key(0)):
-        config = self.config
-        out_seq_len = self.out_seq_len
-        inp_dim = self.inp_dim
         scale_out = self.scale_out
         min_val = self.min_val
         max_val = self.max_val
@@ -274,10 +259,10 @@ def __call__(self, inputs, weights, deterministic = True, dropout_rng = random.k
     
 @struct.dataclass
 class Metrics(metrics.Collection):
-    enc_loss: metrics.Average.from_output('enc_loss')
-    dec_loss: metrics.Average.from_output('dec_loss')
-    enc_corr: metrics.Average.from_output('enc_corr')
-    
+    enc_loss: metrics.Average
+    dec_loss: metrics.Average
+    enc_corr: metrics.Average
+
 class TrainState(train_state.TrainState):
     metrics: Metrics
 
diff --git a/wassersteinwormhole/_utils_WeightedAttention.py b/wassersteinwormhole/_utils_WeightedAttention.py
index c17dc32..eb7585e 100644
--- a/wassersteinwormhole/_utils_WeightedAttention.py
+++ b/wassersteinwormhole/_utils_WeightedAttention.py
@@ -1,38 +1,25 @@
-import math
 
-from flax import linen as nn
-from flax import struct
-from flax.training import train_state 
-from clu import metrics
+from typing import Optional
 
-import jax
 import jax.numpy as jnp
-
+from flax import linen as nn
 from jax import random
-
-import math
-
-from functools import partial
-import scipy.stats
-import numpy as np
-
-from typing import Callable, Any, Optional 
-
 from jax.typing import ArrayLike
 
 from wassersteinwormhole.DefaultConfig import DefaultConfig
 
+
 def scaled_dot_product(q,
                        k,
                        v,
-                       weights: Optional = None, 
+                       weights: Optional[ArrayLike] = None, 
                        scale_weights: float = 1,
                        deterministic: bool = False,
-                       dropout_rng: Optional = random.key(0),
+                       dropout_rng: Optional[ArrayLike] = random.key(0),
                        dropout_rate: float = 0.0,
                        ):
     
-    dtype, d_k, seq_len = q.dtype, q.shape[-1], q.shape[-2]
+    dtype, d_k = q.dtype, q.shape[-1], 
     
     attn_logits = jnp.matmul(q, jnp.swapaxes(k, -2, -1))
     attn_logits = attn_logits / jnp.sqrt(d_k)
@@ -81,9 +68,9 @@ def setup(self):
         
     def __call__(self, 
                  x,
-                 weights: Optional = None, 
-                 deterministic: Optional = True, 
-                 dropout_rng: Optional = random.key(0)):
+                 weights: Optional[ArrayLike] = None, 
+                 deterministic: Optional[bool] = True, 
+                 dropout_rng: Optional[ArrayLike] = random.key(0)):
         
         config = self.config
         scale_weights = self.scale_weights
diff --git a/wassersteinwormhole/utils_OT.py b/wassersteinwormhole/utils_OT.py
index ff0beea..aec8a06 100644
--- a/wassersteinwormhole/utils_OT.py
+++ b/wassersteinwormhole/utils_OT.py
@@ -1,21 +1,13 @@
-import numpy as np
 
 
 import jax.numpy as jnp
-from jax import grad, jit, vmap
-from jax import random
-import jax
 
 import ott
-from ott import problems
-from ott.geometry import geometry, pointcloud, epsilon_scheduler
-from ott.solvers import linear, quadratic
-from ott.solvers.linear import acceleration, sinkhorn
-from ott.problems.linear import linear_problem
-from ott.tools.sinkhorn_divergence import sinkhorn_divergence
+from ott.geometry import pointcloud
+from ott.solvers import linear
 
 from ott.problems.quadratic import quadratic_problem
-from ott.solvers.quadratic import gromov_wasserstein, gromov_wasserstein_lr
+from ott.solvers.quadratic import gromov_wasserstein
 
 def W1(x, y, eps, lse_mode = False):