From 048a39fb4055e2b958e8b95c4ceadeaab7ad869b Mon Sep 17 00:00:00 2001
From: OverLordGoldDragon <OverLordGoldDragon@yahoo.com>
Date: Sun, 2 Feb 2020 00:50:01 +0400
Subject: [PATCH] Massive performance improvements

Vectorized for-loops in `synsq_squeeze`, used list-append instead of array-concat for `buffer`

Also changed import logic
---
 README.md                        |  1 +
 ssqueezepy/stft_transforms.py    |  4 +-
 ssqueezepy/synsq_cwt.py          |  6 +--
 ssqueezepy/synsq_stft.py         |  6 +--
 ssqueezepy/utils.py              | 42 +++++++++----------
 ssqueezepy/wavelet_transforms.py | 72 ++++++++++++++++++++++++++------
 6 files changed, 87 insertions(+), 44 deletions(-)

diff --git a/README.md b/README.md
index 0279e66e..e903aa1b 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,7 @@ There are more unlisted (see original repo), but not all will be implemented, in
  - **Moved functions**; each no longer has its own file, but is grouped with other relevant functions
  - **Code style**; grouped parts of code as sub-functions for improved readability; indentation for vertical alignment; other
  - **Performance**; this repo may work faster or slower, as Numpy arrays are faster than C arrays, but some of original funcs use MEX-optimized code with no Numpy equivalent. Also using dense instead of sparse matrices (see below).
+ - **Performance**; this repo _will_ work **10x+ faster** for some of the methods which were vectorized out of for-loops
  
  **Other**:
   - Dense instead of sparse matrices for `stft_fwd` in [stft_transforms.py](https://github.com/OverLordGoldDragon/ssqueezepy/blob/master/synchrosqueezing/stft_transforms.py), as Numpy doesn't handle latter in ops involved
diff --git a/ssqueezepy/stft_transforms.py b/ssqueezepy/stft_transforms.py
index 400b6847..f127b01b 100644
--- a/ssqueezepy/stft_transforms.py
+++ b/ssqueezepy/stft_transforms.py
@@ -4,8 +4,8 @@
 #    (https://github.com/ebrevdo/synchrosqueezing/)
 
 import numpy as np
+from .utils import wfiltfn, padsignal, buffer
 from quadpy import quad as quadgk
-from utils import wfiltfn, padsignal, buffer
 
 PI = np.pi
 EPS = np.finfo(np.float64).eps  # machine epsilon for float64
@@ -85,7 +85,7 @@ def _process_opts(opts, x):
         Sfs = np.linspace(0, 1, opts['winlen'] + 1)
         Sfs = Sfs[:np.floor(opts['winlen'] / 2).astype('int64') + 1] / dt
         
-        # compute STFt and keep only the positive frequencies
+        # compute STFT and keep only the positive frequencies
         xbuf = buffer(x, opts['winlen'], opts['winlen'] - 1, 'nodelay')
         xbuf = np.diag(window) @ xbuf
         Sx = np.fft.fft(xbuf, None, axis=0)
diff --git a/ssqueezepy/synsq_cwt.py b/ssqueezepy/synsq_cwt.py
index e54337e2..b2dd9b6a 100644
--- a/ssqueezepy/synsq_cwt.py
+++ b/ssqueezepy/synsq_cwt.py
@@ -4,9 +4,9 @@
 #    (https://github.com/ebrevdo/synchrosqueezing/)
 
 import numpy as np
-from utils import est_riskshrink_thresh, p2up, synsq_adm
-from wavelet_transforms import phase_cwt, phase_cwt_num
-from wavelet_transforms import cwt_fwd, synsq_squeeze
+from .utils import est_riskshrink_thresh, p2up, synsq_adm
+from .wavelet_transforms import phase_cwt, phase_cwt_num
+from .wavelet_transforms import cwt_fwd, synsq_squeeze
 
 
 def synsq_cwt_fwd(x, t=None, fs=None, nv=32, opts=None):
diff --git a/ssqueezepy/synsq_stft.py b/ssqueezepy/synsq_stft.py
index e2e2f1ff..0e670ea7 100644
--- a/ssqueezepy/synsq_stft.py
+++ b/ssqueezepy/synsq_stft.py
@@ -4,9 +4,9 @@
 #    (https://github.com/ebrevdo/synchrosqueezing/)
 
 import numpy as np
-from utils import wfiltfn
-from stft_transforms import stft_fwd, phase_stft
-from wavelet_transforms import synsq_squeeze
+from .utils import wfiltfn
+from .stft_transforms import stft_fwd, phase_stft
+from .wavelet_transforms import synsq_squeeze
 from quadpy import quad as quadgk
 
 PI = np.pi
diff --git a/ssqueezepy/utils.py b/ssqueezepy/utils.py
index 9b480f8c..2d8ba83f 100644
--- a/ssqueezepy/utils.py
+++ b/ssqueezepy/utils.py
@@ -248,39 +248,35 @@ def buffer(x, n, p=0, opt=None):
         ryanjdillon: https://stackoverflow.com/questions/38453249/
         is-there-a-matlabs-buffer-equivalent-in-numpy#answer-40105995
     """
-    import numpy as np
-
     if opt not in ('nodelay', None):
         raise ValueError('{} not implemented'.format(opt))
 
     i = 0
-    first_iter = True
-    while i < len(x):
-        if first_iter:
-            if opt == 'nodelay':
-                # No zeros at array start
-                result = x[:n]
-                i = n
-            else:
-                # Start with `p` zeros
-                result = np.hstack([np.zeros(p), x[:n-p]])
-                i = n-p
-            # Make 2D array and pivot
-            result = np.expand_dims(result, axis=0).T
-            first_iter = False
-            continue
+    if opt == 'nodelay':
+        # No zeros at array start
+        result = x[:n]
+        i = n
+    else:
+        # Start with `p` zeros
+        result = np.hstack([np.zeros(p), x[:n-p]])
+        i = n-p
+        
+    # Make 2D array
+    result = np.expand_dims(result, axis=0)
+    result = list(result)
 
+    while i < len(x):
         # Create next column, add `p` results from last col if given
         col = x[i:i+(n-p)]
         if p != 0:
-            col = np.hstack([result[:,-1][-p:], col])
-        i += n-p
+            col = np.hstack([result[-1][-p:], col])
 
         # Append zeros if last row and not length `n`
-        if len(col) < n:
-            col = np.hstack([col, np.zeros(n-len(col))])
+        if len(col):
+            col = np.hstack([col, np.zeros(n - len(col))])
 
         # Combine result with next row
-        result = np.hstack([result, np.expand_dims(col, axis=0).T])
+        result.append(np.array(col))
+        i += (n - p)
 
-    return result
+    return np.vstack(result).T
diff --git a/ssqueezepy/wavelet_transforms.py b/ssqueezepy/wavelet_transforms.py
index 9d50e62d..851899f2 100644
--- a/ssqueezepy/wavelet_transforms.py
+++ b/ssqueezepy/wavelet_transforms.py
@@ -4,7 +4,7 @@
 #    (https://github.com/ebrevdo/synchrosqueezing/)
 
 import numpy as np
-from utils import padsignal, wfiltfn
+from .utils import padsignal, wfiltfn
 
 EPS = np.finfo(np.float64).eps  # machine epsilon for float64
 PI = np.pi
@@ -56,6 +56,63 @@ def synsq_squeeze(Wx, w, t, nv=None, opts={}):
         SIAM Journal on Mathematical Analysis, 43(5):2078-2095, 2011.
     """
     def _squeeze(w, Wx, Wx_scales, fs, dfs, scaleterm, na, N, lfm, lfM, opts):
+        def _vectorized(scaleterm):  # possible to optimize further
+            if len(scaleterm.shape) == 1:
+                scaleterm = np.expand_dims(scaleterm, 1)
+            if v1:
+                _k = np.round(w * dfs)
+                _k[np.where(np.isnan(_k))] = len(fs)
+                _k_ones = np.ones(_k.size)
+                k = np.min((np.max((_k.flatten(), 1*_k_ones), axis=0),
+                            len(fs)*_k_ones), axis=0).reshape(*_k.shape)
+            elif v2:  # TESTED
+                _k = 1 + np.round(na / (lfM - lfm) * (np.log2(w) - lfm))
+                _k[np.where(np.isnan(_k))] = len(fs)
+                _k_ones = np.ones(_k.size)
+                k = np.min((np.max((_k.flatten(), 1*_k_ones), axis=0),
+                            na*_k_ones), axis=0).reshape(*_k.shape)
+            elif v3:
+                w_rep = np.matlib.repmat(w[None].flatten(), len(fs), 1).reshape(
+                    len(fs), *w.shape)
+                _k = w_rep - fs.reshape(len(fs), 1, 1)
+                _k[np.where(np.isnan(_k))] = len(fs)
+                k = np.min(np.abs(_k), axis=0)   
+                
+            k = np.floor(k).astype('int32') - 1  # MAT to py idx
+            Ws_prod = Wx * scaleterm
+
+            for b in range(N):
+                for ai in range(len(Wx_scales)):
+                    Tx[k[ai, b], b] += Ws_prod[ai, b]
+            return Tx
+        
+        def _for_loop():  # much slower; tested 11x slowdown
+            if v1:
+                for b in range(N):
+                  for ai in range(len(Wx_scales)):
+                    _k = np.round(w[ai, b] * dfs)
+                    k = min(max(_k, 1), len(fs)) if not np.isnan(_k) else len(fs)
+    
+                    k = int(k - 1)  # MAT to py idx
+                    Tx[k, b] += Wx[ai, b] * scaleterm[ai]
+            elif v2:
+                for b in range(N):
+                  for ai in range(len(Wx_scales)):
+                    _k = 1 + np.round(na / (lfM - lfm) * (
+                        np.log2(w[ai, b]) - lfm))
+                    k = min(max(_k, 1), na) if not np.isnan(_k) else len(fs)
+    
+                    k = int(k - 1)  # MAT to py idx
+                    Tx[k, b] += Wx[ai, b] * scaleterm[ai]
+            elif v3:
+                for b in range(N):
+                  for ai in range(len(Wx_scales)):
+                    k = np.min(np.abs(w[ai, b] - fs))
+                
+                    k = int(k - 1)  # MAT to py idx
+                    Tx[k, b] += Wx[ai, b] * scaleterm[ai]   
+            return Tx
+
         # must cast to complex else value assignment discards imaginary component
         Tx = np.zeros((len(fs), Wx.shape[1])).astype('complex128')
 
@@ -66,18 +123,7 @@ def _squeeze(w, Wx, Wx_scales, fs, dfs, scaleterm, na, N, lfm, lfM, opts):
         v2 = (opts['findbins'] == 'direct') and (opts['freqscale'] == 'log')
         v3 = (opts['findbins'] == 'min')
 
-        for b in range(N):
-          for ai in range(len(Wx_scales)):
-            if v1:
-                _k = np.round(w[ai, b] * dfs)
-                k = min(max(_k, 1), len(fs)) if not np.isnan(_k) else len(fs)      
-            elif v2:
-                _k = 1 + np.round(na / (lfM - lfm) * (np.log2(w[ai, b]) - lfm))
-                k = min(max(_k, 1), na) if not np.isnan(_k) else len(fs)
-            elif v3:
-                k = np.min(np.abs(w[ai, b] - fs))
-            k = int(k - 1)  # MAT to py idx
-            Tx[k, b] += Wx[ai, b] * scaleterm[ai]
+        Tx = _vectorized(scaleterm)
 
         if opts['transform'] == 'CWT':
             Tx *= (1 / nv)