From 998100804131baaec3adc567639e9023df541b55 Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Wed, 8 Nov 2023 11:07:44 +0100
Subject: [PATCH 01/16] Add denoising to example

---
 examples/example_experimental_data.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/examples/example_experimental_data.py b/examples/example_experimental_data.py
index 89feb1b..9dc9585 100644
--- a/examples/example_experimental_data.py
+++ b/examples/example_experimental_data.py
@@ -9,9 +9,8 @@
 The available denoising methods are "nordic", "mp-pca", "hybrid-pca", "opt-fro", "opt-nuc" and "opt-op".
 """
 
-from patch_denoise.simulation.phantom import mr_shepp_logan_t2_star, g_factor_map
-from patch_denoise.simulation.activations import add_frames
-from patch_denoise.simulation.noise import add_temporal_gaussian_noise
+import nibabel as nib
+from patch_denoise.space_time.lowrank import OptimalSVDDenoiser
 
 # %%
 # Setup the parameters for the simulation and noise
@@ -20,3 +19,18 @@
 N_FRAMES = 200
 
 NOISE_LEVEL = 2
+
+input_path = "/data/parietal/store2/data/ibc/3mm/sub-01/ses-00/func/wrdcsub-01_ses-00_task-ArchiSocial_dir-ap_bold.nii.gz"
+output_path = "/scratch/ymzayek/retreat_data/output.nii"
+
+img = nib.load(input_path)
+
+# data shape is (53, 63, 52, 262) with 3mm resolution
+patch_shape = (11, 11, 11)
+patch_overlap = (5)
+
+# initialize denoiser
+optimal_llr = OptimalSVDDenoiser(patch_shape, patch_overlap)
+
+# denoise image
+denoised = optimal_llr.denoise(img.get_fdata())
\ No newline at end of file

From a16c2e616576cbfa60007762097b50b5f175472b Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Wed, 8 Nov 2023 15:35:36 +0100
Subject: [PATCH 02/16] Add cupy to svd

---
 examples/example_experimental_data.py   |  7 ++-
 src/patch_denoise/space_time/lowrank.py | 67 ++++++++++++++++++-------
 src/patch_denoise/space_time/utils.py   | 14 +++++-
 3 files changed, 66 insertions(+), 22 deletions(-)

diff --git a/examples/example_experimental_data.py b/examples/example_experimental_data.py
index 9dc9585..3d2cf91 100644
--- a/examples/example_experimental_data.py
+++ b/examples/example_experimental_data.py
@@ -11,6 +11,7 @@
 
 import nibabel as nib
 from patch_denoise.space_time.lowrank import OptimalSVDDenoiser
+import timeit
 
 # %%
 # Setup the parameters for the simulation and noise
@@ -27,10 +28,12 @@
 
 # data shape is (53, 63, 52, 262) with 3mm resolution
 patch_shape = (11, 11, 11)
-patch_overlap = (5)
+patch_overlap = (10)
 
 # initialize denoiser
 optimal_llr = OptimalSVDDenoiser(patch_shape, patch_overlap)
 
 # denoise image
-denoised = optimal_llr.denoise(img.get_fdata())
\ No newline at end of file
+time_start = timeit.default_timer()
+denoised = optimal_llr.denoise(img.get_fdata())
+print(timeit.default_timer() - time_start)
\ No newline at end of file
diff --git a/src/patch_denoise/space_time/lowrank.py b/src/patch_denoise/space_time/lowrank.py
index 69d6b26..b6d0da6 100644
--- a/src/patch_denoise/space_time/lowrank.py
+++ b/src/patch_denoise/space_time/lowrank.py
@@ -4,6 +4,7 @@
 import numpy as np
 from scipy.linalg import svd
 from scipy.optimize import minimize
+import cupy
 
 from .base import BaseSpaceTimeDenoiser
 from .utils import (
@@ -373,26 +374,56 @@ def _patch_processing(
         shrink_func=None,
         mp_median=None,
         var_apriori=None,
+        engine="gpu",
     ):
-        u_vec, s_values, v_vec, p_tmean = svd_analysis(patch)
-        if var_apriori is not None:
-            sigma = np.mean(np.sqrt(var_apriori[patch_slice]))
-        else:
-            sigma = np.median(s_values) / np.sqrt(patch.shape[1] * mp_median)
-
-        scale_factor = np.sqrt(patch.shape[1]) * sigma
-        thresh_s_values = scale_factor * shrink_func(
-            s_values / scale_factor,
-            beta=patch.shape[1] / patch.shape[0],
-        )
-        thresh_s_values[np.isnan(thresh_s_values)] = 0
+        if engine == "cpu":
+            u_vec, s_values, v_vec, p_tmean = svd_analysis(patch)
+            if var_apriori is not None:
+                sigma = np.mean(np.sqrt(var_apriori[patch_slice]))
+            else:
+                sigma = np.median(s_values) / np.sqrt(patch.shape[1] * mp_median)
+
+            scale_factor = np.sqrt(patch.shape[1]) * sigma
+            thresh_s_values = scale_factor * shrink_func(
+                s_values / scale_factor,
+                beta=patch.shape[1] / patch.shape[0],
+            )
+            thresh_s_values[np.isnan(thresh_s_values)] = 0
+
+            if np.any(thresh_s_values):
+                maxidx = np.max(np.nonzero(thresh_s_values)) + 1
+                p_new = svd_synthesis(u_vec, thresh_s_values, v_vec, p_tmean, maxidx)
+            else:
+                maxidx = 0
+                p_new = np.zeros_like(patch) + p_tmean
+
+        if engine == "gpu":
+            u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
+            if var_apriori is not None:
+                sigma = cupy.mean(cupy.sqrt(var_apriori[patch_slice]))
+            else:
+                sigma = cupy.median(s_values) / cupy.sqrt(
+                    patch.shape[1] * mp_median
+                )
 
-        if np.any(thresh_s_values):
-            maxidx = np.max(np.nonzero(thresh_s_values)) + 1
-            p_new = svd_synthesis(u_vec, thresh_s_values, v_vec, p_tmean, maxidx)
-        else:
-            maxidx = 0
-            p_new = np.zeros_like(patch) + p_tmean
+            scale_factor = cupy.sqrt(patch.shape[1]) * sigma
+            thresh_s_values = cupy.array(scale_factor * shrink_func(
+                s_values / scale_factor,
+                beta=patch.shape[1] / patch.shape[0],
+            ))
+            thresh_s_values[cupy.isnan(thresh_s_values)] = 0
+
+            if cupy.any(thresh_s_values):
+                maxidx = cupy.amax(cupy.array(
+                    cupy.nonzero(thresh_s_values)
+                ) + 1)
+                print(maxidx)
+                p_new = svd_synthesis(
+                    u_vec, thresh_s_values, v_vec, p_tmean, maxidx
+                )
+            else:
+                maxidx = 0
+                p_new = cupy.zeros_like(patch) + p_tmean
 
         return p_new, maxidx, np.NaN
 
diff --git a/src/patch_denoise/space_time/utils.py b/src/patch_denoise/space_time/utils.py
index 0ad13aa..1c5de40 100644
--- a/src/patch_denoise/space_time/utils.py
+++ b/src/patch_denoise/space_time/utils.py
@@ -2,9 +2,10 @@
 import numpy as np
 from scipy.integrate import quad
 from scipy.linalg import eigh, svd
+import cupy
 
 
-def svd_analysis(input_data):
+def svd_analysis(input_data, engine="cpu"):
     """Return the centered SVD decomposition.
 
     U, S, Vt and M are compute such that:
@@ -22,7 +23,16 @@ def svd_analysis(input_data):
     mean = np.mean(input_data, axis=0)
     data_centered = input_data - mean
     # TODO  benchmark svd vs svds and order of data.
-    u_vec, s_vals, v_vec = svd(data_centered, full_matrices=False)
+    if engine == "cpu":
+        u_vec, s_vals, v_vec = svd(data_centered, full_matrices=False)
+    elif engine == "gpu":
+        data_centered = cupy.array(data_centered)
+        u_vec, s_vals, v_vec = cupy.linalg.svd(
+            data_centered, full_matrices=False
+        )
+#        u_vec = cupy.asnumpy(u_vec)
+#        s_vals = cupy.asnumpy(s_vals)
+#        v_vec = cupy.asnumpy(v_vec)
 
     return u_vec, s_vals, v_vec, mean
 

From c425a4974d2048f689f913f6b4eeced9431d95bc Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Thu, 9 Nov 2023 15:44:27 +0100
Subject: [PATCH 03/16] try cp sliding_window_view

---
 examples/example_experimental_data.py   |  2 +-
 src/patch_denoise/space_time/base.py    |  6 ++
 src/patch_denoise/space_time/lowrank.py | 96 ++++++++++++-------------
 src/patch_denoise/space_time/utils.py   | 12 ++--
 4 files changed, 61 insertions(+), 55 deletions(-)

diff --git a/examples/example_experimental_data.py b/examples/example_experimental_data.py
index 3d2cf91..99182bd 100644
--- a/examples/example_experimental_data.py
+++ b/examples/example_experimental_data.py
@@ -28,7 +28,7 @@
 
 # data shape is (53, 63, 52, 262) with 3mm resolution
 patch_shape = (11, 11, 11)
-patch_overlap = (10)
+patch_overlap = (5)
 
 # initialize denoiser
 optimal_llr = OptimalSVDDenoiser(patch_shape, patch_overlap)
diff --git a/src/patch_denoise/space_time/base.py b/src/patch_denoise/space_time/base.py
index 3b0a660..0d2b7a8 100644
--- a/src/patch_denoise/space_time/base.py
+++ b/src/patch_denoise/space_time/base.py
@@ -3,6 +3,7 @@
 import logging
 import numpy as np
 from tqdm.auto import tqdm
+import cupy as cp
 
 from .._docs import fill_doc
 
@@ -87,6 +88,11 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
             progbar = tqdm(total=len(patch_locs))
         elif progbar is not False:
             progbar.reset(total=len(patch_locs))
+        print(input_data.shape)
+        step = patch_shape[0] - patch_overlap[0]
+        patches = cp.lib.stride_tricks.sliding_window_view(input_data, patch_shape, axis=(0, 1, 2))[::step, :]
+        print(patches.shape)
+        exit(0)
 
         for patch_tl in patch_locs:
             patch_slice = tuple(
diff --git a/src/patch_denoise/space_time/lowrank.py b/src/patch_denoise/space_time/lowrank.py
index b6d0da6..be8cded 100644
--- a/src/patch_denoise/space_time/lowrank.py
+++ b/src/patch_denoise/space_time/lowrank.py
@@ -4,7 +4,7 @@
 import numpy as np
 from scipy.linalg import svd
 from scipy.optimize import minimize
-import cupy
+import cupy as cp
 
 from .base import BaseSpaceTimeDenoiser
 from .utils import (
@@ -376,54 +376,54 @@ def _patch_processing(
         var_apriori=None,
         engine="gpu",
     ):
-        if engine == "cpu":
-            u_vec, s_values, v_vec, p_tmean = svd_analysis(patch)
-            if var_apriori is not None:
-                sigma = np.mean(np.sqrt(var_apriori[patch_slice]))
-            else:
-                sigma = np.median(s_values) / np.sqrt(patch.shape[1] * mp_median)
-
-            scale_factor = np.sqrt(patch.shape[1]) * sigma
-            thresh_s_values = scale_factor * shrink_func(
-                s_values / scale_factor,
-                beta=patch.shape[1] / patch.shape[0],
-            )
-            thresh_s_values[np.isnan(thresh_s_values)] = 0
-
-            if np.any(thresh_s_values):
-                maxidx = np.max(np.nonzero(thresh_s_values)) + 1
-                p_new = svd_synthesis(u_vec, thresh_s_values, v_vec, p_tmean, maxidx)
-            else:
-                maxidx = 0
-                p_new = np.zeros_like(patch) + p_tmean
-
-        if engine == "gpu":
-            u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
-            if var_apriori is not None:
-                sigma = cupy.mean(cupy.sqrt(var_apriori[patch_slice]))
-            else:
-                sigma = cupy.median(s_values) / cupy.sqrt(
-                    patch.shape[1] * mp_median
-                )
 
-            scale_factor = cupy.sqrt(patch.shape[1]) * sigma
-            thresh_s_values = cupy.array(scale_factor * shrink_func(
-                s_values / scale_factor,
-                beta=patch.shape[1] / patch.shape[0],
-            ))
-            thresh_s_values[cupy.isnan(thresh_s_values)] = 0
-
-            if cupy.any(thresh_s_values):
-                maxidx = cupy.amax(cupy.array(
-                    cupy.nonzero(thresh_s_values)
-                ) + 1)
-                print(maxidx)
-                p_new = svd_synthesis(
-                    u_vec, thresh_s_values, v_vec, p_tmean, maxidx
-                )
-            else:
-                maxidx = 0
-                p_new = cupy.zeros_like(patch) + p_tmean
+        u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
+        if var_apriori is not None:
+            sigma = np.mean(np.sqrt(var_apriori[patch_slice]))
+        else:
+            sigma = np.median(s_values) / np.sqrt(patch.shape[1] * mp_median)
+
+        scale_factor = np.sqrt(patch.shape[1]) * sigma
+        thresh_s_values = scale_factor * shrink_func(
+            s_values / scale_factor,
+            beta=patch.shape[1] / patch.shape[0],
+        )
+        thresh_s_values[np.isnan(thresh_s_values)] = 0
+
+        if np.any(thresh_s_values):
+            maxidx = np.max(np.nonzero(thresh_s_values)) + 1
+            p_new = svd_synthesis(u_vec, thresh_s_values, v_vec, p_tmean, maxidx)
+        else:
+            maxidx = 0
+            p_new = np.zeros_like(patch) + p_tmean
+
+#        if engine == "gpu":
+#            u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
+#            if var_apriori is not None:
+#                sigma = cp.mean(cp.sqrt(var_apriori[patch_slice]))
+#            else:
+#                sigma = cp.median(s_values) / cp.sqrt(
+#                    patch.shape[1] * mp_median
+#                )
+#
+#            scale_factor = cp.sqrt(patch.shape[1]) * sigma
+#            thresh_s_values = cp.array(scale_factor * shrink_func(
+#                s_values / scale_factor,
+#                beta=patch.shape[1] / patch.shape[0],
+#            ))
+#            thresh_s_values[cp.isnan(thresh_s_values)] = 0
+#
+#            if cp.any(thresh_s_values):
+#                maxidx = cp.amax(cp.array(
+#                    cp.nonzero(thresh_s_values)
+#                ) + 1)
+#                print(maxidx)
+#                p_new = svd_synthesis(
+#                    u_vec, thresh_s_values, v_vec, p_tmean, maxidx
+#                )
+#            else:
+#                maxidx = 0
+#                p_new = cp.zeros_like(patch) + p_tmean
 
         return p_new, maxidx, np.NaN
 
diff --git a/src/patch_denoise/space_time/utils.py b/src/patch_denoise/space_time/utils.py
index 1c5de40..8696b98 100644
--- a/src/patch_denoise/space_time/utils.py
+++ b/src/patch_denoise/space_time/utils.py
@@ -2,7 +2,7 @@
 import numpy as np
 from scipy.integrate import quad
 from scipy.linalg import eigh, svd
-import cupy
+import cupy as cp
 
 
 def svd_analysis(input_data, engine="cpu"):
@@ -26,13 +26,13 @@ def svd_analysis(input_data, engine="cpu"):
     if engine == "cpu":
         u_vec, s_vals, v_vec = svd(data_centered, full_matrices=False)
     elif engine == "gpu":
-        data_centered = cupy.array(data_centered)
-        u_vec, s_vals, v_vec = cupy.linalg.svd(
+        data_centered = cp.array(data_centered)
+        u_vec, s_vals, v_vec = cp.linalg.svd(
             data_centered, full_matrices=False
         )
-#        u_vec = cupy.asnumpy(u_vec)
-#        s_vals = cupy.asnumpy(s_vals)
-#        v_vec = cupy.asnumpy(v_vec)
+        u_vec = cp.asnumpy(u_vec)
+        s_vals = cp.asnumpy(s_vals)
+        v_vec = cp.asnumpy(v_vec)
 
     return u_vec, s_vals, v_vec, mean
 

From 521c2c28d65c954d8a9db1b02484756dd9340f3b Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Thu, 9 Nov 2023 17:12:27 +0100
Subject: [PATCH 04/16] Fix stepping

---
 src/patch_denoise/space_time/base.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/patch_denoise/space_time/base.py b/src/patch_denoise/space_time/base.py
index 0d2b7a8..dade998 100644
--- a/src/patch_denoise/space_time/base.py
+++ b/src/patch_denoise/space_time/base.py
@@ -90,9 +90,10 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
             progbar.reset(total=len(patch_locs))
         print(input_data.shape)
         step = patch_shape[0] - patch_overlap[0]
-        patches = cp.lib.stride_tricks.sliding_window_view(input_data, patch_shape, axis=(0, 1, 2))[::step, :]
+        patches = cp.lib.stride_tricks.sliding_window_view(
+            input_data, patch_shape, axis=(0, 1, 2)
+        )[::step, ::step, ::step]
         print(patches.shape)
-        exit(0)
 
         for patch_tl in patch_locs:
             patch_slice = tuple(

From edd5b57c3c295696a74f89bde8d205d8578d0550 Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Wed, 15 Nov 2023 12:22:24 +0100
Subject: [PATCH 05/16] Added padding

---
 examples/example_experimental_data.py |  2 +-
 src/patch_denoise/space_time/base.py  | 25 ++++++++++++++++++++++---
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/examples/example_experimental_data.py b/examples/example_experimental_data.py
index 99182bd..f9b9f0a 100644
--- a/examples/example_experimental_data.py
+++ b/examples/example_experimental_data.py
@@ -28,7 +28,7 @@
 
 # data shape is (53, 63, 52, 262) with 3mm resolution
 patch_shape = (11, 11, 11)
-patch_overlap = (5)
+patch_overlap = (1)
 
 # initialize denoiser
 optimal_llr = OptimalSVDDenoiser(patch_shape, patch_overlap)
diff --git a/src/patch_denoise/space_time/base.py b/src/patch_denoise/space_time/base.py
index dade998..3f24741 100644
--- a/src/patch_denoise/space_time/base.py
+++ b/src/patch_denoise/space_time/base.py
@@ -88,13 +88,32 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
             progbar = tqdm(total=len(patch_locs))
         elif progbar is not False:
             progbar.reset(total=len(patch_locs))
-        print(input_data.shape)
+
+        # Pad the data
+
+        output_data = cp.asarray(output_data)
+
+        input_data = cp.asarray(input_data)
+
+        c, h, w, t_s = input_data.shape
+        kc, kh, kw = patch_shape  # kernel size
+        sc, sh, sw = np.repeat(
+            patch_shape[0] - patch_overlap[0], len(patch_shape)
+        )
+        needed_c = int((cp.ceil((c - kc) / sc + 1) - ((c - kc) / sc + 1)) * kc)
+        needed_h = int((cp.ceil((h - kh) / sh + 1) - ((h - kh) / sh + 1)) * kh)
+        needed_w = int((cp.ceil((w - kw) / sw + 1) - ((w - kw) / sw + 1)) * kw)
+
+        input_data_padded = cp.pad(
+            input_data, ((0, needed_c), (0, needed_h), (0, needed_w), (0, 0)
+        ), mode='edge')
+
         step = patch_shape[0] - patch_overlap[0]
         patches = cp.lib.stride_tricks.sliding_window_view(
-            input_data, patch_shape, axis=(0, 1, 2)
+            input_data_padded, patch_shape, axis=(0, 1, 2)
         )[::step, ::step, ::step]
         print(patches.shape)
-
+        exit(0)
         for patch_tl in patch_locs:
             patch_slice = tuple(
                 slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)

From 1a1c8e30c9cd612119f722719551c8ef26f69c9c Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Wed, 15 Nov 2023 16:43:34 +0100
Subject: [PATCH 06/16] Prepare patches for cp svd

---
 src/patch_denoise/space_time/base.py    | 48 ++++++++++++--------
 src/patch_denoise/space_time/lowrank.py | 60 +++++++++++++------------
 src/patch_denoise/space_time/utils.py   |  5 ++-
 3 files changed, 64 insertions(+), 49 deletions(-)

diff --git a/src/patch_denoise/space_time/base.py b/src/patch_denoise/space_time/base.py
index 3f24741..7d1e24b 100644
--- a/src/patch_denoise/space_time/base.py
+++ b/src/patch_denoise/space_time/base.py
@@ -70,25 +70,6 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
         patchs_weight = np.zeros(data_shape[:-1], np.float32)
         noise_std_estimate = np.zeros(data_shape[:-1], dtype=np.float32)
 
-        # discard useless patches
-        patch_locs = get_patch_locs(patch_shape, patch_overlap, data_shape[:-1])
-        get_it = np.zeros(len(patch_locs), dtype=bool)
-
-        for i, patch_tl in enumerate(patch_locs):
-            patch_slice = tuple(
-                slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
-            )
-            if 100 * np.sum(process_mask[patch_slice]) / patch_size > mask_threshold:
-                get_it[i] = True
-
-        logging.info(f"Denoise {100 * np.sum(get_it) / len(patch_locs):.2f}% patches")
-        patch_locs = np.ascontiguousarray(patch_locs[get_it])
-
-        if progbar is None:
-            progbar = tqdm(total=len(patch_locs))
-        elif progbar is not False:
-            progbar.reset(total=len(patch_locs))
-
         # Pad the data
 
         output_data = cp.asarray(output_data)
@@ -113,7 +94,36 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
             input_data_padded, patch_shape, axis=(0, 1, 2)
         )[::step, ::step, ::step]
         print(patches.shape)
+        # TODO patch reshape and prod step
+        patch[np.isnan(patch)] = np.mean(patch)
+        p_denoise, maxidx, noise_var = self._patch_processing(
+            patch,
+            patch_slice=patch_slice,
+            engine="gpu",
+            **self.input_denoising_kwargs,
+        )
+
         exit(0)
+
+        # discard useless patches
+        patch_locs = get_patch_locs(patch_shape, patch_overlap, data_shape[:-1])
+        get_it = np.zeros(len(patch_locs), dtype=bool)
+
+        for i, patch_tl in enumerate(patch_locs):
+            patch_slice = tuple(
+                slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
+            )
+            if 100 * np.sum(process_mask[patch_slice]) / patch_size > mask_threshold:
+                get_it[i] = True
+
+        logging.info(f"Denoise {100 * np.sum(get_it) / len(patch_locs):.2f}% patches")
+        patch_locs = np.ascontiguousarray(patch_locs[get_it])
+
+        if progbar is None:
+            progbar = tqdm(total=len(patch_locs))
+        elif progbar is not False:
+            progbar.reset(total=len(patch_locs))
+
         for patch_tl in patch_locs:
             patch_slice = tuple(
                 slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
diff --git a/src/patch_denoise/space_time/lowrank.py b/src/patch_denoise/space_time/lowrank.py
index be8cded..11629a4 100644
--- a/src/patch_denoise/space_time/lowrank.py
+++ b/src/patch_denoise/space_time/lowrank.py
@@ -376,8 +376,10 @@ def _patch_processing(
         var_apriori=None,
         engine="gpu",
     ):
-
-        u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
+        if engine == "gpu":
+           u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
+        elif engine == "cpu":
+           u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
         if var_apriori is not None:
             sigma = np.mean(np.sqrt(var_apriori[patch_slice]))
         else:
@@ -397,33 +399,33 @@ def _patch_processing(
             maxidx = 0
             p_new = np.zeros_like(patch) + p_tmean
 
-#        if engine == "gpu":
-#            u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
-#            if var_apriori is not None:
-#                sigma = cp.mean(cp.sqrt(var_apriori[patch_slice]))
-#            else:
-#                sigma = cp.median(s_values) / cp.sqrt(
-#                    patch.shape[1] * mp_median
-#                )
-#
-#            scale_factor = cp.sqrt(patch.shape[1]) * sigma
-#            thresh_s_values = cp.array(scale_factor * shrink_func(
-#                s_values / scale_factor,
-#                beta=patch.shape[1] / patch.shape[0],
-#            ))
-#            thresh_s_values[cp.isnan(thresh_s_values)] = 0
-#
-#            if cp.any(thresh_s_values):
-#                maxidx = cp.amax(cp.array(
-#                    cp.nonzero(thresh_s_values)
-#                ) + 1)
-#                print(maxidx)
-#                p_new = svd_synthesis(
-#                    u_vec, thresh_s_values, v_vec, p_tmean, maxidx
-#                )
-#            else:
-#                maxidx = 0
-#                p_new = cp.zeros_like(patch) + p_tmean
+    #    if engine == "gpu":
+    #        u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
+    #        if var_apriori is not None:
+    #            sigma = cp.mean(cp.sqrt(var_apriori[patch_slice]))
+    #        else:
+    #            sigma = cp.median(s_values) / cp.sqrt(
+    #                patch.shape[1] * mp_median
+    #            )
+
+    #        scale_factor = cp.sqrt(patch.shape[1]) * sigma
+    #        thresh_s_values = cp.array(scale_factor * shrink_func(
+    #            s_values / scale_factor,
+    #            beta=patch.shape[1] / patch.shape[0],
+    #        ))
+    #        thresh_s_values[cp.isnan(thresh_s_values)] = 0
+
+    #        if cp.any(thresh_s_values):
+    #            maxidx = cp.amax(cp.array(
+    #                cp.nonzero(thresh_s_values)
+    #            ) + 1)
+    #            print(maxidx)
+    #            p_new = svd_synthesis(
+    #                u_vec, thresh_s_values, v_vec, p_tmean, maxidx
+    #            )
+    #        else:
+    #            maxidx = 0
+    #            p_new = cp.zeros_like(patch) + p_tmean
 
         return p_new, maxidx, np.NaN
 
diff --git a/src/patch_denoise/space_time/utils.py b/src/patch_denoise/space_time/utils.py
index 8696b98..0482a01 100644
--- a/src/patch_denoise/space_time/utils.py
+++ b/src/patch_denoise/space_time/utils.py
@@ -24,9 +24,12 @@ def svd_analysis(input_data, engine="cpu"):
     data_centered = input_data - mean
     # TODO  benchmark svd vs svds and order of data.
     if engine == "cpu":
+        mean = np.mean(input_data, axis=0)
+        data_centered = input_data - mean
         u_vec, s_vals, v_vec = svd(data_centered, full_matrices=False)
     elif engine == "gpu":
-        data_centered = cp.array(data_centered)
+        mean = cp.mean(input_data, axis=0)
+        data_centered = input_data - mean
         u_vec, s_vals, v_vec = cp.linalg.svd(
             data_centered, full_matrices=False
         )

From dc56dd60f6305519a36fdae874baf47eaa428aec Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Wed, 15 Nov 2023 16:48:02 +0100
Subject: [PATCH 07/16] Set up _patch_processing call

---
 src/patch_denoise/space_time/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/patch_denoise/space_time/base.py b/src/patch_denoise/space_time/base.py
index 7d1e24b..09fc693 100644
--- a/src/patch_denoise/space_time/base.py
+++ b/src/patch_denoise/space_time/base.py
@@ -95,10 +95,10 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
         )[::step, ::step, ::step]
         print(patches.shape)
         # TODO patch reshape and prod step
-        patch[np.isnan(patch)] = np.mean(patch)
+        patches[cp.isnan(patches)] = np.mean(patches)
         p_denoise, maxidx, noise_var = self._patch_processing(
-            patch,
-            patch_slice=patch_slice,
+            patches,
+            patch_slice=None,
             engine="gpu",
             **self.input_denoising_kwargs,
         )

From 337b671bc41434b0048fe4a0ab62326e4df5f8d6 Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Thu, 16 Nov 2023 17:03:12 +0100
Subject: [PATCH 08/16] Correct shapes

---
 src/patch_denoise/space_time/base.py  | 8 +++++---
 src/patch_denoise/space_time/utils.py | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/patch_denoise/space_time/base.py b/src/patch_denoise/space_time/base.py
index 09fc693..65647cf 100644
--- a/src/patch_denoise/space_time/base.py
+++ b/src/patch_denoise/space_time/base.py
@@ -4,6 +4,7 @@
 import numpy as np
 from tqdm.auto import tqdm
 import cupy as cp
+import pdb
 
 from .._docs import fill_doc
 
@@ -93,9 +94,10 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
         patches = cp.lib.stride_tricks.sliding_window_view(
             input_data_padded, patch_shape, axis=(0, 1, 2)
         )[::step, ::step, ::step]
-        print(patches.shape)
-        # TODO patch reshape and prod step
-        patches[cp.isnan(patches)] = np.mean(patches)
+
+        patches = patches.transpose((0, 1, 2, 4, 5, 6, 3))
+        patches = patches.reshape((np.prod(patches.shape[:3]), patch_size, t_s))
+        patches[cp.isnan(patches)] = cp.mean(patches)
         p_denoise, maxidx, noise_var = self._patch_processing(
             patches,
             patch_slice=None,
diff --git a/src/patch_denoise/space_time/utils.py b/src/patch_denoise/space_time/utils.py
index 0482a01..d9cdd98 100644
--- a/src/patch_denoise/space_time/utils.py
+++ b/src/patch_denoise/space_time/utils.py
@@ -3,6 +3,7 @@
 from scipy.integrate import quad
 from scipy.linalg import eigh, svd
 import cupy as cp
+import pdb
 
 
 def svd_analysis(input_data, engine="cpu"):
@@ -20,8 +21,6 @@ def svd_analysis(input_data, engine="cpu"):
     -------
     u_vec, s_vals, v_vec, mean
     """
-    mean = np.mean(input_data, axis=0)
-    data_centered = input_data - mean
     # TODO  benchmark svd vs svds and order of data.
     if engine == "cpu":
         mean = np.mean(input_data, axis=0)
@@ -36,6 +35,7 @@ def svd_analysis(input_data, engine="cpu"):
         u_vec = cp.asnumpy(u_vec)
         s_vals = cp.asnumpy(s_vals)
         v_vec = cp.asnumpy(v_vec)
+        mean = cp.asnumpy(mean)
 
     return u_vec, s_vals, v_vec, mean
 
@@ -58,7 +58,9 @@ def svd_synthesis(u_vec, s_vals, v_vec, mean, idx):
     -------
     np.ndarray: The reconstructed matrix.
     """
-    return (u_vec[:, :idx] @ (s_vals[:idx, None] * v_vec[:idx, :])) + mean
+    return (
+        u_vec[..., :idx] @ (s_vals[:idx, ..., None] * v_vec[:idx, ...])
+    ) + mean
 
 
 def eig_analysis(input_data, max_eig_val=10):

From 861aeea7731efafb4e220e79c5849c7e8becc36d Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Thu, 16 Nov 2023 17:32:19 +0100
Subject: [PATCH 09/16] Update

---
 src/patch_denoise/space_time/base.py    | 13 +++++++++
 src/patch_denoise/space_time/lowrank.py | 37 +++----------------------
 src/patch_denoise/space_time/utils.py   | 11 +++++---
 3 files changed, 24 insertions(+), 37 deletions(-)

diff --git a/src/patch_denoise/space_time/base.py b/src/patch_denoise/space_time/base.py
index 65647cf..5cd5fe5 100644
--- a/src/patch_denoise/space_time/base.py
+++ b/src/patch_denoise/space_time/base.py
@@ -105,6 +105,19 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
             **self.input_denoising_kwargs,
         )
 
+        # # Define the shape of the array, the patch, and the step
+        # array_shape = p_denoise.shape
+
+        # # Calculate the top-left corner of each patch
+        # patch_tl = np.array(np.meshgrid(
+        #     *[range(0, dim - ps + 1, step+1) for dim, ps in zip(array_shape, patch_shape)]
+        # )).T.reshape(-1, 3)
+
+        # # Calculate the center of each patch
+        # patch_centers = patch_tl + np.array(patch_shape) // 2
+
+        # print(len(patch_centers))
+
         exit(0)
 
         # discard useless patches
diff --git a/src/patch_denoise/space_time/lowrank.py b/src/patch_denoise/space_time/lowrank.py
index 11629a4..cd667e4 100644
--- a/src/patch_denoise/space_time/lowrank.py
+++ b/src/patch_denoise/space_time/lowrank.py
@@ -376,10 +376,7 @@ def _patch_processing(
         var_apriori=None,
         engine="gpu",
     ):
-        if engine == "gpu":
-           u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
-        elif engine == "cpu":
-           u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
+        u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
         if var_apriori is not None:
             sigma = np.mean(np.sqrt(var_apriori[patch_slice]))
         else:
@@ -394,39 +391,13 @@ def _patch_processing(
 
         if np.any(thresh_s_values):
             maxidx = np.max(np.nonzero(thresh_s_values)) + 1
-            p_new = svd_synthesis(u_vec, thresh_s_values, v_vec, p_tmean, maxidx)
+            p_new = svd_synthesis(
+                u_vec, thresh_s_values, v_vec, p_tmean, maxidx, engine=engine
+            )
         else:
             maxidx = 0
             p_new = np.zeros_like(patch) + p_tmean
 
-    #    if engine == "gpu":
-    #        u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
-    #        if var_apriori is not None:
-    #            sigma = cp.mean(cp.sqrt(var_apriori[patch_slice]))
-    #        else:
-    #            sigma = cp.median(s_values) / cp.sqrt(
-    #                patch.shape[1] * mp_median
-    #            )
-
-    #        scale_factor = cp.sqrt(patch.shape[1]) * sigma
-    #        thresh_s_values = cp.array(scale_factor * shrink_func(
-    #            s_values / scale_factor,
-    #            beta=patch.shape[1] / patch.shape[0],
-    #        ))
-    #        thresh_s_values[cp.isnan(thresh_s_values)] = 0
-
-    #        if cp.any(thresh_s_values):
-    #            maxidx = cp.amax(cp.array(
-    #                cp.nonzero(thresh_s_values)
-    #            ) + 1)
-    #            print(maxidx)
-    #            p_new = svd_synthesis(
-    #                u_vec, thresh_s_values, v_vec, p_tmean, maxidx
-    #            )
-    #        else:
-    #            maxidx = 0
-    #            p_new = cp.zeros_like(patch) + p_tmean
-
         return p_new, maxidx, np.NaN
 
 
diff --git a/src/patch_denoise/space_time/utils.py b/src/patch_denoise/space_time/utils.py
index d9cdd98..e8cba9d 100644
--- a/src/patch_denoise/space_time/utils.py
+++ b/src/patch_denoise/space_time/utils.py
@@ -40,7 +40,7 @@ def svd_analysis(input_data, engine="cpu"):
     return u_vec, s_vals, v_vec, mean
 
 
-def svd_synthesis(u_vec, s_vals, v_vec, mean, idx):
+def svd_synthesis(u_vec, s_vals, v_vec, mean, idx, engine="cpu"):
     """
     Reconstruct ``X = (U @ (S * V)) + M`` with only the max_idx greatest component.
 
@@ -58,9 +58,12 @@ def svd_synthesis(u_vec, s_vals, v_vec, mean, idx):
     -------
     np.ndarray: The reconstructed matrix.
     """
-    return (
-        u_vec[..., :idx] @ (s_vals[:idx, ..., None] * v_vec[:idx, ...])
-    ) + mean
+    if engine == "cpu":
+        return (u_vec[:, :idx] @ (s_vals[:idx, None] * v_vec[:idx, :])) + mean
+    if engine == "gpu":
+        return (
+            u_vec[..., :idx] @ (s_vals[:idx, ..., None] * v_vec[:idx, ...])
+        ) + mean
 
 
 def eig_analysis(input_data, max_eig_val=10):

From 25280eb138bf9bb18bf409d564a22022806ec3c8 Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Thu, 16 Nov 2023 17:55:14 +0100
Subject: [PATCH 10/16] TODO

---
 src/patch_denoise/space_time/base.py    | 20 +++++---------------
 src/patch_denoise/space_time/lowrank.py |  3 ++-
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/src/patch_denoise/space_time/base.py b/src/patch_denoise/space_time/base.py
index 5cd5fe5..23bd245 100644
--- a/src/patch_denoise/space_time/base.py
+++ b/src/patch_denoise/space_time/base.py
@@ -5,6 +5,7 @@
 from tqdm.auto import tqdm
 import cupy as cp
 import pdb
+import timeit
 
 from .._docs import fill_doc
 
@@ -51,6 +52,7 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
         -------
         $denoise_return
         """
+        time_start = timeit.default_timer()
         data_shape = input_data.shape
         output_data = np.zeros_like(input_data)
         rank_map = np.zeros(data_shape[:-1], dtype=np.int32)
@@ -105,21 +107,6 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
             **self.input_denoising_kwargs,
         )
 
-        # # Define the shape of the array, the patch, and the step
-        # array_shape = p_denoise.shape
-
-        # # Calculate the top-left corner of each patch
-        # patch_tl = np.array(np.meshgrid(
-        #     *[range(0, dim - ps + 1, step+1) for dim, ps in zip(array_shape, patch_shape)]
-        # )).T.reshape(-1, 3)
-
-        # # Calculate the center of each patch
-        # patch_centers = patch_tl + np.array(patch_shape) // 2
-
-        # print(len(patch_centers))
-
-        exit(0)
-
         # discard useless patches
         patch_locs = get_patch_locs(patch_shape, patch_overlap, data_shape[:-1])
         get_it = np.zeros(len(patch_locs), dtype=bool)
@@ -182,6 +169,9 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
             rank_map[patch_center_img] = maxidx
             if progbar:
                 progbar.update()
+        print(timeit.default_timer() - time_start)
+
+        exit(0)
         # Averaging the overlapping pixels.
         # this is only required for averaging recombinations.
         if self.recombination in ["average", "weighted"]:
diff --git a/src/patch_denoise/space_time/lowrank.py b/src/patch_denoise/space_time/lowrank.py
index cd667e4..7b8e507 100644
--- a/src/patch_denoise/space_time/lowrank.py
+++ b/src/patch_denoise/space_time/lowrank.py
@@ -374,8 +374,9 @@ def _patch_processing(
         shrink_func=None,
         mp_median=None,
         var_apriori=None,
-        engine="gpu",
+        engine="cpu",
     ):
+        # TODO check matching of shapes and reshape to work with same indexes
         u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
         if var_apriori is not None:
             sigma = np.mean(np.sqrt(var_apriori[patch_slice]))

From 1b417dae3258dd3bf63d101bf4c6d3b2624ad728 Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Thu, 16 Nov 2023 18:05:02 +0100
Subject: [PATCH 11/16] Check cupy array

---
 examples/example_experimental_data.py   |  2 +-
 src/patch_denoise/space_time/base.py    | 20 +++++++++++---------
 src/patch_denoise/space_time/lowrank.py |  5 ++++-
 src/patch_denoise/space_time/utils.py   |  1 +
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/examples/example_experimental_data.py b/examples/example_experimental_data.py
index f9b9f0a..1f79863 100644
--- a/examples/example_experimental_data.py
+++ b/examples/example_experimental_data.py
@@ -35,5 +35,5 @@
 
 # denoise image
 time_start = timeit.default_timer()
-denoised = optimal_llr.denoise(img.get_fdata())
+denoised = optimal_llr.denoise(img.get_fdata(), engine="gpu")
 print(timeit.default_timer() - time_start)
\ No newline at end of file
diff --git a/src/patch_denoise/space_time/base.py b/src/patch_denoise/space_time/base.py
index 23bd245..27edc2b 100644
--- a/src/patch_denoise/space_time/base.py
+++ b/src/patch_denoise/space_time/base.py
@@ -36,7 +36,9 @@ def __init__(self, patch_shape, patch_overlap, recombination="weighted"):
         self.input_denoising_kwargs = dict()
 
     @fill_doc
-    def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
+    def denoise(
+        self, input_data, mask=None, mask_threshold=50, progbar=None, engine="cpu"
+    ):
         """Denoise the input_data, according to mask.
 
         Patches are extracted sequentially and process by the implemented
@@ -100,10 +102,10 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
         patches = patches.transpose((0, 1, 2, 4, 5, 6, 3))
         patches = patches.reshape((np.prod(patches.shape[:3]), patch_size, t_s))
         patches[cp.isnan(patches)] = cp.mean(patches)
-        p_denoise, maxidx, noise_var = self._patch_processing(
+        p_denoise_, maxidx, noise_var = self._patch_processing(
             patches,
             patch_slice=None,
-            engine="gpu",
+            engine=engine,
             **self.input_denoising_kwargs,
         )
 
@@ -126,7 +128,7 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
         elif progbar is not False:
             progbar.reset(total=len(patch_locs))
 
-        for patch_tl in patch_locs:
+        for patch_tl, p_denoise, in zip(patch_locs, p_denoise_):
             patch_slice = tuple(
                 slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
             )
@@ -139,11 +141,11 @@ def denoise(self, input_data, mask=None, mask_threshold=50, progbar=None):
             # And ideally choosen by the user.
 
             patch[np.isnan(patch)] = np.mean(patch)
-            p_denoise, maxidx, noise_var = self._patch_processing(
-                patch,
-                patch_slice=patch_slice,
-                **self.input_denoising_kwargs,
-            )
+            # p_denoise, maxidx, noise_var = self._patch_processing(
+            #     patch,
+            #     patch_slice=patch_slice,
+            #     **self.input_denoising_kwargs,
+            # )
 
             p_denoise = np.reshape(p_denoise, (*patch_shape, -1))
             patch_center_img = tuple(
diff --git a/src/patch_denoise/space_time/lowrank.py b/src/patch_denoise/space_time/lowrank.py
index 7b8e507..97757b1 100644
--- a/src/patch_denoise/space_time/lowrank.py
+++ b/src/patch_denoise/space_time/lowrank.py
@@ -321,6 +321,7 @@ def denoise(
         noise_std=None,
         eps_marshenko_pastur=1e-7,
         progbar=None,
+        engine="cpu",
     ):
         """
         Optimal thresholing denoising method.
@@ -365,7 +366,9 @@ def denoise(
         else:
             self.input_denoising_kwargs["var_apriori"] = noise_std**2
 
-        return super().denoise(input_data, mask, mask_threshold, progbar=progbar)
+        return super().denoise(
+            input_data, mask, mask_threshold, progbar=progbar, engine=engine
+        )
 
     def _patch_processing(
         self,
diff --git a/src/patch_denoise/space_time/utils.py b/src/patch_denoise/space_time/utils.py
index e8cba9d..e8fbc18 100644
--- a/src/patch_denoise/space_time/utils.py
+++ b/src/patch_denoise/space_time/utils.py
@@ -58,6 +58,7 @@ def svd_synthesis(u_vec, s_vals, v_vec, mean, idx, engine="cpu"):
     -------
     np.ndarray: The reconstructed matrix.
     """
+    # TODO check shapes
     if engine == "cpu":
         return (u_vec[:, :idx] @ (s_vals[:idx, None] * v_vec[:idx, :])) + mean
     if engine == "gpu":

From 45938e02d99524ef5077b37a191345cd92a7455b Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Fri, 17 Nov 2023 11:36:30 +0100
Subject: [PATCH 12/16] Restructure

---
 src/patch_denoise/space_time/base.py  | 188 +++++++++++++++-----------
 src/patch_denoise/space_time/utils.py |   1 -
 2 files changed, 111 insertions(+), 78 deletions(-)

diff --git a/src/patch_denoise/space_time/base.py b/src/patch_denoise/space_time/base.py
index 27edc2b..1aa7808 100644
--- a/src/patch_denoise/space_time/base.py
+++ b/src/patch_denoise/space_time/base.py
@@ -4,7 +4,6 @@
 import numpy as np
 from tqdm.auto import tqdm
 import cupy as cp
-import pdb
 import timeit
 
 from .._docs import fill_doc
@@ -37,7 +36,12 @@ def __init__(self, patch_shape, patch_overlap, recombination="weighted"):
 
     @fill_doc
     def denoise(
-        self, input_data, mask=None, mask_threshold=50, progbar=None, engine="cpu"
+        self,
+        input_data,
+        mask=None,
+        mask_threshold=50,
+        progbar=None,
+        engine="cpu"
     ):
         """Denoise the input_data, according to mask.
 
@@ -75,40 +79,6 @@ def denoise(
         patchs_weight = np.zeros(data_shape[:-1], np.float32)
         noise_std_estimate = np.zeros(data_shape[:-1], dtype=np.float32)
 
-        # Pad the data
-
-        output_data = cp.asarray(output_data)
-
-        input_data = cp.asarray(input_data)
-
-        c, h, w, t_s = input_data.shape
-        kc, kh, kw = patch_shape  # kernel size
-        sc, sh, sw = np.repeat(
-            patch_shape[0] - patch_overlap[0], len(patch_shape)
-        )
-        needed_c = int((cp.ceil((c - kc) / sc + 1) - ((c - kc) / sc + 1)) * kc)
-        needed_h = int((cp.ceil((h - kh) / sh + 1) - ((h - kh) / sh + 1)) * kh)
-        needed_w = int((cp.ceil((w - kw) / sw + 1) - ((w - kw) / sw + 1)) * kw)
-
-        input_data_padded = cp.pad(
-            input_data, ((0, needed_c), (0, needed_h), (0, needed_w), (0, 0)
-        ), mode='edge')
-
-        step = patch_shape[0] - patch_overlap[0]
-        patches = cp.lib.stride_tricks.sliding_window_view(
-            input_data_padded, patch_shape, axis=(0, 1, 2)
-        )[::step, ::step, ::step]
-
-        patches = patches.transpose((0, 1, 2, 4, 5, 6, 3))
-        patches = patches.reshape((np.prod(patches.shape[:3]), patch_size, t_s))
-        patches[cp.isnan(patches)] = cp.mean(patches)
-        p_denoise_, maxidx, noise_var = self._patch_processing(
-            patches,
-            patch_slice=None,
-            engine=engine,
-            **self.input_denoising_kwargs,
-        )
-
         # discard useless patches
         patch_locs = get_patch_locs(patch_shape, patch_overlap, data_shape[:-1])
         get_it = np.zeros(len(patch_locs), dtype=bool)
@@ -128,50 +98,114 @@ def denoise(
         elif progbar is not False:
             progbar.reset(total=len(patch_locs))
 
-        for patch_tl, p_denoise, in zip(patch_locs, p_denoise_):
-            patch_slice = tuple(
-                slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
+        if engine == "cpu":
+            for patch_tl in zip(patch_locs):
+                patch_slice = tuple(
+                    slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
+                )
+                process_mask[patch_slice] = 1
+                # building the casoratti matrix
+                patch = np.reshape(input_data[patch_slice], (-1, input_data.shape[-1]))
+
+                # Replace all nan by mean value of patch.
+                # FIXME this behaviour should be documented
+                # And ideally choosen by the user.
+
+                patch[np.isnan(patch)] = np.mean(patch)
+                p_denoise, maxidx, noise_var = self._patch_processing(
+                    patch,
+                    patch_slice=patch_slice,
+                    **self.input_denoising_kwargs,
+                )
+
+                p_denoise = np.reshape(p_denoise, (*patch_shape, -1))
+                patch_center_img = tuple(
+                    ptl + ps // 2 for ptl, ps in zip(patch_tl, patch_shape)
+                )
+                if self.recombination == "center":
+                    output_data[patch_center_img] = p_denoise[patch_center]
+                    noise_std_estimate[patch_center_img] += noise_var
+                elif self.recombination == "weighted":
+                    theta = 1 / (2 + maxidx)
+                    output_data[patch_slice] += p_denoise * theta
+                    patchs_weight[patch_slice] += theta
+                elif self.recombination == "average":
+                    output_data[patch_slice] += p_denoise
+                    patchs_weight[patch_slice] += 1
+                else:
+                    raise ValueError(
+                        "recombination must be one of 'weighted', 'average', 'center'"
+                    )
+                if not np.isnan(noise_var):
+                    noise_std_estimate[patch_slice] += noise_var
+                # the top left corner of the patch is used as id for the patch.
+                rank_map[patch_center_img] = maxidx
+                if progbar:
+                    progbar.update()
+            print(timeit.default_timer() - time_start)
+
+        if engine == "gpu": 
+            # Pad the data
+            output_data = cp.asarray(output_data)
+            input_data = cp.asarray(input_data)
+
+            c, h, w, t_s = input_data.shape
+            kc, kh, kw = patch_shape  # kernel size
+            sc, sh, sw = np.repeat(
+                patch_shape[0] - patch_overlap[0], len(patch_shape)
             )
-            process_mask[patch_slice] = 1
-            # building the casoratti matrix
-            patch = np.reshape(input_data[patch_slice], (-1, input_data.shape[-1]))
-
-            # Replace all nan by mean value of patch.
-            # FIXME this behaviour should be documented
-            # And ideally choosen by the user.
-
-            patch[np.isnan(patch)] = np.mean(patch)
-            # p_denoise, maxidx, noise_var = self._patch_processing(
-            #     patch,
-            #     patch_slice=patch_slice,
-            #     **self.input_denoising_kwargs,
-            # )
-
-            p_denoise = np.reshape(p_denoise, (*patch_shape, -1))
-            patch_center_img = tuple(
-                ptl + ps // 2 for ptl, ps in zip(patch_tl, patch_shape)
+            needed_c = int((cp.ceil((c - kc) / sc + 1) - ((c - kc) / sc + 1)) * kc)
+            needed_h = int((cp.ceil((h - kh) / sh + 1) - ((h - kh) / sh + 1)) * kh)
+            needed_w = int((cp.ceil((w - kw) / sw + 1) - ((w - kw) / sw + 1)) * kw)
+
+            input_data_padded = cp.pad(
+                input_data, ((0, needed_c), (0, needed_h), (0, needed_w), (0, 0)
+            ), mode='edge')
+
+            step = patch_shape[0] - patch_overlap[0]
+            patches = cp.lib.stride_tricks.sliding_window_view(
+                input_data_padded, patch_shape, axis=(0, 1, 2)
+            )[::step, ::step, ::step]
+
+            patches = patches.transpose((0, 1, 2, 4, 5, 6, 3))
+            patches = patches.reshape((np.prod(patches.shape[:3]), patch_size, t_s))
+            patches[cp.isnan(patches)] = cp.mean(patches)
+            patches_denoise, maxidx, noise_var = self._patch_processing(
+                patches,
+                patch_slice=None,
+                engine=engine,
+                **self.input_denoising_kwargs,
             )
-            if self.recombination == "center":
-                output_data[patch_center_img] = p_denoise[patch_center]
-                noise_std_estimate[patch_center_img] += noise_var
-            elif self.recombination == "weighted":
-                theta = 1 / (2 + maxidx)
-                output_data[patch_slice] += p_denoise * theta
-                patchs_weight[patch_slice] += theta
-            elif self.recombination == "average":
-                output_data[patch_slice] += p_denoise
-                patchs_weight[patch_slice] += 1
-            else:
-                raise ValueError(
-                    "recombination must be one of 'weighted', 'average', 'center'"
+            for patch_tl, p_denoise, in zip(patch_locs, patches_denoise):
+                patch_slice = tuple(
+                    slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
+                )
+                process_mask[patch_slice] = 1
+                p_denoise = np.reshape(p_denoise, (*patch_shape, -1))
+                patch_center_img = tuple(
+                    ptl + ps // 2 for ptl, ps in zip(patch_tl, patch_shape)
                 )
-            if not np.isnan(noise_var):
-                noise_std_estimate[patch_slice] += noise_var
-            # the top left corner of the patch is used as id for the patch.
-            rank_map[patch_center_img] = maxidx
-            if progbar:
-                progbar.update()
-        print(timeit.default_timer() - time_start)
+                if self.recombination == "center":
+                    output_data[patch_center_img] = p_denoise[patch_center]
+                    noise_std_estimate[patch_center_img] += noise_var
+                elif self.recombination == "weighted":
+                    theta = 1 / (2 + maxidx)
+                    output_data[patch_slice] += p_denoise * theta
+                    patchs_weight[patch_slice] += theta
+                elif self.recombination == "average":
+                    output_data[patch_slice] += p_denoise
+                    patchs_weight[patch_slice] += 1
+                else:
+                    raise ValueError(
+                        "recombination must be one of 'weighted', 'average', 'center'"
+                    )
+                if not np.isnan(noise_var):
+                    noise_std_estimate[patch_slice] += noise_var
+                # the top left corner of the patch is used as id for the patch.
+                rank_map[patch_center_img] = maxidx
+                if progbar:
+                    progbar.update()                
+            print(timeit.default_timer() - time_start)
 
         exit(0)
         # Averaging the overlapping pixels.
diff --git a/src/patch_denoise/space_time/utils.py b/src/patch_denoise/space_time/utils.py
index e8fbc18..88fd065 100644
--- a/src/patch_denoise/space_time/utils.py
+++ b/src/patch_denoise/space_time/utils.py
@@ -3,7 +3,6 @@
 from scipy.integrate import quad
 from scipy.linalg import eigh, svd
 import cupy as cp
-import pdb
 
 
 def svd_analysis(input_data, engine="cpu"):

From d9844f312bd877f84b01c6acd8ed254fbe9bf914 Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Fri, 17 Nov 2023 15:31:32 +0100
Subject: [PATCH 13/16] WIP initial implementation

---
 src/patch_denoise/space_time/base.py    | 15 ++---
 src/patch_denoise/space_time/lowrank.py | 75 ++++++++++++++++++-------
 src/patch_denoise/space_time/utils.py   | 18 ++----
 3 files changed, 68 insertions(+), 40 deletions(-)

diff --git a/src/patch_denoise/space_time/base.py b/src/patch_denoise/space_time/base.py
index 1aa7808..a3bbf59 100644
--- a/src/patch_denoise/space_time/base.py
+++ b/src/patch_denoise/space_time/base.py
@@ -4,7 +4,6 @@
 import numpy as np
 from tqdm.auto import tqdm
 import cupy as cp
-import timeit
 
 from .._docs import fill_doc
 
@@ -58,7 +57,6 @@ def denoise(
         -------
         $denoise_return
         """
-        time_start = timeit.default_timer()
         data_shape = input_data.shape
         output_data = np.zeros_like(input_data)
         rank_map = np.zeros(data_shape[:-1], dtype=np.int32)
@@ -99,7 +97,7 @@ def denoise(
             progbar.reset(total=len(patch_locs))
 
         if engine == "cpu":
-            for patch_tl in zip(patch_locs):
+            for patch_tl in patch_locs:
                 patch_slice = tuple(
                     slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
                 )
@@ -142,11 +140,9 @@ def denoise(
                 rank_map[patch_center_img] = maxidx
                 if progbar:
                     progbar.update()
-            print(timeit.default_timer() - time_start)
 
         if engine == "gpu": 
             # Pad the data
-            output_data = cp.asarray(output_data)
             input_data = cp.asarray(input_data)
 
             c, h, w, t_s = input_data.shape
@@ -170,13 +166,16 @@ def denoise(
             patches = patches.transpose((0, 1, 2, 4, 5, 6, 3))
             patches = patches.reshape((np.prod(patches.shape[:3]), patch_size, t_s))
             patches[cp.isnan(patches)] = cp.mean(patches)
-            patches_denoise, maxidx, noise_var = self._patch_processing(
+            patches_denoise, patches_maxidx, noise_var = self._patch_processing(
                 patches,
                 patch_slice=None,
                 engine=engine,
                 **self.input_denoising_kwargs,
             )
-            for patch_tl, p_denoise, in zip(patch_locs, patches_denoise):
+            patches_denoise = cp.asnumpy(patches_denoise)
+            patches_maxidx = cp.asnumpy(patches_maxidx)
+            for patch_tl, p_denoise, maxidx in zip(patch_locs, patches_denoise, patches_maxidx):
+                #breakpoint()
                 patch_slice = tuple(
                     slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
                 )
@@ -205,9 +204,7 @@ def denoise(
                 rank_map[patch_center_img] = maxidx
                 if progbar:
                     progbar.update()                
-            print(timeit.default_timer() - time_start)
 
-        exit(0)
         # Averaging the overlapping pixels.
         # this is only required for averaging recombinations.
         if self.recombination in ["average", "weighted"]:
diff --git a/src/patch_denoise/space_time/lowrank.py b/src/patch_denoise/space_time/lowrank.py
index 97757b1..900d14b 100644
--- a/src/patch_denoise/space_time/lowrank.py
+++ b/src/patch_denoise/space_time/lowrank.py
@@ -381,26 +381,63 @@ def _patch_processing(
     ):
         # TODO check matching of shapes and reshape to work with same indexes
         u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
-        if var_apriori is not None:
-            sigma = np.mean(np.sqrt(var_apriori[patch_slice]))
-        else:
-            sigma = np.median(s_values) / np.sqrt(patch.shape[1] * mp_median)
-
-        scale_factor = np.sqrt(patch.shape[1]) * sigma
-        thresh_s_values = scale_factor * shrink_func(
-            s_values / scale_factor,
-            beta=patch.shape[1] / patch.shape[0],
-        )
-        thresh_s_values[np.isnan(thresh_s_values)] = 0
-
-        if np.any(thresh_s_values):
-            maxidx = np.max(np.nonzero(thresh_s_values)) + 1
-            p_new = svd_synthesis(
-                u_vec, thresh_s_values, v_vec, p_tmean, maxidx, engine=engine
+        if engine == "cpu":
+            if var_apriori is not None:
+                sigma = np.mean(np.sqrt(var_apriori[patch_slice]))
+            else:
+                sigma = np.median(s_values) / np.sqrt(patch.shape[1] * mp_median)
+
+            scale_factor = np.sqrt(patch.shape[1]) * sigma
+            thresh_s_values = scale_factor * shrink_func(
+                s_values / scale_factor,
+                beta=patch.shape[1] / patch.shape[0],
             )
-        else:
-            maxidx = 0
-            p_new = np.zeros_like(patch) + p_tmean
+            thresh_s_values[np.isnan(thresh_s_values)] = 0
+
+            if np.any(thresh_s_values):
+                maxidx = np.max(np.nonzero(thresh_s_values)) + 1
+                p_new = svd_synthesis(u_vec, thresh_s_values, v_vec, p_tmean, maxidx)
+            else:
+                maxidx = 0
+                p_new = np.zeros_like(patch) + p_tmean
+
+        if engine == "gpu":
+            if var_apriori is not None:
+                 #sigma = np.mean(np.sqrt(var_apriori[patch_slice]))
+                 pass
+            else:
+                sigma = cp.median(
+                    s_values, axis=1
+                ) / cp.sqrt(patch.shape[-1] * mp_median)
+
+            scale_factor = (cp.sqrt(patch.shape[-1]) * sigma)[..., None]
+            thresh_s_values = scale_factor * shrink_func(
+                s_values / scale_factor,
+                beta=patch.shape[-1] / patch.shape[-2],
+            )
+            thresh_s_values[cp.isnan(thresh_s_values)] = 0
+
+            # Check all batches to see if they have any values above 0
+            check_any = cp.any(thresh_s_values, axis=1)
+            indices_true = cp.nonzero(check_any)[0]
+            indices_false = cp.nonzero(~check_any)[0]
+            maxidx = cp.zeros(thresh_s_values.shape[0])
+            p_new = cp.zeros(patch.shape)
+
+            if len(indices_true) > 0:
+                # Get values at nonzero indices and get the max index for each
+                thresh_s_values_t = thresh_s_values[indices_true, :]
+                for i in indices_true:
+                    maxidx[i] = cp.max(cp.array(cp.nonzero(thresh_s_values_t[i]))) + 1
+                    p_new[i] = (
+                        u_vec[i, :, :maxidx[i]] @ (
+                            thresh_s_values_t[i, :maxidx[i], None] * v_vec[i, :maxidx[i], :]
+                        )
+                    ) + p_tmean[i, :]
+            if len(indices_false) > 0:
+                for i in indices_false:
+                    maxidx[i] = 0
+                    p_new[i] = cp.zeros_like(patch[i]) + p_tmean[i, :]
 
         return p_new, maxidx, np.NaN
 
diff --git a/src/patch_denoise/space_time/utils.py b/src/patch_denoise/space_time/utils.py
index 88fd065..7ca587b 100644
--- a/src/patch_denoise/space_time/utils.py
+++ b/src/patch_denoise/space_time/utils.py
@@ -31,15 +31,15 @@ def svd_analysis(input_data, engine="cpu"):
         u_vec, s_vals, v_vec = cp.linalg.svd(
             data_centered, full_matrices=False
         )
-        u_vec = cp.asnumpy(u_vec)
-        s_vals = cp.asnumpy(s_vals)
-        v_vec = cp.asnumpy(v_vec)
-        mean = cp.asnumpy(mean)
+        # u_vec = cp.asnumpy(u_vec)
+        # s_vals = cp.asnumpy(s_vals)
+        # v_vec = cp.asnumpy(v_vec)
+        # mean = cp.asnumpy(mean)
 
     return u_vec, s_vals, v_vec, mean
 
 
-def svd_synthesis(u_vec, s_vals, v_vec, mean, idx, engine="cpu"):
+def svd_synthesis(u_vec, s_vals, v_vec, mean, idx):
     """
     Reconstruct ``X = (U @ (S * V)) + M`` with only the max_idx greatest component.
 
@@ -57,13 +57,7 @@ def svd_synthesis(u_vec, s_vals, v_vec, mean, idx, engine="cpu"):
     -------
     np.ndarray: The reconstructed matrix.
     """
-    # TODO check shapes
-    if engine == "cpu":
-        return (u_vec[:, :idx] @ (s_vals[:idx, None] * v_vec[:idx, :])) + mean
-    if engine == "gpu":
-        return (
-            u_vec[..., :idx] @ (s_vals[:idx, ..., None] * v_vec[:idx, ...])
-        ) + mean
+    return (u_vec[:, :idx] @ (s_vals[:idx, None] * v_vec[:idx, :])) + mean
 
 
 def eig_analysis(input_data, max_eig_val=10):

From 7f3fb9f8414913b41b02815c80df4642b3a47624 Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Wed, 22 Nov 2023 10:35:45 +0100
Subject: [PATCH 14/16] clean notes

---
 src/patch_denoise/space_time/lowrank.py | 1 -
 src/patch_denoise/space_time/utils.py   | 4 ----
 2 files changed, 5 deletions(-)

diff --git a/src/patch_denoise/space_time/lowrank.py b/src/patch_denoise/space_time/lowrank.py
index 900d14b..4b9160a 100644
--- a/src/patch_denoise/space_time/lowrank.py
+++ b/src/patch_denoise/space_time/lowrank.py
@@ -379,7 +379,6 @@ def _patch_processing(
         var_apriori=None,
         engine="cpu",
     ):
-        # TODO check matching of shapes and reshape to work with same indexes
         u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
         if engine == "cpu":
             if var_apriori is not None:
diff --git a/src/patch_denoise/space_time/utils.py b/src/patch_denoise/space_time/utils.py
index 7ca587b..5aa9e3d 100644
--- a/src/patch_denoise/space_time/utils.py
+++ b/src/patch_denoise/space_time/utils.py
@@ -31,10 +31,6 @@ def svd_analysis(input_data, engine="cpu"):
         u_vec, s_vals, v_vec = cp.linalg.svd(
             data_centered, full_matrices=False
         )
-        # u_vec = cp.asnumpy(u_vec)
-        # s_vals = cp.asnumpy(s_vals)
-        # v_vec = cp.asnumpy(v_vec)
-        # mean = cp.asnumpy(mean)
 
     return u_vec, s_vals, v_vec, mean
 

From 3697ce6f1632ba1e0ba8af88d3ea0384f66cc1d0 Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Wed, 29 Nov 2023 15:06:02 +0100
Subject: [PATCH 15/16] Batch svd and refactor gpu

---
 examples/example_experimental_data.py   |  11 +-
 src/patch_denoise/space_time/base.py    | 266 +++++++++++++++---------
 src/patch_denoise/space_time/lowrank.py | 140 +++++++------
 src/patch_denoise/space_time/utils.py   |  40 +++-
 4 files changed, 279 insertions(+), 178 deletions(-)

diff --git a/examples/example_experimental_data.py b/examples/example_experimental_data.py
index 1f79863..1da98eb 100644
--- a/examples/example_experimental_data.py
+++ b/examples/example_experimental_data.py
@@ -21,19 +21,22 @@
 
 NOISE_LEVEL = 2
 
-input_path = "/data/parietal/store2/data/ibc/3mm/sub-01/ses-00/func/wrdcsub-01_ses-00_task-ArchiSocial_dir-ap_bold.nii.gz"
+#input_path = "/data/parietal/store2/data/ibc/3mm/sub-01/ses-00/func/wrdcsub-01_ses-00_task-ArchiSocial_dir-ap_bold.nii.gz"
+#"/data/parietal/store2/data/ibc/derivatives/sub-01/ses-00/func/wrdcsub-01_ses-00_task-ArchiSocial_dir-ap_bold.nii.gz"
+input_path = "/data/parietal/store2/data/ibc/sourcedata/sub-01/ses-00/func/sub-01_ses-00_task-ArchiSocial_dir-ap_bold.nii.gz"
 output_path = "/scratch/ymzayek/retreat_data/output.nii"
 
 img = nib.load(input_path)
 
-# data shape is (53, 63, 52, 262) with 3mm resolution
+print(f"Data shape is {img.shape} with affine {img.affine}")
+
 patch_shape = (11, 11, 11)
-patch_overlap = (1)
+patch_overlap = (5)
 
 # initialize denoiser
 optimal_llr = OptimalSVDDenoiser(patch_shape, patch_overlap)
 
 # denoise image
 time_start = timeit.default_timer()
-denoised = optimal_llr.denoise(img.get_fdata(), engine="gpu")
+denoised = optimal_llr.denoise(img.get_fdata(), engine="cpu")
 print(timeit.default_timer() - time_start)
\ No newline at end of file
diff --git a/src/patch_denoise/space_time/base.py b/src/patch_denoise/space_time/base.py
index a3bbf59..45d2f92 100644
--- a/src/patch_denoise/space_time/base.py
+++ b/src/patch_denoise/space_time/base.py
@@ -40,7 +40,6 @@ def denoise(
         mask=None,
         mask_threshold=50,
         progbar=None,
-        engine="cpu"
     ):
         """Denoise the input_data, according to mask.
 
@@ -96,114 +95,172 @@ def denoise(
         elif progbar is not False:
             progbar.reset(total=len(patch_locs))
 
-        if engine == "cpu":
-            for patch_tl in patch_locs:
-                patch_slice = tuple(
-                    slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
-                )
-                process_mask[patch_slice] = 1
-                # building the casoratti matrix
-                patch = np.reshape(input_data[patch_slice], (-1, input_data.shape[-1]))
-
-                # Replace all nan by mean value of patch.
-                # FIXME this behaviour should be documented
-                # And ideally choosen by the user.
-
-                patch[np.isnan(patch)] = np.mean(patch)
-                p_denoise, maxidx, noise_var = self._patch_processing(
-                    patch,
-                    patch_slice=patch_slice,
-                    **self.input_denoising_kwargs,
-                )
-
-                p_denoise = np.reshape(p_denoise, (*patch_shape, -1))
-                patch_center_img = tuple(
-                    ptl + ps // 2 for ptl, ps in zip(patch_tl, patch_shape)
-                )
-                if self.recombination == "center":
-                    output_data[patch_center_img] = p_denoise[patch_center]
-                    noise_std_estimate[patch_center_img] += noise_var
-                elif self.recombination == "weighted":
-                    theta = 1 / (2 + maxidx)
-                    output_data[patch_slice] += p_denoise * theta
-                    patchs_weight[patch_slice] += theta
-                elif self.recombination == "average":
-                    output_data[patch_slice] += p_denoise
-                    patchs_weight[patch_slice] += 1
-                else:
-                    raise ValueError(
-                        "recombination must be one of 'weighted', 'average', 'center'"
-                    )
-                if not np.isnan(noise_var):
-                    noise_std_estimate[patch_slice] += noise_var
-                # the top left corner of the patch is used as id for the patch.
-                rank_map[patch_center_img] = maxidx
-                if progbar:
-                    progbar.update()
-
-        if engine == "gpu": 
-            # Pad the data
-            input_data = cp.asarray(input_data)
-
-            c, h, w, t_s = input_data.shape
-            kc, kh, kw = patch_shape  # kernel size
-            sc, sh, sw = np.repeat(
-                patch_shape[0] - patch_overlap[0], len(patch_shape)
+        for patch_tl in patch_locs:
+            patch_slice = tuple(
+                slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
             )
-            needed_c = int((cp.ceil((c - kc) / sc + 1) - ((c - kc) / sc + 1)) * kc)
-            needed_h = int((cp.ceil((h - kh) / sh + 1) - ((h - kh) / sh + 1)) * kh)
-            needed_w = int((cp.ceil((w - kw) / sw + 1) - ((w - kw) / sw + 1)) * kw)
-
-            input_data_padded = cp.pad(
-                input_data, ((0, needed_c), (0, needed_h), (0, needed_w), (0, 0)
-            ), mode='edge')
-
-            step = patch_shape[0] - patch_overlap[0]
-            patches = cp.lib.stride_tricks.sliding_window_view(
-                input_data_padded, patch_shape, axis=(0, 1, 2)
-            )[::step, ::step, ::step]
-
-            patches = patches.transpose((0, 1, 2, 4, 5, 6, 3))
-            patches = patches.reshape((np.prod(patches.shape[:3]), patch_size, t_s))
-            patches[cp.isnan(patches)] = cp.mean(patches)
-            patches_denoise, patches_maxidx, noise_var = self._patch_processing(
-                patches,
-                patch_slice=None,
-                engine=engine,
+            process_mask[patch_slice] = 1
+            # building the casoratti matrix
+            patch = np.reshape(input_data[patch_slice], (-1, input_data.shape[-1]))
+
+            # Replace all nan by mean value of patch.
+            # FIXME this behaviour should be documented
+            # And ideally choosen by the user.
+
+            patch[np.isnan(patch)] = np.mean(patch)
+            p_denoise, maxidx, noise_var = self._patch_processing(
+                patch,
+                patch_slice=patch_slice,
                 **self.input_denoising_kwargs,
             )
-            patches_denoise = cp.asnumpy(patches_denoise)
-            patches_maxidx = cp.asnumpy(patches_maxidx)
-            for patch_tl, p_denoise, maxidx in zip(patch_locs, patches_denoise, patches_maxidx):
-                #breakpoint()
-                patch_slice = tuple(
-                    slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
+
+            p_denoise = np.reshape(p_denoise, (*patch_shape, -1))
+            patch_center_img = tuple(
+                ptl + ps // 2 for ptl, ps in zip(patch_tl, patch_shape)
+            )
+            if self.recombination == "center":
+                output_data[patch_center_img] = p_denoise[patch_center]
+                noise_std_estimate[patch_center_img] += noise_var
+            elif self.recombination == "weighted":
+                theta = 1 / (2 + maxidx)
+                output_data[patch_slice] += p_denoise * theta
+                patchs_weight[patch_slice] += theta
+            elif self.recombination == "average":
+                output_data[patch_slice] += p_denoise
+                patchs_weight[patch_slice] += 1
+            else:
+                raise ValueError(
+                    "recombination must be one of 'weighted', 'average', 'center'"
                 )
-                process_mask[patch_slice] = 1
-                p_denoise = np.reshape(p_denoise, (*patch_shape, -1))
-                patch_center_img = tuple(
-                    ptl + ps // 2 for ptl, ps in zip(patch_tl, patch_shape)
+            if not np.isnan(noise_var):
+                noise_std_estimate[patch_slice] += noise_var
+            # the top left corner of the patch is used as id for the patch.
+            rank_map[patch_center_img] = maxidx
+            if progbar:
+                progbar.update()               
+
+        # Averaging the overlapping pixels.
+        # this is only required for averaging recombinations.
+        if self.recombination in ["average", "weighted"]:
+            output_data /= patchs_weight[..., None]
+            noise_std_estimate /= patchs_weight
+
+        output_data[~process_mask] = 0
+
+        return output_data, patchs_weight, noise_std_estimate, rank_map
+    
+    def denoise_gpu(
+        self,
+        input_data,
+        mask=None,
+        mask_threshold=50,
+        progbar=None,
+        batch_size=None,
+    ):
+        data_shape = input_data.shape
+        output_data = np.zeros_like(input_data)
+        rank_map = np.zeros(data_shape[:-1], dtype=np.int32)
+        # Create Default mask
+        if mask is None:
+            process_mask = np.full(data_shape[:-1], True)
+        else:
+            process_mask = np.copy(mask)
+
+        patch_shape, patch_overlap = self.__get_patch_param(data_shape)
+        patch_size = np.prod(patch_shape)
+
+        if self.recombination == "center":
+            patch_center = (
+                *(slice(ps // 2, ps // 2 + 1) for ps in patch_shape),
+                slice(None, None, None),
+            )
+        patchs_weight = np.zeros(data_shape[:-1], np.float32)
+        noise_std_estimate = np.zeros(data_shape[:-1], dtype=np.float32)
+
+        # discard useless patches
+        patch_locs = get_patch_locs(patch_shape, patch_overlap, data_shape[:-1])
+        get_it = np.zeros(len(patch_locs), dtype=bool)
+
+        patch_slices = []
+        for i, patch_tl in enumerate(patch_locs):
+            patch_slice = tuple(
+                slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
+            )
+            if 100 * np.sum(process_mask[patch_slice]) / patch_size > mask_threshold:
+                get_it[i] = True
+            patch_slices.append(patch_slice)
+
+        logging.info(f"Denoise {100 * np.sum(get_it) / len(patch_locs):.2f}% patches")
+        patch_locs = np.ascontiguousarray(patch_locs[get_it])
+
+        if progbar is None:
+            progbar = tqdm(total=len(patch_locs))
+        elif progbar is not False:
+            progbar.reset(total=len(patch_locs))
+
+        # Pad the data
+        input_data = cp.asarray(input_data)
+
+        c, h, w, t_s = input_data.shape
+        kc, kh, kw = patch_shape  # kernel size
+        sc, sh, sw = np.repeat(
+            patch_shape[0] - patch_overlap[0], len(patch_shape)
+        )
+        needed_c = int((cp.ceil((c - kc) / sc + 1) - ((c - kc) / sc + 1)) * kc)
+        needed_h = int((cp.ceil((h - kh) / sh + 1) - ((h - kh) / sh + 1)) * kh)
+        needed_w = int((cp.ceil((w - kw) / sw + 1) - ((w - kw) / sw + 1)) * kw)
+
+        input_data_padded = cp.pad(
+            input_data, ((0, needed_c), (0, needed_h), (0, needed_w), (0, 0)
+        ), mode='edge')
+
+        step = patch_shape[0] - patch_overlap[0]
+        patches = cp.lib.stride_tricks.sliding_window_view(
+            input_data_padded, patch_shape, axis=(0, 1, 2)
+        )[::step, ::step, ::step]
+
+        patches = patches.transpose((0, 1, 2, 4, 5, 6, 3))
+        patches = patches.reshape((np.prod(patches.shape[:3]), patch_size, t_s))
+        patches[cp.isnan(patches)] = cp.mean(patches)
+        patches_denoise, patches_maxidx, noise_var = self._patch_processing_gpu(
+            patches,
+            patch_slices=patch_slices,
+            batch_size=batch_size,
+            **self.input_denoising_kwargs,
+        )
+        patches_denoise = cp.asnumpy(patches_denoise)
+        patches_maxidx = cp.asnumpy(patches_maxidx)
+        for patch_tl, p_denoise, maxidx in zip(patch_locs, patches_denoise, patches_maxidx):
+            #breakpoint()
+            patch_slice = tuple(
+                slice(tl, tl + ps) for tl, ps in zip(patch_tl, patch_shape)
+            )
+            process_mask[patch_slice] = 1
+            p_denoise = np.reshape(p_denoise, (*patch_shape, -1))
+            patch_center_img = tuple(
+                ptl + ps // 2 for ptl, ps in zip(patch_tl, patch_shape)
+            )
+            if self.recombination == "center":
+                output_data[patch_center_img] = p_denoise[patch_center]
+                noise_std_estimate[patch_center_img] += noise_var
+            elif self.recombination == "weighted":
+                theta = 1 / (2 + maxidx)
+                output_data[patch_slice] += p_denoise * theta
+                patchs_weight[patch_slice] += theta
+            elif self.recombination == "average":
+                output_data[patch_slice] += p_denoise
+                patchs_weight[patch_slice] += 1
+            else:
+                raise ValueError(
+                    "recombination must be one of 'weighted', 'average', "
+                    "'center'."
                 )
-                if self.recombination == "center":
-                    output_data[patch_center_img] = p_denoise[patch_center]
-                    noise_std_estimate[patch_center_img] += noise_var
-                elif self.recombination == "weighted":
-                    theta = 1 / (2 + maxidx)
-                    output_data[patch_slice] += p_denoise * theta
-                    patchs_weight[patch_slice] += theta
-                elif self.recombination == "average":
-                    output_data[patch_slice] += p_denoise
-                    patchs_weight[patch_slice] += 1
-                else:
-                    raise ValueError(
-                        "recombination must be one of 'weighted', 'average', 'center'"
-                    )
-                if not np.isnan(noise_var):
-                    noise_std_estimate[patch_slice] += noise_var
-                # the top left corner of the patch is used as id for the patch.
-                rank_map[patch_center_img] = maxidx
-                if progbar:
-                    progbar.update()                
+            if not np.isnan(noise_var):
+                noise_std_estimate[patch_slice] += noise_var
+            # the top left corner of the patch is used as id for the patch.
+            rank_map[patch_center_img] = maxidx
+            if progbar:
+                progbar.update()
 
         # Averaging the overlapping pixels.
         # this is only required for averaging recombinations.
@@ -214,6 +271,7 @@ def denoise(
         output_data[~process_mask] = 0
 
         return output_data, patchs_weight, noise_std_estimate, rank_map
+        
 
     @abc.abstractmethod
     def _patch_processing(self, patch, patch_slice=None, **kwargs):
diff --git a/src/patch_denoise/space_time/lowrank.py b/src/patch_denoise/space_time/lowrank.py
index 4b9160a..c51bcbd 100644
--- a/src/patch_denoise/space_time/lowrank.py
+++ b/src/patch_denoise/space_time/lowrank.py
@@ -12,6 +12,7 @@
     eig_synthesis,
     marshenko_pastur_median,
     svd_analysis,
+    svd_analysis_gpu,
     svd_synthesis,
 )
 from .._docs import fill_doc
@@ -366,9 +367,14 @@ def denoise(
         else:
             self.input_denoising_kwargs["var_apriori"] = noise_std**2
 
-        return super().denoise(
-            input_data, mask, mask_threshold, progbar=progbar, engine=engine
-        )
+        if engine == "cpu":
+            return super().denoise(
+                input_data, mask, mask_threshold, progbar=progbar,
+            )
+        else:
+            return super().denoise_gpu(
+                input_data, mask, mask_threshold, progbar=progbar,
+            )
 
     def _patch_processing(
         self,
@@ -377,69 +383,81 @@ def _patch_processing(
         shrink_func=None,
         mp_median=None,
         var_apriori=None,
-        engine="cpu",
     ):
-        u_vec, s_values, v_vec, p_tmean = svd_analysis(patch, engine=engine)
-        if engine == "cpu":
-            if var_apriori is not None:
-                sigma = np.mean(np.sqrt(var_apriori[patch_slice]))
-            else:
-                sigma = np.median(s_values) / np.sqrt(patch.shape[1] * mp_median)
+        u_vec, s_values, v_vec, p_tmean = svd_analysis(patch)
+        if var_apriori is not None:
+            sigma = np.mean(np.sqrt(var_apriori[patch_slice]))
+        else:
+            sigma = np.median(s_values) / np.sqrt(patch.shape[1] * mp_median)
 
-            scale_factor = np.sqrt(patch.shape[1]) * sigma
-            thresh_s_values = scale_factor * shrink_func(
-                s_values / scale_factor,
-                beta=patch.shape[1] / patch.shape[0],
-            )
-            thresh_s_values[np.isnan(thresh_s_values)] = 0
-
-            if np.any(thresh_s_values):
-                maxidx = np.max(np.nonzero(thresh_s_values)) + 1
-                p_new = svd_synthesis(u_vec, thresh_s_values, v_vec, p_tmean, maxidx)
-            else:
-                maxidx = 0
-                p_new = np.zeros_like(patch) + p_tmean
-
-        if engine == "gpu":
-            if var_apriori is not None:
-                 #sigma = np.mean(np.sqrt(var_apriori[patch_slice]))
-                 pass
-            else:
-                sigma = cp.median(
-                    s_values, axis=1
-                ) / cp.sqrt(patch.shape[-1] * mp_median)
-
-            scale_factor = (cp.sqrt(patch.shape[-1]) * sigma)[..., None]
-            thresh_s_values = scale_factor * shrink_func(
-                s_values / scale_factor,
-                beta=patch.shape[-1] / patch.shape[-2],
-            )
-            thresh_s_values[cp.isnan(thresh_s_values)] = 0
-
-            # Check all batches to see if they have any values above 0
-            check_any = cp.any(thresh_s_values, axis=1)
-            indices_true = cp.nonzero(check_any)[0]
-            indices_false = cp.nonzero(~check_any)[0]
-            maxidx = cp.zeros(thresh_s_values.shape[0])
-            p_new = cp.zeros(patch.shape)
-
-            if len(indices_true) > 0:
-                # Get values at nonzero indices and get the max index for each
-                thresh_s_values_t = thresh_s_values[indices_true, :]
-                for i in indices_true:
-                    maxidx[i] = cp.max(cp.array(cp.nonzero(thresh_s_values_t[i]))) + 1
-                    p_new[i] = (
-                        u_vec[i, :, :maxidx[i]] @ (
-                            thresh_s_values_t[i, :maxidx[i], None] * v_vec[i, :maxidx[i], :]
-                        )
-                    ) + p_tmean[i, :]
-            if len(indices_false) > 0:
-                for i in indices_false:
-                    maxidx[i] = 0
-                    p_new[i] = cp.zeros_like(patch[i]) + p_tmean[i, :]
+        scale_factor = np.sqrt(patch.shape[1]) * sigma
+        thresh_s_values = scale_factor * shrink_func(
+            s_values / scale_factor,
+            beta=patch.shape[1] / patch.shape[0],
+        )
+        thresh_s_values[np.isnan(thresh_s_values)] = 0
+
+        if np.any(thresh_s_values):
+            maxidx = np.max(np.nonzero(thresh_s_values)) + 1
+            p_new = svd_synthesis(u_vec, thresh_s_values, v_vec, p_tmean, maxidx)
+        else:
+            maxidx = 0
+            p_new = np.zeros_like(patch) + p_tmean
 
         return p_new, maxidx, np.NaN
 
+    def _patch_processing_gpu(
+        self,
+        patches,
+        patch_slices=None,
+        shrink_func=None,
+        mp_median=None,
+        var_apriori=None,
+        batch_size=None,
+    ):
+        if batch_size is None:
+            batch_size = patches.shape[0]
+        u_vec, s_values, v_vec, p_tmean = svd_analysis_gpu(
+            patches, batch_size=batch_size
+        )
+        if var_apriori is not None:
+            #sigma = cp.empty((batch_size, m, m), dtype=cp.float64)
+            for patch_slice in patch_slices:
+                sigma = np.mean(np.sqrt(var_apriori[patch_slice]))
+        else:
+            sigma = cp.median(
+                s_values, axis=1
+            ) / cp.sqrt(patches.shape[-1] * mp_median)
+
+        scale_factor = (cp.sqrt(patches.shape[-1]) * sigma)[..., None]
+        thresh_s_values = scale_factor * shrink_func(
+            s_values / scale_factor,
+            beta=patches.shape[-1] / patches.shape[-2],
+        )
+        thresh_s_values[cp.isnan(thresh_s_values)] = 0
+
+        # Check all batches to see if they have any values above 0
+        check_any = cp.any(thresh_s_values, axis=1)
+        indices_true = cp.nonzero(check_any)[0]
+        indices_false = cp.nonzero(~check_any)[0]
+        maxidx = cp.zeros(thresh_s_values.shape[0])
+        p_new = cp.zeros(patches.shape)
+
+        if len(indices_true) > 0:
+            # Get values at nonzero indices and get the max index for each
+            thresh_s_values_t = thresh_s_values[indices_true, :]
+            for i in indices_true:
+                maxidx[i] = cp.max(cp.array(cp.nonzero(thresh_s_values_t[i]))) + 1
+                p_new[i] = (
+                    u_vec[i, :, :maxidx[i]] @ (
+                        thresh_s_values_t[i, :maxidx[i], None] * v_vec[i, :maxidx[i], :]
+                    )
+                ) + p_tmean[i, :]
+        if len(indices_false) > 0:
+            for i in indices_false:
+                maxidx[i] = 0
+                p_new[i] = cp.zeros_like(patches[i]) + p_tmean[i, :]
+
 
 def _sure_atn_cost(X, method, sing_vals, gamma, sigma=None, tau=None):
     """
diff --git a/src/patch_denoise/space_time/utils.py b/src/patch_denoise/space_time/utils.py
index 5aa9e3d..e69d429 100644
--- a/src/patch_denoise/space_time/utils.py
+++ b/src/patch_denoise/space_time/utils.py
@@ -5,7 +5,7 @@
 import cupy as cp
 
 
-def svd_analysis(input_data, engine="cpu"):
+def svd_analysis(input_data):
     """Return the centered SVD decomposition.
 
     U, S, Vt and M are compute such that:
@@ -21,18 +21,40 @@ def svd_analysis(input_data, engine="cpu"):
     u_vec, s_vals, v_vec, mean
     """
     # TODO  benchmark svd vs svds and order of data.
-    if engine == "cpu":
-        mean = np.mean(input_data, axis=0)
-        data_centered = input_data - mean
-        u_vec, s_vals, v_vec = svd(data_centered, full_matrices=False)
-    elif engine == "gpu":
-        mean = cp.mean(input_data, axis=0)
-        data_centered = input_data - mean
+    mean = np.mean(input_data, axis=0)
+    data_centered = input_data - mean
+    u_vec, s_vals, v_vec = svd(data_centered, full_matrices=False)
+
+    return u_vec, s_vals, v_vec, mean
+
+
+def svd_analysis_gpu(input_data, batch_size):
+    # Initialize arrays to store the results
+    # input_data shape is (total patches, patch size, time)
+    batch_size = batch_size
+    # total_patches = input_data.shape[0]
+    # if total_patches % batch_size != 0:
+    #     raise ValueError("Batch size must be a divisor of total patches")
+    m = input_data.shape[1]
+    n = input_data.shape[2]
+    U_batched = cp.empty((batch_size, m, m), dtype=cp.float64)
+    S_batched = cp.empty((batch_size, min(m, n)), dtype=cp.float64)
+    V_batched = cp.empty((batch_size, n, n), dtype=cp.float64)
+    mean_batched = cp.empty((batch_size, n), dtype=cp.float64)
+
+    # Compute SVD for each matrix in the batch
+    for i in range(batch_size):
+        mean = cp.mean(input_data[i], axis=1)
+        data_centered = input_data[i] - mean
         u_vec, s_vals, v_vec = cp.linalg.svd(
             data_centered, full_matrices=False
         )
+        U_batched[i] = u_vec
+        S_batched[i] = cp.diag(s_vals)
+        V_batched[i] = v_vec
+        mean_batched[i] = mean
 
-    return u_vec, s_vals, v_vec, mean
+    return U_batched, S_batched, V_batched, mean_batched
 
 
 def svd_synthesis(u_vec, s_vals, v_vec, mean, idx):

From d89955c69ceb6907d19a498418e96f544a77d5d5 Mon Sep 17 00:00:00 2001
From: ymzayek <yasmin.mzayek@inria.fr>
Date: Wed, 29 Nov 2023 16:47:07 +0100
Subject: [PATCH 16/16] Fix batching

---
 examples/example_experimental_data.py   | 16 +++---
 src/patch_denoise/space_time/base.py    | 27 +--------
 src/patch_denoise/space_time/lowrank.py | 11 +++-
 src/patch_denoise/space_time/utils.py   | 73 +++++++++++++++++++------
 4 files changed, 77 insertions(+), 50 deletions(-)

diff --git a/examples/example_experimental_data.py b/examples/example_experimental_data.py
index 1da98eb..d60749f 100644
--- a/examples/example_experimental_data.py
+++ b/examples/example_experimental_data.py
@@ -16,19 +16,19 @@
 # %%
 # Setup the parameters for the simulation and noise
 
-SHAPE = (64, 64, 64)
-N_FRAMES = 200
+# SHAPE = (64, 64, 64)
+# N_FRAMES = 200
 
-NOISE_LEVEL = 2
+# NOISE_LEVEL = 2
 
-#input_path = "/data/parietal/store2/data/ibc/3mm/sub-01/ses-00/func/wrdcsub-01_ses-00_task-ArchiSocial_dir-ap_bold.nii.gz"
-#"/data/parietal/store2/data/ibc/derivatives/sub-01/ses-00/func/wrdcsub-01_ses-00_task-ArchiSocial_dir-ap_bold.nii.gz"
-input_path = "/data/parietal/store2/data/ibc/sourcedata/sub-01/ses-00/func/sub-01_ses-00_task-ArchiSocial_dir-ap_bold.nii.gz"
+base_path = "/data/parietal/store2/data/ibc/"
+#input_path = base_path + "3mm/sub-01/ses-00/func/wrdcsub-01_ses-00_task-ArchiSocial_dir-ap_bold.nii.gz"
+input_path = base_path + "sourcedata/sub-01/ses-00/func/sub-01_ses-00_task-ArchiSocial_dir-ap_bold.nii.gz"
 output_path = "/scratch/ymzayek/retreat_data/output.nii"
 
 img = nib.load(input_path)
 
-print(f"Data shape is {img.shape} with affine {img.affine}")
+print(f"Data shape is {img.shape} with affine \n{img.affine}")
 
 patch_shape = (11, 11, 11)
 patch_overlap = (5)
@@ -38,5 +38,5 @@
 
 # denoise image
 time_start = timeit.default_timer()
-denoised = optimal_llr.denoise(img.get_fdata(), engine="cpu")
+denoised = optimal_llr.denoise(img.get_fdata(), engine="gpu", batch_size=100)
 print(timeit.default_timer() - time_start)
\ No newline at end of file
diff --git a/src/patch_denoise/space_time/base.py b/src/patch_denoise/space_time/base.py
index 45d2f92..9e11d76 100644
--- a/src/patch_denoise/space_time/base.py
+++ b/src/patch_denoise/space_time/base.py
@@ -7,7 +7,7 @@
 
 from .._docs import fill_doc
 
-from .utils import get_patch_locs
+from .utils import get_patch_locs, get_patches_gpu
 
 
 @fill_doc
@@ -198,30 +198,9 @@ def denoise_gpu(
         elif progbar is not False:
             progbar.reset(total=len(patch_locs))
 
-        # Pad the data
-        input_data = cp.asarray(input_data)
+        patches = get_patches_gpu(input_data, patch_shape, patch_overlap)
+        patches[np.isnan(patches)] = np.mean(patches)
 
-        c, h, w, t_s = input_data.shape
-        kc, kh, kw = patch_shape  # kernel size
-        sc, sh, sw = np.repeat(
-            patch_shape[0] - patch_overlap[0], len(patch_shape)
-        )
-        needed_c = int((cp.ceil((c - kc) / sc + 1) - ((c - kc) / sc + 1)) * kc)
-        needed_h = int((cp.ceil((h - kh) / sh + 1) - ((h - kh) / sh + 1)) * kh)
-        needed_w = int((cp.ceil((w - kw) / sw + 1) - ((w - kw) / sw + 1)) * kw)
-
-        input_data_padded = cp.pad(
-            input_data, ((0, needed_c), (0, needed_h), (0, needed_w), (0, 0)
-        ), mode='edge')
-
-        step = patch_shape[0] - patch_overlap[0]
-        patches = cp.lib.stride_tricks.sliding_window_view(
-            input_data_padded, patch_shape, axis=(0, 1, 2)
-        )[::step, ::step, ::step]
-
-        patches = patches.transpose((0, 1, 2, 4, 5, 6, 3))
-        patches = patches.reshape((np.prod(patches.shape[:3]), patch_size, t_s))
-        patches[cp.isnan(patches)] = cp.mean(patches)
         patches_denoise, patches_maxidx, noise_var = self._patch_processing_gpu(
             patches,
             patch_slices=patch_slices,
diff --git a/src/patch_denoise/space_time/lowrank.py b/src/patch_denoise/space_time/lowrank.py
index c51bcbd..b8ebb15 100644
--- a/src/patch_denoise/space_time/lowrank.py
+++ b/src/patch_denoise/space_time/lowrank.py
@@ -323,6 +323,7 @@ def denoise(
         eps_marshenko_pastur=1e-7,
         progbar=None,
         engine="cpu",
+        batch_size=None,
     ):
         """
         Optimal thresholing denoising method.
@@ -371,10 +372,16 @@ def denoise(
             return super().denoise(
                 input_data, mask, mask_threshold, progbar=progbar,
             )
-        else:
+        elif engine == "gpu":
             return super().denoise_gpu(
-                input_data, mask, mask_threshold, progbar=progbar,
+                input_data,
+                mask,
+                mask_threshold,
+                progbar=progbar,
+                batch_size=batch_size,
             )
+        else:
+            raise ValueError(f"Unknown engine: {engine}. Use 'cpu' or 'gpu'.")
 
     def _patch_processing(
         self,
diff --git a/src/patch_denoise/space_time/utils.py b/src/patch_denoise/space_time/utils.py
index e69d429..9d9ad72 100644
--- a/src/patch_denoise/space_time/utils.py
+++ b/src/patch_denoise/space_time/utils.py
@@ -29,31 +29,35 @@ def svd_analysis(input_data):
 
 
 def svd_analysis_gpu(input_data, batch_size):
+    total_samples = input_data.shape[0]
+    num_batches = int(np.ceil(total_samples/ batch_size))
+    adjusted_batch_size = total_samples // num_batches
+    last_batch_size = total_samples % adjusted_batch_size
+
     # Initialize arrays to store the results
     # input_data shape is (total patches, patch size, time)
-    batch_size = batch_size
-    # total_patches = input_data.shape[0]
-    # if total_patches % batch_size != 0:
-    #     raise ValueError("Batch size must be a divisor of total patches")
     m = input_data.shape[1]
     n = input_data.shape[2]
-    U_batched = cp.empty((batch_size, m, m), dtype=cp.float64)
-    S_batched = cp.empty((batch_size, min(m, n)), dtype=cp.float64)
-    V_batched = cp.empty((batch_size, n, n), dtype=cp.float64)
-    mean_batched = cp.empty((batch_size, n), dtype=cp.float64)
+    U_batched = cp.empty((total_samples, m, n), dtype=cp.float64)
+    S_batched = cp.empty((total_samples, min(m, n)), dtype=cp.float64)
+    V_batched = cp.empty((total_samples, n, n), dtype=cp.float64)
+    mean_batched = cp.empty((total_samples, n), dtype=cp.float64)
 
     # Compute SVD for each matrix in the batch
-    for i in range(batch_size):
-        mean = cp.mean(input_data[i], axis=1)
-        data_centered = input_data[i] - mean
+    for i in range(num_batches):
+        print(i)
+        start_idx = i * adjusted_batch_size
+        end_idx = start_idx + adjusted_batch_size if i < num_batches - 1 else start_idx + last_batch_size
+        idx = slice(start_idx, end_idx)
+        mean = cp.mean(input_data[idx], axis=1, keepdims=True)
+        data_centered = cp.asarray(input_data[idx] - mean)
         u_vec, s_vals, v_vec = cp.linalg.svd(
             data_centered, full_matrices=False
         )
-        U_batched[i] = u_vec
-        S_batched[i] = cp.diag(s_vals)
-        V_batched[i] = v_vec
-        mean_batched[i] = mean
-
+        U_batched[idx] = u_vec
+        S_batched[idx] = s_vals
+        V_batched[idx] = v_vec
+        mean_batched[idx] = cp.asarray(cp.squeeze(mean))
     return U_batched, S_batched, V_batched, mean_batched
 
 
@@ -227,6 +231,43 @@ def get_patch_locs(p_shape, p_ovl, v_shape):
     return patch_locs.reshape(-1, len(p_shape))
 
 
+def get_patches_gpu(input_data, patch_shape, patch_overlap):
+    """Extract all the patches from a volume.
+    
+    Returns
+    -------
+    numpy.ndarray
+        All the patches in shape (patches, patch size, time).
+    """
+    patch_size = np.prod(patch_shape)
+
+    # Pad the data
+    input_data = cp.asarray(input_data)
+
+    c, h, w, t_s = input_data.shape
+    kc, kh, kw = patch_shape  # kernel size
+    sc, sh, sw = np.repeat(
+        patch_shape[0] - patch_overlap[0], len(patch_shape)
+    )
+    needed_c = int((cp.ceil((c - kc) / sc + 1) - ((c - kc) / sc + 1)) * kc)
+    needed_h = int((cp.ceil((h - kh) / sh + 1) - ((h - kh) / sh + 1)) * kh)
+    needed_w = int((cp.ceil((w - kw) / sw + 1) - ((w - kw) / sw + 1)) * kw)
+
+    input_data_padded = cp.pad(
+        input_data, ((0, needed_c), (0, needed_h), (0, needed_w), (0, 0)
+    ), mode='edge')
+
+    step = patch_shape[0] - patch_overlap[0]
+    patches = cp.lib.stride_tricks.sliding_window_view(
+        input_data_padded, patch_shape, axis=(0, 1, 2)
+    )[::step, ::step, ::step]
+
+    patches = patches.transpose((0, 1, 2, 4, 5, 6, 3))
+    patches = patches.reshape((np.prod(patches.shape[:3]), patch_size, t_s))
+    
+    return cp.asnumpy(patches)
+
+
 def estimate_noise(noise_sequence, block_size=1):
     """Estimate the temporal noise standard deviation of a noise only sequence."""
     volume_shape = noise_sequence.shape[:-1]