From 3556223c7ce590afd9cc6daa9376acfcbd5a0755 Mon Sep 17 00:00:00 2001
From: semjon00 <semjon.00@gmail.com>
Date: Wed, 5 Jul 2023 15:03:01 +0300
Subject: [PATCH 01/16] Rework run_generate, run and run_depthmap input system

Large refactor part, may be broken

Also indent everything with spaces, add/remove line breaks and add some TODOs
---
 ...bundled_sources.txt => bundled_sources.txt |    0
 install.py                                    |    2 +
 javascript/depthmap.js                        |    4 -
 scripts/depthmap.py                           | 3216 +++++++++--------
 scripts/gradio_args_transport.py              |   52 +
 scripts/stereoimage_generation.py             |    3 +-
 6 files changed, 1705 insertions(+), 1572 deletions(-)
 rename scripts/bundled_sources.txt => bundled_sources.txt (100%)
 create mode 100644 scripts/gradio_args_transport.py
diff --git a/scripts/bundled_sources.txt b/bundled_sources.txt
similarity index 100%
rename from scripts/bundled_sources.txt
rename to bundled_sources.txt
diff --git a/install.py b/install.py
index 1644534..c699427 100644
--- a/install.py
+++ b/install.py
@@ -2,6 +2,8 @@
 import platform
 import sys
 
+# TODO: some dependencies apparently being reinstalled on every run. Investigate and fix.
+
 if sys.version_info < (3, 8):
     launch.run_pip("install importlib-metadata", "importlib-metadata for depthmap script")
     import importlib_metadata
diff --git a/javascript/depthmap.js b/javascript/depthmap.js
index 09ef717..e69de29 100644
--- a/javascript/depthmap.js
+++ b/javascript/depthmap.js
@@ -1,4 +0,0 @@
-function get_depthmap_tab_index(){
-    const [,...args] = [...arguments]
-    return [get_tab_index('mode_depthmap'), ...args]
-}
diff --git a/scripts/depthmap.py b/scripts/depthmap.py
index 34832b7..8be91bd 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap.py
@@ -43,25 +43,26 @@
 
 # Not sure if this is needed
 try:
-	script_dir = os.path.dirname(os.path.realpath(__file__))
-	extension_dir = pathlib.Path(script_dir).parent
-	sys.path.append(extension_dir)
+    script_dir = os.path.dirname(os.path.realpath(__file__))
+    extension_dir = pathlib.Path(script_dir).parent
+    sys.path.append(extension_dir)
 except:
-	sys.path.append('extensions/stable-diffusion-webui-depthmap-script')
+    sys.path.append('extensions/stable-diffusion-webui-depthmap-script')
 
 # Ugly workaround to fix gradio tempfile issue
 def ensure_gradio_temp_directory():
-	try:
-		import tempfile
-		path = os.path.join(tempfile.gettempdir(), 'gradio')
-		if not (os.path.exists(path)):
-			os.mkdir(path)
-	except Exception as e:
-		traceback.print_exc()
+    try:
+        import tempfile
+        path = os.path.join(tempfile.gettempdir(), 'gradio')
+        if not (os.path.exists(path)):
+            os.mkdir(path)
+    except Exception as e:
+        traceback.print_exc()
 ensure_gradio_temp_directory()
 
-
+# Our code
 from scripts.stereoimage_generation import create_stereoimages
+from scripts.gradio_args_transport import GradioComponentBundle
 
 # midas imports
 from dmidas.dpt_depth import DPTDepthModel
@@ -108,1202 +109,1289 @@ def ensure_gradio_temp_directory():
 depthmap_deviceidx = None
 
 commit_hash = None  # TODO: understand why it would spam to stderr if changed to ... = get_commit_hash()
+
+
 def get_commit_hash():
-	global commit_hash
-	if commit_hash is None:
-		try:
-			commit_hash = subprocess.check_output(
-				[os.environ.get('GIT', "git"), "rev-parse", "HEAD"],
-				cwd=pathlib.Path.cwd().joinpath('extensions/stable-diffusion-webui-depthmap-script/'),
-				shell=False,
-				stderr=subprocess.DEVNULL,
-				encoding='utf8').strip()[0:8]
-		except Exception:
-			commit_hash = "<none>"
-	return commit_hash
+    global commit_hash
+    if commit_hash is None:
+        try:
+            commit_hash = subprocess.check_output(
+                [os.environ.get('GIT', "git"), "rev-parse", "HEAD"],
+                cwd=pathlib.Path.cwd().joinpath('extensions/stable-diffusion-webui-depthmap-script/'),
+                shell=False,
+                stderr=subprocess.DEVNULL,
+                encoding='utf8').strip()[0:8]
+        except Exception:
+            commit_hash = "<none>"
+    return commit_hash
 
 
 def main_ui_panel(is_depth_tab):
-	with gr.Blocks():
-		with gr.Row():
-			compute_device = gr.Radio(label="Compute on", choices=['GPU', 'CPU'], value='GPU', type="index")
-			model_type = gr.Dropdown(label="Model", choices=['res101', 'dpt_beit_large_512 (midas 3.1)',
-															 'dpt_beit_large_384 (midas 3.1)',
-															 'dpt_large_384 (midas 3.0)', 'dpt_hybrid_384 (midas 3.0)',
-															 'midas_v21', 'midas_v21_small', 
-															 'zoedepth_n (indoor)', 'zoedepth_k (outdoor)', 'zoedepth_nk'], value='res101',
-									 type="index", elem_id="tabmodel_type")
-		with gr.Group():
-			with gr.Row():
-				boost = gr.Checkbox(label="BOOST (multi-resolution merging)", value=True)
-				invert_depth = gr.Checkbox(label="Invert DepthMap (black=near, white=far)", value=False)
-			with gr.Group(visible=False) as options_depend_on_boost:
-				match_size = gr.Checkbox(label="Match input size", value=False)
-				with gr.Row() as options_depend_on_match_size:
-					net_width = gr.Slider(minimum=64, maximum=2048, step=64, label='Net width', value=512)
-					net_height = gr.Slider(minimum=64, maximum=2048, step=64, label='Net height', value=512)
-
-		with gr.Group():
-			with gr.Row():
-				clipdepth = gr.Checkbox(label="Clip and renormalize", value=False)
-			with gr.Row(visible=False) as clip_options_row_1:
-				clipthreshold_far = gr.Slider(minimum=0, maximum=1, step=0.001, label='Far clip', value=0)
-				clipthreshold_near = gr.Slider(minimum=0, maximum=1, step=0.001, label='Near clip', value=1)
-
-		with gr.Group():
-			with gr.Row():
-				combine_output = gr.Checkbox(label="Combine into one image", value=False)
-				combine_output_axis = gr.Radio(label="Combine axis", choices=['Vertical', 'Horizontal'],
-											   value='Horizontal', type="index")
-			with gr.Row():
-				save_depth = gr.Checkbox(label="Save DepthMap", value=True)
-				show_depth = gr.Checkbox(label="Show DepthMap", value=True)
-				show_heat = gr.Checkbox(label="Show HeatMap", value=False)
-
-		with gr.Group():
-			with gr.Row():
-				gen_stereo = gr.Checkbox(label="Generate stereoscopic image(s)", value=False)
-				with gr.Group(visible=False) as stereo_options_row_0:
-					with gr.Row():
-						stereo_modes = gr.CheckboxGroup(["left-right", "right-left", "top-bottom", "bottom-top", "red-cyan-anaglyph"], label="Output", value=["left-right","red-cyan-anaglyph"])
-
-			with gr.Row(visible=False) as stereo_options_row_1:
-				stereo_divergence = gr.Slider(minimum=0.05, maximum=10.005, step=0.01, label='Divergence (3D effect)',
-											  value=2.5)
-				stereo_separation = gr.Slider(minimum=-5.0, maximum=5.0, step=0.01, label='Separation (moves images apart)',
-											  value=0.0)
-			with gr.Row(visible=False) as stereo_options_row_2:
-				stereo_fill = gr.Dropdown(label="Gap fill technique",
-										  choices=['none', 'naive', 'naive_interpolating', 'polylines_soft',
-												   'polylines_sharp'], value='polylines_sharp', type="value",
-										  elem_id="stereo_fill_type")
-				stereo_balance = gr.Slider(minimum=-1.0, maximum=1.0, step=0.05, label='Balance between eyes',
-										   value=0.0)
-
-		with gr.Group():
-			with gr.Row():
-				gen_mesh = gr.Checkbox(label="Generate simple 3D mesh. (Fast, accurate only with ZoeDepth models and no boost, no custom maps)", value=False, visible=True)
-			with gr.Row(visible=False) as mesh_options_row_0:
-				mesh_occlude = gr.Checkbox(label="Remove occluded edges", value=True, visible=True)
-				mesh_spherical = gr.Checkbox(label="Equirectangular projection", value=False, visible=True)
-
-		with gr.Group(visible=is_depth_tab):
-			with gr.Row():
-				inpaint = gr.Checkbox(label="Generate 3D inpainted mesh. (Sloooow, required for generating videos)", value=False, visible=is_depth_tab)
-			with gr.Row(visible=False) as inpaint_options_row_0:
-				inpaint_vids = gr.Checkbox(label="Generate 4 demo videos with 3D inpainted mesh.", value=False, visible=is_depth_tab)
-
-		with gr.Group():
-			with gr.Row():
-				background_removal = gr.Checkbox(label="Remove background", value=False)
-			with gr.Row(visible=False) as bgrem_options_row_1:
-				save_background_removal_masks = gr.Checkbox(label="Save the foreground masks", value=False)
-				pre_depth_background_removal = gr.Checkbox(label="Pre-depth background removal", value=False)
-			with gr.Row(visible=False) as bgrem_options_row_2:
-				background_removal_model = gr.Dropdown(label="Rembg Model",
-													   choices=['u2net', 'u2netp', 'u2net_human_seg', 'silueta'],
-													   value='u2net', type="value", elem_id="backgroundmodel_type")
-
-		with gr.Box():
-			gr.HTML("Information, comment and share @ <a "
-					"href='https://github.com/thygate/stable-diffusion-webui-depthmap-script'>"
-					"https://github.com/thygate/stable-diffusion-webui-depthmap-script</a>")
-
-		gen_normal = gr.Checkbox(label="Generate Normalmap (hidden! api only)", value=False, visible=False)
-
-
-		# Invert_depthmap must not be used with gen_stereo - otherwise stereo images look super-wrong
-		gen_stereo.change(
-			fn=lambda a, b: False if b else a,
-			inputs=[invert_depth, gen_stereo],
-			outputs=[invert_depth]
-		)
-		gen_stereo.change(
-			fn=lambda a, b: invert_depth.update(interactive = not b),
-			inputs=[invert_depth, gen_stereo],
-			outputs=[invert_depth]
-		)
-
-		clipthreshold_far.change(
-			fn=lambda a, b: a if b < a else b,
-			inputs=[clipthreshold_far, clipthreshold_near],
-			outputs=[clipthreshold_near]
-		)
-
-		clipthreshold_near.change(
-			fn=lambda a, b: a if b > a else b,
-			inputs=[clipthreshold_near, clipthreshold_far],
-			outputs=[clipthreshold_far]
-		)
-
-		boost.change(
-			fn=lambda a: options_depend_on_boost.update(visible = not a),
-			inputs=[boost],
-			outputs=[options_depend_on_boost]
-		)
-
-		match_size.change(
-			fn=lambda a: options_depend_on_match_size.update(visible = not a),
-			inputs=[match_size],
-			outputs=[options_depend_on_match_size]
-		)
-
-		def clipdepth_options_visibility(v):
-			return clip_options_row_1.update(visible=v)
-		clipdepth.change(
-			fn=clipdepth_options_visibility,
-			inputs=[clipdepth],
-			outputs=[clip_options_row_1]
-		)
-
-		def stereo_options_visibility(v):
-			return stereo_options_row_0.update(visible=v),\
-				   stereo_options_row_1.update(visible=v),\
-				   stereo_options_row_2.update(visible=v)
-		gen_stereo.change(
-			fn=stereo_options_visibility,
-			inputs=[gen_stereo],
-			outputs=[stereo_options_row_0, stereo_options_row_1, stereo_options_row_2]
-		)
-
-		def mesh_options_visibility(v):
-			return mesh_options_row_0.update(visible=v)
-		gen_mesh.change(
-			fn=mesh_options_visibility,
-			inputs=[gen_mesh],
-			outputs=[mesh_options_row_0]
-		)
-
-		def inpaint_options_visibility(v):
-			return inpaint_options_row_0.update(visible=v)
-		inpaint.change(
-			fn=inpaint_options_visibility,
-			inputs=[inpaint],
-			outputs=[inpaint_options_row_0]
-		)
-
-		def background_removal_options_visibility(v):
-			return bgrem_options_row_1.update(visible=v), \
-				   bgrem_options_row_2.update(visible=v)
-		background_removal.change(
-			fn=background_removal_options_visibility,
-			inputs=[background_removal],
-			outputs=[bgrem_options_row_1, bgrem_options_row_2]
-		)
-
-	return [compute_device, model_type, net_width, net_height, match_size, boost, invert_depth, clipdepth, clipthreshold_far, clipthreshold_near, combine_output, combine_output_axis, save_depth, show_depth, show_heat, gen_stereo, stereo_modes, stereo_divergence, stereo_separation, stereo_fill, stereo_balance, inpaint, inpaint_vids, background_removal, save_background_removal_masks, gen_normal, pre_depth_background_removal, background_removal_model, gen_mesh, mesh_occlude, mesh_spherical]
+    inp = GradioComponentBundle()
+    with gr.Blocks():
+        with gr.Row():
+            inp += 'compute_device', gr.Radio(label="Compute on", choices=['GPU', 'CPU'], value='GPU')
+            # TODO: Should return value instead of index. Maybe Enum should be used?
+            inp += 'model_type', gr.Dropdown(label="Model",
+                                             choices=['res101', 'dpt_beit_large_512 (midas 3.1)',
+                                                      'dpt_beit_large_384 (midas 3.1)', 'dpt_large_384 (midas 3.0)',
+                                                      'dpt_hybrid_384 (midas 3.0)',
+                                                      'midas_v21', 'midas_v21_small',
+                                                      'zoedepth_n (indoor)', 'zoedepth_k (outdoor)', 'zoedepth_nk'],
+                                             value='res101',
+                                             type="index")
+        with gr.Group():
+            with gr.Row():
+                inp += 'boost', gr.Checkbox(label="BOOST (multi-resolution merging)", value=True)
+                inp += 'invert_depth', gr.Checkbox(label="Invert DepthMap (black=near, white=far)", value=False)
+            with gr.Group(visible=False) as options_depend_on_boost:
+                inp += 'match_size', gr.Checkbox(label="Match input size", value=False)
+                with gr.Row() as options_depend_on_match_size:
+                    inp += 'net_width', gr.Slider(minimum=64, maximum=2048, step=64, label='Net width', value=512)
+                    inp += 'net_height', gr.Slider(minimum=64, maximum=2048, step=64, label='Net height', value=512)
+
+        with gr.Group():
+            with gr.Row():
+                inp += 'clipdepth', gr.Checkbox(label="Clip and renormalize", value=False)
+            with gr.Row(visible=False) as clip_options_row_1:
+                inp += "clipthreshold_far", gr.Slider(minimum=0, maximum=1, step=0.001, label='Far clip', value=0)
+                inp += "clipthreshold_near", gr.Slider(minimum=0, maximum=1, step=0.001, label='Near clip', value=1)
+
+        with gr.Group():
+            with gr.Row():
+                inp += "combine_output", gr.Checkbox(label="Combine into one image", value=False)
+                inp += "combine_output_axis", gr.Radio(label="Combine axis", choices=['Vertical', 'Horizontal'],
+                                                       value='Horizontal', type="index")
+            with gr.Row():
+                inp += "save_depth", gr.Checkbox(label="Save DepthMap", value=True)
+                inp += "show_depth", gr.Checkbox(label="Show DepthMap", value=True)
+                inp += "show_heat", gr.Checkbox(label="Show HeatMap", value=False)
+
+        with gr.Group():
+            with gr.Row():
+                inp += "gen_stereo", gr.Checkbox(label="Generate stereoscopic image(s)", value=False)
+                with gr.Group(visible=False) as stereo_options_row_0:
+                    with gr.Row():
+                        inp += "stereo_modes", gr.CheckboxGroup(
+                            ["left-right", "right-left", "top-bottom", "bottom-top", "red-cyan-anaglyph"],
+                            label="Output", value=["left-right", "red-cyan-anaglyph"])
+
+            with gr.Row(visible=False) as stereo_options_row_1:
+                inp += "stereo_divergence", gr.Slider(minimum=0.05, maximum=10.005, step=0.01,
+                                                      label='Divergence (3D effect)',
+                                                      value=2.5)
+                inp += "stereo_separation", gr.Slider(minimum=-5.0, maximum=5.0, step=0.01,
+                                                      label='Separation (moves images apart)',
+                                                      value=0.0)
+            with gr.Row(visible=False) as stereo_options_row_2:
+                inp += "stereo_fill", gr.Dropdown(label="Gap fill technique",
+                                                  choices=['none', 'naive', 'naive_interpolating', 'polylines_soft',
+                                                           'polylines_sharp'], value='polylines_sharp', type="value")
+                inp += "stereo_balance", gr.Slider(minimum=-1.0, maximum=1.0, step=0.05, label='Balance between eyes',
+                                                   value=0.0)
+
+        with gr.Group():
+            with gr.Row():
+                inp += "gen_mesh", gr.Checkbox(
+                    label="Generate simple 3D mesh. "
+                          "(Fast, accurate only with ZoeDepth models and no boost, no custom maps)",
+                    value=False, visible=True)
+            with gr.Row(visible=False) as mesh_options_row_0:
+                inp += "mesh_occlude", gr.Checkbox(label="Remove occluded edges", value=True, visible=True)
+                inp += "mesh_spherical", gr.Checkbox(label="Equirectangular projection", value=False, visible=True)
+
+        with gr.Group(visible=is_depth_tab):
+            with gr.Row():
+                inp += "inpaint", gr.Checkbox(
+                    label="Generate 3D inpainted mesh. (Sloooow, required for generating videos)", value=False,
+                    visible=is_depth_tab)
+            with gr.Row(visible=False) as inpaint_options_row_0:
+                inp += "inpaint_vids", gr.Checkbox(label="Generate 4 demo videos with 3D inpainted mesh.", value=False,
+                                                   visible=is_depth_tab)
+
+        with gr.Group():
+            with gr.Row():
+                inp += "background_removal", gr.Checkbox(label="Remove background", value=False)
+            with gr.Row(visible=False) as bgrem_options_row_1:
+                inp += "save_background_removal_masks", gr.Checkbox(label="Save the foreground masks", value=False)
+                inp += "pre_depth_background_removal", gr.Checkbox(label="Pre-depth background removal", value=False)
+            with gr.Row(visible=False) as bgrem_options_row_2:
+                inp += "background_removal_model", gr.Dropdown(label="Rembg Model",
+                                                               choices=['u2net', 'u2netp', 'u2net_human_seg',
+                                                                        'silueta'],
+                                                               value='u2net', type="value")
+
+        with gr.Box():
+            gr.HTML("Information, comment and share @ <a "
+                    "href='https://github.com/thygate/stable-diffusion-webui-depthmap-script'>"
+                    "https://github.com/thygate/stable-diffusion-webui-depthmap-script</a>")
+
+        inp += "gen_normal", gr.Checkbox(label="Generate Normalmap (hidden! api only)", value=False, visible=False)
+
+        inp['boost'].change(
+            fn=lambda a: options_depend_on_boost.update(visible=not a),
+            inputs=[inp['boost']],
+            outputs=[options_depend_on_boost]
+        )
+        inp['match_size'].change(
+            fn=lambda a: options_depend_on_match_size.update(visible=not a),
+            inputs=[inp['match_size']],
+            outputs=[options_depend_on_match_size]
+        )
+
+        inp['clipdepth'].change(
+            fn=lambda v: clip_options_row_1.update(visible=v),
+            inputs=[inp['clipdepth']],
+            outputs=[clip_options_row_1]
+        )
+        inp['clipthreshold_far'].change(
+            fn=lambda a, b: a if b < a else b,
+            inputs=[inp['clipthreshold_far'], inp['clipthreshold_near']],
+            outputs=[inp['clipthreshold_near']]
+        )
+        inp['clipthreshold_near'].change(
+            fn=lambda a, b: a if b > a else b,
+            inputs=[inp['clipthreshold_near'], inp['clipthreshold_far']],
+            outputs=[inp['clipthreshold_far']]
+        )
+
+        # Invert_depthmap must not be used with gen_stereo - otherwise stereo images look super-wrong
+        inp['gen_stereo'].change(
+            fn=lambda a, b: False if b else a,
+            inputs=[inp['invert_depth'], inp['gen_stereo']],
+            outputs=[inp['invert_depth']]
+        )
+        inp['gen_stereo'].change(
+            fn=lambda a, b: inp['invert_depth'].update(interactive=not b),
+            inputs=[inp['invert_depth'], inp['gen_stereo']],
+            outputs=[inp['invert_depth']]
+        )
+
+        def stereo_options_visibility(v):
+            return stereo_options_row_0.update(visible=v), \
+                stereo_options_row_1.update(visible=v), \
+                stereo_options_row_2.update(visible=v)
+
+        inp['gen_stereo'].change(
+            fn=stereo_options_visibility,
+            inputs=[inp['gen_stereo']],
+            outputs=[stereo_options_row_0, stereo_options_row_1, stereo_options_row_2]
+        )
+
+        def mesh_options_visibility(v):
+            return mesh_options_row_0.update(visible=v)
+
+        inp['gen_mesh'].change(
+            fn=mesh_options_visibility,
+            inputs=[inp['gen_mesh']],
+            outputs=[mesh_options_row_0]
+        )
+
+        def inpaint_options_visibility(v):
+            return inpaint_options_row_0.update(visible=v)
+
+        inp['inpaint'].change(
+            fn=inpaint_options_visibility,
+            inputs=[inp['inpaint']],
+            outputs=[inpaint_options_row_0]
+        )
+
+        def background_removal_options_visibility(v):
+            return bgrem_options_row_1.update(visible=v), \
+                bgrem_options_row_2.update(visible=v)
+
+        inp['background_removal'].change(
+            fn=background_removal_options_visibility,
+            inputs=[inp['background_removal']],
+            outputs=[bgrem_options_row_1, bgrem_options_row_2]
+        )
+
+    return inp
 
 
 class Script(scripts.Script):
-	def title(self):
-		return scriptname
-
-	def show(self, is_img2img):
-		return True
-
-	def ui(self, is_img2img):
-		with gr.Column(variant='panel'):
-			ret = main_ui_panel(False)
-		return ret
-
-	# run from script in txt2img or img2img
-	def run(self, p,
-			compute_device, model_type, net_width, net_height, match_size, boost, invert_depth, clipdepth, clipthreshold_far, clipthreshold_near, combine_output, combine_output_axis, save_depth, show_depth, show_heat, gen_stereo, stereo_modes, stereo_divergence, stereo_separation, stereo_fill, stereo_balance, inpaint, inpaint_vids, background_removal, save_background_removal_masks, gen_normal, pre_depth_background_removal, background_removal_model, gen_mesh, mesh_occlude, mesh_spherical
-			):
-
-		# sd process 
-		processed = processing.process_images(p)
-
-		processed.sampler = p.sampler # for create_infotext
-
-		inputimages = []
-		for count in range(0, len(processed.images)):
-			# skip first grid image
-			if count == 0 and len(processed.images) > 1 and opts.return_grid:
-				continue
-			inputimages.append(processed.images[count])
-		
-		#remove on base image before depth calculation
-		background_removed_images = []
-		if background_removal:
-			if pre_depth_background_removal:
-				inputimages = batched_background_removal(inputimages, background_removal_model)
-				background_removed_images = inputimages
-			else:
-				background_removed_images = batched_background_removal(inputimages, background_removal_model)			
-
-		newmaps, mesh_fi, meshsimple_fi = run_depthmap(processed, p.outpath_samples, inputimages, None,
-                                        compute_device, model_type,
-                                        net_width, net_height, match_size, boost, invert_depth, clipdepth, clipthreshold_far, clipthreshold_near, combine_output, combine_output_axis, save_depth, show_depth, show_heat, gen_stereo, stereo_modes, stereo_divergence, stereo_separation, stereo_fill, stereo_balance, inpaint, inpaint_vids, background_removal, save_background_removal_masks, gen_normal,
-                                        background_removed_images, "mp4", 0, False, None, False, gen_mesh, mesh_occlude, mesh_spherical )
-		
-		for img in newmaps:
-			processed.images.append(img)
-
-		return processed
+    def title(self):
+        return scriptname
+
+    def show(self, is_img2img):
+        return True
+
+    def ui(self, is_img2img):
+        gr.HTML()  # Work around a Gradio bug
+        with gr.Column(variant='panel'):
+            gr.HTML()  # Work around a Gradio bug
+            ret = main_ui_panel(False)
+            ret += ret.enkey_tail()
+        return ret.enkey_body()
+
+    # run from script in txt2img or img2img
+    def run(self, p, *inputs):
+        inp = GradioComponentBundle.enkey_to_dict(inputs)
+
+        # sd process
+        processed = processing.process_images(p)
+        processed.sampler = p.sampler  # for create_infotext
+
+        inputimages = []
+        for count in range(0, len(processed.images)):
+            # skip first grid image
+            if count == 0 and len(processed.images) > 1 and opts.return_grid:
+                continue
+            inputimages.append(processed.images[count])
+
+        # TODO: this should not be here
+        # remove on base image before depth calculation
+        background_removed_images = []
+        if inp['background_removal']:
+            if inp['pre_depth_background_removal']:
+                inputimages = batched_background_removal(inputimages, inp['background_removal_model'])
+                background_removed_images = inputimages
+            else:
+                background_removed_images = batched_background_removal(inputimages, inp['background_removal_model'])
+
+        newmaps, mesh_fi, meshsimple_fi = run_depthmap(processed, p.outpath_samples, inputimages, None, inp,
+                                                       background_removed_images)
+        for img in newmaps:
+            processed.images.append(img)
+
+        return processed
+
 
 def unload_sd_model():
-	if shared.sd_model is not None:
-		shared.sd_model.cond_stage_model.to(devices.cpu)
-		shared.sd_model.first_stage_model.to(devices.cpu)
+    if shared.sd_model is not None:
+        shared.sd_model.cond_stage_model.to(devices.cpu)
+        shared.sd_model.first_stage_model.to(devices.cpu)
+
 
 def reload_sd_model():
-	if shared.sd_model is not None:
-		shared.sd_model.cond_stage_model.to(devices.device)
-		shared.sd_model.first_stage_model.to(devices.device)
-
-def run_depthmap(processed, outpath, inputimages, inputnames,
-                 compute_device, model_type, net_width, net_height, match_size, boost, invert_depth, clipdepth, clipthreshold_far, clipthreshold_near, combine_output, combine_output_axis, save_depth, show_depth, show_heat, gen_stereo, stereo_modes, stereo_divergence, stereo_separation, stereo_fill, stereo_balance, inpaint, inpaint_vids, background_removal, save_background_removal_masks, gen_normal,
-                 background_removed_images, fnExt, vid_ssaa, custom_depthmap, custom_depthmap_img, depthmap_batch_reuse, gen_mesh, mesh_occlude, mesh_spherical):
-
-	if len(inputimages) == 0 or inputimages[0] == None:
-		return [], []
-
-	print(f"\n{scriptname} {scriptversion} ({get_commit_hash()})")
-
-	unload_sd_model()
-
-	meshsimple_fi = None
-	mesh_fi = None
-
-	resize_mode = "minimal"
-	normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-	# init torch device
-	global device
-	if compute_device == 0:
-		device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-	else:
-		device = torch.device("cpu")
-	print("device: %s" % device)
-
-	# model path and name
-	model_dir = "./models/midas"
-	if model_type == 0:
-		model_dir = "./models/leres"
-	# create paths to model if not present
-	os.makedirs(model_dir, exist_ok=True)
-	os.makedirs('./models/pix2pix', exist_ok=True)
-
-	global depthmap_model_depth, depthmap_model_pix2pix, depthmap_model_type, depthmap_device_idx
-	loadmodels = True
-	if hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels:
-		loadmodels = False
-		if depthmap_model_type != model_type or depthmap_model_depth == None or depthmap_device_idx != compute_device:
-			del depthmap_model_depth
-			depthmap_model_depth = None
-			loadmodels = True
-
-	outimages = []
-	try:
-		if loadmodels and not (custom_depthmap and custom_depthmap_img != None):
-			print("Loading model weights from ", end=" ")
-
-			#"res101"
-			if model_type == 0: 
-				model_path = f"{model_dir}/res101.pth"
-				print(model_path)
-				ensure_file_downloaded(
-					model_path,
-					["https://cloudstor.aarnet.edu.au/plus/s/lTIJF4vrvHCAI31/download",
-					 "https://huggingface.co/lllyasviel/Annotators/resolve/5bc80eec2b4fddbb/res101.pth",
-					 ],
-					"1d696b2ef3e8336b057d0c15bc82d2fecef821bfebe5ef9d7671a5ec5dde520b")
-				ensure_file_downloaded(model_path, "https://cloudstor.aarnet.edu.au/plus/s/lTIJF4vrvHCAI31/download")
-				if compute_device == 0:
-					checkpoint = torch.load(model_path)
-				else:
-					checkpoint = torch.load(model_path,map_location=torch.device('cpu'))
-				model = RelDepthModel(backbone='resnext101')
-				model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."), strict=True)
-				del checkpoint
-				devices.torch_gc()
-
-			#"dpt_beit_large_512" midas 3.1
-			if model_type == 1: 
-				model_path = f"{model_dir}/dpt_beit_large_512.pt"
-				print(model_path)
-				ensure_file_downloaded(model_path, "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt")
-				model = DPTDepthModel(
-					path=model_path,
-					backbone="beitl16_512",
-					non_negative=True,
-				)
-				net_w, net_h = 512, 512
-				resize_mode = "minimal"
-				normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-			#"dpt_beit_large_384" midas 3.1
-			if model_type == 2: 
-				model_path = f"{model_dir}/dpt_beit_large_384.pt"
-				print(model_path)
-				ensure_file_downloaded(model_path, "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt")
-				model = DPTDepthModel(
-					path=model_path,
-					backbone="beitl16_384",
-					non_negative=True,
-				)
-				net_w, net_h = 384, 384
-				resize_mode = "minimal"
-				normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-			#"dpt_large_384" midas 3.0
-			if model_type == 3: 
-				model_path = f"{model_dir}/dpt_large-midas-2f21e586.pt"
-				print(model_path)
-				ensure_file_downloaded(model_path, "https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt")
-				model = DPTDepthModel(
-					path=model_path,
-					backbone="vitl16_384",
-					non_negative=True,
-				)
-				net_w, net_h = 384, 384
-				resize_mode = "minimal"
-				normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-			#"dpt_hybrid_384" midas 3.0
-			elif model_type == 4: 
-				model_path = f"{model_dir}/dpt_hybrid-midas-501f0c75.pt"
-				print(model_path)
-				ensure_file_downloaded(model_path,"https://github.com/intel-isl/DPT/releases/download/1_0/dpt_hybrid-midas-501f0c75.pt")
-				model = DPTDepthModel(
-					path=model_path,
-					backbone="vitb_rn50_384",
-					non_negative=True,
-				)
-				net_w, net_h = 384, 384
-				resize_mode="minimal"
-				normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-			#"midas_v21"
-			elif model_type == 5: 
-				model_path = f"{model_dir}/midas_v21-f6b98070.pt"
-				print(model_path)
-				ensure_file_downloaded(model_path,"https://github.com/AlexeyAB/MiDaS/releases/download/midas_dpt/midas_v21-f6b98070.pt")
-				model = MidasNet(model_path, non_negative=True)
-				net_w, net_h = 384, 384
-				resize_mode="upper_bound"
-				normalization = NormalizeImage(
-					mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-				)
-
-			#"midas_v21_small"
-			elif model_type == 6: 
-				model_path = f"{model_dir}/midas_v21_small-70d6b9c8.pt"
-				print(model_path)
-				ensure_file_downloaded(model_path,"https://github.com/AlexeyAB/MiDaS/releases/download/midas_dpt/midas_v21_small-70d6b9c8.pt")
-				model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True, non_negative=True, blocks={'expand': True})
-				net_w, net_h = 256, 256
-				resize_mode="upper_bound"
-				normalization = NormalizeImage(
-					mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-				)
-
-			# zoedepth_n
-			elif model_type == 7:
-				print("zoedepth_n\n")
-				conf = get_config("zoedepth", "infer")
-				conf.img_size = [net_width, net_height]
-				model = build_model(conf)
-
-			# zoedepth_k
-			elif model_type == 8:
-				print("zoedepth_k\n")
-				conf = get_config("zoedepth", "infer", config_version="kitti")
-				conf.img_size = [net_width, net_height]
-				model = build_model(conf)
-
-			# zoedepth_nk
-			elif model_type == 9:
-				print("zoedepth_nk\n")
-				conf = get_config("zoedepth_nk", "infer")
-				conf.img_size = [net_width, net_height]
-				model = build_model(conf)
-
-			pix2pixmodel = None
-			# load merge network if boost enabled or keepmodels enabled
-			if boost or (hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels):
-				# sfu.ca unfortunately is not very reliable, we use a mirror just in case
-				ensure_file_downloaded(
-					'./models/pix2pix/latest_net_G.pth',
-					["https://huggingface.co/lllyasviel/Annotators/blob/9a7d84251d487d11/latest_net_G.pth",
-					"https://sfu.ca/~yagiz/CVPR21/latest_net_G.pth"],
-					'50ec735d74ed6499562d898f41b49343e521808b8dae589aa3c2f5c9ac9f7462')
-				opt = TestOptions().parse()
-				if compute_device == 1:
-					opt.gpu_ids = [] # cpu mode
-				pix2pixmodel = Pix2Pix4DepthModel(opt)
-				pix2pixmodel.save_dir = './models/pix2pix'
-				pix2pixmodel.load_networks('latest')
-				pix2pixmodel.eval()
-
-			devices.torch_gc()
-
-			# prepare for evaluation
-			model.eval()
-		
-			# optimize
-			if device == torch.device("cuda") and model_type < 7:
-				model = model.to(memory_format=torch.channels_last)  
-				if not cmd_opts.no_half and model_type != 0 and not boost:
-					model = model.half()
-
-			model.to(device)
-
-			depthmap_model_depth = model
-			depthmap_model_pix2pix = pix2pixmodel
-			depthmap_model_type = model_type
-			depthmap_device_idx = compute_device
-
-		if not loadmodels:
-			model = depthmap_model_depth
-			pix2pixmodel = depthmap_model_pix2pix
-			if device == torch.device("cuda"):
-				model = model.to(device)
-
-
-		print("Computing depthmap(s) ..")
-		inpaint_imgs = []
-		inpaint_depths = []
-		# iterate over input (generated) images
-		numimages = len(inputimages)
-		for count in trange(0, numimages):
-
-			print('\n')
-
-			# filename
-			basename = 'depthmap'
-
-			# figuring out the name of custom DepthMap
-			custom_depthmap_fn = None  # None means that DepthMap should be computed
-			# find filename if in the single image mode
-			if custom_depthmap and custom_depthmap_img is not None:
-				custom_depthmap_fn = custom_depthmap_img.name
-			# find filename if in batch mode
-			if inputnames is not None and depthmap_batch_reuse:
-				save_depth = True
-				if inputnames[count] is not None:
-					p = Path(inputnames[count])
-					basename = p.stem
-					if outpath != opts.outdir_extras_samples:
-						custom_depthmap_fn = os.path.join(outpath, basename + '-0000.' + opts.samples_format)
-						if not os.path.isfile(custom_depthmap_fn):
-							custom_depthmap_fn = None
-
-			# override net size
-			if (match_size):
-				net_width, net_height = inputimages[count].width, inputimages[count].height
-
-			# Convert single channel input (PIL) images to rgb
-			if inputimages[count].mode == 'I':
-				inputimages[count].point(lambda p: p*0.0039063096, mode='RGB')
-				inputimages[count] = inputimages[count].convert('RGB')
-
-			# input image
-			img = cv2.cvtColor(np.asarray(inputimages[count]), cv2.COLOR_BGR2RGB) / 255.0
-
-			skipInvertAndSave = False
-			if custom_depthmap_fn is not None:
-				# use custom depthmap
-				dimg = Image.open(os.path.abspath(custom_depthmap_fn))
-				# resize if not same size as input
-				if dimg.width != inputimages[count].width or dimg.height != inputimages[count].height:
-					dimg = dimg.resize((inputimages[count].width, inputimages[count].height), Image.Resampling.LANCZOS)
-				if dimg.mode == 'I' or dimg.mode == 'P' or dimg.mode == 'L':
-					prediction = np.asarray(dimg, dtype="float")
-				else:
-					prediction = np.asarray(dimg, dtype="float")[:,:,0]
-				skipInvertAndSave = True #skip invert for leres model (0)
-			else:
-				# compute depthmap
-				if not boost:
-					if model_type == 0:
-						prediction = estimateleres(img, model, net_width, net_height)
-					elif model_type >= 7:
-						prediction = estimatezoedepth(inputimages[count], model, net_width, net_height)
-					else:
-						prediction = estimatemidas(img, model, net_width, net_height, resize_mode, normalization)
-				else:
-					prediction = estimateboost(img, model, model_type, pix2pixmodel)
-
-			# output
-			depth = prediction
-			numbytes=2
-			depth_min = depth.min()
-			depth_max = depth.max()
-			max_val = (2**(8*numbytes))-1
-
-			# check output before normalizing and mapping to 16 bit
-			if depth_max - depth_min > np.finfo("float").eps:
-				out = max_val * (depth - depth_min) / (depth_max - depth_min)
-			else:
-				out = np.zeros(depth.shape)
-			
-			# single channel, 16 bit image
-			img_output = out.astype("uint16")
-
-			# invert depth map
-			if invert_depth ^ (((model_type == 0) or (model_type >= 7)) and not skipInvertAndSave):
-				img_output = cv2.bitwise_not(img_output)
-
-			# apply depth clip and renormalize if enabled
-			if clipdepth:
-				img_output = clipdepthmap(img_output, clipthreshold_far, clipthreshold_near)
-				#img_output = cv2.blur(img_output, (3, 3))
-
-			# three channel, 8 bits per channel image
-			img_output2 = np.zeros_like(inputimages[count])
-			img_output2[:,:,0] = img_output / 256.0
-			img_output2[:,:,1] = img_output / 256.0
-			img_output2[:,:,2] = img_output / 256.0
-
-			# if 3dinpainting, store maps for processing in second pass
-			if inpaint:
-				inpaint_imgs.append(inputimages[count])
-				inpaint_depths.append(img_output)
-
-			# get generation parameters
-			if processed is not None and hasattr(processed, 'all_prompts') and opts.enable_pnginfo:
-				info = create_infotext(processed, processed.all_prompts, processed.all_seeds, processed.all_subseeds, "", 0, count)
-			else:
-				info = None
-
-			rgb_image = inputimages[count]
-
-			#applying background masks after depth
-			if background_removal:
-				print('applying background masks')
-				background_removed_image = background_removed_images[count]
-				#maybe a threshold cut would be better on the line below.
-				background_removed_array = np.array(background_removed_image)
-				bg_mask = (background_removed_array[:,:,0]==0)&(background_removed_array[:,:,1]==0)&(background_removed_array[:,:,2]==0)&(background_removed_array[:,:,3]<=0.2)
-				far_value = 255 if invert_depth else 0
-
-				img_output[bg_mask] = far_value * far_value #255*255 or 0*0
-				
-				#should this be optional
-				if (processed is not None):
-					images.save_image(background_removed_image, outpath, "", processed.all_seeds[count], processed.all_prompts[count], opts.samples_format, info=info, p=processed, suffix="_background_removed")
-				else:
-					images.save_image(background_removed_image, path=outpath, basename=basename, seed=None, prompt=None, extension=opts.samples_format, info=info, short_filename=True,no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None, forced_filename=None, suffix="_background_removed")
-				outimages.append(background_removed_image )
-				if save_background_removal_masks:
-					bg_array = (1 - bg_mask.astype('int8'))*255
-					mask_array = np.stack( (bg_array, bg_array, bg_array, bg_array), axis=2)
-					mask_image = Image.fromarray( mask_array.astype(np.uint8))
-					if (processed is not None):
-						images.save_image(mask_image, outpath, "", processed.all_seeds[count], processed.all_prompts[count], opts.samples_format, info=info, p=processed, suffix="_foreground_mask")
-					else:
-						images.save_image(mask_image, path=outpath, basename=basename, seed=None, prompt=None, extension=opts.samples_format, info=info, short_filename=True,no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None, forced_filename=None, suffix="_foreground_mask")
-					outimages.append(mask_image)
-
-			img_concat = np.concatenate((rgb_image, img_output2), axis=combine_output_axis)
-			if show_depth:
-				if not combine_output:
-					outimages.append(Image.fromarray(img_output))
-				else:
-					outimages.append(Image.fromarray(img_concat))
-					
-			if not skipInvertAndSave:
-				if not combine_output:
-					if save_depth and processed is not None:
-						# only save 16 bit single channel image when PNG format is selected
-						if opts.samples_format == "png":
-							try:
-								images.save_image(Image.fromarray(img_output), outpath, "", processed.all_seeds[count], processed.all_prompts[count], opts.samples_format, info=info, p=processed, suffix="_depth")
-							except Exception as ve:
-								if not ('image has wrong mode' in str(ve) or 'I;16' in str(ve)): raise ve
-								print('Catched exception: image has wrong mode!')
-								traceback.print_exc()
-						else:
-							images.save_image(Image.fromarray(img_output2), outpath, "", processed.all_seeds[count], processed.all_prompts[count], opts.samples_format, info=info, p=processed, suffix="_depth")
-					elif save_depth:
-						# from depth tab
-						# only save 16 bit single channel image when PNG format is selected
-						if opts.samples_format == "png":
-							try:
-								images.save_image(Image.fromarray(img_output), path=outpath, basename=basename, seed=None, prompt=None, extension=opts.samples_format, info=info, short_filename=True,no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None, forced_filename=None)
-							except Exception as ve:
-								if not ('image has wrong mode' in str(ve) or 'I;16' in str(ve)): raise ve
-								print('Catched exception: image has wrong mode!')
-								traceback.print_exc()
-						else:
-							images.save_image(Image.fromarray(img_output2), path=outpath, basename=basename, seed=None, prompt=None, extension=opts.samples_format, info=info, short_filename=True,no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None, forced_filename=None)
-				else:
-					if save_depth and processed is not None:
-						images.save_image(Image.fromarray(img_concat), outpath, "", processed.all_seeds[count], processed.all_prompts[count], opts.samples_format, info=info, p=processed, suffix="_depth")
-					elif save_depth:
-						# from tab
-						images.save_image(Image.fromarray(img_concat), path=outpath, basename=basename, seed=None, prompt=None, extension=opts.samples_format, info=info, short_filename=True,no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None, forced_filename=None)
-			if show_heat:
-				heatmap = colorize(img_output, cmap='inferno')
-				outimages.append(heatmap)
-
-			if gen_stereo:
-				print("Generating stereoscopic images..")
-
-				stereomodes = stereo_modes
-				stereoimages = create_stereoimages(inputimages[count], img_output, stereo_divergence, stereo_separation, stereomodes, stereo_balance, stereo_fill)
-
-				for c in range(0, len(stereoimages)):
-					outimages.append(stereoimages[c])
-					if processed is not None:
-						images.save_image(stereoimages[c], outpath, "", processed.all_seeds[count],
-										processed.all_prompts[count], opts.samples_format, info=info, p=processed,
-										suffix=f"_{stereomodes[c]}")
-					else:
-						# from tab
-						images.save_image(stereoimages[c], path=outpath, basename=basename, seed=None,
-										prompt=None, extension=opts.samples_format, info=info, short_filename=True,
-										no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None,
-										forced_filename=None, suffix=f"_{stereomodes[c]}")
-
-			if gen_normal:
-				# taken from @graemeniedermayer, hidden, for api use only, will remove in future.
-				# take gradients 
-				zx = cv2.Sobel(np.float64(img_output), cv2.CV_64F, 1, 0, ksize=3)     
-				zy = cv2.Sobel(np.float64(img_output), cv2.CV_64F, 0, 1, ksize=3) 
-
-				# combine and normalize gradients.
-				normal = np.dstack((zx, -zy, np.ones_like(img_output)))
-				n = np.linalg.norm(normal, axis=2)
-				normal[:, :, 0] /= n
-				normal[:, :, 1] /= n
-				normal[:, :, 2] /= n
-
-				# offset and rescale values to be in 0-255
-				normal += 1
-				normal /= 2
-				normal *= 255	
-				normal = normal.astype(np.uint8)
-				
-				outimages.append(Image.fromarray(normal))
-
-			# gen mesh
-			if gen_mesh:
-				print(f"\nGenerating (occluded) mesh ..")
-
-				meshsimple_fi = get_uniquefn(outpath, basename, 'obj')
-				meshsimple_fi = os.path.join(outpath, meshsimple_fi + '_simple.obj')
-
-				depthi = prediction
-				# try to map output to sensible values for non zoedepth models, boost, or custom maps
-				if model_type < 7 or boost or (custom_depthmap and custom_depthmap_img != None):
-					# invert if midas
-					if model_type > 0 or ((custom_depthmap and custom_depthmap_img != None) and not invert_depth):
-						depthi = depth_max - depthi + depth_min
-						depth_max = depthi.max()
-						depth_min = depthi.min()
-					# make positive
-					if depth_min < 0:
-						depthi = depthi - depth_min
-						depth_max = depthi.max()
-						depth_min = depthi.min()
-					# scale down 
-					if depthi.max() > 10:
-						depthi = 4 * (depthi - depth_min) / (depth_max - depth_min)
-					# offset
-					depthi = depthi + 1
-
-				mesh = create_mesh(inputimages[count], depthi, keep_edges=not mesh_occlude, spherical=mesh_spherical)
-				save_mesh_obj(meshsimple_fi, mesh)
-
-		print("Done.")
-
-	except RuntimeError as e:
-		if 'out of memory' in str(e):
-			print("ERROR: out of memory, could not generate depthmap !")
-		else:
-			print(e)
-
-	finally:
-		if not (hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels):
-			if 'model' in locals():
-				del model
-			if boost and 'pix2pixmodel' in locals():
-				del pix2pixmodel
-		else:
-			if 'model' in locals():
-				model.to(devices.cpu)
-
-		gc.collect()
-		devices.torch_gc()
-		reload_sd_model()
-	try:
-		if inpaint:
-			unload_sd_model()
-			mesh_fi = run_3dphoto(device, inpaint_imgs, inpaint_depths, inputnames, outpath, fnExt, vid_ssaa, inpaint_vids)
-	
-	finally:
-		reload_sd_model()
-		print("All done.")
-
-	return outimages, mesh_fi, meshsimple_fi
+    if shared.sd_model is not None:
+        shared.sd_model.cond_stage_model.to(devices.device)
+        shared.sd_model.first_stage_model.to(devices.device)
+
+
+def run_depthmap(processed, outpath, inputimages, inputnames, inp, background_removed_images):
+    if len(inputimages) == 0 or inputimages[0] is None:
+        return [], []
+
+    background_removal = inp["background_removal"]
+    background_removal_model = inp["background_removal_model"]
+    boost = inp["boost"]
+    clipdepth = inp["clipdepth"]
+    clipthreshold_far = inp["clipthreshold_far"]
+    clipthreshold_near = inp["clipthreshold_near"]
+    combine_output = inp["combine_output"]
+    combine_output_axis = inp["combine_output_axis"]
+    depthmap_compute_device = inp["compute_device"]
+    gen_mesh = inp["gen_mesh"]
+    gen_normal = inp["gen_normal"]
+    gen_stereo = inp["gen_stereo"]
+    inpaint = inp["inpaint"]
+    inpaint_vids = inp["inpaint_vids"]
+    invert_depth = inp["invert_depth"]
+    match_size = inp["match_size"]
+    mesh_occlude = inp["mesh_occlude"]
+    mesh_spherical = inp["mesh_spherical"]
+    model_type = inp["model_type"]
+    net_height = inp["net_height"]
+    net_width = inp["net_width"]
+    pre_depth_background_removal = inp["pre_depth_background_removal"]
+    save_background_removal_masks = inp["save_background_removal_masks"]
+    save_depth = inp["save_depth"]
+    show_depth = inp["show_depth"]
+    show_heat = inp["show_heat"]
+    stereo_balance = inp["stereo_balance"]
+    stereo_divergence = inp["stereo_divergence"]
+    stereo_fill = inp["stereo_fill"]
+    stereo_modes = inp["stereo_modes"]
+    stereo_separation = inp["stereo_separation"]
+
+    custom_depthmap = inp["custom_depthmap"] if "custom_depthmap" in inp else "False"
+    custom_depthmap_img = inp["custom_depthmap_img"] if "custom_depthmap_img" in inp else None
+    depthmap_batch_reuse = inp["depthmap_batch_reuse"] if "depthmap_batch_reuse" in inp else True
+    fnExt = inp["fnExt"] if "fnExt" in inp else "mp4"
+    vid_ssaa = inp["vid_ssaa"] if "vid_ssaa" in inp else 0
+
+    print(f"\n{scriptname} {scriptversion} ({get_commit_hash()})")
+
+    unload_sd_model()
+
+    meshsimple_fi = None
+    mesh_fi = None
+
+    resize_mode = "minimal"
+    normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+    # init torch device
+    global device
+    if depthmap_compute_device == 'GPU':
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    else:
+        device = torch.device("cpu")
+    print("device: %s" % device)
+
+    # model path and name
+    model_dir = "./models/midas"
+    if model_type == 0:
+        model_dir = "./models/leres"
+    # create paths to model if not present
+    os.makedirs(model_dir, exist_ok=True)
+    os.makedirs('./models/pix2pix', exist_ok=True)
+
+    global depthmap_model_depth, depthmap_model_pix2pix, depthmap_model_type, depthmap_device_idx
+    loadmodels = True
+    if hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels:
+        loadmodels = False
+        if depthmap_model_type != model_type or depthmap_model_depth == None or depthmap_device_idx != depthmap_compute_device:
+            del depthmap_model_depth
+            depthmap_model_depth = None
+            loadmodels = True
+
+    outimages = []
+    try:
+        if loadmodels and not (custom_depthmap and custom_depthmap_img != None):
+            # TODO: loading model should be separated into a function that would return the model
+            #  and the parameters needed for this. The rest of the run_depthmap should depend on what specific model
+            #  is actually used for the generation.
+            print("Loading model weights from ", end=" ")
+
+            # "res101"
+            if model_type == 0:
+                model_path = f"{model_dir}/res101.pth"
+                print(model_path)
+                ensure_file_downloaded(
+                    model_path,
+                    ["https://cloudstor.aarnet.edu.au/plus/s/lTIJF4vrvHCAI31/download",
+                     "https://huggingface.co/lllyasviel/Annotators/resolve/5bc80eec2b4fddbb/res101.pth",
+                     ],
+                    "1d696b2ef3e8336b057d0c15bc82d2fecef821bfebe5ef9d7671a5ec5dde520b")
+                ensure_file_downloaded(model_path, "https://cloudstor.aarnet.edu.au/plus/s/lTIJF4vrvHCAI31/download")
+                if depthmap_compute_device == 'GPU':
+                    checkpoint = torch.load(model_path)
+                else:
+                    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
+                model = RelDepthModel(backbone='resnext101')
+                model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."), strict=True)
+                del checkpoint
+                devices.torch_gc()
+
+            # "dpt_beit_large_512" midas 3.1
+            if model_type == 1:
+                model_path = f"{model_dir}/dpt_beit_large_512.pt"
+                print(model_path)
+                ensure_file_downloaded(model_path,
+                                       "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt")
+                model = DPTDepthModel(
+                    path=model_path,
+                    backbone="beitl16_512",
+                    non_negative=True,
+                )
+                net_w, net_h = 512, 512
+                resize_mode = "minimal"
+                normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+            # "dpt_beit_large_384" midas 3.1
+            if model_type == 2:
+                model_path = f"{model_dir}/dpt_beit_large_384.pt"
+                print(model_path)
+                ensure_file_downloaded(model_path,
+                                       "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt")
+                model = DPTDepthModel(
+                    path=model_path,
+                    backbone="beitl16_384",
+                    non_negative=True,
+                )
+                net_w, net_h = 384, 384
+                resize_mode = "minimal"
+                normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+            # "dpt_large_384" midas 3.0
+            if model_type == 3:
+                model_path = f"{model_dir}/dpt_large-midas-2f21e586.pt"
+                print(model_path)
+                ensure_file_downloaded(model_path,
+                                       "https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt")
+                model = DPTDepthModel(
+                    path=model_path,
+                    backbone="vitl16_384",
+                    non_negative=True,
+                )
+                net_w, net_h = 384, 384
+                resize_mode = "minimal"
+                normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+            # "dpt_hybrid_384" midas 3.0
+            elif model_type == 4:
+                model_path = f"{model_dir}/dpt_hybrid-midas-501f0c75.pt"
+                print(model_path)
+                ensure_file_downloaded(model_path,
+                                       "https://github.com/intel-isl/DPT/releases/download/1_0/dpt_hybrid-midas-501f0c75.pt")
+                model = DPTDepthModel(
+                    path=model_path,
+                    backbone="vitb_rn50_384",
+                    non_negative=True,
+                )
+                net_w, net_h = 384, 384
+                resize_mode = "minimal"
+                normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+            # "midas_v21"
+            elif model_type == 5:
+                model_path = f"{model_dir}/midas_v21-f6b98070.pt"
+                print(model_path)
+                ensure_file_downloaded(model_path,
+                                       "https://github.com/AlexeyAB/MiDaS/releases/download/midas_dpt/midas_v21-f6b98070.pt")
+                model = MidasNet(model_path, non_negative=True)
+                net_w, net_h = 384, 384
+                resize_mode = "upper_bound"
+                normalization = NormalizeImage(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                )
+
+            # "midas_v21_small"
+            elif model_type == 6:
+                model_path = f"{model_dir}/midas_v21_small-70d6b9c8.pt"
+                print(model_path)
+                ensure_file_downloaded(model_path,
+                                       "https://github.com/AlexeyAB/MiDaS/releases/download/midas_dpt/midas_v21_small-70d6b9c8.pt")
+                model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
+                                       non_negative=True, blocks={'expand': True})
+                net_w, net_h = 256, 256
+                resize_mode = "upper_bound"
+                normalization = NormalizeImage(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                )
+
+            # zoedepth_n
+            elif model_type == 7:
+                print("zoedepth_n\n")
+                conf = get_config("zoedepth", "infer")
+                conf.img_size = [net_width, net_height]
+                model = build_model(conf)
+
+            # zoedepth_k
+            elif model_type == 8:
+                print("zoedepth_k\n")
+                conf = get_config("zoedepth", "infer", config_version="kitti")
+                conf.img_size = [net_width, net_height]
+                model = build_model(conf)
+
+            # zoedepth_nk
+            elif model_type == 9:
+                print("zoedepth_nk\n")
+                conf = get_config("zoedepth_nk", "infer")
+                conf.img_size = [net_width, net_height]
+                model = build_model(conf)
+
+            pix2pixmodel = None
+            # load merge network if boost enabled or keepmodels enabled
+            if boost or (hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels):
+                # sfu.ca unfortunately is not very reliable, we use a mirror just in case
+                ensure_file_downloaded(
+                    './models/pix2pix/latest_net_G.pth',
+                    ["https://huggingface.co/lllyasviel/Annotators/blob/9a7d84251d487d11/latest_net_G.pth",
+                     "https://sfu.ca/~yagiz/CVPR21/latest_net_G.pth"],
+                    '50ec735d74ed6499562d898f41b49343e521808b8dae589aa3c2f5c9ac9f7462')
+                opt = TestOptions().parse()
+                if depthmap_compute_device == "CPU":
+                    opt.gpu_ids = []
+                pix2pixmodel = Pix2Pix4DepthModel(opt)
+                pix2pixmodel.save_dir = './models/pix2pix'
+                pix2pixmodel.load_networks('latest')
+                pix2pixmodel.eval()
+
+            devices.torch_gc()
+
+            # prepare for evaluation
+            model.eval()
+
+            # optimize
+            if device == torch.device("cuda") and model_type < 7:
+                model = model.to(memory_format=torch.channels_last)
+                if not cmd_opts.no_half and model_type != 0 and not boost:
+                    model = model.half()
+
+            model.to(device)
+
+            depthmap_model_depth = model
+            depthmap_model_pix2pix = pix2pixmodel
+            depthmap_model_type = model_type
+            depthmap_device_idx = depthmap_compute_device
+
+        if not loadmodels:
+            model = depthmap_model_depth
+            pix2pixmodel = depthmap_model_pix2pix
+            if device == torch.device("cuda"):
+                model = model.to(device)
+
+        print("Computing depthmap(s) ..")
+        inpaint_imgs = []
+        inpaint_depths = []
+        # iterate over input (generated) images
+        numimages = len(inputimages)
+        for count in trange(0, numimages):
+
+            print('\n')
+
+            # filename
+            basename = 'depthmap'
+
+            # TODO: this should not use heuristics to figure out the mode, mode should ideally be abstracted away
+            # figuring out the name of custom DepthMap
+            custom_depthmap_fn = None  # None means that DepthMap should be computed
+            # find filename if in the single image mode
+            if custom_depthmap and custom_depthmap_img is not None:
+                custom_depthmap_fn = custom_depthmap_img.name
+            # find filename if in batch mode
+            if inputnames is not None and depthmap_batch_reuse:
+                save_depth = True
+                if inputnames[count] is not None:
+                    p = Path(inputnames[count])
+                    basename = p.stem
+                    if outpath != opts.outdir_extras_samples:
+                        custom_depthmap_fn = os.path.join(outpath, basename + '-0000.' + opts.samples_format)
+                        if not os.path.isfile(custom_depthmap_fn):
+                            custom_depthmap_fn = None
+
+            # override net size
+            if (match_size):
+                net_width, net_height = inputimages[count].width, inputimages[count].height
+
+            # Convert single channel input (PIL) images to rgb
+            if inputimages[count].mode == 'I':
+                inputimages[count].point(lambda p: p * 0.0039063096, mode='RGB')
+                inputimages[count] = inputimages[count].convert('RGB')
+
+            # input image
+            img = cv2.cvtColor(np.asarray(inputimages[count]), cv2.COLOR_BGR2RGB) / 255.0
+
+            skipInvertAndSave = False
+            if custom_depthmap_fn is not None:
+                # use custom depthmap
+                dimg = Image.open(os.path.abspath(custom_depthmap_fn))
+                # resize if not same size as input
+                if dimg.width != inputimages[count].width or dimg.height != inputimages[count].height:
+                    dimg = dimg.resize((inputimages[count].width, inputimages[count].height), Image.Resampling.LANCZOS)
+                if dimg.mode == 'I' or dimg.mode == 'P' or dimg.mode == 'L':
+                    prediction = np.asarray(dimg, dtype="float")
+                else:
+                    prediction = np.asarray(dimg, dtype="float")[:, :, 0]
+                skipInvertAndSave = True  # skip invert for leres model (0)
+            else:
+                # compute depthmap
+                if not boost:
+                    if model_type == 0:
+                        prediction = estimateleres(img, model, net_width, net_height)
+                    elif model_type >= 7:
+                        prediction = estimatezoedepth(inputimages[count], model, net_width, net_height)
+                    else:
+                        prediction = estimatemidas(img, model, net_width, net_height, resize_mode, normalization)
+                else:
+                    prediction = estimateboost(img, model, model_type, pix2pixmodel)
+
+            # output
+            depth = prediction
+            numbytes = 2
+            depth_min = depth.min()
+            depth_max = depth.max()
+            max_val = (2 ** (8 * numbytes)) - 1
+
+            # check output before normalizing and mapping to 16 bit
+            if depth_max - depth_min > np.finfo("float").eps:
+                out = max_val * (depth - depth_min) / (depth_max - depth_min)
+            else:
+                out = np.zeros(depth.shape)
+
+            # single channel, 16 bit image
+            img_output = out.astype("uint16")
+
+            # invert depth map
+            if invert_depth ^ (((model_type == 0) or (model_type >= 7)) and not skipInvertAndSave):
+                img_output = cv2.bitwise_not(img_output)
+
+            # apply depth clip and renormalize if enabled
+            if clipdepth:
+                img_output = clipdepthmap(img_output, clipthreshold_far, clipthreshold_near)
+                # img_output = cv2.blur(img_output, (3, 3))
+
+            # three channel, 8 bits per channel image
+            img_output2 = np.zeros_like(inputimages[count])
+            img_output2[:, :, 0] = img_output / 256.0
+            img_output2[:, :, 1] = img_output / 256.0
+            img_output2[:, :, 2] = img_output / 256.0
+
+            # if 3dinpainting, store maps for processing in second pass
+            if inpaint:
+                inpaint_imgs.append(inputimages[count])
+                inpaint_depths.append(img_output)
+
+            # get generation parameters
+            if processed is not None and hasattr(processed, 'all_prompts') and opts.enable_pnginfo:
+                info = create_infotext(processed, processed.all_prompts, processed.all_seeds, processed.all_subseeds,
+                                       "", 0, count)
+            else:
+                info = None
+
+            rgb_image = inputimages[count]
+
+            # applying background masks after depth
+            if background_removal:
+                print('applying background masks')
+                background_removed_image = background_removed_images[count]
+                # maybe a threshold cut would be better on the line below.
+                background_removed_array = np.array(background_removed_image)
+                bg_mask = (background_removed_array[:, :, 0] == 0) & (background_removed_array[:, :, 1] == 0) & (
+                            background_removed_array[:, :, 2] == 0) & (background_removed_array[:, :, 3] <= 0.2)
+                far_value = 255 if invert_depth else 0
+
+                img_output[bg_mask] = far_value * far_value  # 255*255 or 0*0
+
+                # should this be optional
+                if (processed is not None):
+                    images.save_image(background_removed_image, outpath, "", processed.all_seeds[count],
+                                      processed.all_prompts[count], opts.samples_format, info=info, p=processed,
+                                      suffix="_background_removed")
+                else:
+                    images.save_image(background_removed_image, path=outpath, basename=basename, seed=None, prompt=None,
+                                      extension=opts.samples_format, info=info, short_filename=True, no_prompt=True,
+                                      grid=False, pnginfo_section_name="extras", existing_info=None,
+                                      forced_filename=None, suffix="_background_removed")
+                outimages.append(background_removed_image)
+                if save_background_removal_masks:
+                    bg_array = (1 - bg_mask.astype('int8')) * 255
+                    mask_array = np.stack((bg_array, bg_array, bg_array, bg_array), axis=2)
+                    mask_image = Image.fromarray(mask_array.astype(np.uint8))
+                    if (processed is not None):
+                        images.save_image(mask_image, outpath, "", processed.all_seeds[count],
+                                          processed.all_prompts[count], opts.samples_format, info=info, p=processed,
+                                          suffix="_foreground_mask")
+                    else:
+                        images.save_image(mask_image, path=outpath, basename=basename, seed=None, prompt=None,
+                                          extension=opts.samples_format, info=info, short_filename=True, no_prompt=True,
+                                          grid=False, pnginfo_section_name="extras", existing_info=None,
+                                          forced_filename=None, suffix="_foreground_mask")
+                    outimages.append(mask_image)
+
+            img_concat = np.concatenate((rgb_image, img_output2), axis=combine_output_axis)
+            if show_depth:
+                if not combine_output:
+                    outimages.append(Image.fromarray(img_output))
+                else:
+                    outimages.append(Image.fromarray(img_concat))
+
+            if not skipInvertAndSave:
+                if not combine_output:
+                    if save_depth and processed is not None:
+                        # only save 16 bit single channel image when PNG format is selected
+                        if opts.samples_format == "png":
+                            try:
+                                images.save_image(Image.fromarray(img_output), outpath, "", processed.all_seeds[count],
+                                                  processed.all_prompts[count], opts.samples_format, info=info,
+                                                  p=processed, suffix="_depth")
+                            except Exception as ve:
+                                if not ('image has wrong mode' in str(ve) or 'I;16' in str(ve)): raise ve
+                                print('Catched exception: image has wrong mode!')
+                                traceback.print_exc()
+                        else:
+                            images.save_image(Image.fromarray(img_output2), outpath, "", processed.all_seeds[count],
+                                              processed.all_prompts[count], opts.samples_format, info=info, p=processed,
+                                              suffix="_depth")
+                    elif save_depth:
+                        # from depth tab
+                        # only save 16 bit single channel image when PNG format is selected
+                        if opts.samples_format == "png":
+                            try:
+                                images.save_image(Image.fromarray(img_output), path=outpath, basename=basename,
+                                                  seed=None, prompt=None, extension=opts.samples_format, info=info,
+                                                  short_filename=True, no_prompt=True, grid=False,
+                                                  pnginfo_section_name="extras", existing_info=None,
+                                                  forced_filename=None)
+                            except Exception as ve:
+                                if not ('image has wrong mode' in str(ve) or 'I;16' in str(ve)): raise ve
+                                print('Catched exception: image has wrong mode!')
+                                traceback.print_exc()
+                        else:
+                            images.save_image(Image.fromarray(img_output2), path=outpath, basename=basename, seed=None,
+                                              prompt=None, extension=opts.samples_format, info=info,
+                                              short_filename=True, no_prompt=True, grid=False,
+                                              pnginfo_section_name="extras", existing_info=None, forced_filename=None)
+                else:
+                    if save_depth and processed is not None:
+                        images.save_image(Image.fromarray(img_concat), outpath, "", processed.all_seeds[count],
+                                          processed.all_prompts[count], opts.samples_format, info=info, p=processed,
+                                          suffix="_depth")
+                    elif save_depth:
+                        # from tab
+                        images.save_image(Image.fromarray(img_concat), path=outpath, basename=basename, seed=None,
+                                          prompt=None, extension=opts.samples_format, info=info, short_filename=True,
+                                          no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None,
+                                          forced_filename=None)
+            if show_heat:
+                heatmap = colorize(img_output, cmap='inferno')
+                outimages.append(heatmap)
+
+            if gen_stereo:
+                print("Generating stereoscopic images..")
+
+                stereomodes = stereo_modes
+                stereoimages = create_stereoimages(inputimages[count], img_output, stereo_divergence, stereo_separation,
+                                                   stereomodes, stereo_balance, stereo_fill)
+
+                for c in range(0, len(stereoimages)):
+                    outimages.append(stereoimages[c])
+                    if processed is not None:
+                        images.save_image(stereoimages[c], outpath, "", processed.all_seeds[count],
+                                          processed.all_prompts[count], opts.samples_format, info=info, p=processed,
+                                          suffix=f"_{stereomodes[c]}")
+                    else:
+                        # from tab
+                        images.save_image(stereoimages[c], path=outpath, basename=basename, seed=None,
+                                          prompt=None, extension=opts.samples_format, info=info, short_filename=True,
+                                          no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None,
+                                          forced_filename=None, suffix=f"_{stereomodes[c]}")
+
+            if gen_normal:
+                # taken from @graemeniedermayer, hidden, for api use only, will remove in future.
+                # take gradients
+                zx = cv2.Sobel(np.float64(img_output), cv2.CV_64F, 1, 0, ksize=3)
+                zy = cv2.Sobel(np.float64(img_output), cv2.CV_64F, 0, 1, ksize=3)
+
+                # combine and normalize gradients.
+                normal = np.dstack((zx, -zy, np.ones_like(img_output)))
+                n = np.linalg.norm(normal, axis=2)
+                normal[:, :, 0] /= n
+                normal[:, :, 1] /= n
+                normal[:, :, 2] /= n
+
+                # offset and rescale values to be in 0-255
+                normal += 1
+                normal /= 2
+                normal *= 255
+                normal = normal.astype(np.uint8)
+
+                outimages.append(Image.fromarray(normal))
+
+            # gen mesh
+            if gen_mesh:
+                print(f"\nGenerating (occluded) mesh ..")
+
+                meshsimple_fi = get_uniquefn(outpath, basename, 'obj')
+                meshsimple_fi = os.path.join(outpath, meshsimple_fi + '_simple.obj')
+
+                depthi = prediction
+                # try to map output to sensible values for non zoedepth models, boost, or custom maps
+                if model_type < 7 or boost or (custom_depthmap and custom_depthmap_img != None):
+                    # invert if midas
+                    if model_type > 0 or ((custom_depthmap and custom_depthmap_img != None) and not invert_depth):
+                        depthi = depth_max - depthi + depth_min
+                        depth_max = depthi.max()
+                        depth_min = depthi.min()
+                    # make positive
+                    if depth_min < 0:
+                        depthi = depthi - depth_min
+                        depth_max = depthi.max()
+                        depth_min = depthi.min()
+                    # scale down
+                    if depthi.max() > 10:
+                        depthi = 4 * (depthi - depth_min) / (depth_max - depth_min)
+                    # offset
+                    depthi = depthi + 1
+
+                mesh = create_mesh(inputimages[count], depthi, keep_edges=not mesh_occlude, spherical=mesh_spherical)
+                save_mesh_obj(meshsimple_fi, mesh)
+
+        print("Done.")
+
+    except RuntimeError as e:
+        if 'out of memory' in str(e):
+            print("ERROR: out of memory, could not generate depthmap !")
+        else:
+            print(e)
+
+    finally:
+        if not (hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels):
+            if 'model' in locals():
+                del model
+            if boost and 'pix2pixmodel' in locals():
+                del pix2pixmodel
+        else:
+            if 'model' in locals():
+                model.to(devices.cpu)
+
+        gc.collect()
+        devices.torch_gc()
+        reload_sd_model()
+    try:
+        if inpaint:
+            unload_sd_model()
+            mesh_fi = run_3dphoto(device, inpaint_imgs, inpaint_depths, inputnames, outpath, fnExt, vid_ssaa, inpaint_vids)
+    finally:
+        reload_sd_model()
+        print("All done.")
+
+    return outimages, mesh_fi, meshsimple_fi
+
 
 @njit(parallel=True)
 def clipdepthmap(img, clipthreshold_far, clipthreshold_near):
-	clipped_img = img
-	w, h = img.shape
-	min = img.min()
-	max = img.max()
-	drange = max - min
-	clipthreshold_far = min + (clipthreshold_far * drange)
-	clipthreshold_near = min + (clipthreshold_near * drange)
-
-	for x in prange(w):
-		for y in range(h):
-			if clipped_img[x,y] < clipthreshold_far:
-				clipped_img[x,y] = 0
-			elif clipped_img[x,y] > clipthreshold_near:
-				clipped_img[x,y] = 65535
-			else:
-				clipped_img[x,y] = ((clipped_img[x,y] + min) / drange * 65535)
-
-	return clipped_img
+    clipped_img = img
+    w, h = img.shape
+    min = img.min()
+    max = img.max()
+    drange = max - min
+    clipthreshold_far = min + (clipthreshold_far * drange)
+    clipthreshold_near = min + (clipthreshold_near * drange)
+
+    for x in prange(w):
+        for y in range(h):
+            if clipped_img[x, y] < clipthreshold_far:
+                clipped_img[x, y] = 0
+            elif clipped_img[x, y] > clipthreshold_near:
+                clipped_img[x, y] = 65535
+            else:
+                clipped_img[x, y] = ((clipped_img[x, y] + min) / drange * 65535)
+
+    return clipped_img
+
 
 def get_uniquefn(outpath, basename, ext):
-	# unique filename
-	basecount = get_next_sequence_number(outpath, basename)
-	if basecount > 0: basecount = basecount - 1
-	fullfn = None
-	for i in range(500):
-		fn = f"{basecount + i:05}" if basename == '' else f"{basename}-{basecount + i:04}"
-		fullfn = os.path.join(outpath, f"{fn}.{ext}")
-		if not os.path.exists(fullfn):
-			break
-	basename = Path(fullfn).stem
-	
-	return basename
+    # unique filename
+    basecount = get_next_sequence_number(outpath, basename)
+    if basecount > 0: basecount = basecount - 1
+    fullfn = None
+    for i in range(500):
+        fn = f"{basecount + i:05}" if basename == '' else f"{basename}-{basecount + i:04}"
+        fullfn = os.path.join(outpath, f"{fn}.{ext}")
+        if not os.path.exists(fullfn):
+            break
+    basename = Path(fullfn).stem
+
+    return basename
+
 
 def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, fnExt, vid_ssaa, inpaint_vids):
-	mesh_fi = ''
-	try:
-		print("Running 3D Photo Inpainting .. ")
-		edgemodel_path = './models/3dphoto/edge_model.pth'
-		depthmodel_path = './models/3dphoto/depth_model.pth'
-		colormodel_path = './models/3dphoto/color_model.pth'
-		# create paths to model if not present
-		os.makedirs('./models/3dphoto/', exist_ok=True)
-
-		ensure_file_downloaded(edgemodel_path,"https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/edge-model.pth")
-		ensure_file_downloaded(depthmodel_path,"https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/depth-model.pth")
-		ensure_file_downloaded(colormodel_path,"https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/color-model.pth")
-		
-		print("Loading edge model ..")
-		depth_edge_model = Inpaint_Edge_Net(init_weights=True)
-		depth_edge_weight = torch.load(edgemodel_path, map_location=torch.device(device))
-		depth_edge_model.load_state_dict(depth_edge_weight)
-		depth_edge_model = depth_edge_model.to(device)
-		depth_edge_model.eval()
-		print("Loading depth model ..")
-		depth_feat_model = Inpaint_Depth_Net()
-		depth_feat_weight = torch.load(depthmodel_path, map_location=torch.device(device))
-		depth_feat_model.load_state_dict(depth_feat_weight, strict=True)
-		depth_feat_model = depth_feat_model.to(device)
-		depth_feat_model.eval()
-		depth_feat_model = depth_feat_model.to(device)
-		print("Loading rgb model ..")
-		rgb_model = Inpaint_Color_Net()
-		rgb_feat_weight = torch.load(colormodel_path, map_location=torch.device(device))
-		rgb_model.load_state_dict(rgb_feat_weight)
-		rgb_model.eval()
-		rgb_model = rgb_model.to(device)
-
-		config = {}
-		config["gpu_ids"] = 0
-		config['extrapolation_thickness'] = 60
-		config['extrapolate_border'] = True
-		config['depth_threshold'] = 0.04
-		config['redundant_number'] = 12
-		config['ext_edge_threshold'] = 0.002
-		config['background_thickness'] = 70
-		config['context_thickness'] = 140
-		config['background_thickness_2'] = 70
-		config['context_thickness_2'] = 70
-		config['log_depth'] = True
-		config['depth_edge_dilate'] = 10
-		config['depth_edge_dilate_2'] = 5
-		config['largest_size'] = 512
-		config['repeat_inpaint_edge'] = True
-		config['ply_fmt'] = "bin"
-
-		config['save_ply'] = False
-		if hasattr(opts, 'depthmap_script_save_ply') and opts.depthmap_script_save_ply:
-			config['save_ply'] = True
-
-		config['save_obj'] = True
-		
-
-		if device == torch.device("cpu"):
-			config["gpu_ids"] = -1
-
-		# process all inputs
-		numimages = len(img_rgb)
-		for count in trange(0, numimages):
-
-			basename = 'depthmap'
-			if inputnames is not None:
-				if inputnames[count] is not None:
-					p = Path(inputnames[count])
-					basename = p.stem
-
-			basename = get_uniquefn(outpath, basename, 'obj')
-			mesh_fi = os.path.join(outpath, basename + '.obj')
-
-			print(f"\nGenerating inpainted mesh .. (go make some coffee) ..")
-
-			# from inpaint.utils.get_MiDaS_samples
-			W = img_rgb[count].width
-			H = img_rgb[count].height
-			int_mtx = np.array([[max(H, W), 0, W//2], [0, max(H, W), H//2], [0, 0, 1]]).astype(np.float32)
-			if int_mtx.max() > 1:
-				int_mtx[0, :] = int_mtx[0, :] / float(W)
-				int_mtx[1, :] = int_mtx[1, :] / float(H)
-
-			# how inpaint.utils.read_MiDaS_depth() imports depthmap
-			disp = img_depth[count].astype(np.float32)
-			disp = disp - disp.min()
-			disp = cv2.blur(disp / disp.max(), ksize=(3, 3)) * disp.max()
-			disp = (disp / disp.max()) * 3.0
-			depth = 1. / np.maximum(disp, 0.05)
-
-			# rgb input
-			img = np.asarray(img_rgb[count])
-
-			# run sparse bilateral filter
-			config['sparse_iter'] = 5
-			config['filter_size'] = [7, 7, 5, 5, 5]
-			config['sigma_s'] = 4.0
-			config['sigma_r'] = 0.5
-			vis_photos, vis_depths = sparse_bilateral_filtering(depth.copy(), img.copy(), config, num_iter=config['sparse_iter'], spdb=False)
-			depth = vis_depths[-1]
-
-			#bilat_fn = os.path.join(outpath, basename +'_bilatdepth.png')
-			#cv2.imwrite(bilat_fn, depth)
-
-			rt_info = write_mesh(img,
-								depth,
-								int_mtx,
-								mesh_fi,
-								config,
-								rgb_model,
-								depth_edge_model,
-								depth_edge_model,
-								depth_feat_model)
-
-			if rt_info is not False and inpaint_vids:
-				run_3dphoto_videos(mesh_fi, basename, outpath, 300, 40, 
-					[0.03, 0.03, 0.05, 0.03], 
-					['double-straight-line', 'double-straight-line', 'circle', 'circle'], 
-					[0.00, 0.00, -0.015, -0.015], 
-					[0.00, 0.00, -0.015, -0.00], 
-					[-0.05, -0.05, -0.05, -0.05], 
-					['dolly-zoom-in', 'zoom-in', 'circle', 'swing'], False, fnExt, vid_ssaa)
-				
-			devices.torch_gc()
-
-	finally:
-		del rgb_model
-		rgb_model = None
-		del depth_edge_model
-		depth_edge_model = None
-		del depth_feat_model
-		depth_feat_model = None
-		devices.torch_gc()
-
-	return mesh_fi
-
-def run_3dphoto_videos(mesh_fi, basename, outpath, num_frames, fps, crop_border, traj_types, x_shift_range, y_shift_range, z_shift_range, video_postfix, vid_dolly, fnExt, vid_ssaa):
-
-	if platform.system() == 'Windows':
-		vispy.use(app='PyQt5')
-	elif platform.system() == 'Darwin':
-		vispy.use('PyQt6')
-	else:
-		vispy.use(app='egl')
-
-	# read ply
-	global video_mesh_data, video_mesh_fn
-	if video_mesh_fn == None or video_mesh_fn != mesh_fi:
-		del video_mesh_data
-		video_mesh_fn = mesh_fi
-		video_mesh_data = read_mesh(mesh_fi)
-		
-	verts, colors, faces, Height, Width, hFov, vFov, mean_loc_depth = video_mesh_data
-
-	original_w = output_w = W = Width
-	original_h = output_h = H = Height
-	int_mtx = np.array([[max(H, W), 0, W//2], [0, max(H, W), H//2], [0, 0, 1]]).astype(np.float32)
-	if int_mtx.max() > 1:
-		int_mtx[0, :] = int_mtx[0, :] / float(W)
-		int_mtx[1, :] = int_mtx[1, :] / float(H)
-
-	config = {}
-	config['video_folder'] = outpath
-	config['num_frames'] = num_frames
-	config['fps'] = fps
-	config['crop_border'] = crop_border
-	config['traj_types'] = traj_types
-	config['x_shift_range'] = x_shift_range
-	config['y_shift_range'] = y_shift_range
-	config['z_shift_range'] = z_shift_range
-	config['video_postfix'] = video_postfix
-	config['ssaa'] = vid_ssaa
-
-	# from inpaint.utils.get_MiDaS_samples
-	generic_pose = np.eye(4)
-	assert len(config['traj_types']) == len(config['x_shift_range']) ==\
-		len(config['y_shift_range']) == len(config['z_shift_range']) == len(config['video_postfix']), \
-		"The number of elements in 'traj_types', 'x_shift_range', 'y_shift_range', 'z_shift_range' and \
-			'video_postfix' should be equal."
-	tgt_pose = [[generic_pose * 1]]
-	tgts_poses = []
-	for traj_idx in range(len(config['traj_types'])):
-		tgt_poses = []
-		sx, sy, sz = path_planning(config['num_frames'], config['x_shift_range'][traj_idx], config['y_shift_range'][traj_idx],
-								config['z_shift_range'][traj_idx], path_type=config['traj_types'][traj_idx])
-		for xx, yy, zz in zip(sx, sy, sz):
-			tgt_poses.append(generic_pose * 1.)
-			tgt_poses[-1][:3, -1] = np.array([xx, yy, zz])
-		tgts_poses += [tgt_poses]    
-	tgt_pose = generic_pose * 1
-
-	# seems we only need the depthmap to calc mean_loc_depth, which is only used when doing 'dolly'
-	# width and height are already in the ply file in the comments ..
-	# might try to add the mean_loc_depth to it too 
-	# did just that
-	#mean_loc_depth = img_depth[img_depth.shape[0]//2, img_depth.shape[1]//2]
-
-	print("Generating videos ..")
-
-	normal_canvas, all_canvas = None, None
-	videos_poses, video_basename = copy.deepcopy(tgts_poses), basename
-	top = (original_h // 2 - int_mtx[1, 2] * output_h)
-	left = (original_w // 2 - int_mtx[0, 2] * output_w)
-	down, right = top + output_h, left + output_w
-	border = [int(xx) for xx in [top, down, left, right]]
-	normal_canvas, all_canvas, fn_saved = output_3d_photo(verts.copy(), colors.copy(), faces.copy(), copy.deepcopy(Height), copy.deepcopy(Width), copy.deepcopy(hFov), copy.deepcopy(vFov),
-						copy.deepcopy(tgt_pose), config['video_postfix'], copy.deepcopy(generic_pose), copy.deepcopy(config['video_folder']),
-						None, copy.deepcopy(int_mtx), config, None,
-						videos_poses, video_basename, original_h, original_w, border=border, depth=None, normal_canvas=normal_canvas, all_canvas=all_canvas,
-						mean_loc_depth=mean_loc_depth, dolly=vid_dolly, fnExt=fnExt)
-	return fn_saved
+    mesh_fi = ''
+    try:
+        print("Running 3D Photo Inpainting .. ")
+        edgemodel_path = './models/3dphoto/edge_model.pth'
+        depthmodel_path = './models/3dphoto/depth_model.pth'
+        colormodel_path = './models/3dphoto/color_model.pth'
+        # create paths to model if not present
+        os.makedirs('./models/3dphoto/', exist_ok=True)
+
+        ensure_file_downloaded(edgemodel_path,
+                               "https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/edge-model.pth")
+        ensure_file_downloaded(depthmodel_path,
+                               "https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/depth-model.pth")
+        ensure_file_downloaded(colormodel_path,
+                               "https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/color-model.pth")
+
+        print("Loading edge model ..")
+        depth_edge_model = Inpaint_Edge_Net(init_weights=True)
+        depth_edge_weight = torch.load(edgemodel_path, map_location=torch.device(device))
+        depth_edge_model.load_state_dict(depth_edge_weight)
+        depth_edge_model = depth_edge_model.to(device)
+        depth_edge_model.eval()
+        print("Loading depth model ..")
+        depth_feat_model = Inpaint_Depth_Net()
+        depth_feat_weight = torch.load(depthmodel_path, map_location=torch.device(device))
+        depth_feat_model.load_state_dict(depth_feat_weight, strict=True)
+        depth_feat_model = depth_feat_model.to(device)
+        depth_feat_model.eval()
+        depth_feat_model = depth_feat_model.to(device)
+        print("Loading rgb model ..")
+        rgb_model = Inpaint_Color_Net()
+        rgb_feat_weight = torch.load(colormodel_path, map_location=torch.device(device))
+        rgb_model.load_state_dict(rgb_feat_weight)
+        rgb_model.eval()
+        rgb_model = rgb_model.to(device)
+
+        config = {}
+        config["gpu_ids"] = 0
+        config['extrapolation_thickness'] = 60
+        config['extrapolate_border'] = True
+        config['depth_threshold'] = 0.04
+        config['redundant_number'] = 12
+        config['ext_edge_threshold'] = 0.002
+        config['background_thickness'] = 70
+        config['context_thickness'] = 140
+        config['background_thickness_2'] = 70
+        config['context_thickness_2'] = 70
+        config['log_depth'] = True
+        config['depth_edge_dilate'] = 10
+        config['depth_edge_dilate_2'] = 5
+        config['largest_size'] = 512
+        config['repeat_inpaint_edge'] = True
+        config['ply_fmt'] = "bin"
+
+        config['save_ply'] = False
+        if hasattr(opts, 'depthmap_script_save_ply') and opts.depthmap_script_save_ply:
+            config['save_ply'] = True
+
+        config['save_obj'] = True
+
+        if device == torch.device("cpu"):
+            config["gpu_ids"] = -1
+
+        # process all inputs
+        numimages = len(img_rgb)
+        for count in trange(0, numimages):
+
+            basename = 'depthmap'
+            if inputnames is not None:
+                if inputnames[count] is not None:
+                    p = Path(inputnames[count])
+                    basename = p.stem
+
+            basename = get_uniquefn(outpath, basename, 'obj')
+            mesh_fi = os.path.join(outpath, basename + '.obj')
+
+            print(f"\nGenerating inpainted mesh .. (go make some coffee) ..")
+
+            # from inpaint.utils.get_MiDaS_samples
+            W = img_rgb[count].width
+            H = img_rgb[count].height
+            int_mtx = np.array([[max(H, W), 0, W // 2], [0, max(H, W), H // 2], [0, 0, 1]]).astype(np.float32)
+            if int_mtx.max() > 1:
+                int_mtx[0, :] = int_mtx[0, :] / float(W)
+                int_mtx[1, :] = int_mtx[1, :] / float(H)
+
+            # how inpaint.utils.read_MiDaS_depth() imports depthmap
+            disp = img_depth[count].astype(np.float32)
+            disp = disp - disp.min()
+            disp = cv2.blur(disp / disp.max(), ksize=(3, 3)) * disp.max()
+            disp = (disp / disp.max()) * 3.0
+            depth = 1. / np.maximum(disp, 0.05)
+
+            # rgb input
+            img = np.asarray(img_rgb[count])
+
+            # run sparse bilateral filter
+            config['sparse_iter'] = 5
+            config['filter_size'] = [7, 7, 5, 5, 5]
+            config['sigma_s'] = 4.0
+            config['sigma_r'] = 0.5
+            vis_photos, vis_depths = sparse_bilateral_filtering(depth.copy(), img.copy(), config,
+                                                                num_iter=config['sparse_iter'], spdb=False)
+            depth = vis_depths[-1]
+
+            # bilat_fn = os.path.join(outpath, basename +'_bilatdepth.png')
+            # cv2.imwrite(bilat_fn, depth)
+
+            rt_info = write_mesh(img,
+                                 depth,
+                                 int_mtx,
+                                 mesh_fi,
+                                 config,
+                                 rgb_model,
+                                 depth_edge_model,
+                                 depth_edge_model,
+                                 depth_feat_model)
+
+            if rt_info is not False and inpaint_vids:
+                run_3dphoto_videos(mesh_fi, basename, outpath, 300, 40,
+                                   [0.03, 0.03, 0.05, 0.03],
+                                   ['double-straight-line', 'double-straight-line', 'circle', 'circle'],
+                                   [0.00, 0.00, -0.015, -0.015],
+                                   [0.00, 0.00, -0.015, -0.00],
+                                   [-0.05, -0.05, -0.05, -0.05],
+                                   ['dolly-zoom-in', 'zoom-in', 'circle', 'swing'], False, fnExt, vid_ssaa)
+
+            devices.torch_gc()
+
+    finally:
+        del rgb_model
+        rgb_model = None
+        del depth_edge_model
+        depth_edge_model = None
+        del depth_feat_model
+        depth_feat_model = None
+        devices.torch_gc()
+
+    return mesh_fi
+
+
+def run_3dphoto_videos(mesh_fi, basename, outpath, num_frames, fps, crop_border, traj_types, x_shift_range,
+                       y_shift_range, z_shift_range, video_postfix, vid_dolly, fnExt, vid_ssaa):
+    if platform.system() == 'Windows':
+        vispy.use(app='PyQt5')
+    elif platform.system() == 'Darwin':
+        vispy.use('PyQt6')
+    else:
+        vispy.use(app='egl')
+
+    # read ply
+    global video_mesh_data, video_mesh_fn
+    if video_mesh_fn == None or video_mesh_fn != mesh_fi:
+        del video_mesh_data
+        video_mesh_fn = mesh_fi
+        video_mesh_data = read_mesh(mesh_fi)
+
+    verts, colors, faces, Height, Width, hFov, vFov, mean_loc_depth = video_mesh_data
+
+    original_w = output_w = W = Width
+    original_h = output_h = H = Height
+    int_mtx = np.array([[max(H, W), 0, W // 2], [0, max(H, W), H // 2], [0, 0, 1]]).astype(np.float32)
+    if int_mtx.max() > 1:
+        int_mtx[0, :] = int_mtx[0, :] / float(W)
+        int_mtx[1, :] = int_mtx[1, :] / float(H)
+
+    config = {}
+    config['video_folder'] = outpath
+    config['num_frames'] = num_frames
+    config['fps'] = fps
+    config['crop_border'] = crop_border
+    config['traj_types'] = traj_types
+    config['x_shift_range'] = x_shift_range
+    config['y_shift_range'] = y_shift_range
+    config['z_shift_range'] = z_shift_range
+    config['video_postfix'] = video_postfix
+    config['ssaa'] = vid_ssaa
+
+    # from inpaint.utils.get_MiDaS_samples
+    generic_pose = np.eye(4)
+    assert len(config['traj_types']) == len(config['x_shift_range']) == \
+           len(config['y_shift_range']) == len(config['z_shift_range']) == len(config['video_postfix']), \
+        "The number of elements in 'traj_types', 'x_shift_range', 'y_shift_range', 'z_shift_range' and \
+            'video_postfix' should be equal."
+    tgt_pose = [[generic_pose * 1]]
+    tgts_poses = []
+    for traj_idx in range(len(config['traj_types'])):
+        tgt_poses = []
+        sx, sy, sz = path_planning(config['num_frames'], config['x_shift_range'][traj_idx],
+                                   config['y_shift_range'][traj_idx],
+                                   config['z_shift_range'][traj_idx], path_type=config['traj_types'][traj_idx])
+        for xx, yy, zz in zip(sx, sy, sz):
+            tgt_poses.append(generic_pose * 1.)
+            tgt_poses[-1][:3, -1] = np.array([xx, yy, zz])
+        tgts_poses += [tgt_poses]
+    tgt_pose = generic_pose * 1
+
+    # seems we only need the depthmap to calc mean_loc_depth, which is only used when doing 'dolly'
+    # width and height are already in the ply file in the comments ..
+    # might try to add the mean_loc_depth to it too
+    # did just that
+    # mean_loc_depth = img_depth[img_depth.shape[0]//2, img_depth.shape[1]//2]
+
+    print("Generating videos ..")
+
+    normal_canvas, all_canvas = None, None
+    videos_poses, video_basename = copy.deepcopy(tgts_poses), basename
+    top = (original_h // 2 - int_mtx[1, 2] * output_h)
+    left = (original_w // 2 - int_mtx[0, 2] * output_w)
+    down, right = top + output_h, left + output_w
+    border = [int(xx) for xx in [top, down, left, right]]
+    normal_canvas, all_canvas, fn_saved = output_3d_photo(verts.copy(), colors.copy(), faces.copy(),
+                                                          copy.deepcopy(Height), copy.deepcopy(Width),
+                                                          copy.deepcopy(hFov), copy.deepcopy(vFov),
+                                                          copy.deepcopy(tgt_pose), config['video_postfix'],
+                                                          copy.deepcopy(generic_pose),
+                                                          copy.deepcopy(config['video_folder']),
+                                                          None, copy.deepcopy(int_mtx), config, None,
+                                                          videos_poses, video_basename, original_h, original_w,
+                                                          border=border, depth=None, normal_canvas=normal_canvas,
+                                                          all_canvas=all_canvas,
+                                                          mean_loc_depth=mean_loc_depth, dolly=vid_dolly, fnExt=fnExt)
+    return fn_saved
+
 
 # called from gen vid tab button
 def run_makevideo(fn_mesh, vid_numframes, vid_fps, vid_traj, vid_shift, vid_border, dolly, vid_format, vid_ssaa):
-	if len(fn_mesh) == 0 or not os.path.exists(fn_mesh):
-		raise Exception("Could not open mesh.")
-
-	# file type
-	fnExt = "mp4" if vid_format == 0 else "webm"
-
-	vid_ssaa = vid_ssaa + 1
-	
-	# traj type
-	if vid_traj == 0:
-		vid_traj = ['straight-line']
-	elif vid_traj == 1:
-		vid_traj = ['double-straight-line']
-	elif vid_traj == 2:
-		vid_traj = ['circle']
-
-	num_fps = int(vid_fps)
-	num_frames = int(vid_numframes)
-	shifts = vid_shift.split(',')
-	if len(shifts) != 3:
-		raise Exception("Translate requires 3 elements.")
-	x_shift_range = [ float(shifts[0]) ]
-	y_shift_range = [ float(shifts[1]) ]
-	z_shift_range = [ float(shifts[2]) ]
-	
-	borders = vid_border.split(',')
-	if len(borders) != 4:
-		raise Exception("Crop Border requires 4 elements.")
-	crop_border = [float(borders[0]), float(borders[1]), float(borders[2]), float(borders[3])]
-
-	# output path and filename mess ..
-	basename = Path(fn_mesh).stem
-	outpath = opts.outdir_samples or opts.outdir_extras_samples
-	# unique filename
-	basecount = get_next_sequence_number(outpath, basename)
-	if basecount > 0: basecount = basecount - 1
-	fullfn = None
-	for i in range(500):
-		fn = f"{basecount + i:05}" if basename == '' else f"{basename}-{basecount + i:04}"
-		fullfn = os.path.join(outpath, f"{fn}_." + fnExt)
-		if not os.path.exists(fullfn):
-			break
-	basename = Path(fullfn).stem
-	basename = basename[:-1]
-	
-	print("Loading mesh ..")
-
-	fn_saved = run_3dphoto_videos(fn_mesh, basename, outpath, num_frames, num_fps, crop_border, vid_traj, x_shift_range, y_shift_range, z_shift_range, [''], dolly, fnExt, vid_ssaa)
-
-	return fn_saved[-1], fn_saved[-1], ''
+    if len(fn_mesh) == 0 or not os.path.exists(fn_mesh):
+        raise Exception("Could not open mesh.")
+
+    # file type
+    fnExt = "mp4" if vid_format == 0 else "webm"
+
+    vid_ssaa = vid_ssaa + 1
+
+    # traj type
+    if vid_traj == 0:
+        vid_traj = ['straight-line']
+    elif vid_traj == 1:
+        vid_traj = ['double-straight-line']
+    elif vid_traj == 2:
+        vid_traj = ['circle']
+
+    num_fps = int(vid_fps)
+    num_frames = int(vid_numframes)
+    shifts = vid_shift.split(',')
+    if len(shifts) != 3:
+        raise Exception("Translate requires 3 elements.")
+    x_shift_range = [float(shifts[0])]
+    y_shift_range = [float(shifts[1])]
+    z_shift_range = [float(shifts[2])]
+
+    borders = vid_border.split(',')
+    if len(borders) != 4:
+        raise Exception("Crop Border requires 4 elements.")
+    crop_border = [float(borders[0]), float(borders[1]), float(borders[2]), float(borders[3])]
+
+    # output path and filename mess ..
+    basename = Path(fn_mesh).stem
+    outpath = opts.outdir_samples or opts.outdir_extras_samples
+    # unique filename
+    basecount = get_next_sequence_number(outpath, basename)
+    if basecount > 0: basecount = basecount - 1
+    fullfn = None
+    for i in range(500):
+        fn = f"{basecount + i:05}" if basename == '' else f"{basename}-{basecount + i:04}"
+        fullfn = os.path.join(outpath, f"{fn}_." + fnExt)
+        if not os.path.exists(fullfn):
+            break
+    basename = Path(fullfn).stem
+    basename = basename[:-1]
+
+    print("Loading mesh ..")
+
+    fn_saved = run_3dphoto_videos(fn_mesh, basename, outpath, num_frames, num_fps, crop_border, vid_traj, x_shift_range,
+                                  y_shift_range, z_shift_range, [''], dolly, fnExt, vid_ssaa)
+
+    return fn_saved[-1], fn_saved[-1], ''
 
 
 # called from depth tab
-def run_generate(depthmap_mode, 
-                depthmap_image,
-                image_batch,
-                depthmap_batch_input_dir,
-                depthmap_batch_output_dir,
-                compute_device,
-                model_type,
-                net_width,
-                net_height,
-                match_size,
-                boost,
-                invert_depth,
-                clipdepth,
-                clipthreshold_far,
-                clipthreshold_near,
-                combine_output,
-                combine_output_axis,
-                save_depth,
-                show_depth,
-                show_heat,
-                gen_stereo,
-                stereo_modes,
-                stereo_divergence,
-				stereo_separation,
-                stereo_fill,
-                stereo_balance,
-                inpaint,
-                inpaint_vids,
-                background_removal,
-                save_background_removal_masks,
-                gen_normal,
-
-                background_removal_model,
-                pre_depth_background_removal,
-                vid_format,
-                vid_ssaa,
-                custom_depthmap, 
-                custom_depthmap_img,
-                depthmap_batch_reuse,
-                gen_mesh, mesh_occlude, mesh_spherical
-                ):
-
-				
-	# file type
-	fnExt = "mp4" if vid_format == 0 else "webm"
-
-	vid_ssaa = vid_ssaa + 1
-
-	imageArr = []
-	# Also keep track of original file names
-	imageNameArr = []
-	outputs = []
-
-	if depthmap_mode == 1:
-		#convert file to pillow image
-		for img in image_batch:
-			image = Image.open(os.path.abspath(img.name))
-			imageArr.append(image)
-			imageNameArr.append(os.path.splitext(img.orig_name)[0])
-	elif depthmap_mode == 2:
-		assert not shared.cmd_opts.hide_ui_dir_config, '--hide-ui-dir-config option must be disabled'
-
-		if depthmap_batch_input_dir == '':
-			return outputs, "Please select an input directory.", ''
-		image_list = shared.listfiles(depthmap_batch_input_dir)
-		for img in image_list:
-			try:
-				image = Image.open(img)
-			except Exception:
-				continue
-			imageArr.append(image)
-			imageNameArr.append(img)
-	else:
-		imageArr.append(depthmap_image)
-		imageNameArr.append(None)
-
-	if depthmap_mode == 2 and depthmap_batch_output_dir != '':
-		outpath = depthmap_batch_output_dir
-	else:
-		outpath = opts.outdir_samples or opts.outdir_extras_samples
-
-	background_removed_images = []
-	if background_removal:
-		if pre_depth_background_removal:
-			imageArr = batched_background_removal(imageArr, background_removal_model)
-			background_removed_images = imageArr
-		else:
-			background_removed_images = batched_background_removal(imageArr, background_removal_model)
-
-	outputs, mesh_fi, meshsimple_fi = run_depthmap(
-        None, outpath, imageArr, imageNameArr,
-        compute_device, model_type, net_width, net_height, match_size, boost, invert_depth, clipdepth, clipthreshold_far, clipthreshold_near, combine_output, combine_output_axis, save_depth, show_depth, show_heat, gen_stereo, stereo_modes, stereo_divergence, stereo_separation, stereo_fill, stereo_balance, inpaint, inpaint_vids, background_removal, save_background_removal_masks, gen_normal,
-        background_removed_images, fnExt, vid_ssaa, custom_depthmap, custom_depthmap_img, depthmap_batch_reuse, gen_mesh, mesh_occlude, mesh_spherical)
-
-	# use inpainted 3d mesh to show in 3d model output when enabled in settings
-	if hasattr(opts, 'depthmap_script_show_3d_inpaint') and opts.depthmap_script_show_3d_inpaint and mesh_fi != None and len(mesh_fi) > 0:
-			meshsimple_fi = mesh_fi
-
-	# don't show 3dmodel when disabled in settings
-	if hasattr(opts, 'depthmap_script_show_3d') and not opts.depthmap_script_show_3d:
-			meshsimple_fi = None
-
-	return outputs, mesh_fi, meshsimple_fi, plaintext_to_html('info'), ''
+def run_generate(*inputs):
+    inputs = GradioComponentBundle.enkey_to_dict(inputs)
+    depthmap_mode = inputs['depthmap_mode']
+    depthmap_batch_input_dir = inputs['depthmap_batch_input_dir']
+    image_batch = inputs['image_batch']
+    depthmap_input_image = inputs['depthmap_input_image']
+    depthmap_batch_output_dir = inputs['depthmap_batch_output_dir']
+
+    imageArr = []
+    # Also keep track of original file names
+    imageNameArr = []
+    outputs = []
+
+    # TODO: this should not be here
+    # file type
+    inputs['fnExt'] = "mp4" if inputs['vid_format'] == 0 else "webm"
+    inputs['vid_ssaa'] = inputs['vid_ssaa'] + 1
+
+    if depthmap_mode == '0':  # Single image
+        imageArr.append(depthmap_input_image)
+        imageNameArr.append(None)
+    if depthmap_mode == '1':  # Batch Process
+        # convert files to pillow images
+        for img in image_batch:
+            image = Image.open(os.path.abspath(img.name))
+            imageArr.append(image)
+            imageNameArr.append(os.path.splitext(img.orig_name)[0])
+    elif depthmap_mode == '2':  # Batch from Directory
+        assert not shared.cmd_opts.hide_ui_dir_config, '--hide-ui-dir-config option must be disabled'
+        if depthmap_batch_input_dir == '':
+            return outputs, "Please select an input directory.", ''
+        image_list = shared.listfiles(depthmap_batch_input_dir)
+        for img in image_list:
+            try:
+                image = Image.open(img)
+                imageArr.append(image)
+                imageNameArr.append(img)
+            except Exception:
+                print(f'Failed to load {img}, ignoring.')
+
+    if depthmap_mode == '2' and depthmap_batch_output_dir != '':
+        outpath = depthmap_batch_output_dir
+    else:
+        outpath = opts.outdir_samples or opts.outdir_extras_samples
+
+    # TODO: this should not be here
+    background_removed_images = []
+    if inputs['background_removal']:
+        if inputs['pre_depth_background_removal']:
+            imageArr = batched_background_removal(imageArr, inputs['background_removal_model'])
+            background_removed_images = imageArr
+        else:
+            background_removed_images = batched_background_removal(imageArr, inputs['background_removal_model'])
+    outputs, mesh_fi, meshsimple_fi = run_depthmap(None, outpath, imageArr, imageNameArr, inputs,
+                                                   background_removed_images)
+
+    # use inpainted 3d mesh to show in 3d model output when enabled in settings
+    if hasattr(opts, 'depthmap_script_show_3d_inpaint') and opts.depthmap_script_show_3d_inpaint and mesh_fi != None and len(mesh_fi) > 0:
+            meshsimple_fi = mesh_fi
+
+    # don't show 3dmodel when disabled in settings
+    if hasattr(opts, 'depthmap_script_show_3d') and not opts.depthmap_script_show_3d:
+            meshsimple_fi = None
+
+    return outputs, mesh_fi, meshsimple_fi, plaintext_to_html('info'), ''
+
 
 def unload_models():
-	global depthmap_model_depth, depthmap_model_pix2pix, depthmap_model_type
-	depthmap_model_type = -1
-	del depthmap_model_depth
-	del depthmap_model_pix2pix
-	depthmap_model_depth = None
-	depthmap_model_pix2pix = None
-	gc.collect()
-	devices.torch_gc()
+    global depthmap_model_depth, depthmap_model_pix2pix, depthmap_model_type
+    depthmap_model_type = -1
+    del depthmap_model_depth
+    del depthmap_model_pix2pix
+    depthmap_model_depth = None
+    depthmap_model_pix2pix = None
+    gc.collect()
+    devices.torch_gc()
+
 
 def clear_mesh():
-	return None
+    return None
+
 
 def on_ui_settings():
     section = ('depthmap-script', "Depthmap extension")
-    shared.opts.add_option("depthmap_script_keepmodels", shared.OptionInfo(False, "Keep depth models loaded.", section=section))
-    shared.opts.add_option("depthmap_script_boost_rmax", shared.OptionInfo(1600, "Maximum wholesize for boost (Rmax)", section=section))
-    shared.opts.add_option("depthmap_script_save_ply", shared.OptionInfo(False, "Save additional PLY file with 3D inpainted mesh.", section=section))
-    shared.opts.add_option("depthmap_script_show_3d", shared.OptionInfo(True, "Enable showing 3D Meshes in output tab. (Experimental)", section=section))
-    shared.opts.add_option("depthmap_script_show_3d_inpaint", shared.OptionInfo(True, "Also show 3D Inpainted Mesh in 3D Mesh output tab. (Experimental)", section=section))
-    shared.opts.add_option("depthmap_script_mesh_maxsize", shared.OptionInfo(2048, "Max size for generating simple mesh.", section=section))
+    shared.opts.add_option("depthmap_script_keepmodels",
+                           shared.OptionInfo(False, "Keep depth models loaded.",
+                                             section=section))
+    shared.opts.add_option("depthmap_script_boost_rmax",
+                           shared.OptionInfo(1600, "Maximum wholesize for boost (Rmax)",
+                                             section=section))
+    shared.opts.add_option("depthmap_script_save_ply",
+                           shared.OptionInfo(False, "Save additional PLY file with 3D inpainted mesh.",
+                                             section=section))
+    shared.opts.add_option("depthmap_script_show_3d",
+                           shared.OptionInfo(True, "Enable showing 3D Meshes in output tab. (Experimental)",
+                                             section=section))
+    shared.opts.add_option("depthmap_script_show_3d_inpaint",
+                           shared.OptionInfo(True, "Also show 3D Inpainted Mesh in 3D Mesh output tab. (Experimental)",
+                                             section=section))
+    shared.opts.add_option("depthmap_script_mesh_maxsize",
+                           shared.OptionInfo(2048, "Max size for generating simple mesh.",
+                                             section=section))
+
 
 def on_ui_tabs():
+    inp = GradioComponentBundle()
     with gr.Blocks(analytics_enabled=False) as depthmap_interface:
-        dummy_component = gr.Label(visible=False)
         with gr.Row().style(equal_height=False):
             with gr.Column(variant='panel'):
-                with gr.Tabs(elem_id="mode_depthmap"):
-                    with gr.TabItem('Single Image'):
+                inp += 'depthmap_mode', gr.HTML(visible=False, value='0')
+                with gr.Tabs():
+                    with gr.TabItem('Single Image') as depthmap_mode_0:
                         with gr.Row():
-                            depthmap_image = gr.Image(label="Source", source="upload", interactive=True, type="pil", elem_id="depthmap_input_image")
+                            inp += gr.Image(label="Source", source="upload", interactive=True, type="pil",
+                                            elem_id="depthmap_input_image")
                             with gr.Group(visible=False) as custom_depthmap_row_0:
-                                custom_depthmap_img = gr.File(label="Custom DepthMap", file_count="single", interactive=True, type="file")
-                        custom_depthmap = gr.Checkbox(label="Use custom DepthMap",value=False)
-
-                    with gr.TabItem('Batch Process'):
-                        image_batch = gr.File(label="Batch Process", file_count="multiple", interactive=True, type="file")
-
-                    with gr.TabItem('Batch from Directory'):
-                        depthmap_batch_input_dir = gr.Textbox(label="Input directory", **shared.hide_dirs, placeholder="A directory on the same machine where the server is running.")
-                        depthmap_batch_output_dir = gr.Textbox(label="Output directory", **shared.hide_dirs, placeholder="Leave blank to save images to the default path.")
-                        depthmap_batch_reuse = gr.Checkbox(label="Skip generation and use (edited/custom) depthmaps in output directory when a file exists.",value=True)
-
+                                inp += gr.File(label="Custom DepthMap", file_count="single", interactive=True,
+                                               type="file", elem_id='custom_depthmap_img')
+                        inp += gr.Checkbox(elem_id="custom_depthmap", label="Use custom DepthMap", value=False)
+                    with gr.TabItem('Batch Process') as depthmap_mode_1:
+                        inp += gr.File(elem_id='image_batch', label="Batch Process", file_count="multiple",
+                                       interactive=True, type="file")
+                    with gr.TabItem('Batch from Directory') as depthmap_mode_2:
+                        inp += gr.Textbox(elem_id="depthmap_batch_input_dir", label="Input directory",
+                                          **shared.hide_dirs,
+                                          placeholder="A directory on the same machine where the server is running.")
+                        inp += gr.Textbox(elem_id="depthmap_batch_output_dir", label="Output directory",
+                                          **shared.hide_dirs,
+                                          placeholder="Leave blank to save images to the default path.")
+                        inp += gr.Checkbox(elem_id="depthmap_batch_reuse",
+                                           label="Skip generation and use (edited/custom) depthmaps in output directory when a file exists.",
+                                           value=True)
                 submit = gr.Button('Generate', elem_id="depthmap_generate", variant='primary')
-
-				# insert main panel
-                compute_device, model_type, net_width, net_height, match_size, boost, invert_depth, clipdepth, clipthreshold_far, clipthreshold_near, combine_output, combine_output_axis, save_depth, show_depth, show_heat, gen_stereo, stereo_modes, stereo_divergence, stereo_separation, stereo_fill, stereo_balance, inpaint, inpaint_vids, background_removal, save_background_removal_masks, gen_normal, pre_depth_background_removal, background_removal_model, gen_mesh, mesh_occlude, mesh_spherical = main_ui_panel(True)
-
+                inp += main_ui_panel(True)
                 unloadmodels = gr.Button('Unload models', elem_id="depthmap_unloadmodels")
 
             with gr.Column(variant='panel'):
                 with gr.Tabs(elem_id="mode_depthmap_output"):
                     with gr.TabItem('Depth Output'):
-
                         with gr.Group():
-                            result_images = gr.Gallery(label='Output', show_label=False, elem_id=f"depthmap_gallery").style(grid=4)
+                            result_images = gr.Gallery(label='Output', show_label=False,
+                                                       elem_id=f"depthmap_gallery").style(grid=4)
                         with gr.Column():
                             html_info_x = gr.HTML()
                             html_info = gr.HTML()
@@ -1314,7 +1402,7 @@ def on_ui_tabs():
                             with gr.Row():
                                 #loadmesh = gr.Button('Load')
                                 clearmesh = gr.Button('Clear')
-    
+
                     with gr.TabItem('Generate video'):
                         # generate video
                         with gr.Group():
@@ -1329,8 +1417,8 @@ def on_ui_tabs():
                             with gr.Row():
                                 vid_numframes = gr.Textbox(label="Number of frames", value="300")
                                 vid_fps = gr.Textbox(label="Framerate", value="40")
-                                vid_format = gr.Dropdown(label="Format", choices=['mp4', 'webm'], value='mp4', type="index", elem_id="video_format")
-                                vid_ssaa = gr.Dropdown(label="SSAA", choices=['1', '2', '3', '4'], value='3', type="index", elem_id="video_ssaa")
+                                inp += 'vid_format', gr.Dropdown(label="Format", choices=['mp4', 'webm'], value='mp4', type="index", elem_id="video_format")
+                                inp += 'vid_ssaa', gr.Dropdown(label="SSAA", choices=['1', '2', '3', '4'], value='3', type="index", elem_id="video_ssaa")
                             with gr.Row():
                                 vid_traj = gr.Dropdown(label="Trajectory", choices=['straight-line', 'double-straight-line', 'circle'], value='double-straight-line', type="index", elem_id="video_trajectory")
                                 vid_shift = gr.Textbox(label="Translate: x, y, z", value="-0.015, 0.0, -0.05")
@@ -1339,11 +1427,17 @@ def on_ui_tabs():
                             with gr.Row():
                                 submit_vid = gr.Button('Generate Video', elem_id="depthmap_generatevideo", variant='primary')
 
+        inp += inp.enkey_tail()
+
+        depthmap_mode_0.select(lambda: '0', None, inp['depthmap_mode'])
+        depthmap_mode_1.select(lambda: '1', None, inp['depthmap_mode'])
+        depthmap_mode_2.select(lambda: '2', None, inp['depthmap_mode'])
+
         def custom_depthmap_visibility(v):
             return custom_depthmap_row_0.update(visible=v)
-        custom_depthmap.change(
+        inp['custom_depthmap'].change(
             fn=custom_depthmap_visibility,
-            inputs=[custom_depthmap],
+            inputs=[inp['custom_depthmap']],
             outputs=[custom_depthmap_row_0]
         )
 
@@ -1352,7 +1446,7 @@ def custom_depthmap_visibility(v):
             inputs=[],
             outputs=[]
         )
-	
+
         clearmesh.click(
             fn=clear_mesh,
             inputs=[],
@@ -1361,54 +1455,11 @@ def custom_depthmap_visibility(v):
 
         submit.click(
             fn=wrap_gradio_gpu_call(run_generate),
-            _js="get_depthmap_tab_index",
-            inputs=[
-                dummy_component,
-                depthmap_image,
-                image_batch,
-                depthmap_batch_input_dir,
-                depthmap_batch_output_dir,
-
-				compute_device,
-				model_type,
-				net_width, 
-				net_height, 
-				match_size,
-                boost,
-                invert_depth,
-                clipdepth,
-                clipthreshold_far,
-                clipthreshold_near,
-                combine_output,
-                combine_output_axis,
-                save_depth,
-				show_depth, 
-				show_heat,
-				gen_stereo,
-				stereo_modes,
-				stereo_divergence,
-				stereo_separation,
-				stereo_fill,
-				stereo_balance,
-				inpaint,
-				inpaint_vids,
-                background_removal,
-                save_background_removal_masks,
-                gen_normal,
-
-                background_removal_model,
-				pre_depth_background_removal,
-				vid_format,
-				vid_ssaa,
-				custom_depthmap, 
-				custom_depthmap_img,
-				depthmap_batch_reuse,
-				gen_mesh, mesh_occlude, mesh_spherical
-            ],
+            inputs=inp.enkey_body(),
             outputs=[
                 result_images,
-				fn_mesh,
-				result_depthmesh,
+                fn_mesh,
+                result_depthmesh,
                 html_info_x,
                 html_info
             ]
@@ -1417,15 +1468,15 @@ def custom_depthmap_visibility(v):
         submit_vid.click(
             fn=wrap_gradio_gpu_call(run_makevideo),
             inputs=[
-				fn_mesh,
-				vid_numframes,
-				vid_fps,
-				vid_traj,
-				vid_shift,
-				vid_border,
-				vid_dolly,
-				vid_format,
-				vid_ssaa
+                fn_mesh,
+                vid_numframes,
+                vid_fps,
+                vid_traj,
+                vid_shift,
+                vid_border,
+                vid_dolly,
+                inp['vid_format'],
+                inp['vid_ssaa']
             ],
             outputs=[
                 depth_vid,
@@ -1434,138 +1485,148 @@ def custom_depthmap_visibility(v):
             ]
         )
 
-    return (depthmap_interface , "Depth", "depthmap_interface"),
+    return (depthmap_interface, "Depth", "depthmap_interface"),
+
 
 script_callbacks.on_ui_settings(on_ui_settings)
 script_callbacks.on_ui_tabs(on_ui_tabs)
 
+# TODO: code borrowed from the internet to be marked as such and to reside in separate files
 
 def batched_background_removal(inimages, model_name):
-	print('creating background masks')
-	outimages = []
-
-	# model path and name
-	bg_model_dir = Path.joinpath(Path().resolve(), "models/rem_bg")
-	os.makedirs(bg_model_dir, exist_ok=True)
-	os.environ["U2NET_HOME"] = str(bg_model_dir)
-	
-	#starting a session
-	background_removal_session = new_session(model_name)
-	for count in range(0, len(inimages)):
-		bg_remove_img = np.array(remove(inimages[count], session=background_removal_session))
-		outimages.append(Image.fromarray(bg_remove_img))
-	#The line below might be redundant
-	del background_removal_session
-	return outimages
+    print('creating background masks')
+    outimages = []
+
+    # model path and name
+    bg_model_dir = Path.joinpath(Path().resolve(), "models/rem_bg")
+    os.makedirs(bg_model_dir, exist_ok=True)
+    os.environ["U2NET_HOME"] = str(bg_model_dir)
+
+    # starting a session
+    background_removal_session = new_session(model_name)
+    for count in range(0, len(inimages)):
+        bg_remove_img = np.array(remove(inimages[count], session=background_removal_session))
+        outimages.append(Image.fromarray(bg_remove_img))
+    # The line below might be redundant
+    del background_removal_session
+    return outimages
+
 
 def ensure_file_downloaded(filename, url, sha256_hash_prefix=None):
-	# Do not check the hash every time - it is somewhat time-consuming
-	if os.path.exists(filename):
-		return
-
-	if type(url) is not list:
-		url = [url]
-	for cur_url in url:
-		try:
-			print("Downloading", cur_url, "to", filename)
-			torch.hub.download_url_to_file(cur_url, filename, sha256_hash_prefix)
-			if os.path.exists(filename):
-				return  # The correct model was downloaded, no need to try more
-		except:
-			pass
-	raise RuntimeError('Download failed. Try again later or manually download the file to that location.')
+    # Do not check the hash every time - it is somewhat time-consuming
+    if os.path.exists(filename):
+        return
+
+    if type(url) is not list:
+        url = [url]
+    for cur_url in url:
+        try:
+            print("Downloading", cur_url, "to", filename)
+            torch.hub.download_url_to_file(cur_url, filename, sha256_hash_prefix)
+            if os.path.exists(filename):
+                return  # The correct model was downloaded, no need to try more
+        except:
+            pass
+    raise RuntimeError('Download failed. Try again later or manually download the file to that location.')
+
 
 def estimatezoedepth(img, model, w, h):
-	#x = transforms.ToTensor()(img).unsqueeze(0)
-	#x = x.type(torch.float32)
-	#x.to(device)
-	#prediction = model.infer(x)
-	model.core.prep.resizer._Resize__width = w
-	model.core.prep.resizer._Resize__height = h
-	prediction = model.infer_pil(img)
+    # x = transforms.ToTensor()(img).unsqueeze(0)
+    # x = x.type(torch.float32)
+    # x.to(device)
+    # prediction = model.infer(x)
+    model.core.prep.resizer._Resize__width = w
+    model.core.prep.resizer._Resize__height = h
+    prediction = model.infer_pil(img)
+
+    return prediction
 
-	return prediction
 
 def scale_torch(img):
-	"""
-	Scale the image and output it in torch.tensor.
-	:param img: input rgb is in shape [H, W, C], input depth/disp is in shape [H, W]
-	:param scale: the scale factor. float
-	:return: img. [C, H, W]
-	"""
-	if len(img.shape) == 2:
-		img = img[np.newaxis, :, :]
-	if img.shape[2] == 3:
-		transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406) , (0.229, 0.224, 0.225) )])
-		img = transform(img.astype(np.float32))
-	else:
-		img = img.astype(np.float32)
-		img = torch.from_numpy(img)
-	return img
-	
+    """
+    Scale the image and output it in torch.tensor.
+    :param img: input rgb is in shape [H, W, C], input depth/disp is in shape [H, W]
+    :param scale: the scale factor. float
+    :return: img. [C, H, W]
+    """
+    if len(img.shape) == 2:
+        img = img[np.newaxis, :, :]
+    if img.shape[2] == 3:
+        transform = transforms.Compose(
+            [transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])
+        img = transform(img.astype(np.float32))
+    else:
+        img = img.astype(np.float32)
+        img = torch.from_numpy(img)
+    return img
+
+
 def estimateleres(img, model, w, h):
-	# leres transform input
-	rgb_c = img[:, :, ::-1].copy()
-	A_resize = cv2.resize(rgb_c, (w, h))
-	img_torch = scale_torch(A_resize)[None, :, :, :] 
-	
-	# compute
-	with torch.no_grad():
-		if device == torch.device("cuda"):
-			img_torch = img_torch.cuda()
-		prediction = model.depth_model(img_torch)
-
-	prediction = prediction.squeeze().cpu().numpy()
-	prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC)
-
-	return prediction
+    # leres transform input
+    rgb_c = img[:, :, ::-1].copy()
+    A_resize = cv2.resize(rgb_c, (w, h))
+    img_torch = scale_torch(A_resize)[None, :, :, :]
+
+    # compute
+    with torch.no_grad():
+        if device == torch.device("cuda"):
+            img_torch = img_torch.cuda()
+        prediction = model.depth_model(img_torch)
+
+    prediction = prediction.squeeze().cpu().numpy()
+    prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC)
+
+    return prediction
+
 
 def estimatemidas(img, model, w, h, resize_mode, normalization):
-	# init transform
-	transform = Compose(
-		[
-			Resize(
-				w,
-				h,
-				resize_target=None,
-				keep_aspect_ratio=True,
-				ensure_multiple_of=32,
-				resize_method=resize_mode,
-				image_interpolation_method=cv2.INTER_CUBIC,
-			),
-			normalization,
-			PrepareForNet(),
-		]
-	)
-
-	# transform input
-	img_input = transform({"image": img})["image"]
-
-	# compute
-	precision_scope = torch.autocast if shared.cmd_opts.precision == "autocast" and device == torch.device("cuda") else contextlib.nullcontext
-	with torch.no_grad(), precision_scope("cuda"):
-		sample = torch.from_numpy(img_input).to(device).unsqueeze(0)
-		if device == torch.device("cuda"):
-			sample = sample.to(memory_format=torch.channels_last) 
-			if not cmd_opts.no_half:
-				sample = sample.half()
-		prediction = model.forward(sample)
-		prediction = (
-			torch.nn.functional.interpolate(
-				prediction.unsqueeze(1),
-				size=img.shape[:2],
-				mode="bicubic",
-				align_corners=False,
-			)
-			.squeeze()
-			.cpu()
-			.numpy()
-		)
-
-	return prediction
+    # init transform
+    transform = Compose(
+        [
+            Resize(
+                w,
+                h,
+                resize_target=None,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=32,
+                resize_method=resize_mode,
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            normalization,
+            PrepareForNet(),
+        ]
+    )
+
+    # transform input
+    img_input = transform({"image": img})["image"]
+
+    # compute
+    precision_scope = torch.autocast if shared.cmd_opts.precision == "autocast" and device == torch.device(
+        "cuda") else contextlib.nullcontext
+    with torch.no_grad(), precision_scope("cuda"):
+        sample = torch.from_numpy(img_input).to(device).unsqueeze(0)
+        if device == torch.device("cuda"):
+            sample = sample.to(memory_format=torch.channels_last)
+            if not cmd_opts.no_half:
+                sample = sample.half()
+        prediction = model.forward(sample)
+        prediction = (
+            torch.nn.functional.interpolate(
+                prediction.unsqueeze(1),
+                size=img.shape[:2],
+                mode="bicubic",
+                align_corners=False,
+            )
+            .squeeze()
+            .cpu()
+            .numpy()
+        )
+
+    return prediction
+
 
 def estimatemidasBoost(img, model, w, h):
-	# init transform
+    # init transform
     transform = Compose(
         [
             Resize(
@@ -1582,17 +1643,17 @@ def estimatemidasBoost(img, model, w, h):
         ]
     )
 
-	# transform input
+    # transform input
     img_input = transform({"image": img})["image"]
 
     # compute
     with torch.no_grad():
         sample = torch.from_numpy(img_input).to(device).unsqueeze(0)
         if device == torch.device("cuda"):
-            sample = sample.to(memory_format=torch.channels_last) 
+            sample = sample.to(memory_format=torch.channels_last)
         prediction = model.forward(sample)
 
-    prediction = prediction.squeeze().cpu().numpy()    
+    prediction = prediction.squeeze().cpu().numpy()
     prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC)
 
     # normalization
@@ -1606,28 +1667,32 @@ def estimatemidasBoost(img, model, w, h):
 
     return prediction
 
+
 def generatemask(size):
     # Generates a Guassian mask
     mask = np.zeros(size, dtype=np.float32)
-    sigma = int(size[0]/16)
-    k_size = int(2 * np.ceil(2 * int(size[0]/16)) + 1)
-    mask[int(0.15*size[0]):size[0] - int(0.15*size[0]), int(0.15*size[1]): size[1] - int(0.15*size[1])] = 1
+    sigma = int(size[0] / 16)
+    k_size = int(2 * np.ceil(2 * int(size[0] / 16)) + 1)
+    mask[int(0.15 * size[0]):size[0] - int(0.15 * size[0]), int(0.15 * size[1]): size[1] - int(0.15 * size[1])] = 1
     mask = cv2.GaussianBlur(mask, (int(k_size), int(k_size)), sigma)
     mask = (mask - mask.min()) / (mask.max() - mask.min())
     mask = mask.astype(np.float32)
     return mask
 
+
 def resizewithpool(img, size):
     i_size = img.shape[0]
-    n = int(np.floor(i_size/size))
+    n = int(np.floor(i_size / size))
 
     out = skimage.measure.block_reduce(img, (n, n), np.max)
     return out
 
+
 def rgb2gray(rgb):
     # Converts rgb to gray
     return np.dot(rgb[..., :3], [0.2989, 0.5870, 0.1140])
 
+
 def calculateprocessingres(img, basesize, confidence=0.1, scale_threshold=3, whole_size_threshold=3000):
     # Returns the R_x resolution described in section 5 of the main paper.
 
@@ -1658,23 +1723,23 @@ def calculateprocessingres(img, basesize, confidence=0.1, scale_threshold=3, who
     grad[grad >= middle] = 1
 
     # dilation kernel with size of the receptive field
-    kernel = np.ones((int(basesize/speed_scale), int(basesize/speed_scale)), float)
+    kernel = np.ones((int(basesize / speed_scale), int(basesize / speed_scale)), float)
     # dilation kernel with size of the a quarter of receptive field used to compute k
     # as described in section 6 of main paper
-    kernel2 = np.ones((int(basesize / (4*speed_scale)), int(basesize / (4*speed_scale))), float)
+    kernel2 = np.ones((int(basesize / (4 * speed_scale)), int(basesize / (4 * speed_scale))), float)
 
     # Output resolution limit set by the whole_size_threshold and scale_threshold.
     threshold = min(whole_size_threshold, scale_threshold * max(img.shape[:2]))
 
     outputsize_scale = basesize / speed_scale
-    for p_size in range(int(basesize/speed_scale), int(threshold/speed_scale), int(basesize / (2*speed_scale))):
+    for p_size in range(int(basesize / speed_scale), int(threshold / speed_scale), int(basesize / (2 * speed_scale))):
         grad_resized = resizewithpool(grad, p_size)
         grad_resized = cv2.resize(grad_resized, (p_size, p_size), cv2.INTER_NEAREST)
         grad_resized[grad_resized >= 0.5] = 1
         grad_resized[grad_resized < 0.5] = 0
 
         dilated = cv2.dilate(grad_resized, kernel, iterations=1)
-        meanvalue = (1-dilated).mean()
+        meanvalue = (1 - dilated).mean()
         if meanvalue > confidence:
             break
         else:
@@ -1683,7 +1748,8 @@ def calculateprocessingres(img, basesize, confidence=0.1, scale_threshold=3, who
     grad_region = cv2.dilate(grad_resized, kernel2, iterations=1)
     patch_scale = grad_region.mean()
 
-    return int(outputsize_scale*speed_scale), patch_scale
+    return int(outputsize_scale * speed_scale), patch_scale
+
 
 # Generate a double-input depth estimation
 def doubleestimate(img, size1, size2, pix2pixsize, model, net_type, pix2pixmodel):
@@ -1702,22 +1768,24 @@ def doubleestimate(img, size1, size2, pix2pixsize, model, net_type, pix2pixmodel
     pix2pixmodel.test()
     visuals = pix2pixmodel.get_current_visuals()
     prediction_mapped = visuals['fake_B']
-    prediction_mapped = (prediction_mapped+1)/2
+    prediction_mapped = (prediction_mapped + 1) / 2
     prediction_mapped = (prediction_mapped - torch.min(prediction_mapped)) / (
-                torch.max(prediction_mapped) - torch.min(prediction_mapped))
+            torch.max(prediction_mapped) - torch.min(prediction_mapped))
     prediction_mapped = prediction_mapped.squeeze().cpu().numpy()
 
     return prediction_mapped
 
+
 # Generate a single-input depth estimation
 def singleestimate(img, msize, model, net_type):
-	if net_type == 0:
-		return estimateleres(img, model, msize, msize)
-	elif net_type >= 7:
-		# np to PIL
-		return estimatezoedepth(Image.fromarray(np.uint8(img * 255)).convert('RGB'), model, msize, msize)
-	else:
-		return estimatemidasBoost(img, model, msize, msize)
+    if net_type == 0:
+        return estimateleres(img, model, msize, msize)
+    elif net_type >= 7:
+        # np to PIL
+        return estimatezoedepth(Image.fromarray(np.uint8(img * 255)).convert('RGB'), model, msize, msize)
+    else:
+        return estimatemidasBoost(img, model, msize, msize)
+
 
 def applyGridpatch(blsize, stride, img, box):
     # Extract a simple grid patch.
@@ -1734,25 +1802,25 @@ def applyGridpatch(blsize, stride, img, box):
             counter1 = counter1 + 1
     return patch_bound_list
 
+
 # Generating local patches to perform the local refinement described in section 6 of the main paper.
 def generatepatchs(img, base_size):
-    
     # Compute the gradients as a proxy of the contextual cues.
     img_gray = rgb2gray(img)
-    whole_grad = np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 0, 1, ksize=3)) +\
-        np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 1, 0, ksize=3))
+    whole_grad = np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 0, 1, ksize=3)) + \
+                 np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 1, 0, ksize=3))
 
     threshold = whole_grad[whole_grad > 0].mean()
     whole_grad[whole_grad < threshold] = 0
 
     # We use the integral image to speed-up the evaluation of the amount of gradients for each patch.
-    gf = whole_grad.sum()/len(whole_grad.reshape(-1))
+    gf = whole_grad.sum() / len(whole_grad.reshape(-1))
     grad_integral_image = cv2.integral(whole_grad)
 
     # Variables are selected such that the initial patch size would be the receptive field size
     # and the stride is set to 1/3 of the receptive field size.
-    blsize = int(round(base_size/2))
-    stride = int(round(blsize*0.75))
+    blsize = int(round(base_size / 2))
+    stride = int(round(blsize * 0.75))
 
     # Get initial Grid
     patch_bound_list = applyGridpatch(blsize, stride, img, [0, 0, 0, 0])
@@ -1767,22 +1835,24 @@ def generatepatchs(img, base_size):
     patchset = sorted(patch_bound_list.items(), key=lambda x: getitem(x[1], 'size'), reverse=True)
     return patchset
 
+
 def getGF_fromintegral(integralimage, rect):
     # Computes the gradient density of a given patch from the gradient integral image.
     x1 = rect[1]
-    x2 = rect[1]+rect[3]
+    x2 = rect[1] + rect[3]
     y1 = rect[0]
-    y2 = rect[0]+rect[2]
-    value = integralimage[x2, y2]-integralimage[x1, y2]-integralimage[x2, y1]+integralimage[x1, y1]
+    y2 = rect[0] + rect[2]
+    value = integralimage[x2, y2] - integralimage[x1, y2] - integralimage[x2, y1] + integralimage[x1, y1]
     return value
 
+
 # Adaptively select patches
 def adaptiveselection(integral_grad, patch_bound_list, gf):
     patchlist = {}
     count = 0
     height, width = integral_grad.shape
 
-    search_step = int(32/factor)
+    search_step = int(32 / factor)
 
     # Go through all patches
     for c in range(len(patch_bound_list)):
@@ -1790,7 +1860,7 @@ def adaptiveselection(integral_grad, patch_bound_list, gf):
         bbox = patch_bound_list[str(c)]['rect']
 
         # Compute the amount of gradients present in the patch from the integral image.
-        cgf = getGF_fromintegral(integral_grad, bbox)/(bbox[2]*bbox[3])
+        cgf = getGF_fromintegral(integral_grad, bbox) / (bbox[2] * bbox[3])
 
         # Check if patching is beneficial by comparing the gradient density of the patch to
         # the gradient density of the whole image
@@ -1802,8 +1872,8 @@ def adaptiveselection(integral_grad, patch_bound_list, gf):
             # to the whole image gradient density
             while True:
 
-                bbox_test[0] = bbox_test[0] - int(search_step/2)
-                bbox_test[1] = bbox_test[1] - int(search_step/2)
+                bbox_test[0] = bbox_test[0] - int(search_step / 2)
+                bbox_test[1] = bbox_test[1] - int(search_step / 2)
 
                 bbox_test[2] = bbox_test[2] + search_step
                 bbox_test[3] = bbox_test[3] + search_step
@@ -1814,7 +1884,7 @@ def adaptiveselection(integral_grad, patch_bound_list, gf):
                     break
 
                 # Compare gradient density
-                cgf = getGF_fromintegral(integral_grad, bbox_test)/(bbox_test[2]*bbox_test[3])
+                cgf = getGF_fromintegral(integral_grad, bbox_test) / (bbox_test[2] * bbox_test[3])
                 if cgf < gf:
                     break
                 bbox = bbox_test.copy()
@@ -1823,10 +1893,11 @@ def adaptiveselection(integral_grad, patch_bound_list, gf):
             patchlist[str(count)]['rect'] = bbox
             patchlist[str(count)]['size'] = bbox[2]
             count = count + 1
-    
+
     # Return selected patches
     return patchlist
 
+
 def impatch(image, rect):
     # Extract the given patch pixels from a given image.
     w1 = rect[0]
@@ -1836,6 +1907,7 @@ def impatch(image, rect):
     image_patch = image[h1:h2, w1:w2]
     return image_patch
 
+
 class ImageandPatchs:
     def __init__(self, root_dir, name, patchsinfo, rgb_image, scale=1):
         self.root_dir = root_dir
@@ -1844,7 +1916,7 @@ def __init__(self, root_dir, name, patchsinfo, rgb_image, scale=1):
         self.patchs = patchsinfo
         self.scale = scale
 
-        self.rgb_image = cv2.resize(rgb_image, (round(rgb_image.shape[1]*scale), round(rgb_image.shape[0]*scale)),
+        self.rgb_image = cv2.resize(rgb_image, (round(rgb_image.shape[1] * scale), round(rgb_image.shape[0] * scale)),
                                     interpolation=cv2.INTER_CUBIC)
 
         self.do_have_estimate = False
@@ -1914,14 +1986,14 @@ def print_options(self, opt):
     def parse(self):
         """Parse our options, create checkpoints directory suffix, and set up gpu device."""
         opt = self.gather_options()
-        opt.isTrain = self.isTrain   # train or test
+        opt.isTrain = self.isTrain  # train or test
 
         # process opt.suffix
         if opt.suffix:
             suffix = ('_' + opt.suffix.format(**vars(opt))) if opt.suffix != '' else ''
             opt.name = opt.name + suffix
 
-        #self.print_options(opt)
+        # self.print_options(opt)
 
         # set gpu ids
         str_ids = opt.gpu_ids.split(',')
@@ -1930,7 +2002,7 @@ def parse(self):
             id = int(str_id)
             if id >= 0:
                 opt.gpu_ids.append(id)
-        #if len(opt.gpu_ids) > 0:
+        # if len(opt.gpu_ids) > 0:
         #    torch.cuda.set_device(opt.gpu_ids[0])
 
         self.opt = opt
@@ -1938,167 +2010,174 @@ def parse(self):
 
 
 def estimateboost(img, model, model_type, pix2pixmodel):
-	# get settings
-	if hasattr(opts, 'depthmap_script_boost_rmax'):
-		whole_size_threshold = opts.depthmap_script_boost_rmax
-		
-	if model_type == 0: #leres
-		net_receptive_field_size = 448
-		patch_netsize = 2 * net_receptive_field_size
-	elif model_type == 1: #dpt_beit_large_512
-		net_receptive_field_size = 512
-		patch_netsize = 2 * net_receptive_field_size
-	else: #other midas
-		net_receptive_field_size = 384
-		patch_netsize = 2 * net_receptive_field_size
-
-	gc.collect()
-	devices.torch_gc()
-
-	# Generate mask used to smoothly blend the local pathc estimations to the base estimate.
-	# It is arbitrarily large to avoid artifacts during rescaling for each crop.
-	mask_org = generatemask((3000, 3000))
-	mask = mask_org.copy()
-
-	# Value x of R_x defined in the section 5 of the main paper.
-	r_threshold_value = 0.2
-	#if R0:
-	#	r_threshold_value = 0
-
-	input_resolution = img.shape
-	scale_threshold = 3  # Allows up-scaling with a scale up to 3
-
-	# Find the best input resolution R-x. The resolution search described in section 5-double estimation of the main paper and section B of the
-	# supplementary material.
-	whole_image_optimal_size, patch_scale = calculateprocessingres(img, net_receptive_field_size, r_threshold_value, scale_threshold, whole_size_threshold)
-
-	print('wholeImage being processed in :', whole_image_optimal_size)
-
-	# Generate the base estimate using the double estimation.
-	whole_estimate = doubleestimate(img, net_receptive_field_size, whole_image_optimal_size, pix2pixsize, model, model_type, pix2pixmodel)
-
-	# Compute the multiplier described in section 6 of the main paper to make sure our initial patch can select
-	# small high-density regions of the image.
-	global factor
-	factor = max(min(1, 4 * patch_scale * whole_image_optimal_size / whole_size_threshold), 0.2)
-	print('Adjust factor is:', 1/factor)
-
-	# Compute the default target resolution.
-	if img.shape[0] > img.shape[1]:
-		a = 2 * whole_image_optimal_size
-		b = round(2 * whole_image_optimal_size * img.shape[1] / img.shape[0])
-	else:
-		a = round(2 * whole_image_optimal_size * img.shape[0] / img.shape[1])
-		b = 2 * whole_image_optimal_size
-	b = int(round(b / factor))
-	a = int(round(a / factor))
-
-	"""
-	# recompute a, b and saturate to max res.
-	if max(a,b) > max_res:
-		print('Default Res is higher than max-res: Reducing final resolution')
-		if img.shape[0] > img.shape[1]:
-			a = max_res
-			b = round(option.max_res * img.shape[1] / img.shape[0])
-		else:
-			a = round(option.max_res * img.shape[0] / img.shape[1])
-			b = max_res
-		b = int(b)
-		a = int(a)
-	"""
-
-	img = cv2.resize(img, (b, a), interpolation=cv2.INTER_CUBIC)
-
-	# Extract selected patches for local refinement
-	base_size = net_receptive_field_size * 2
-	patchset = generatepatchs(img, base_size)
-
-	print('Target resolution: ', img.shape)
-
-	# Computing a scale in case user prompted to generate the results as the same resolution of the input.
-	# Notice that our method output resolution is independent of the input resolution and this parameter will only
-	# enable a scaling operation during the local patch merge implementation to generate results with the same resolution
-	# as the input.
-	"""
-	if output_resolution == 1:
-		mergein_scale = input_resolution[0] / img.shape[0]
-		print('Dynamicly change merged-in resolution; scale:', mergein_scale)
-	else:
-		mergein_scale = 1
-	"""
-	# always rescale to input res for now
-	mergein_scale = input_resolution[0] / img.shape[0]
-
-	imageandpatchs = ImageandPatchs('', '', patchset, img, mergein_scale)
-	whole_estimate_resized = cv2.resize(whole_estimate, (round(img.shape[1]*mergein_scale),
-										round(img.shape[0]*mergein_scale)), interpolation=cv2.INTER_CUBIC)
-	imageandpatchs.set_base_estimate(whole_estimate_resized.copy())
-	imageandpatchs.set_updated_estimate(whole_estimate_resized.copy())
-
-	print('Resulting depthmap resolution will be :', whole_estimate_resized.shape[:2])
-	print('patches to process: '+str(len(imageandpatchs)))
-
-	# Enumerate through all patches, generate their estimations and refining the base estimate.
-	for patch_ind in range(len(imageandpatchs)):
-		
-		# Get patch information
-		patch = imageandpatchs[patch_ind] # patch object
-		patch_rgb = patch['patch_rgb'] # rgb patch
-		patch_whole_estimate_base = patch['patch_whole_estimate_base'] # corresponding patch from base
-		rect = patch['rect'] # patch size and location
-		patch_id = patch['id'] # patch ID
-		org_size = patch_whole_estimate_base.shape # the original size from the unscaled input
-		print('\t processing patch', patch_ind, '/', len(imageandpatchs)-1, '|', rect)
-
-		# We apply double estimation for patches. The high resolution value is fixed to twice the receptive
-		# field size of the network for patches to accelerate the process.
-		patch_estimation = doubleestimate(patch_rgb, net_receptive_field_size, patch_netsize, pix2pixsize, model, model_type, pix2pixmodel)
-		patch_estimation = cv2.resize(patch_estimation, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
-		patch_whole_estimate_base = cv2.resize(patch_whole_estimate_base, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
-
-		# Merging the patch estimation into the base estimate using our merge network:
-		# We feed the patch estimation and the same region from the updated base estimate to the merge network
-		# to generate the target estimate for the corresponding region.
-		pix2pixmodel.set_input(patch_whole_estimate_base, patch_estimation)
-
-		# Run merging network
-		pix2pixmodel.test()
-		visuals = pix2pixmodel.get_current_visuals()
-
-		prediction_mapped = visuals['fake_B']
-		prediction_mapped = (prediction_mapped+1)/2
-		prediction_mapped = prediction_mapped.squeeze().cpu().numpy()
-
-		mapped = prediction_mapped
-
-		# We use a simple linear polynomial to make sure the result of the merge network would match the values of
-		# base estimate
-		p_coef = np.polyfit(mapped.reshape(-1), patch_whole_estimate_base.reshape(-1), deg=1)
-		merged = np.polyval(p_coef, mapped.reshape(-1)).reshape(mapped.shape)
-
-		merged = cv2.resize(merged, (org_size[1],org_size[0]), interpolation=cv2.INTER_CUBIC)
-
-		# Get patch size and location
-		w1 = rect[0]
-		h1 = rect[1]
-		w2 = w1 + rect[2]
-		h2 = h1 + rect[3]
-
-		# To speed up the implementation, we only generate the Gaussian mask once with a sufficiently large size
-		# and resize it to our needed size while merging the patches.
-		if mask.shape != org_size:
-			mask = cv2.resize(mask_org, (org_size[1],org_size[0]), interpolation=cv2.INTER_LINEAR)
-
-		tobemergedto = imageandpatchs.estimation_updated_image
-
-		# Update the whole estimation:
-		# We use a simple Gaussian mask to blend the merged patch region with the base estimate to ensure seamless
-		# blending at the boundaries of the patch region.
-		tobemergedto[h1:h2, w1:w2] = np.multiply(tobemergedto[h1:h2, w1:w2], 1 - mask) + np.multiply(merged, mask)
-		imageandpatchs.set_updated_estimate(tobemergedto)
-
-	# output
-	return cv2.resize(imageandpatchs.estimation_updated_image, (input_resolution[1], input_resolution[0]), interpolation=cv2.INTER_CUBIC)
+    # get settings
+    if hasattr(opts, 'depthmap_script_boost_rmax'):
+        whole_size_threshold = opts.depthmap_script_boost_rmax
+
+    if model_type == 0:  # leres
+        net_receptive_field_size = 448
+        patch_netsize = 2 * net_receptive_field_size
+    elif model_type == 1:  # dpt_beit_large_512
+        net_receptive_field_size = 512
+        patch_netsize = 2 * net_receptive_field_size
+    else:  # other midas
+        net_receptive_field_size = 384
+        patch_netsize = 2 * net_receptive_field_size
+
+    gc.collect()
+    devices.torch_gc()
+
+    # Generate mask used to smoothly blend the local pathc estimations to the base estimate.
+    # It is arbitrarily large to avoid artifacts during rescaling for each crop.
+    mask_org = generatemask((3000, 3000))
+    mask = mask_org.copy()
+
+    # Value x of R_x defined in the section 5 of the main paper.
+    r_threshold_value = 0.2
+    # if R0:
+    #    r_threshold_value = 0
+
+    input_resolution = img.shape
+    scale_threshold = 3  # Allows up-scaling with a scale up to 3
+
+    # Find the best input resolution R-x. The resolution search described in section 5-double estimation of the main paper and section B of the
+    # supplementary material.
+    whole_image_optimal_size, patch_scale = calculateprocessingres(img, net_receptive_field_size, r_threshold_value,
+                                                                   scale_threshold, whole_size_threshold)
+
+    print('wholeImage being processed in :', whole_image_optimal_size)
+
+    # Generate the base estimate using the double estimation.
+    whole_estimate = doubleestimate(img, net_receptive_field_size, whole_image_optimal_size, pix2pixsize, model,
+                                    model_type, pix2pixmodel)
+
+    # Compute the multiplier described in section 6 of the main paper to make sure our initial patch can select
+    # small high-density regions of the image.
+    global factor
+    factor = max(min(1, 4 * patch_scale * whole_image_optimal_size / whole_size_threshold), 0.2)
+    print('Adjust factor is:', 1 / factor)
+
+    # Compute the default target resolution.
+    if img.shape[0] > img.shape[1]:
+        a = 2 * whole_image_optimal_size
+        b = round(2 * whole_image_optimal_size * img.shape[1] / img.shape[0])
+    else:
+        a = round(2 * whole_image_optimal_size * img.shape[0] / img.shape[1])
+        b = 2 * whole_image_optimal_size
+    b = int(round(b / factor))
+    a = int(round(a / factor))
+
+    """
+    # recompute a, b and saturate to max res.
+    if max(a,b) > max_res:
+        print('Default Res is higher than max-res: Reducing final resolution')
+        if img.shape[0] > img.shape[1]:
+            a = max_res
+            b = round(option.max_res * img.shape[1] / img.shape[0])
+        else:
+            a = round(option.max_res * img.shape[0] / img.shape[1])
+            b = max_res
+        b = int(b)
+        a = int(a)
+    """
+
+    img = cv2.resize(img, (b, a), interpolation=cv2.INTER_CUBIC)
+
+    # Extract selected patches for local refinement
+    base_size = net_receptive_field_size * 2
+    patchset = generatepatchs(img, base_size)
+
+    print('Target resolution: ', img.shape)
+
+    # Computing a scale in case user prompted to generate the results as the same resolution of the input.
+    # Notice that our method output resolution is independent of the input resolution and this parameter will only
+    # enable a scaling operation during the local patch merge implementation to generate results with the same resolution
+    # as the input.
+    """
+    if output_resolution == 1:
+        mergein_scale = input_resolution[0] / img.shape[0]
+        print('Dynamicly change merged-in resolution; scale:', mergein_scale)
+    else:
+        mergein_scale = 1
+    """
+    # always rescale to input res for now
+    mergein_scale = input_resolution[0] / img.shape[0]
+
+    imageandpatchs = ImageandPatchs('', '', patchset, img, mergein_scale)
+    whole_estimate_resized = cv2.resize(whole_estimate, (round(img.shape[1] * mergein_scale),
+                                                         round(img.shape[0] * mergein_scale)),
+                                        interpolation=cv2.INTER_CUBIC)
+    imageandpatchs.set_base_estimate(whole_estimate_resized.copy())
+    imageandpatchs.set_updated_estimate(whole_estimate_resized.copy())
+
+    print('Resulting depthmap resolution will be :', whole_estimate_resized.shape[:2])
+    print('patches to process: ' + str(len(imageandpatchs)))
+
+    # Enumerate through all patches, generate their estimations and refining the base estimate.
+    for patch_ind in range(len(imageandpatchs)):
+
+        # Get patch information
+        patch = imageandpatchs[patch_ind]  # patch object
+        patch_rgb = patch['patch_rgb']  # rgb patch
+        patch_whole_estimate_base = patch['patch_whole_estimate_base']  # corresponding patch from base
+        rect = patch['rect']  # patch size and location
+        patch_id = patch['id']  # patch ID
+        org_size = patch_whole_estimate_base.shape  # the original size from the unscaled input
+        print('\t processing patch', patch_ind, '/', len(imageandpatchs) - 1, '|', rect)
+
+        # We apply double estimation for patches. The high resolution value is fixed to twice the receptive
+        # field size of the network for patches to accelerate the process.
+        patch_estimation = doubleestimate(patch_rgb, net_receptive_field_size, patch_netsize, pix2pixsize, model,
+                                          model_type, pix2pixmodel)
+        patch_estimation = cv2.resize(patch_estimation, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
+        patch_whole_estimate_base = cv2.resize(patch_whole_estimate_base, (pix2pixsize, pix2pixsize),
+                                               interpolation=cv2.INTER_CUBIC)
+
+        # Merging the patch estimation into the base estimate using our merge network:
+        # We feed the patch estimation and the same region from the updated base estimate to the merge network
+        # to generate the target estimate for the corresponding region.
+        pix2pixmodel.set_input(patch_whole_estimate_base, patch_estimation)
+
+        # Run merging network
+        pix2pixmodel.test()
+        visuals = pix2pixmodel.get_current_visuals()
+
+        prediction_mapped = visuals['fake_B']
+        prediction_mapped = (prediction_mapped + 1) / 2
+        prediction_mapped = prediction_mapped.squeeze().cpu().numpy()
+
+        mapped = prediction_mapped
+
+        # We use a simple linear polynomial to make sure the result of the merge network would match the values of
+        # base estimate
+        p_coef = np.polyfit(mapped.reshape(-1), patch_whole_estimate_base.reshape(-1), deg=1)
+        merged = np.polyval(p_coef, mapped.reshape(-1)).reshape(mapped.shape)
+
+        merged = cv2.resize(merged, (org_size[1], org_size[0]), interpolation=cv2.INTER_CUBIC)
+
+        # Get patch size and location
+        w1 = rect[0]
+        h1 = rect[1]
+        w2 = w1 + rect[2]
+        h2 = h1 + rect[3]
+
+        # To speed up the implementation, we only generate the Gaussian mask once with a sufficiently large size
+        # and resize it to our needed size while merging the patches.
+        if mask.shape != org_size:
+            mask = cv2.resize(mask_org, (org_size[1], org_size[0]), interpolation=cv2.INTER_LINEAR)
+
+        tobemergedto = imageandpatchs.estimation_updated_image
+
+        # Update the whole estimation:
+        # We use a simple Gaussian mask to blend the merged patch region with the base estimate to ensure seamless
+        # blending at the boundaries of the patch region.
+        tobemergedto[h1:h2, w1:w2] = np.multiply(tobemergedto[h1:h2, w1:w2], 1 - mask) + np.multiply(merged, mask)
+        imageandpatchs.set_updated_estimate(tobemergedto)
+
+    # output
+    return cv2.resize(imageandpatchs.estimation_updated_image, (input_resolution[1], input_resolution[0]),
+                      interpolation=cv2.INTER_CUBIC)
+
 
 def pano_depth_to_world_points(depth):
     """
@@ -2113,7 +2192,7 @@ def pano_depth_to_world_points(depth):
     radius = depth.flatten()
 
     lon = np.linspace(-np.pi, np.pi, depth.shape[1])
-    lat = np.linspace(-np.pi/2, np.pi/2, depth.shape[0])
+    lat = np.linspace(-np.pi / 2, np.pi / 2, depth.shape[0])
 
     lon, lat = np.meshgrid(lon, lat)
     lon = lon.flatten()
@@ -2128,6 +2207,7 @@ def pano_depth_to_world_points(depth):
 
     return pts3d
 
+
 def depth_edges_mask(depth):
     """Returns a mask of edges in the depth map.
     Args:
@@ -2143,40 +2223,42 @@ def depth_edges_mask(depth):
     mask = depth_grad > 0.05
     return mask
 
+
 def create_mesh(image, depth, keep_edges=False, spherical=False):
-	maxsize = 1024
-	if hasattr(opts, 'depthmap_script_mesh_maxsize'):
-		maxsize = opts.depthmap_script_mesh_maxsize
+    maxsize = 1024
+    if hasattr(opts, 'depthmap_script_mesh_maxsize'):
+        maxsize = opts.depthmap_script_mesh_maxsize
+
+    # limit the size of the input image
+    image.thumbnail((maxsize, maxsize))
 
-	# limit the size of the input image
-	image.thumbnail((maxsize, maxsize))  
+    if not spherical:
+        pts3d = depth_to_points(depth[None])
+    else:
+        pts3d = pano_depth_to_world_points(depth)
 
-	if not spherical:
-		pts3d = depth_to_points(depth[None])
-	else:
-		pts3d = pano_depth_to_world_points(depth)
+    pts3d = pts3d.reshape(-1, 3)
 
-	pts3d = pts3d.reshape(-1, 3)
+    verts = pts3d.reshape(-1, 3)
+    image = np.array(image)
+    if keep_edges:
+        triangles = create_triangles(image.shape[0], image.shape[1])
+    else:
+        triangles = create_triangles(image.shape[0], image.shape[1], mask=~depth_edges_mask(depth))
+    colors = image.reshape(-1, 3)
 
-	verts = pts3d.reshape(-1, 3)
-	image = np.array(image)
-	if keep_edges:
-		triangles = create_triangles(image.shape[0], image.shape[1])
-	else:
-		triangles = create_triangles(image.shape[0], image.shape[1], mask=~depth_edges_mask(depth))
-	colors = image.reshape(-1, 3)
+    mesh = trimesh.Trimesh(vertices=verts, faces=triangles, vertex_colors=colors)
 
-	mesh = trimesh.Trimesh(vertices=verts, faces=triangles, vertex_colors=colors)
+    # rotate 90deg over X when spherical
+    if spherical:
+        angle = math.pi / 2
+        direction = [1, 0, 0]
+        center = [0, 0, 0]
+        rot_matrix = transformations.rotation_matrix(angle, direction, center)
+        mesh.apply_transform(rot_matrix)
 
-	# rotate 90deg over X when spherical
-	if spherical:
-		angle = math.pi / 2
-		direction = [1, 0, 0]
-		center = [0, 0, 0]
-		rot_matrix = transformations.rotation_matrix(angle, direction, center)
-		mesh.apply_transform(rot_matrix)
+    return mesh
 
-	return mesh
 
 def save_mesh_obj(fn, mesh):
-		mesh.export(fn)
+    mesh.export(fn)
diff --git a/scripts/gradio_args_transport.py b/scripts/gradio_args_transport.py
new file mode 100644
index 0000000..7ae95ef
--- /dev/null
+++ b/scripts/gradio_args_transport.py
@@ -0,0 +1,52 @@
+import gradio as gr
+
+class GradioComponentBundle:
+    """Allows easier transportation of massive ammount of named gradio inputs"""
+    def __init__(self):
+        self.internal = {}
+
+    def append(self, thing):
+        if isinstance(thing, GradioComponentBundle):
+            keys = list(thing.internal.keys())
+            for key in keys:
+                assert key not in self.internal, f"Already bundled component with name {key}."
+                self.internal[key] = thing[key]
+        elif isinstance(thing, tuple) and len(thing) == 2 and isinstance(thing[1], gr.components.Component):
+                assert thing[0] not in self.internal, f"Already bundled component with name {thing[0]}."
+                self.internal[thing[0]] = thing[1]
+        elif isinstance(thing, gr.components.Component) and thing.elem_id is not None:
+            assert thing.elem_id not in self.internal, f"Already bundled component with name {thing.elem_id}."
+            self.internal[thing.elem_id] = thing
+        else:
+            assert False, f"This object can not be bundled, {str(thing)}"
+
+    def __iadd__(self, els):
+        self.append(els)
+        return self
+
+    def __getitem__(self, key):
+        """Return the gradio component elem_id"""
+        return self.internal[key]
+
+    # def send_format(self):
+    #     return set(self.internal.values())
+
+    def enkey_tail(self):
+        keys = sorted(list(self.internal.keys()))
+        head = gr.HTML(elem_id="zzz_depthmap_enkey", value="\u222F" + "\u222F".join(keys), visible=False)
+        return head
+
+    def enkey_body(self):
+        return [self.internal[x] for x in sorted(list(self.internal.keys()))]
+
+    @staticmethod
+    def enkey_to_dict(inp):
+        """Enkey format: bunch of Gradio components,
+        then a Gradio component, which value is concatination of names of the previous Gradio objects"""
+        assert inp[-1].startswith("\u222F")
+        ret = {}
+        names = inp[-1].split("\u222F")[1:]
+        assert len(names) == len(inp) - 1
+        for i, name in enumerate(names):
+            ret[name] = inp[i]
+        return ret
diff --git a/scripts/stereoimage_generation.py b/scripts/stereoimage_generation.py
index 7284000..80461f9 100644
--- a/scripts/stereoimage_generation.py
+++ b/scripts/stereoimage_generation.py
@@ -5,7 +5,8 @@
 def create_stereoimages(original_image, depthmap, divergence, separation=0.0, modes=None, stereo_balance=0.0,
                         fill_technique='polylines_sharp'):
     """Creates stereoscopic images.
-    An effort is made to make them look nice, but beware that the resulting image will have some distortion .
+    An effort is made to make them look nice, but beware that the resulting image will have some distortion.
+    The correctness was not rigurously tested.
 
     :param original_image: original image from which the 3D image (stereoimage) will be created
     :param depthmap: depthmap corresponding to the original image. White = near, black = far.

From cd1d73ee64c066de461d4216ca53352fbe78642d Mon Sep 17 00:00:00 2001
From: Semjon Kravtsenko <semjon.00@gmail.com>
Date: Thu, 6 Jul 2023 10:10:47 +0300
Subject: [PATCH 02/16] Fix run_3dphoto_videos parameters leaking into
 run_depthmap

Large refactor part, may be broken
---
 scripts/depthmap.py | 53 +++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 31 deletions(-)

diff --git a/scripts/depthmap.py b/scripts/depthmap.py
index 8be91bd..2206afe 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap.py
@@ -200,14 +200,15 @@ def main_ui_panel(is_depth_tab):
                 inp += "mesh_occlude", gr.Checkbox(label="Remove occluded edges", value=True, visible=True)
                 inp += "mesh_spherical", gr.Checkbox(label="Equirectangular projection", value=False, visible=True)
 
-        with gr.Group(visible=is_depth_tab):
-            with gr.Row():
-                inp += "inpaint", gr.Checkbox(
-                    label="Generate 3D inpainted mesh. (Sloooow, required for generating videos)", value=False,
-                    visible=is_depth_tab)
-            with gr.Row(visible=False) as inpaint_options_row_0:
-                inp += "inpaint_vids", gr.Checkbox(label="Generate 4 demo videos with 3D inpainted mesh.", value=False,
-                                                   visible=is_depth_tab)
+        if is_depth_tab:
+            with gr.Group():
+                with gr.Row():
+                    inp += "inpaint", gr.Checkbox(
+                        label="Generate 3D inpainted mesh. (Sloooow, required for generating videos)", value=False)
+                with gr.Group(visible=False) as inpaint_options_row_0:
+                    inp += "inpaint_vids", gr.Checkbox(
+                        label="Generate 4 demo videos with 3D inpainted mesh.", value=False)
+                    gr.HTML("More options can be found in the Generate video tab")
 
         with gr.Group():
             with gr.Row():
@@ -408,8 +409,6 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp, background_re
     custom_depthmap = inp["custom_depthmap"] if "custom_depthmap" in inp else "False"
     custom_depthmap_img = inp["custom_depthmap_img"] if "custom_depthmap_img" in inp else None
     depthmap_batch_reuse = inp["depthmap_batch_reuse"] if "depthmap_batch_reuse" in inp else True
-    fnExt = inp["fnExt"] if "fnExt" in inp else "mp4"
-    vid_ssaa = inp["vid_ssaa"] if "vid_ssaa" in inp else 0
 
     print(f"\n{scriptname} {scriptversion} ({get_commit_hash()})")
 
@@ -923,7 +922,7 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp, background_re
     try:
         if inpaint:
             unload_sd_model()
-            mesh_fi = run_3dphoto(device, inpaint_imgs, inpaint_depths, inputnames, outpath, fnExt, vid_ssaa, inpaint_vids)
+            mesh_fi = run_3dphoto(device, inpaint_imgs, inpaint_depths, inputnames, outpath, inpaint_vids, 1, "mp4")
     finally:
         reload_sd_model()
         print("All done.")
@@ -968,7 +967,7 @@ def get_uniquefn(outpath, basename, ext):
     return basename
 
 
-def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, fnExt, vid_ssaa, inpaint_vids):
+def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, inpaint_vids, vid_ssaa, vid_format):
     mesh_fi = ''
     try:
         print("Running 3D Photo Inpainting .. ")
@@ -1094,7 +1093,7 @@ def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, fnExt, vid_ssaa
                                    [0.00, 0.00, -0.015, -0.015],
                                    [0.00, 0.00, -0.015, -0.00],
                                    [-0.05, -0.05, -0.05, -0.05],
-                                   ['dolly-zoom-in', 'zoom-in', 'circle', 'swing'], False, fnExt, vid_ssaa)
+                                   ['dolly-zoom-in', 'zoom-in', 'circle', 'swing'], False, vid_format, vid_ssaa)
 
             devices.torch_gc()
 
@@ -1111,7 +1110,7 @@ def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, fnExt, vid_ssaa
 
 
 def run_3dphoto_videos(mesh_fi, basename, outpath, num_frames, fps, crop_border, traj_types, x_shift_range,
-                       y_shift_range, z_shift_range, video_postfix, vid_dolly, fnExt, vid_ssaa):
+                       y_shift_range, z_shift_range, video_postfix, vid_dolly, vid_format, vid_ssaa):
     if platform.system() == 'Windows':
         vispy.use(app='PyQt5')
     elif platform.system() == 'Darwin':
@@ -1190,7 +1189,7 @@ def run_3dphoto_videos(mesh_fi, basename, outpath, num_frames, fps, crop_border,
                                                           videos_poses, video_basename, original_h, original_w,
                                                           border=border, depth=None, normal_canvas=normal_canvas,
                                                           all_canvas=all_canvas,
-                                                          mean_loc_depth=mean_loc_depth, dolly=vid_dolly, fnExt=fnExt)
+                                                          mean_loc_depth=mean_loc_depth, dolly=vid_dolly, fnExt=vid_format)
     return fn_saved
 
 
@@ -1199,10 +1198,7 @@ def run_makevideo(fn_mesh, vid_numframes, vid_fps, vid_traj, vid_shift, vid_bord
     if len(fn_mesh) == 0 or not os.path.exists(fn_mesh):
         raise Exception("Could not open mesh.")
 
-    # file type
-    fnExt = "mp4" if vid_format == 0 else "webm"
-
-    vid_ssaa = vid_ssaa + 1
+    vid_ssaa = int(vid_ssaa)
 
     # traj type
     if vid_traj == 0:
@@ -1235,7 +1231,7 @@ def run_makevideo(fn_mesh, vid_numframes, vid_fps, vid_traj, vid_shift, vid_bord
     fullfn = None
     for i in range(500):
         fn = f"{basecount + i:05}" if basename == '' else f"{basename}-{basecount + i:04}"
-        fullfn = os.path.join(outpath, f"{fn}_." + fnExt)
+        fullfn = os.path.join(outpath, f"{fn}_." + vid_format)
         if not os.path.exists(fullfn):
             break
     basename = Path(fullfn).stem
@@ -1244,7 +1240,7 @@ def run_makevideo(fn_mesh, vid_numframes, vid_fps, vid_traj, vid_shift, vid_bord
     print("Loading mesh ..")
 
     fn_saved = run_3dphoto_videos(fn_mesh, basename, outpath, num_frames, num_fps, crop_border, vid_traj, x_shift_range,
-                                  y_shift_range, z_shift_range, [''], dolly, fnExt, vid_ssaa)
+                                  y_shift_range, z_shift_range, [''], dolly, vid_format, vid_ssaa)
 
     return fn_saved[-1], fn_saved[-1], ''
 
@@ -1263,11 +1259,6 @@ def run_generate(*inputs):
     imageNameArr = []
     outputs = []
 
-    # TODO: this should not be here
-    # file type
-    inputs['fnExt'] = "mp4" if inputs['vid_format'] == 0 else "webm"
-    inputs['vid_ssaa'] = inputs['vid_ssaa'] + 1
-
     if depthmap_mode == '0':  # Single image
         imageArr.append(depthmap_input_image)
         imageNameArr.append(None)
@@ -1383,7 +1374,7 @@ def on_ui_tabs():
                                            label="Skip generation and use (edited/custom) depthmaps in output directory when a file exists.",
                                            value=True)
                 submit = gr.Button('Generate', elem_id="depthmap_generate", variant='primary')
-                inp += main_ui_panel(True)
+                inp += main_ui_panel(True)  # Main panel is inserted here
                 unloadmodels = gr.Button('Unload models', elem_id="depthmap_unloadmodels")
 
             with gr.Column(variant='panel'):
@@ -1417,8 +1408,8 @@ def on_ui_tabs():
                             with gr.Row():
                                 vid_numframes = gr.Textbox(label="Number of frames", value="300")
                                 vid_fps = gr.Textbox(label="Framerate", value="40")
-                                inp += 'vid_format', gr.Dropdown(label="Format", choices=['mp4', 'webm'], value='mp4', type="index", elem_id="video_format")
-                                inp += 'vid_ssaa', gr.Dropdown(label="SSAA", choices=['1', '2', '3', '4'], value='3', type="index", elem_id="video_ssaa")
+                                vid_format = gr.Dropdown(label="Format", choices=['mp4', 'webm'], value='mp4', type="value", elem_id="video_format")
+                                vid_ssaa = gr.Dropdown(label="SSAA", choices=['1', '2', '3', '4'], value='3', type="value", elem_id="video_ssaa")
                             with gr.Row():
                                 vid_traj = gr.Dropdown(label="Trajectory", choices=['straight-line', 'double-straight-line', 'circle'], value='double-straight-line', type="index", elem_id="video_trajectory")
                                 vid_shift = gr.Textbox(label="Translate: x, y, z", value="-0.015, 0.0, -0.05")
@@ -1475,8 +1466,8 @@ def custom_depthmap_visibility(v):
                 vid_shift,
                 vid_border,
                 vid_dolly,
-                inp['vid_format'],
-                inp['vid_ssaa']
+                vid_format,
+                vid_ssaa
             ],
             outputs=[
                 depth_vid,

From c98cfa7840389de64ede7509f8eb7851b83ffb59 Mon Sep 17 00:00:00 2001
From: Semjon Kravtsenko <semjon.00@gmail.com>
Date: Thu, 6 Jul 2023 10:38:49 +0300
Subject: [PATCH 03/16] Move background_removed_images code from run_depthmap
 wrappers to run_depthmap

Large refactor part, may be broken
---
 scripts/depthmap.py | 77 +++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 42 deletions(-)

diff --git a/scripts/depthmap.py b/scripts/depthmap.py
index 2206afe..2820940 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap.py
@@ -340,19 +340,8 @@ def run(self, p, *inputs):
                 continue
             inputimages.append(processed.images[count])
 
-        # TODO: this should not be here
-        # remove on base image before depth calculation
-        background_removed_images = []
-        if inp['background_removal']:
-            if inp['pre_depth_background_removal']:
-                inputimages = batched_background_removal(inputimages, inp['background_removal_model'])
-                background_removed_images = inputimages
-            else:
-                background_removed_images = batched_background_removal(inputimages, inp['background_removal_model'])
-
-        newmaps, mesh_fi, meshsimple_fi = run_depthmap(processed, p.outpath_samples, inputimages, None, inp,
-                                                       background_removed_images)
-        for img in newmaps:
+        outimages, mesh_fi, meshsimple_fi = run_depthmap(processed, p.outpath_samples, inputimages, None, inp)
+        for img in outimages:
             processed.images.append(img)
 
         return processed
@@ -370,7 +359,7 @@ def reload_sd_model():
         shared.sd_model.first_stage_model.to(devices.device)
 
 
-def run_depthmap(processed, outpath, inputimages, inputnames, inp, background_removed_images):
+def run_depthmap(processed, outpath, inputimages, inputnames, inp):
     if len(inputimages) == 0 or inputimages[0] is None:
         return [], []
 
@@ -410,10 +399,22 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp, background_re
     custom_depthmap_img = inp["custom_depthmap_img"] if "custom_depthmap_img" in inp else None
     depthmap_batch_reuse = inp["depthmap_batch_reuse"] if "depthmap_batch_reuse" in inp else True
 
+    # TODO: run_depthmap should not save outputs nor assign filenames, it should be done by a wrapper.
+    #  Rationale: allowing webui-independent (stand-alone) wrapers
     print(f"\n{scriptname} {scriptversion} ({get_commit_hash()})")
 
     unload_sd_model()
 
+    # TODO: this still should not be here
+    background_removed_images = []
+    # remove on base image before depth calculation
+    if background_removal:
+        if pre_depth_background_removal:
+            inputimages = batched_background_removal(inputimages, background_removal_model)
+            background_removed_images = inputimages
+        else:
+            background_removed_images = batched_background_removal(inputimages, background_removal_model)
+
     meshsimple_fi = None
     mesh_fi = None
 
@@ -422,8 +423,11 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp, background_re
 
     # init torch device
     global device
+    if depthmap_compute_device == 'GPU' and not torch.cuda.is_available():
+        print('WARNING: Cuda device was not found, cpu will be used')
+        depthmap_compute_device = 'CPU'
     if depthmap_compute_device == 'GPU':
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        device = torch.device("cuda")
     else:
         device = torch.device("cpu")
     print("device: %s" % device)
@@ -831,22 +835,21 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp, background_re
             if gen_stereo:
                 print("Generating stereoscopic images..")
 
-                stereomodes = stereo_modes
                 stereoimages = create_stereoimages(inputimages[count], img_output, stereo_divergence, stereo_separation,
-                                                   stereomodes, stereo_balance, stereo_fill)
+                                                   stereo_modes, stereo_balance, stereo_fill)
 
                 for c in range(0, len(stereoimages)):
                     outimages.append(stereoimages[c])
                     if processed is not None:
                         images.save_image(stereoimages[c], outpath, "", processed.all_seeds[count],
                                           processed.all_prompts[count], opts.samples_format, info=info, p=processed,
-                                          suffix=f"_{stereomodes[c]}")
+                                          suffix=f"_{stereo_modes[c]}")
                     else:
                         # from tab
                         images.save_image(stereoimages[c], path=outpath, basename=basename, seed=None,
                                           prompt=None, extension=opts.samples_format, info=info, short_filename=True,
                                           no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None,
-                                          forced_filename=None, suffix=f"_{stereomodes[c]}")
+                                          forced_filename=None, suffix=f"_{stereo_modes[c]}")
 
             if gen_normal:
                 # taken from @graemeniedermayer, hidden, for api use only, will remove in future.
@@ -1254,30 +1257,30 @@ def run_generate(*inputs):
     depthmap_input_image = inputs['depthmap_input_image']
     depthmap_batch_output_dir = inputs['depthmap_batch_output_dir']
 
-    imageArr = []
+    inputimages = []
     # Also keep track of original file names
-    imageNameArr = []
-    outputs = []
+    inputnames = []
+    outimages = []
 
     if depthmap_mode == '0':  # Single image
-        imageArr.append(depthmap_input_image)
-        imageNameArr.append(None)
+        inputimages.append(depthmap_input_image)
+        inputnames.append(None)
     if depthmap_mode == '1':  # Batch Process
         # convert files to pillow images
         for img in image_batch:
             image = Image.open(os.path.abspath(img.name))
-            imageArr.append(image)
-            imageNameArr.append(os.path.splitext(img.orig_name)[0])
+            inputimages.append(image)
+            inputnames.append(os.path.splitext(img.orig_name)[0])
     elif depthmap_mode == '2':  # Batch from Directory
         assert not shared.cmd_opts.hide_ui_dir_config, '--hide-ui-dir-config option must be disabled'
         if depthmap_batch_input_dir == '':
-            return outputs, "Please select an input directory.", ''
+            return outimages, "Please select an input directory.", ''
         image_list = shared.listfiles(depthmap_batch_input_dir)
         for img in image_list:
             try:
                 image = Image.open(img)
-                imageArr.append(image)
-                imageNameArr.append(img)
+                inputimages.append(image)
+                inputnames.append(img)
             except Exception:
                 print(f'Failed to load {img}, ignoring.')
 
@@ -1286,26 +1289,16 @@ def run_generate(*inputs):
     else:
         outpath = opts.outdir_samples or opts.outdir_extras_samples
 
-    # TODO: this should not be here
-    background_removed_images = []
-    if inputs['background_removal']:
-        if inputs['pre_depth_background_removal']:
-            imageArr = batched_background_removal(imageArr, inputs['background_removal_model'])
-            background_removed_images = imageArr
-        else:
-            background_removed_images = batched_background_removal(imageArr, inputs['background_removal_model'])
-    outputs, mesh_fi, meshsimple_fi = run_depthmap(None, outpath, imageArr, imageNameArr, inputs,
-                                                   background_removed_images)
+    outimages, mesh_fi, meshsimple_fi = run_depthmap(None, outpath, inputimages, inputnames, inputs)
 
     # use inpainted 3d mesh to show in 3d model output when enabled in settings
     if hasattr(opts, 'depthmap_script_show_3d_inpaint') and opts.depthmap_script_show_3d_inpaint and mesh_fi != None and len(mesh_fi) > 0:
             meshsimple_fi = mesh_fi
-
-    # don't show 3dmodel when disabled in settings
+    # however, don't show 3dmodel when disabled in settings
     if hasattr(opts, 'depthmap_script_show_3d') and not opts.depthmap_script_show_3d:
             meshsimple_fi = None
 
-    return outputs, mesh_fi, meshsimple_fi, plaintext_to_html('info'), ''
+    return outimages, mesh_fi, meshsimple_fi, plaintext_to_html('info'), ''
 
 
 def unload_models():

From c414acdb2f7f9579d1be215b6123ffc9eea71209 Mon Sep 17 00:00:00 2001
From: Semjon Kravtsenko <semjon.00@gmail.com>
Date: Thu, 6 Jul 2023 14:06:19 +0300
Subject: [PATCH 04/16] Move image saving from run_depthmap to wrappers

Large refactor part, may be broken
---
 scripts/depthmap.py               | 203 +++++++++++++-----------------
 scripts/stereoimage_generation.py |   2 +
 2 files changed, 87 insertions(+), 118 deletions(-)

diff --git a/scripts/depthmap.py b/scripts/depthmap.py
index 2820940..1a16882 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap.py
@@ -159,9 +159,9 @@ def main_ui_panel(is_depth_tab):
 
         with gr.Group():
             with gr.Row():
-                inp += "combine_output", gr.Checkbox(label="Combine into one image", value=False)
+                inp += "combine_output", gr.Checkbox(label="Combine input and corresponding depthmap into one image", value=False)
                 inp += "combine_output_axis", gr.Radio(label="Combine axis", choices=['Vertical', 'Horizontal'],
-                                                       value='Horizontal', type="index")
+                                                       value='Horizontal', type="index", visible=False)
             with gr.Row():
                 inp += "save_depth", gr.Checkbox(label="Save DepthMap", value=True)
                 inp += "show_depth", gr.Checkbox(label="Show DepthMap", value=True)
@@ -211,6 +211,7 @@ def main_ui_panel(is_depth_tab):
                     gr.HTML("More options can be found in the Generate video tab")
 
         with gr.Group():
+            # TODO: it should be clear from the UI that the background removal does not use the model selected above
             with gr.Row():
                 inp += "background_removal", gr.Checkbox(label="Remove background", value=False)
             with gr.Row(visible=False) as bgrem_options_row_1:
@@ -240,6 +241,12 @@ def main_ui_panel(is_depth_tab):
             outputs=[options_depend_on_match_size]
         )
 
+        inp['combine_output'].change(
+            fn=lambda v: inp['combine_output_axis'].update(visible=v),
+            inputs=[inp['combine_output']],
+            outputs=[inp['combine_output_axis']]
+        )
+
         inp['clipdepth'].change(
             fn=lambda v: clip_options_row_1.update(visible=v),
             inputs=[inp['clipdepth']],
@@ -272,25 +279,20 @@ def stereo_options_visibility(v):
             return stereo_options_row_0.update(visible=v), \
                 stereo_options_row_1.update(visible=v), \
                 stereo_options_row_2.update(visible=v)
-
         inp['gen_stereo'].change(
             fn=stereo_options_visibility,
             inputs=[inp['gen_stereo']],
             outputs=[stereo_options_row_0, stereo_options_row_1, stereo_options_row_2]
         )
 
-        def mesh_options_visibility(v):
-            return mesh_options_row_0.update(visible=v)
-
         inp['gen_mesh'].change(
-            fn=mesh_options_visibility,
+            fn=lambda v: mesh_options_row_0.update(visible=v),
             inputs=[inp['gen_mesh']],
             outputs=[mesh_options_row_0]
         )
 
         def inpaint_options_visibility(v):
             return inpaint_options_row_0.update(visible=v)
-
         inp['inpaint'].change(
             fn=inpaint_options_visibility,
             inputs=[inp['inpaint']],
@@ -300,7 +302,6 @@ def inpaint_options_visibility(v):
         def background_removal_options_visibility(v):
             return bgrem_options_row_1.update(visible=v), \
                 bgrem_options_row_2.update(visible=v)
-
         inp['background_removal'].change(
             fn=background_removal_options_visibility,
             inputs=[inp['background_removal']],
@@ -340,8 +341,22 @@ def run(self, p, *inputs):
                 continue
             inputimages.append(processed.images[count])
 
-        outimages, mesh_fi, meshsimple_fi = run_depthmap(processed, p.outpath_samples, inputimages, None, inp)
-        for img in outimages:
+        show_images, save_images, mesh_fi, meshsimple_fi = run_depthmap(processed, p.outpath_samples, inputimages, None, inp)
+
+        for input_i, imgs in enumerate(save_images):
+            # get generation parameters
+            if hasattr(processed, 'all_prompts') and opts.enable_pnginfo:
+                info = create_infotext(processed, processed.all_prompts, processed.all_seeds, processed.all_subseeds, "", 0, input_i)
+            else:
+                info = None
+
+            for image_type, image in list(imgs.items()):
+                images.save_image(image, path=p.outpath_samples, basename="", seed=processed.all_seeds[input_i],
+                                  prompt=processed.all_prompts[input_i], extension=opts.samples_format, info=info,
+                                  p=processed,
+                                  suffix=f"_{image_type}")
+
+        for img in show_images:
             processed.images.append(img)
 
         return processed
@@ -399,8 +414,8 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
     custom_depthmap_img = inp["custom_depthmap_img"] if "custom_depthmap_img" in inp else None
     depthmap_batch_reuse = inp["depthmap_batch_reuse"] if "depthmap_batch_reuse" in inp else True
 
-    # TODO: run_depthmap should not save outputs nor assign filenames, it should be done by a wrapper.
-    #  Rationale: allowing webui-independent (stand-alone) wrapers
+    # TODO: run_depthmap should not generate or save meshes, since these do not use generated depthmaps.
+    #  Rationale: allowing webui-independent (stand-alone) wrappers.
     print(f"\n{scriptname} {scriptversion} ({get_commit_hash()})")
 
     unload_sd_model()
@@ -449,11 +464,10 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
             depthmap_model_depth = None
             loadmodels = True
 
-    outimages = []
     try:
         if loadmodels and not (custom_depthmap and custom_depthmap_img != None):
             # TODO: loading model should be separated into a function that would return the model
-            #  and the parameters needed for this. The rest of the run_depthmap should depend on what specific model
+            #  and the parameters needed. The rest of the run_depthmap should not depend on what specific model
             #  is actually used for the generation.
             print("Loading model weights from ", end=" ")
 
@@ -627,10 +641,18 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
                 model = model.to(device)
 
         print("Computing depthmap(s) ..")
+        numimages = len(inputimages)
+        # Images that are meant to be shown in a GUI (if any)
+        show_images = []
+        # Images that should be saved as an array of dictionaries.
+        # Every array element corresponds to particular input image.
+        # Dictionary keys are types of images that were derived from the input image.
+        save_images = [{} for _ in range(numimages)]
+        # TODO: ???
         inpaint_imgs = []
+        # TODO: ???
         inpaint_depths = []
         # iterate over input (generated) images
-        numimages = len(inputimages)
         for count in trange(0, numimages):
 
             print('\n')
@@ -668,6 +690,7 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
             img = cv2.cvtColor(np.asarray(inputimages[count]), cv2.COLOR_BGR2RGB) / 255.0
 
             skipInvertAndSave = False
+            # TODO: custom depthmaps should be supplied in the same way as "custom depthmap" in single image mode
             if custom_depthmap_fn is not None:
                 # use custom depthmap
                 dimg = Image.open(os.path.abspath(custom_depthmap_fn))
@@ -727,13 +750,6 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
                 inpaint_imgs.append(inputimages[count])
                 inpaint_depths.append(img_output)
 
-            # get generation parameters
-            if processed is not None and hasattr(processed, 'all_prompts') and opts.enable_pnginfo:
-                info = create_infotext(processed, processed.all_prompts, processed.all_seeds, processed.all_subseeds,
-                                       "", 0, count)
-            else:
-                info = None
-
             rgb_image = inputimages[count]
 
             # applying background masks after depth
@@ -748,111 +764,47 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
 
                 img_output[bg_mask] = far_value * far_value  # 255*255 or 0*0
 
-                # should this be optional
-                if (processed is not None):
-                    images.save_image(background_removed_image, outpath, "", processed.all_seeds[count],
-                                      processed.all_prompts[count], opts.samples_format, info=info, p=processed,
-                                      suffix="_background_removed")
-                else:
-                    images.save_image(background_removed_image, path=outpath, basename=basename, seed=None, prompt=None,
-                                      extension=opts.samples_format, info=info, short_filename=True, no_prompt=True,
-                                      grid=False, pnginfo_section_name="extras", existing_info=None,
-                                      forced_filename=None, suffix="_background_removed")
-                outimages.append(background_removed_image)
+                # saving should be optional
+                save_images[count]['background_removed'] = background_removed_image
+                show_images.append(background_removed_image)
+
                 if save_background_removal_masks:
                     bg_array = (1 - bg_mask.astype('int8')) * 255
                     mask_array = np.stack((bg_array, bg_array, bg_array, bg_array), axis=2)
                     mask_image = Image.fromarray(mask_array.astype(np.uint8))
-                    if (processed is not None):
-                        images.save_image(mask_image, outpath, "", processed.all_seeds[count],
-                                          processed.all_prompts[count], opts.samples_format, info=info, p=processed,
-                                          suffix="_foreground_mask")
-                    else:
-                        images.save_image(mask_image, path=outpath, basename=basename, seed=None, prompt=None,
-                                          extension=opts.samples_format, info=info, short_filename=True, no_prompt=True,
-                                          grid=False, pnginfo_section_name="extras", existing_info=None,
-                                          forced_filename=None, suffix="_foreground_mask")
-                    outimages.append(mask_image)
 
-            img_concat = np.concatenate((rgb_image, img_output2), axis=combine_output_axis)
+                    # saving should be optional
+                    save_images[count]['foreground_mask'] = mask_image
+                    show_images.append(mask_image)
+
+            img_concat = Image.fromarray(np.concatenate((rgb_image, img_output2), axis=combine_output_axis))
             if show_depth:
                 if not combine_output:
-                    outimages.append(Image.fromarray(img_output))
+                    show_images.append(Image.fromarray(img_output))
                 else:
-                    outimages.append(Image.fromarray(img_concat))
+                    show_images.append(img_concat)
+            if not skipInvertAndSave:  # TODO: skipInvertAndSave is not intuitive
+                if save_depth:
+                    if combine_output:
+                        save_images[count]['concat_depth'] = img_concat
+                    else:
+                        save_images[count]['depth'] = Image.fromarray(img_output)
 
-            if not skipInvertAndSave:
-                if not combine_output:
-                    if save_depth and processed is not None:
-                        # only save 16 bit single channel image when PNG format is selected
-                        if opts.samples_format == "png":
-                            try:
-                                images.save_image(Image.fromarray(img_output), outpath, "", processed.all_seeds[count],
-                                                  processed.all_prompts[count], opts.samples_format, info=info,
-                                                  p=processed, suffix="_depth")
-                            except Exception as ve:
-                                if not ('image has wrong mode' in str(ve) or 'I;16' in str(ve)): raise ve
-                                print('Catched exception: image has wrong mode!')
-                                traceback.print_exc()
-                        else:
-                            images.save_image(Image.fromarray(img_output2), outpath, "", processed.all_seeds[count],
-                                              processed.all_prompts[count], opts.samples_format, info=info, p=processed,
-                                              suffix="_depth")
-                    elif save_depth:
-                        # from depth tab
-                        # only save 16 bit single channel image when PNG format is selected
-                        if opts.samples_format == "png":
-                            try:
-                                images.save_image(Image.fromarray(img_output), path=outpath, basename=basename,
-                                                  seed=None, prompt=None, extension=opts.samples_format, info=info,
-                                                  short_filename=True, no_prompt=True, grid=False,
-                                                  pnginfo_section_name="extras", existing_info=None,
-                                                  forced_filename=None)
-                            except Exception as ve:
-                                if not ('image has wrong mode' in str(ve) or 'I;16' in str(ve)): raise ve
-                                print('Catched exception: image has wrong mode!')
-                                traceback.print_exc()
-                        else:
-                            images.save_image(Image.fromarray(img_output2), path=outpath, basename=basename, seed=None,
-                                              prompt=None, extension=opts.samples_format, info=info,
-                                              short_filename=True, no_prompt=True, grid=False,
-                                              pnginfo_section_name="extras", existing_info=None, forced_filename=None)
-                else:
-                    if save_depth and processed is not None:
-                        images.save_image(Image.fromarray(img_concat), outpath, "", processed.all_seeds[count],
-                                          processed.all_prompts[count], opts.samples_format, info=info, p=processed,
-                                          suffix="_depth")
-                    elif save_depth:
-                        # from tab
-                        images.save_image(Image.fromarray(img_concat), path=outpath, basename=basename, seed=None,
-                                          prompt=None, extension=opts.samples_format, info=info, short_filename=True,
-                                          no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None,
-                                          forced_filename=None)
             if show_heat:
                 heatmap = colorize(img_output, cmap='inferno')
-                outimages.append(heatmap)
+                show_images.append(heatmap)
 
             if gen_stereo:
                 print("Generating stereoscopic images..")
 
                 stereoimages = create_stereoimages(inputimages[count], img_output, stereo_divergence, stereo_separation,
                                                    stereo_modes, stereo_balance, stereo_fill)
-
                 for c in range(0, len(stereoimages)):
-                    outimages.append(stereoimages[c])
-                    if processed is not None:
-                        images.save_image(stereoimages[c], outpath, "", processed.all_seeds[count],
-                                          processed.all_prompts[count], opts.samples_format, info=info, p=processed,
-                                          suffix=f"_{stereo_modes[c]}")
-                    else:
-                        # from tab
-                        images.save_image(stereoimages[c], path=outpath, basename=basename, seed=None,
-                                          prompt=None, extension=opts.samples_format, info=info, short_filename=True,
-                                          no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None,
-                                          forced_filename=None, suffix=f"_{stereo_modes[c]}")
-
-            if gen_normal:
-                # taken from @graemeniedermayer, hidden, for api use only, will remove in future.
+                    show_images.append(stereoimages[c])
+                    save_images[count][stereo_modes[c]] = stereoimages[c]
+
+            if gen_normal:  # TODO: should be moved into a separate file when redesigned
+                # taken from @graemeniedermayer
                 # take gradients
                 zx = cv2.Sobel(np.float64(img_output), cv2.CV_64F, 1, 0, ksize=3)
                 zy = cv2.Sobel(np.float64(img_output), cv2.CV_64F, 0, 1, ksize=3)
@@ -870,7 +822,7 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
                 normal *= 255
                 normal = normal.astype(np.uint8)
 
-                outimages.append(Image.fromarray(normal))
+                show_images.append(Image.fromarray(normal))
 
             # gen mesh
             if gen_mesh:
@@ -930,7 +882,7 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
         reload_sd_model()
         print("All done.")
 
-    return outimages, mesh_fi, meshsimple_fi
+    return show_images, save_images, mesh_fi, meshsimple_fi
 
 
 @njit(parallel=True)
@@ -956,7 +908,7 @@ def clipdepthmap(img, clipthreshold_far, clipthreshold_near):
 
 
 def get_uniquefn(outpath, basename, ext):
-    # unique filename
+    # Inefficient and may fail, maybe use unbounded binary search?
     basecount = get_next_sequence_number(outpath, basename)
     if basecount > 0: basecount = basecount - 1
     fullfn = None
@@ -1256,11 +1208,12 @@ def run_generate(*inputs):
     image_batch = inputs['image_batch']
     depthmap_input_image = inputs['depthmap_input_image']
     depthmap_batch_output_dir = inputs['depthmap_batch_output_dir']
+    depthmap_batch_reuse = inputs['depthmap_batch_reuse']
 
     inputimages = []
     # Also keep track of original file names
     inputnames = []
-    outimages = []
+    show_images = []
 
     if depthmap_mode == '0':  # Single image
         inputimages.append(depthmap_input_image)
@@ -1274,7 +1227,7 @@ def run_generate(*inputs):
     elif depthmap_mode == '2':  # Batch from Directory
         assert not shared.cmd_opts.hide_ui_dir_config, '--hide-ui-dir-config option must be disabled'
         if depthmap_batch_input_dir == '':
-            return outimages, "Please select an input directory.", ''
+            return show_images, "Please select an input directory.", ''
         image_list = shared.listfiles(depthmap_batch_input_dir)
         for img in image_list:
             try:
@@ -1289,7 +1242,21 @@ def run_generate(*inputs):
     else:
         outpath = opts.outdir_samples or opts.outdir_extras_samples
 
-    outimages, mesh_fi, meshsimple_fi = run_depthmap(None, outpath, inputimages, inputnames, inputs)
+    show_images, save_images, mesh_fi, meshsimple_fi = run_depthmap(None, outpath, inputimages, inputnames, inputs)
+
+    # Saving images
+    for input_i, imgs in enumerate(save_images):
+        basename = 'depthmap'
+        if depthmap_batch_reuse and depthmap_mode == '2':
+            if inputnames[input_i] is not None:
+                basename = Path(inputnames[input_i]).stem
+        info = None
+
+        for image_type, image in list(imgs.items()):
+            images.save_image(image, path=outpath, basename=basename, seed=None,
+                              prompt=None, extension=opts.samples_format, info=info, short_filename=True,
+                              no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None,
+                              forced_filename=None, suffix=f"_{image_type}")
 
     # use inpainted 3d mesh to show in 3d model output when enabled in settings
     if hasattr(opts, 'depthmap_script_show_3d_inpaint') and opts.depthmap_script_show_3d_inpaint and mesh_fi != None and len(mesh_fi) > 0:
@@ -1298,7 +1265,7 @@ def run_generate(*inputs):
     if hasattr(opts, 'depthmap_script_show_3d') and not opts.depthmap_script_show_3d:
             meshsimple_fi = None
 
-    return outimages, mesh_fi, meshsimple_fi, plaintext_to_html('info'), ''
+    return show_images, mesh_fi, meshsimple_fi, plaintext_to_html('info'), ''
 
 
 def unload_models():
@@ -1315,7 +1282,7 @@ def unload_models():
 def clear_mesh():
     return None
 
-
+# TODO: some of them may be put into the main ui pane
 def on_ui_settings():
     section = ('depthmap-script', "Depthmap extension")
     shared.opts.add_option("depthmap_script_keepmodels",
diff --git a/scripts/stereoimage_generation.py b/scripts/stereoimage_generation.py
index 80461f9..082e727 100644
--- a/scripts/stereoimage_generation.py
+++ b/scripts/stereoimage_generation.py
@@ -28,6 +28,8 @@ def create_stereoimages(original_image, depthmap, divergence, separation=0.0, mo
         modes = ['left-right']
     if not isinstance(modes, list):
         modes = [modes]
+    if len(modes) == 0:
+        return []
 
     original_image = np.asarray(original_image)
     balance = (stereo_balance + 1) / 2

From 701ec1798652f53c63e35953a0ed973069698436 Mon Sep 17 00:00:00 2001
From: Semjon Kravtsenko <semjon.00@gmail.com>
Date: Thu, 6 Jul 2023 15:06:34 +0300
Subject: [PATCH 05/16] Optimize imports a bit

Large refactor part, may be broken
---
 scripts/depthmap.py | 53 +++++++++++++++++++++------------------------
 1 file changed, 25 insertions(+), 28 deletions(-)

diff --git a/scripts/depthmap.py b/scripts/depthmap.py
index 1a16882..a4035cd 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap.py
@@ -1,40 +1,36 @@
 # Author: thygate
 # https://github.com/thygate/stable-diffusion-webui-depthmap-script
 
-import modules.scripts as scripts
+from operator import getitem
+from pathlib import Path
+
 import gradio as gr
+from PIL import Image
+from numba import njit, prange
+from torchvision.transforms import Compose, transforms
 
-from modules.call_queue import wrap_gradio_gpu_call, wrap_queued_call, wrap_gradio_call
-from modules.ui import plaintext_to_html
-from modules import processing, images, shared, sd_samplers, devices
-from modules.processing import create_infotext, process_images, Processed
-from modules.shared import opts, cmd_opts, state, Options
+import modules.scripts as scripts
+from modules import processing, images, shared, devices
 from modules import script_callbacks
+from modules.call_queue import wrap_gradio_gpu_call
 from modules.images import get_next_sequence_number
-from numba import njit, prange
-from torchvision.transforms import Compose, transforms
-from PIL import Image
-from pathlib import Path
-from operator import getitem
-from tqdm import trange
-from functools import reduce
-from skimage.transform import resize
-from trimesh import transformations
+from modules.processing import create_infotext
+from modules.shared import opts, cmd_opts
+from modules.ui import plaintext_to_html
+
+try:
+    from tqdm import trange
+except:
+    from builtins import range as trange
 
 import sys
 import torch, gc
-import torch.nn as nn
 import cv2
 import os.path
-import contextlib
-import matplotlib.pyplot as plt
 import numpy as np
 import skimage.measure
 import copy
 import platform
-import vispy
-import trimesh
-import os
 import math
 import subprocess
 import traceback
@@ -90,9 +86,7 @@ def ensure_gradio_temp_directory():
 from dzoedepth.utils.misc import colorize
 from dzoedepth.utils.geometry import depth_to_points, create_triangles
 
-# background removal
-from rembg import new_session, remove
-
+# TODO: next two should not be here
 whole_size_threshold = 1600  # R_max from the paper
 pix2pixsize = 1024
 scriptname = "DepthMap"
@@ -109,8 +103,6 @@ def ensure_gradio_temp_directory():
 depthmap_deviceidx = None
 
 commit_hash = None  # TODO: understand why it would spam to stderr if changed to ... = get_commit_hash()
-
-
 def get_commit_hash():
     global commit_hash
     if commit_hash is None:
@@ -467,7 +459,8 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
     try:
         if loadmodels and not (custom_depthmap and custom_depthmap_img != None):
             # TODO: loading model should be separated into a function that would return the model
-            #  and the parameters needed. The rest of the run_depthmap should not depend on what specific model
+            #  and the parameters (or maybe even functions) needed.
+            #  The rest of the run_depthmap should not depend on what specific model
             #  is actually used for the generation.
             print("Loading model weights from ", end=" ")
 
@@ -1066,6 +1059,7 @@ def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, inpaint_vids, v
 
 def run_3dphoto_videos(mesh_fi, basename, outpath, num_frames, fps, crop_border, traj_types, x_shift_range,
                        y_shift_range, z_shift_range, video_postfix, vid_dolly, vid_format, vid_ssaa):
+    import vispy
     if platform.system() == 'Windows':
         vispy.use(app='PyQt5')
     elif platform.system() == 'Darwin':
@@ -1445,6 +1439,7 @@ def custom_depthmap_visibility(v):
 # TODO: code borrowed from the internet to be marked as such and to reside in separate files
 
 def batched_background_removal(inimages, model_name):
+    from rembg import new_session, remove
     print('creating background masks')
     outimages = []
 
@@ -1531,6 +1526,7 @@ def estimateleres(img, model, w, h):
 
 
 def estimatemidas(img, model, w, h, resize_mode, normalization):
+    import contextlib
     # init transform
     transform = Compose(
         [
@@ -2176,6 +2172,7 @@ def depth_edges_mask(depth):
 
 
 def create_mesh(image, depth, keep_edges=False, spherical=False):
+    import trimesh
     maxsize = 1024
     if hasattr(opts, 'depthmap_script_mesh_maxsize'):
         maxsize = opts.depthmap_script_mesh_maxsize
@@ -2205,7 +2202,7 @@ def create_mesh(image, depth, keep_edges=False, spherical=False):
         angle = math.pi / 2
         direction = [1, 0, 0]
         center = [0, 0, 0]
-        rot_matrix = transformations.rotation_matrix(angle, direction, center)
+        rot_matrix = trimesh.transformations.rotation_matrix(angle, direction, center)
         mesh.apply_transform(rot_matrix)
 
     return mesh

From c57e8c50e1a741cad45af31c23358eead6dca920 Mon Sep 17 00:00:00 2001
From: Semjon Kravtsenko <semjon.00@gmail.com>
Date: Fri, 7 Jul 2023 13:06:27 +0300
Subject: [PATCH 06/16] Rework show_images of run_depthmap, improve UI

Large refactor part, may be broken

Also fix downloading models
---
 README.md           |   3 +
 scripts/depthmap.py | 174 ++++++++++++++++++++++----------------------
 2 files changed, 88 insertions(+), 89 deletions(-)

diff --git a/README.md b/README.md
index 658a69d..5e90dd4 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,9 @@ video by [@graemeniedermayer](https://github.com/graemeniedermayer), more exampl
 images generated by [@semjon00](https://github.com/semjon00) from CC0 photos, more examples [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/56#issuecomment-1367596463).
 
 ## Changelog
+* v0.3.13 
+    * Large code refactor
+    * Improved interface
 * v0.3.12
     * Fixed stereo image generation
     * Other bugfixes
diff --git a/scripts/depthmap.py b/scripts/depthmap.py
index a4035cd..27f8a59 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap.py
@@ -90,7 +90,7 @@ def ensure_gradio_temp_directory():
 whole_size_threshold = 1600  # R_max from the paper
 pix2pixsize = 1024
 scriptname = "DepthMap"
-scriptversion = "v0.3.12"
+scriptversion = "v0.3.13"
 
 global video_mesh_data, video_mesh_fn
 video_mesh_data = None
@@ -120,6 +120,7 @@ def get_commit_hash():
 
 def main_ui_panel(is_depth_tab):
     inp = GradioComponentBundle()
+    # TODO: Greater visual separation
     with gr.Blocks():
         with gr.Row():
             inp += 'compute_device', gr.Radio(label="Compute on", choices=['GPU', 'CPU'], value='GPU')
@@ -135,52 +136,57 @@ def main_ui_panel(is_depth_tab):
         with gr.Group():
             with gr.Row():
                 inp += 'boost', gr.Checkbox(label="BOOST (multi-resolution merging)", value=True)
-                inp += 'invert_depth', gr.Checkbox(label="Invert DepthMap (black=near, white=far)", value=False)
-            with gr.Group(visible=False) as options_depend_on_boost:
-                inp += 'match_size', gr.Checkbox(label="Match input size", value=False)
-                with gr.Row() as options_depend_on_match_size:
-                    inp += 'net_width', gr.Slider(minimum=64, maximum=2048, step=64, label='Net width', value=512)
-                    inp += 'net_height', gr.Slider(minimum=64, maximum=2048, step=64, label='Net height', value=512)
+                with gr.Group(visible=False) as options_depend_on_boost:
+                    inp += 'match_size', gr.Checkbox(label="Match net size to input size", value=False)
+            with gr.Row(visible=False) as options_depend_on_match_size:
+                inp += 'net_width', gr.Slider(minimum=64, maximum=2048, step=64, label='Net width', value=512)
+                inp += 'net_height', gr.Slider(minimum=64, maximum=2048, step=64, label='Net height', value=512)
 
         with gr.Group():
             with gr.Row():
-                inp += 'clipdepth', gr.Checkbox(label="Clip and renormalize", value=False)
+                inp += "save_outputs", gr.Checkbox(label="Save Outputs", value=True)  # 50% of width
+                with gr.Group():  # 50% of width
+                    inp += "output_depth", gr.Checkbox(label="Output DepthMap", value=True)
+                    inp += "invert_depth", gr.Checkbox(label="Invert (black=near, white=far)", value=False)
+            with gr.Row() as options_depend_on_output_depth_1:
+                inp += "combine_output", gr.Checkbox(
+                    label="Combine input and depthmap into one image", value=False)
+                inp += "combine_output_axis", gr.Radio(label="Combine axis", choices=['Vertical', 'Horizontal'],
+                                                       value='Horizontal', type="index", visible=False)
+        with gr.Group():
+            with gr.Row():
+                inp += 'clipdepth', gr.Checkbox(label="Clip and renormalize DepthMap", value=False)
             with gr.Row(visible=False) as clip_options_row_1:
                 inp += "clipthreshold_far", gr.Slider(minimum=0, maximum=1, step=0.001, label='Far clip', value=0)
                 inp += "clipthreshold_near", gr.Slider(minimum=0, maximum=1, step=0.001, label='Near clip', value=1)
 
         with gr.Group():
             with gr.Row():
-                inp += "combine_output", gr.Checkbox(label="Combine input and corresponding depthmap into one image", value=False)
-                inp += "combine_output_axis", gr.Radio(label="Combine axis", choices=['Vertical', 'Horizontal'],
-                                                       value='Horizontal', type="index", visible=False)
-            with gr.Row():
-                inp += "save_depth", gr.Checkbox(label="Save DepthMap", value=True)
-                inp += "show_depth", gr.Checkbox(label="Show DepthMap", value=True)
-                inp += "show_heat", gr.Checkbox(label="Show HeatMap", value=False)
+                inp += "show_heat", gr.Checkbox(label="Generate HeatMap", value=False)
+                # gr.Checkbox(label="Generate NormalMap", value=False)  # TODO: this is a fake door
 
         with gr.Group():
             with gr.Row():
                 inp += "gen_stereo", gr.Checkbox(label="Generate stereoscopic image(s)", value=False)
-                with gr.Group(visible=False) as stereo_options_row_0:
+            with gr.Group(visible=False) as stereo_options:
+                with gr.Row():
                     with gr.Row():
                         inp += "stereo_modes", gr.CheckboxGroup(
                             ["left-right", "right-left", "top-bottom", "bottom-top", "red-cyan-anaglyph"],
                             label="Output", value=["left-right", "red-cyan-anaglyph"])
-
-            with gr.Row(visible=False) as stereo_options_row_1:
-                inp += "stereo_divergence", gr.Slider(minimum=0.05, maximum=10.005, step=0.01,
-                                                      label='Divergence (3D effect)',
-                                                      value=2.5)
-                inp += "stereo_separation", gr.Slider(minimum=-5.0, maximum=5.0, step=0.01,
-                                                      label='Separation (moves images apart)',
-                                                      value=0.0)
-            with gr.Row(visible=False) as stereo_options_row_2:
-                inp += "stereo_fill", gr.Dropdown(label="Gap fill technique",
-                                                  choices=['none', 'naive', 'naive_interpolating', 'polylines_soft',
-                                                           'polylines_sharp'], value='polylines_sharp', type="value")
-                inp += "stereo_balance", gr.Slider(minimum=-1.0, maximum=1.0, step=0.05, label='Balance between eyes',
-                                                   value=0.0)
+                with gr.Row():
+                    inp += "stereo_divergence", gr.Slider(minimum=0.05, maximum=10.005, step=0.01,
+                                                          label='Divergence (3D effect)',
+                                                          value=2.5)
+                    inp += "stereo_separation", gr.Slider(minimum=-5.0, maximum=5.0, step=0.01,
+                                                          label='Separation (moves images apart)',
+                                                          value=0.0)
+                with gr.Row():
+                    inp += "stereo_fill", gr.Dropdown(label="Gap fill technique",
+                                                      choices=['none', 'naive', 'naive_interpolating', 'polylines_soft',
+                                                               'polylines_sharp'], value='polylines_sharp', type="value")
+                    inp += "stereo_balance", gr.Slider(minimum=-1.0, maximum=1.0, step=0.05, label='Balance between eyes',
+                                                       value=0.0)
 
         with gr.Group():
             with gr.Row():
@@ -200,7 +206,7 @@ def main_ui_panel(is_depth_tab):
                 with gr.Group(visible=False) as inpaint_options_row_0:
                     inp += "inpaint_vids", gr.Checkbox(
                         label="Generate 4 demo videos with 3D inpainted mesh.", value=False)
-                    gr.HTML("More options can be found in the Generate video tab")
+                    gr.HTML("More options for generating video can be found in the Generate video tab")
 
         with gr.Group():
             # TODO: it should be clear from the UI that the background removal does not use the model selected above
@@ -223,16 +229,22 @@ def main_ui_panel(is_depth_tab):
         inp += "gen_normal", gr.Checkbox(label="Generate Normalmap (hidden! api only)", value=False, visible=False)
 
         inp['boost'].change(
-            fn=lambda a: options_depend_on_boost.update(visible=not a),
-            inputs=[inp['boost']],
-            outputs=[options_depend_on_boost]
+            fn=lambda a, b: (options_depend_on_boost.update(visible=not a), options_depend_on_match_size.update(visible=not a and not b)),
+            inputs=[inp['boost'], inp['match_size']],
+            outputs=[options_depend_on_boost, options_depend_on_match_size]
         )
         inp['match_size'].change(
-            fn=lambda a: options_depend_on_match_size.update(visible=not a),
-            inputs=[inp['match_size']],
+            fn=lambda a, b: options_depend_on_match_size.update(visible=not a and not b),
+            inputs=[inp['boost'], inp['match_size']],
             outputs=[options_depend_on_match_size]
         )
 
+        inp['output_depth'].change(
+            fn=lambda a: (inp['invert_depth'].update(visible=a), options_depend_on_output_depth_1.update(visible=a)),
+            inputs=[inp['output_depth']],
+            outputs=[inp['invert_depth'], options_depend_on_output_depth_1]
+        )
+
         inp['combine_output'].change(
             fn=lambda v: inp['combine_output_axis'].update(visible=v),
             inputs=[inp['combine_output']],
@@ -268,13 +280,11 @@ def main_ui_panel(is_depth_tab):
         )
 
         def stereo_options_visibility(v):
-            return stereo_options_row_0.update(visible=v), \
-                stereo_options_row_1.update(visible=v), \
-                stereo_options_row_2.update(visible=v)
+            return stereo_options.update(visible=v)
         inp['gen_stereo'].change(
             fn=stereo_options_visibility,
             inputs=[inp['gen_stereo']],
-            outputs=[stereo_options_row_0, stereo_options_row_1, stereo_options_row_2]
+            outputs=[stereo_options]
         )
 
         inp['gen_mesh'].change(
@@ -285,11 +295,12 @@ def stereo_options_visibility(v):
 
         def inpaint_options_visibility(v):
             return inpaint_options_row_0.update(visible=v)
-        inp['inpaint'].change(
-            fn=inpaint_options_visibility,
-            inputs=[inp['inpaint']],
-            outputs=[inpaint_options_row_0]
-        )
+        if is_depth_tab:
+            inp['inpaint'].change(
+                fn=inpaint_options_visibility,
+                inputs=[inp['inpaint']],
+                outputs=[inpaint_options_row_0]
+            )
 
         def background_removal_options_visibility(v):
             return bgrem_options_row_1.update(visible=v), \
@@ -320,7 +331,7 @@ def ui(self, is_img2img):
 
     # run from script in txt2img or img2img
     def run(self, p, *inputs):
-        inp = GradioComponentBundle.enkey_to_dict(inputs)
+        inputs = GradioComponentBundle.enkey_to_dict(inputs)
 
         # sd process
         processed = processing.process_images(p)
@@ -333,24 +344,21 @@ def run(self, p, *inputs):
                 continue
             inputimages.append(processed.images[count])
 
-        show_images, save_images, mesh_fi, meshsimple_fi = run_depthmap(processed, p.outpath_samples, inputimages, None, inp)
+        generated_images, mesh_fi, meshsimple_fi = run_depthmap(processed, p.outpath_samples, inputimages, None, inputs)
 
-        for input_i, imgs in enumerate(save_images):
+        for input_i, imgs in enumerate(generated_images):
             # get generation parameters
             if hasattr(processed, 'all_prompts') and opts.enable_pnginfo:
                 info = create_infotext(processed, processed.all_prompts, processed.all_seeds, processed.all_subseeds, "", 0, input_i)
             else:
                 info = None
-
             for image_type, image in list(imgs.items()):
-                images.save_image(image, path=p.outpath_samples, basename="", seed=processed.all_seeds[input_i],
+                processed.images.append(image)
+                if inputs["save_outputs"]:
+                    images.save_image(image, path=p.outpath_samples, basename="", seed=processed.all_seeds[input_i],
                                   prompt=processed.all_prompts[input_i], extension=opts.samples_format, info=info,
                                   p=processed,
                                   suffix=f"_{image_type}")
-
-        for img in show_images:
-            processed.images.append(img)
-
         return processed
 
 
@@ -380,7 +388,7 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
     combine_output_axis = inp["combine_output_axis"]
     depthmap_compute_device = inp["compute_device"]
     gen_mesh = inp["gen_mesh"]
-    gen_normal = inp["gen_normal"]
+    gen_normal = inp["gen_normal"] if "gen_normal" in inp else False
     gen_stereo = inp["gen_stereo"]
     inpaint = inp["inpaint"]
     inpaint_vids = inp["inpaint_vids"]
@@ -393,8 +401,7 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
     net_width = inp["net_width"]
     pre_depth_background_removal = inp["pre_depth_background_removal"]
     save_background_removal_masks = inp["save_background_removal_masks"]
-    save_depth = inp["save_depth"]
-    show_depth = inp["show_depth"]
+    output_depth = inp["output_depth"]
     show_heat = inp["show_heat"]
     stereo_balance = inp["stereo_balance"]
     stereo_divergence = inp["stereo_divergence"]
@@ -402,7 +409,7 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
     stereo_modes = inp["stereo_modes"]
     stereo_separation = inp["stereo_separation"]
 
-    custom_depthmap = inp["custom_depthmap"] if "custom_depthmap" in inp else "False"
+    custom_depthmap = inp["custom_depthmap"] if "custom_depthmap" in inp else False
     custom_depthmap_img = inp["custom_depthmap_img"] if "custom_depthmap_img" in inp else None
     depthmap_batch_reuse = inp["depthmap_batch_reuse"] if "depthmap_batch_reuse" in inp else True
 
@@ -474,7 +481,6 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
                      "https://huggingface.co/lllyasviel/Annotators/resolve/5bc80eec2b4fddbb/res101.pth",
                      ],
                     "1d696b2ef3e8336b057d0c15bc82d2fecef821bfebe5ef9d7671a5ec5dde520b")
-                ensure_file_downloaded(model_path, "https://cloudstor.aarnet.edu.au/plus/s/lTIJF4vrvHCAI31/download")
                 if depthmap_compute_device == 'GPU':
                     checkpoint = torch.load(model_path)
                 else:
@@ -598,7 +604,7 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
                 # sfu.ca unfortunately is not very reliable, we use a mirror just in case
                 ensure_file_downloaded(
                     './models/pix2pix/latest_net_G.pth',
-                    ["https://huggingface.co/lllyasviel/Annotators/blob/9a7d84251d487d11/latest_net_G.pth",
+                    ["https://huggingface.co/lllyasviel/Annotators/resolve/9a7d84251d487d11/latest_net_G.pth",
                      "https://sfu.ca/~yagiz/CVPR21/latest_net_G.pth"],
                     '50ec735d74ed6499562d898f41b49343e521808b8dae589aa3c2f5c9ac9f7462')
                 opt = TestOptions().parse()
@@ -635,12 +641,10 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
 
         print("Computing depthmap(s) ..")
         numimages = len(inputimages)
-        # Images that are meant to be shown in a GUI (if any)
-        show_images = []
-        # Images that should be saved as an array of dictionaries.
+        # Images that will be returned.
         # Every array element corresponds to particular input image.
         # Dictionary keys are types of images that were derived from the input image.
-        save_images = [{} for _ in range(numimages)]
+        generated_images = [{} for _ in range(numimages)]
         # TODO: ???
         inpaint_imgs = []
         # TODO: ???
@@ -653,7 +657,8 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
             # filename
             basename = 'depthmap'
 
-            # TODO: this should not use heuristics to figure out the mode, mode should ideally be abstracted away
+            # TODO: this should not use heuristics to figure out the mode, mode should ideally be abstracted away.
+            #  By the way, this is probably broken
             # figuring out the name of custom DepthMap
             custom_depthmap_fn = None  # None means that DepthMap should be computed
             # find filename if in the single image mode
@@ -661,7 +666,6 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
                 custom_depthmap_fn = custom_depthmap_img.name
             # find filename if in batch mode
             if inputnames is not None and depthmap_batch_reuse:
-                save_depth = True
                 if inputnames[count] is not None:
                     p = Path(inputnames[count])
                     basename = p.stem
@@ -757,44 +761,33 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
 
                 img_output[bg_mask] = far_value * far_value  # 255*255 or 0*0
 
-                # saving should be optional
-                save_images[count]['background_removed'] = background_removed_image
-                show_images.append(background_removed_image)
+                generated_images[count]['background_removed'] = background_removed_image
 
                 if save_background_removal_masks:
                     bg_array = (1 - bg_mask.astype('int8')) * 255
                     mask_array = np.stack((bg_array, bg_array, bg_array, bg_array), axis=2)
                     mask_image = Image.fromarray(mask_array.astype(np.uint8))
 
-                    # saving should be optional
-                    save_images[count]['foreground_mask'] = mask_image
-                    show_images.append(mask_image)
+                    generated_images[count]['foreground_mask'] = mask_image
 
-            img_concat = Image.fromarray(np.concatenate((rgb_image, img_output2), axis=combine_output_axis))
-            if show_depth:
-                if not combine_output:
-                    show_images.append(Image.fromarray(img_output))
-                else:
-                    show_images.append(img_concat)
             if not skipInvertAndSave:  # TODO: skipInvertAndSave is not intuitive
-                if save_depth:
+                if output_depth:
                     if combine_output:
-                        save_images[count]['concat_depth'] = img_concat
+                        img_concat = Image.fromarray(np.concatenate((rgb_image, img_output2), axis=combine_output_axis))
+                        generated_images[count]['concat_depth'] = img_concat
                     else:
-                        save_images[count]['depth'] = Image.fromarray(img_output)
+                        generated_images[count]['depth'] = Image.fromarray(img_output)
 
             if show_heat:
                 heatmap = colorize(img_output, cmap='inferno')
-                show_images.append(heatmap)
+                generated_images[count]['heatmap'] = heatmap
 
             if gen_stereo:
                 print("Generating stereoscopic images..")
-
                 stereoimages = create_stereoimages(inputimages[count], img_output, stereo_divergence, stereo_separation,
                                                    stereo_modes, stereo_balance, stereo_fill)
                 for c in range(0, len(stereoimages)):
-                    show_images.append(stereoimages[c])
-                    save_images[count][stereo_modes[c]] = stereoimages[c]
+                    generated_images[count][stereo_modes[c]] = stereoimages[c]
 
             if gen_normal:  # TODO: should be moved into a separate file when redesigned
                 # taken from @graemeniedermayer
@@ -815,7 +808,7 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
                 normal *= 255
                 normal = normal.astype(np.uint8)
 
-                show_images.append(Image.fromarray(normal))
+                generated_images[count]['normal'] = Image.fromarray(normal)
 
             # gen mesh
             if gen_mesh:
@@ -875,7 +868,7 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
         reload_sd_model()
         print("All done.")
 
-    return show_images, save_images, mesh_fi, meshsimple_fi
+    return generated_images, mesh_fi, meshsimple_fi
 
 
 @njit(parallel=True)
@@ -1236,7 +1229,8 @@ def run_generate(*inputs):
     else:
         outpath = opts.outdir_samples or opts.outdir_extras_samples
 
-    show_images, save_images, mesh_fi, meshsimple_fi = run_depthmap(None, outpath, inputimages, inputnames, inputs)
+    save_images, mesh_fi, meshsimple_fi = run_depthmap(None, outpath, inputimages, inputnames, inputs)
+    show_images = []
 
     # Saving images
     for input_i, imgs in enumerate(save_images):
@@ -1247,7 +1241,9 @@ def run_generate(*inputs):
         info = None
 
         for image_type, image in list(imgs.items()):
-            images.save_image(image, path=outpath, basename=basename, seed=None,
+            show_images += [image]
+            if inputs["save_outputs"]:
+                images.save_image(image, path=outpath, basename=basename, seed=None,
                               prompt=None, extension=opts.samples_format, info=info, short_filename=True,
                               no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None,
                               forced_filename=None, suffix=f"_{image_type}")

From eb82cd4345fa8e6447f6586c5654be8b419d2f06 Mon Sep 17 00:00:00 2001
From: Semjon Kravtsenko <semjon.00@gmail.com>
Date: Fri, 7 Jul 2023 16:57:41 +0300
Subject: [PATCH 07/16] Rework custom depthmaps option

Large refactor part, may be broken

Also restore "wrong mode" bug workaround
---
 scripts/depthmap.py | 157 +++++++++++++++++++++++++-------------------
 1 file changed, 90 insertions(+), 67 deletions(-)

diff --git a/scripts/depthmap.py b/scripts/depthmap.py
index 27f8a59..d798bf2 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap.py
@@ -118,6 +118,15 @@ def get_commit_hash():
     return commit_hash
 
 
+def convert_i16_to_rgb(image, like):
+    # three channel, 8 bits per channel image
+    output = np.zeros_like(like)
+    output[:, :, 0] = image / 256.0
+    output[:, :, 1] = image / 256.0
+    output[:, :, 2] = image / 256.0
+    return output
+
+
 def main_ui_panel(is_depth_tab):
     inp = GradioComponentBundle()
     # TODO: Greater visual separation
@@ -344,7 +353,7 @@ def run(self, p, *inputs):
                 continue
             inputimages.append(processed.images[count])
 
-        generated_images, mesh_fi, meshsimple_fi = run_depthmap(processed, p.outpath_samples, inputimages, None, inputs)
+        generated_images, mesh_fi, meshsimple_fi = run_depthmap(p.outpath_samples, inputimages, None, None, inputs)
 
         for input_i, imgs in enumerate(generated_images):
             # get generation parameters
@@ -355,10 +364,15 @@ def run(self, p, *inputs):
             for image_type, image in list(imgs.items()):
                 processed.images.append(image)
                 if inputs["save_outputs"]:
-                    images.save_image(image, path=p.outpath_samples, basename="", seed=processed.all_seeds[input_i],
+                    try:
+                        images.save_image(image, path=p.outpath_samples, basename="", seed=processed.all_seeds[input_i],
                                   prompt=processed.all_prompts[input_i], extension=opts.samples_format, info=info,
                                   p=processed,
                                   suffix=f"_{image_type}")
+                    except Exception as e:
+                        if not ('image has wrong mode' in str(e) or 'I;16' in str(e)): raise e
+                        print('Catched exception: image has wrong mode!')
+                        traceback.print_exc()
         return processed
 
 
@@ -374,9 +388,12 @@ def reload_sd_model():
         shared.sd_model.first_stage_model.to(devices.device)
 
 
-def run_depthmap(processed, outpath, inputimages, inputnames, inp):
+def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
     if len(inputimages) == 0 or inputimages[0] is None:
-        return [], []
+        return [], '', ''
+    if len(inputdepthmaps) == 0:
+        inputdepthmaps = [None for _ in range(len(inputimages))]
+    inputdepthmaps_complete = all([x is not None for x in inputdepthmaps])
 
     background_removal = inp["background_removal"]
     background_removal_model = inp["background_removal_model"]
@@ -409,10 +426,6 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
     stereo_modes = inp["stereo_modes"]
     stereo_separation = inp["stereo_separation"]
 
-    custom_depthmap = inp["custom_depthmap"] if "custom_depthmap" in inp else False
-    custom_depthmap_img = inp["custom_depthmap_img"] if "custom_depthmap_img" in inp else None
-    depthmap_batch_reuse = inp["depthmap_batch_reuse"] if "depthmap_batch_reuse" in inp else True
-
     # TODO: run_depthmap should not generate or save meshes, since these do not use generated depthmaps.
     #  Rationale: allowing webui-independent (stand-alone) wrappers.
     print(f"\n{scriptname} {scriptversion} ({get_commit_hash()})")
@@ -429,9 +442,6 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
         else:
             background_removed_images = batched_background_removal(inputimages, background_removal_model)
 
-    meshsimple_fi = None
-    mesh_fi = None
-
     resize_mode = "minimal"
     normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
 
@@ -455,7 +465,7 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
     os.makedirs('./models/pix2pix', exist_ok=True)
 
     global depthmap_model_depth, depthmap_model_pix2pix, depthmap_model_type, depthmap_device_idx
-    loadmodels = True
+    loadmodels = True  # TODO: loadmodels is not intuitive
     if hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels:
         loadmodels = False
         if depthmap_model_type != model_type or depthmap_model_depth == None or depthmap_device_idx != depthmap_compute_device:
@@ -464,7 +474,7 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
             loadmodels = True
 
     try:
-        if loadmodels and not (custom_depthmap and custom_depthmap_img != None):
+        if loadmodels and not inputdepthmaps_complete:
             # TODO: loading model should be separated into a function that would return the model
             #  and the parameters (or maybe even functions) needed.
             #  The rest of the run_depthmap should not depend on what specific model
@@ -645,10 +655,12 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
         # Every array element corresponds to particular input image.
         # Dictionary keys are types of images that were derived from the input image.
         generated_images = [{} for _ in range(numimages)]
+
         # TODO: ???
+        meshsimple_fi = None
         inpaint_imgs = []
-        # TODO: ???
         inpaint_depths = []
+
         # iterate over input (generated) images
         for count in trange(0, numimages):
 
@@ -657,25 +669,8 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
             # filename
             basename = 'depthmap'
 
-            # TODO: this should not use heuristics to figure out the mode, mode should ideally be abstracted away.
-            #  By the way, this is probably broken
-            # figuring out the name of custom DepthMap
-            custom_depthmap_fn = None  # None means that DepthMap should be computed
-            # find filename if in the single image mode
-            if custom_depthmap and custom_depthmap_img is not None:
-                custom_depthmap_fn = custom_depthmap_img.name
-            # find filename if in batch mode
-            if inputnames is not None and depthmap_batch_reuse:
-                if inputnames[count] is not None:
-                    p = Path(inputnames[count])
-                    basename = p.stem
-                    if outpath != opts.outdir_extras_samples:
-                        custom_depthmap_fn = os.path.join(outpath, basename + '-0000.' + opts.samples_format)
-                        if not os.path.isfile(custom_depthmap_fn):
-                            custom_depthmap_fn = None
-
             # override net size
-            if (match_size):
+            if match_size:
                 net_width, net_height = inputimages[count].width, inputimages[count].height
 
             # Convert single channel input (PIL) images to rgb
@@ -687,10 +682,9 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
             img = cv2.cvtColor(np.asarray(inputimages[count]), cv2.COLOR_BGR2RGB) / 255.0
 
             skipInvertAndSave = False
-            # TODO: custom depthmaps should be supplied in the same way as "custom depthmap" in single image mode
-            if custom_depthmap_fn is not None:
+            if inputdepthmaps is not None and inputdepthmaps[count] is not None:
                 # use custom depthmap
-                dimg = Image.open(os.path.abspath(custom_depthmap_fn))
+                dimg = inputdepthmaps[count]
                 # resize if not same size as input
                 if dimg.width != inputimages[count].width or dimg.height != inputimages[count].height:
                     dimg = dimg.resize((inputimages[count].width, inputimages[count].height), Image.Resampling.LANCZOS)
@@ -736,12 +730,6 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
                 img_output = clipdepthmap(img_output, clipthreshold_far, clipthreshold_near)
                 # img_output = cv2.blur(img_output, (3, 3))
 
-            # three channel, 8 bits per channel image
-            img_output2 = np.zeros_like(inputimages[count])
-            img_output2[:, :, 0] = img_output / 256.0
-            img_output2[:, :, 1] = img_output / 256.0
-            img_output2[:, :, 2] = img_output / 256.0
-
             # if 3dinpainting, store maps for processing in second pass
             if inpaint:
                 inpaint_imgs.append(inputimages[count])
@@ -773,7 +761,8 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
             if not skipInvertAndSave:  # TODO: skipInvertAndSave is not intuitive
                 if output_depth:
                     if combine_output:
-                        img_concat = Image.fromarray(np.concatenate((rgb_image, img_output2), axis=combine_output_axis))
+                        img_concat = Image.fromarray(np.concatenate(
+                            (rgb_image, convert_i16_to_rgb(img_output, rgb_image)), axis=combine_output_axis))
                         generated_images[count]['concat_depth'] = img_concat
                     else:
                         generated_images[count]['depth'] = Image.fromarray(img_output)
@@ -819,9 +808,9 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
 
                 depthi = prediction
                 # try to map output to sensible values for non zoedepth models, boost, or custom maps
-                if model_type < 7 or boost or (custom_depthmap and custom_depthmap_img != None):
+                if model_type < 7 or boost or inputdepthmaps_complete:
                     # invert if midas
-                    if model_type > 0 or ((custom_depthmap and custom_depthmap_img != None) and not invert_depth):
+                    if model_type > 0 or (inputdepthmaps_complete and not invert_depth):
                         depthi = depth_max - depthi + depth_min
                         depth_max = depthi.max()
                         depth_min = depthi.min()
@@ -860,13 +849,15 @@ def run_depthmap(processed, outpath, inputimages, inputnames, inp):
         gc.collect()
         devices.torch_gc()
         reload_sd_model()
-    try:
-        if inpaint:
+
+    mesh_fi = None
+    if inpaint:
+        try:
             unload_sd_model()
             mesh_fi = run_3dphoto(device, inpaint_imgs, inpaint_depths, inputnames, outpath, inpaint_vids, 1, "mp4")
-    finally:
-        reload_sd_model()
-        print("All done.")
+        finally:
+            reload_sd_model()  # Do not reload twice
+            print("All done.")
 
     return generated_images, mesh_fi, meshsimple_fi
 
@@ -1196,15 +1187,27 @@ def run_generate(*inputs):
     depthmap_input_image = inputs['depthmap_input_image']
     depthmap_batch_output_dir = inputs['depthmap_batch_output_dir']
     depthmap_batch_reuse = inputs['depthmap_batch_reuse']
+    custom_depthmap = inputs['custom_depthmap']
+    custom_depthmap_img = inputs['custom_depthmap_img']
 
     inputimages = []
+    # Allow supplying custom depthmaps
+    inputdepthmaps = []
     # Also keep track of original file names
     inputnames = []
-    show_images = []
+
+    if depthmap_mode == '2' and depthmap_batch_output_dir != '':
+        outpath = depthmap_batch_output_dir
+    else:
+        outpath = opts.outdir_samples or opts.outdir_extras_samples
 
     if depthmap_mode == '0':  # Single image
         inputimages.append(depthmap_input_image)
         inputnames.append(None)
+        if custom_depthmap:
+            inputdepthmaps.append(custom_depthmap_img)
+        else:
+            inputdepthmaps.append(None)
     if depthmap_mode == '1':  # Batch Process
         # convert files to pillow images
         for img in image_batch:
@@ -1214,39 +1217,58 @@ def run_generate(*inputs):
     elif depthmap_mode == '2':  # Batch from Directory
         assert not shared.cmd_opts.hide_ui_dir_config, '--hide-ui-dir-config option must be disabled'
         if depthmap_batch_input_dir == '':
-            return show_images, "Please select an input directory.", ''
+            return [], "Please select an input directory.", ""
+        if depthmap_batch_input_dir == depthmap_batch_output_dir:
+            return [], "Please pick different directories for batch processing.", ""
         image_list = shared.listfiles(depthmap_batch_input_dir)
-        for img in image_list:
+        for path in image_list:
             try:
-                image = Image.open(img)
-                inputimages.append(image)
-                inputnames.append(img)
-            except Exception:
-                print(f'Failed to load {img}, ignoring.')
+                inputimages.append(Image.open(path))
+                inputnames.append(path)
 
-    if depthmap_mode == '2' and depthmap_batch_output_dir != '':
-        outpath = depthmap_batch_output_dir
-    else:
-        outpath = opts.outdir_samples or opts.outdir_extras_samples
+                custom_depthmap = None
+                if depthmap_batch_reuse:
+                    basename = Path(path).stem
+                    # Custom names are not used in samples directory
+                    if outpath != opts.outdir_extras_samples:
+                        # Possible filenames that the custom depthmaps may have
+                        name_candidates = [f'{basename}-0000_depth.{opts.samples_format}',  # current format
+                                           f'{basename}-0000.{opts.samples_format}',  # old format
+                                           f'{basename}.png',  # human-intuitive format
+                                           f'{Path(path).name}']  # human-intuitive format (worse)
+                        for fn_cand in name_candidates:
+                            path_cand = os.path.join(outpath, fn_cand)
+                            if os.path.isfile(path_cand):
+                                custom_depthmap = Image.open(os.path.abspath(path_cand))
+                                break
+                inputdepthmaps.append(custom_depthmap)
+            except Exception:
+                print(f'Failed to load {path}, ignoring.')
+        inputdepthmaps_n = len([1 for x in inputdepthmaps if x is not None])
+        print(f'{len(inputimages)} images will be processed, {inputdepthmaps_n} existing depthmaps will be reused')
 
-    save_images, mesh_fi, meshsimple_fi = run_depthmap(None, outpath, inputimages, inputnames, inputs)
+    save_images, mesh_fi, meshsimple_fi = run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inputs)
     show_images = []
 
     # Saving images
     for input_i, imgs in enumerate(save_images):
         basename = 'depthmap'
-        if depthmap_batch_reuse and depthmap_mode == '2':
-            if inputnames[input_i] is not None:
-                basename = Path(inputnames[input_i]).stem
+        if depthmap_mode == '2' and inputnames[input_i] is not None and outpath != opts.outdir_extras_samples:
+            basename = Path(inputnames[input_i]).stem
         info = None
 
         for image_type, image in list(imgs.items()):
             show_images += [image]
             if inputs["save_outputs"]:
-                images.save_image(image, path=outpath, basename=basename, seed=None,
+                try:
+                    images.save_image(image, path=outpath, basename=basename, seed=None,
                               prompt=None, extension=opts.samples_format, info=info, short_filename=True,
                               no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None,
                               forced_filename=None, suffix=f"_{image_type}")
+                except Exception as e:
+                    if not ('image has wrong mode' in str(e) or 'I;16' in str(e)): raise e
+                    print('Catched exception: image has wrong mode!')
+                    traceback.print_exc()
 
     # use inpainted 3d mesh to show in 3d model output when enabled in settings
     if hasattr(opts, 'depthmap_script_show_3d_inpaint') and opts.depthmap_script_show_3d_inpaint and mesh_fi != None and len(mesh_fi) > 0:
@@ -1320,8 +1342,9 @@ def on_ui_tabs():
                         inp += gr.Textbox(elem_id="depthmap_batch_output_dir", label="Output directory",
                                           **shared.hide_dirs,
                                           placeholder="Leave blank to save images to the default path.")
+                        gr.HTML("Files in the output directory may be overwritten")
                         inp += gr.Checkbox(elem_id="depthmap_batch_reuse",
-                                           label="Skip generation and use (edited/custom) depthmaps in output directory when a file exists.",
+                                           label="Skip generation and use (edited/custom) depthmaps in output directory when a file already exists.",
                                            value=True)
                 submit = gr.Button('Generate', elem_id="depthmap_generate", variant='primary')
                 inp += main_ui_panel(True)  # Main panel is inserted here

From ff41b5ced3310cadba557e2dbcb7a6a38d5c5903 Mon Sep 17 00:00:00 2001
From: Semjon Kravtsenko <semjon.00@gmail.com>
Date: Sat, 8 Jul 2023 14:23:11 +0300
Subject: [PATCH 08/16] Rework depth prediction handling, invert and
 clipdepthmap

Large refactor part, may be broken
---
 README.md           |   1 +
 scripts/depthmap.py | 173 ++++++++++++++++++--------------------------
 2 files changed, 73 insertions(+), 101 deletions(-)

diff --git a/README.md b/README.md
index 5e90dd4..01c063d 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,7 @@ images generated by [@semjon00](https://github.com/semjon00) from CC0 photos, mo
 * v0.3.13 
     * Large code refactor
     * Improved interface
+    * Slightly changed the behaviour of various options
 * v0.3.12
     * Fixed stereo image generation
     * Other bugfixes
diff --git a/scripts/depthmap.py b/scripts/depthmap.py
index d798bf2..f87dfda 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap.py
@@ -6,7 +6,6 @@
 
 import gradio as gr
 from PIL import Image
-from numba import njit, prange
 from torchvision.transforms import Compose, transforms
 
 import modules.scripts as scripts
@@ -276,7 +275,7 @@ def main_ui_panel(is_depth_tab):
             outputs=[inp['clipthreshold_far']]
         )
 
-        # Invert_depthmap must not be used with gen_stereo - otherwise stereo images look super-wrong
+        # invert_depth must not be used with gen_stereo - otherwise stereo images look super-wrong
         inp['gen_stereo'].change(
             fn=lambda a, b: False if b else a,
             inputs=[inp['invert_depth'], inp['gen_stereo']],
@@ -365,10 +364,11 @@ def run(self, p, *inputs):
                 processed.images.append(image)
                 if inputs["save_outputs"]:
                     try:
+                        suffix = "" if image_type == "depth" else f"_{image_type}"
                         images.save_image(image, path=p.outpath_samples, basename="", seed=processed.all_seeds[input_i],
                                   prompt=processed.all_prompts[input_i], extension=opts.samples_format, info=info,
                                   p=processed,
-                                  suffix=f"_{image_type}")
+                                  suffix=suffix)
                     except Exception as e:
                         if not ('image has wrong mode' in str(e) or 'I;16' in str(e)): raise e
                         print('Catched exception: image has wrong mode!')
@@ -426,8 +426,7 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
     stereo_modes = inp["stereo_modes"]
     stereo_separation = inp["stereo_separation"]
 
-    # TODO: run_depthmap should not generate or save meshes, since these do not use generated depthmaps.
-    #  Rationale: allowing webui-independent (stand-alone) wrappers.
+    # TODO: ideally, run_depthmap should not save meshes - that makes the function not pure
     print(f"\n{scriptname} {scriptversion} ({get_commit_hash()})")
 
     unload_sd_model()
@@ -631,7 +630,7 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
             model.eval()
 
             # optimize
-            if device == torch.device("cuda") and model_type < 7:
+            if device == torch.device("cuda") and model_type in [0, 1, 2, 3, 4, 5, 6]:
                 model = model.to(memory_format=torch.channels_last)
                 if not cmd_opts.no_half and model_type != 0 and not boost:
                     model = model.half()
@@ -650,11 +649,11 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
                 model = model.to(device)
 
         print("Computing depthmap(s) ..")
-        numimages = len(inputimages)
-        # Images that will be returned.
-        # Every array element corresponds to particular input image.
-        # Dictionary keys are types of images that were derived from the input image.
-        generated_images = [{} for _ in range(numimages)]
+
+        generated_images = [{} for _ in range(len(inputimages))]
+        """Images that will be returned.
+        Every array element corresponds to particular input image.
+        Dictionary keys are types of images that were derived from the input image."""
 
         # TODO: ???
         meshsimple_fi = None
@@ -662,14 +661,8 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
         inpaint_depths = []
 
         # iterate over input (generated) images
-        for count in trange(0, numimages):
-
-            print('\n')
-
-            # filename
-            basename = 'depthmap'
-
-            # override net size
+        for count in trange(0, len(inputimages)):
+            # override net size (size may be different for different images)
             if match_size:
                 net_width, net_height = inputimages[count].width, inputimages[count].height
 
@@ -681,62 +674,63 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
             # input image
             img = cv2.cvtColor(np.asarray(inputimages[count]), cv2.COLOR_BGR2RGB) / 255.0
 
-            skipInvertAndSave = False
+            raw_prediction = None
+            """Raw prediction, as returned by a model. None if input depthmap is used."""
+            raw_prediction_invert = False
+            """True if near=dark on raw_prediction"""
+            out = None
             if inputdepthmaps is not None and inputdepthmaps[count] is not None:
                 # use custom depthmap
                 dimg = inputdepthmaps[count]
                 # resize if not same size as input
                 if dimg.width != inputimages[count].width or dimg.height != inputimages[count].height:
                     dimg = dimg.resize((inputimages[count].width, inputimages[count].height), Image.Resampling.LANCZOS)
+
                 if dimg.mode == 'I' or dimg.mode == 'P' or dimg.mode == 'L':
-                    prediction = np.asarray(dimg, dtype="float")
+                    out = np.asarray(dimg, dtype="float")
                 else:
-                    prediction = np.asarray(dimg, dtype="float")[:, :, 0]
-                skipInvertAndSave = True  # skip invert for leres model (0)
+                    out = np.asarray(dimg, dtype="float")[:, :, 0]
             else:
                 # compute depthmap
                 if not boost:
                     if model_type == 0:
-                        prediction = estimateleres(img, model, net_width, net_height)
-                    elif model_type >= 7:
-                        prediction = estimatezoedepth(inputimages[count], model, net_width, net_height)
+                        raw_prediction = estimateleres(img, model, net_width, net_height)
+                        raw_prediction_invert = True
+                    elif model_type in [7, 8, 9]:
+                        raw_prediction = estimatezoedepth(inputimages[count], model, net_width, net_height)
+                        raw_prediction_invert = True
                     else:
-                        prediction = estimatemidas(img, model, net_width, net_height, resize_mode, normalization)
+                        raw_prediction = estimatemidas(img, model, net_width, net_height, resize_mode, normalization)
                 else:
-                    prediction = estimateboost(img, model, model_type, pix2pixmodel)
+                    raw_prediction = estimateboost(img, model, model_type, pix2pixmodel)
+
+                # output
+                if abs(raw_prediction.max() - raw_prediction.min()) > np.finfo("float").eps:
+                    out = np.copy(raw_prediction)
+                    # TODO: some models may output negative values, maybe these should be clamped to zero.
+                    if raw_prediction_invert:
+                        out *= -1
+                    if clipdepth:
+                        out = (out - out.min()) / (out.max() - out.min())  # normalize to [0; 1]
+                        out = np.clip(out, clipthreshold_far, clipthreshold_near)
+                else:
+                    # Regretfully, the depthmap is broken and will be replaced with a black image
+                    out = np.zeros(raw_prediction.shape)
+            out = (out - out.min()) / (out.max() - out.min())  # normalize to [0; 1]
 
-            # output
-            depth = prediction
+            # Single channel, 16 bit image. This loses some precision!
+            # uint16 conversion uses round-down, therefore values should be [0; 2**16)
             numbytes = 2
-            depth_min = depth.min()
-            depth_max = depth.max()
-            max_val = (2 ** (8 * numbytes)) - 1
-
-            # check output before normalizing and mapping to 16 bit
-            if depth_max - depth_min > np.finfo("float").eps:
-                out = max_val * (depth - depth_min) / (depth_max - depth_min)
-            else:
-                out = np.zeros(depth.shape)
-
-            # single channel, 16 bit image
+            max_val = (2 ** (8 * numbytes))
+            out = np.clip(out * max_val, 0, max_val - 0.1)  # Clipping form above is needed to avoid overflowing
             img_output = out.astype("uint16")
-
-            # invert depth map
-            if invert_depth ^ (((model_type == 0) or (model_type >= 7)) and not skipInvertAndSave):
-                img_output = cv2.bitwise_not(img_output)
-
-            # apply depth clip and renormalize if enabled
-            if clipdepth:
-                img_output = clipdepthmap(img_output, clipthreshold_far, clipthreshold_near)
-                # img_output = cv2.blur(img_output, (3, 3))
+            """Depthmap (near=bright), as uint16"""
 
             # if 3dinpainting, store maps for processing in second pass
             if inpaint:
                 inpaint_imgs.append(inputimages[count])
                 inpaint_depths.append(img_output)
 
-            rgb_image = inputimages[count]
-
             # applying background masks after depth
             if background_removal:
                 print('applying background masks')
@@ -745,9 +739,7 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
                 background_removed_array = np.array(background_removed_image)
                 bg_mask = (background_removed_array[:, :, 0] == 0) & (background_removed_array[:, :, 1] == 0) & (
                             background_removed_array[:, :, 2] == 0) & (background_removed_array[:, :, 3] <= 0.2)
-                far_value = 255 if invert_depth else 0
-
-                img_output[bg_mask] = far_value * far_value  # 255*255 or 0*0
+                img_output[bg_mask] = 0  # far value
 
                 generated_images[count]['background_removed'] = background_removed_image
 
@@ -758,14 +750,17 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
 
                     generated_images[count]['foreground_mask'] = mask_image
 
-            if not skipInvertAndSave:  # TODO: skipInvertAndSave is not intuitive
+            # A weird quirk: if user tries to save depthmap, whereas input depthmap is used,
+            # depthmap will be outputed, even if combine_output is used.
+            if output_depth and inputdepthmaps[count] is None:
                 if output_depth:
+                    img_depth = cv2.bitwise_not(img_output) if invert_depth else img_output
                     if combine_output:
                         img_concat = Image.fromarray(np.concatenate(
-                            (rgb_image, convert_i16_to_rgb(img_output, rgb_image)), axis=combine_output_axis))
+                            (inputimages[count], convert_i16_to_rgb(img_depth, inputimages[count])), axis=combine_output_axis))
                         generated_images[count]['concat_depth'] = img_concat
                     else:
-                        generated_images[count]['depth'] = Image.fromarray(img_output)
+                        generated_images[count]['depth'] = Image.fromarray(img_depth)
 
             if show_heat:
                 heatmap = colorize(img_output, cmap='inferno')
@@ -802,15 +797,15 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
             # gen mesh
             if gen_mesh:
                 print(f"\nGenerating (occluded) mesh ..")
-
+                basename = 'depthmap'
                 meshsimple_fi = get_uniquefn(outpath, basename, 'obj')
                 meshsimple_fi = os.path.join(outpath, meshsimple_fi + '_simple.obj')
 
-                depthi = prediction
+                depthi = raw_prediction if raw_prediction is not None else raw_prediction
                 # try to map output to sensible values for non zoedepth models, boost, or custom maps
-                if model_type < 7 or boost or inputdepthmaps_complete:
+                if model_type not in [7, 8, 9] or boost or inputdepthmaps[count] is not None:
                     # invert if midas
-                    if model_type > 0 or (inputdepthmaps_complete and not invert_depth):
+                    if model_type > 0 or inputdepthmaps[count] is not None:  # TODO: Weird
                         depthi = depth_max - depthi + depth_min
                         depth_max = depthi.max()
                         depth_min = depthi.min()
@@ -826,7 +821,7 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
                     depthi = depthi + 1
 
                 mesh = create_mesh(inputimages[count], depthi, keep_edges=not mesh_occlude, spherical=mesh_spherical)
-                save_mesh_obj(meshsimple_fi, mesh)
+                mesh.export(meshsimple_fi)
 
         print("Done.")
 
@@ -848,42 +843,19 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
 
         gc.collect()
         devices.torch_gc()
-        reload_sd_model()
 
     mesh_fi = None
     if inpaint:
         try:
-            unload_sd_model()
             mesh_fi = run_3dphoto(device, inpaint_imgs, inpaint_depths, inputnames, outpath, inpaint_vids, 1, "mp4")
-        finally:
-            reload_sd_model()  # Do not reload twice
-            print("All done.")
+        except Exception as e:
+            print(f'{str(e)}, some issue with generating inpainted mesh')
+    reload_sd_model()
+    print("All done.")
 
     return generated_images, mesh_fi, meshsimple_fi
 
 
-@njit(parallel=True)
-def clipdepthmap(img, clipthreshold_far, clipthreshold_near):
-    clipped_img = img
-    w, h = img.shape
-    min = img.min()
-    max = img.max()
-    drange = max - min
-    clipthreshold_far = min + (clipthreshold_far * drange)
-    clipthreshold_near = min + (clipthreshold_near * drange)
-
-    for x in prange(w):
-        for y in range(h):
-            if clipped_img[x, y] < clipthreshold_far:
-                clipped_img[x, y] = 0
-            elif clipped_img[x, y] > clipthreshold_near:
-                clipped_img[x, y] = 65535
-            else:
-                clipped_img[x, y] = ((clipped_img[x, y] + min) / drange * 65535)
-
-    return clipped_img
-
-
 def get_uniquefn(outpath, basename, ext):
     # Inefficient and may fail, maybe use unbounded binary search?
     basecount = get_next_sequence_number(outpath, basename)
@@ -963,10 +935,7 @@ def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, inpaint_vids, v
         if device == torch.device("cpu"):
             config["gpu_ids"] = -1
 
-        # process all inputs
-        numimages = len(img_rgb)
-        for count in trange(0, numimages):
-
+        for count in trange(0, len(img_rgb)):
             basename = 'depthmap'
             if inputnames is not None:
                 if inputnames[count] is not None:
@@ -1205,6 +1174,8 @@ def run_generate(*inputs):
         inputimages.append(depthmap_input_image)
         inputnames.append(None)
         if custom_depthmap:
+            if custom_depthmap_img is None:
+                return [], None, None, "Custom depthmap is not specified. Please either supply it or disable this option.", ""
             inputdepthmaps.append(custom_depthmap_img)
         else:
             inputdepthmaps.append(None)
@@ -1217,9 +1188,9 @@ def run_generate(*inputs):
     elif depthmap_mode == '2':  # Batch from Directory
         assert not shared.cmd_opts.hide_ui_dir_config, '--hide-ui-dir-config option must be disabled'
         if depthmap_batch_input_dir == '':
-            return [], "Please select an input directory.", ""
+            return [], None, None, "Please select an input directory.", ""
         if depthmap_batch_input_dir == depthmap_batch_output_dir:
-            return [], "Please pick different directories for batch processing.", ""
+            return [], None, None, "Please pick different directories for batch processing.", ""
         image_list = shared.listfiles(depthmap_batch_input_dir)
         for path in image_list:
             try:
@@ -1232,8 +1203,7 @@ def run_generate(*inputs):
                     # Custom names are not used in samples directory
                     if outpath != opts.outdir_extras_samples:
                         # Possible filenames that the custom depthmaps may have
-                        name_candidates = [f'{basename}-0000_depth.{opts.samples_format}',  # current format
-                                           f'{basename}-0000.{opts.samples_format}',  # old format
+                        name_candidates = [f'{basename}-0000.{opts.samples_format}',  # current format
                                            f'{basename}.png',  # human-intuitive format
                                            f'{Path(path).name}']  # human-intuitive format (worse)
                         for fn_cand in name_candidates:
@@ -1261,10 +1231,11 @@ def run_generate(*inputs):
             show_images += [image]
             if inputs["save_outputs"]:
                 try:
+                    suffix = "" if image_type == "depth" else f"_{image_type}"
                     images.save_image(image, path=outpath, basename=basename, seed=None,
                               prompt=None, extension=opts.samples_format, info=info, short_filename=True,
                               no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None,
-                              forced_filename=None, suffix=f"_{image_type}")
+                              forced_filename=None, suffix=suffix)
                 except Exception as e:
                     if not ('image has wrong mode' in str(e) or 'I;16' in str(e)): raise e
                     print('Catched exception: image has wrong mode!')
@@ -1276,7 +1247,7 @@ def run_generate(*inputs):
     # however, don't show 3dmodel when disabled in settings
     if hasattr(opts, 'depthmap_script_show_3d') and not opts.depthmap_script_show_3d:
             meshsimple_fi = None
-
+    # TODO: return more info
     return show_images, mesh_fi, meshsimple_fi, plaintext_to_html('info'), ''
 
 
@@ -1342,7 +1313,7 @@ def on_ui_tabs():
                         inp += gr.Textbox(elem_id="depthmap_batch_output_dir", label="Output directory",
                                           **shared.hide_dirs,
                                           placeholder="Leave blank to save images to the default path.")
-                        gr.HTML("Files in the output directory may be overwritten")
+                        gr.HTML("Files in the output directory may be overwritten.")
                         inp += gr.Checkbox(elem_id="depthmap_batch_reuse",
                                            label="Skip generation and use (edited/custom) depthmaps in output directory when a file already exists.",
                                            value=True)

From f65d8a7b8cf392869309bd7210017648f7dfb1d7 Mon Sep 17 00:00:00 2001
From: Semjon Kravtsenko <semjon.00@gmail.com>
Date: Sun, 9 Jul 2023 21:52:23 +0300
Subject: [PATCH 09/16] Rework model loading, split UI into a separate file

Large refactor part, may be broken
---
 scripts/depthmap.py        | 1079 ++++++++++--------------------------
 scripts/interface_webui.py |  553 ++++++++++++++++++
 scripts/main.py            |   39 ++
 3 files changed, 874 insertions(+), 797 deletions(-)
 create mode 100644 scripts/interface_webui.py
 create mode 100644 scripts/main.py

diff --git a/scripts/depthmap.py b/scripts/depthmap.py
index f87dfda..4dd02fc 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap.py
@@ -4,18 +4,12 @@
 from operator import getitem
 from pathlib import Path
 
-import gradio as gr
 from PIL import Image
 from torchvision.transforms import Compose, transforms
 
-import modules.scripts as scripts
-from modules import processing, images, shared, devices
-from modules import script_callbacks
-from modules.call_queue import wrap_gradio_gpu_call
+from modules import shared, devices
 from modules.images import get_next_sequence_number
-from modules.processing import create_infotext
 from modules.shared import opts, cmd_opts
-from modules.ui import plaintext_to_html
 
 try:
     from tqdm import trange
@@ -31,7 +25,6 @@
 import copy
 import platform
 import math
-import subprocess
 import traceback
 import pathlib
 import os
@@ -56,8 +49,8 @@ def ensure_gradio_temp_directory():
 ensure_gradio_temp_directory()
 
 # Our code
+from scripts.main import *
 from scripts.stereoimage_generation import create_stereoimages
-from scripts.gradio_args_transport import GradioComponentBundle
 
 # midas imports
 from dmidas.dpt_depth import DPTDepthModel
@@ -88,292 +81,261 @@ def ensure_gradio_temp_directory():
 # TODO: next two should not be here
 whole_size_threshold = 1600  # R_max from the paper
 pix2pixsize = 1024
-scriptname = "DepthMap"
-scriptversion = "v0.3.13"
 
 global video_mesh_data, video_mesh_fn
 video_mesh_data = None
 video_mesh_fn = None
 
-global depthmap_model_depth, depthmap_model_pix2pix, depthmap_model_type, depthmap_deviceidx
-depthmap_model_depth = None
-depthmap_model_pix2pix = None
-depthmap_model_type = None
-depthmap_deviceidx = None
-
-commit_hash = None  # TODO: understand why it would spam to stderr if changed to ... = get_commit_hash()
-def get_commit_hash():
-    global commit_hash
-    if commit_hash is None:
-        try:
-            commit_hash = subprocess.check_output(
-                [os.environ.get('GIT', "git"), "rev-parse", "HEAD"],
-                cwd=pathlib.Path.cwd().joinpath('extensions/stable-diffusion-webui-depthmap-script/'),
-                shell=False,
-                stderr=subprocess.DEVNULL,
-                encoding='utf8').strip()[0:8]
-        except Exception:
-            commit_hash = "<none>"
-    return commit_hash
-
-
-def convert_i16_to_rgb(image, like):
-    # three channel, 8 bits per channel image
-    output = np.zeros_like(like)
-    output[:, :, 0] = image / 256.0
-    output[:, :, 1] = image / 256.0
-    output[:, :, 2] = image / 256.0
-    return output
-
-
-def main_ui_panel(is_depth_tab):
-    inp = GradioComponentBundle()
-    # TODO: Greater visual separation
-    with gr.Blocks():
-        with gr.Row():
-            inp += 'compute_device', gr.Radio(label="Compute on", choices=['GPU', 'CPU'], value='GPU')
-            # TODO: Should return value instead of index. Maybe Enum should be used?
-            inp += 'model_type', gr.Dropdown(label="Model",
-                                             choices=['res101', 'dpt_beit_large_512 (midas 3.1)',
-                                                      'dpt_beit_large_384 (midas 3.1)', 'dpt_large_384 (midas 3.0)',
-                                                      'dpt_hybrid_384 (midas 3.0)',
-                                                      'midas_v21', 'midas_v21_small',
-                                                      'zoedepth_n (indoor)', 'zoedepth_k (outdoor)', 'zoedepth_nk'],
-                                             value='res101',
-                                             type="index")
-        with gr.Group():
-            with gr.Row():
-                inp += 'boost', gr.Checkbox(label="BOOST (multi-resolution merging)", value=True)
-                with gr.Group(visible=False) as options_depend_on_boost:
-                    inp += 'match_size', gr.Checkbox(label="Match net size to input size", value=False)
-            with gr.Row(visible=False) as options_depend_on_match_size:
-                inp += 'net_width', gr.Slider(minimum=64, maximum=2048, step=64, label='Net width', value=512)
-                inp += 'net_height', gr.Slider(minimum=64, maximum=2048, step=64, label='Net height', value=512)
-
-        with gr.Group():
-            with gr.Row():
-                inp += "save_outputs", gr.Checkbox(label="Save Outputs", value=True)  # 50% of width
-                with gr.Group():  # 50% of width
-                    inp += "output_depth", gr.Checkbox(label="Output DepthMap", value=True)
-                    inp += "invert_depth", gr.Checkbox(label="Invert (black=near, white=far)", value=False)
-            with gr.Row() as options_depend_on_output_depth_1:
-                inp += "combine_output", gr.Checkbox(
-                    label="Combine input and depthmap into one image", value=False)
-                inp += "combine_output_axis", gr.Radio(label="Combine axis", choices=['Vertical', 'Horizontal'],
-                                                       value='Horizontal', type="index", visible=False)
-        with gr.Group():
-            with gr.Row():
-                inp += 'clipdepth', gr.Checkbox(label="Clip and renormalize DepthMap", value=False)
-            with gr.Row(visible=False) as clip_options_row_1:
-                inp += "clipthreshold_far", gr.Slider(minimum=0, maximum=1, step=0.001, label='Far clip', value=0)
-                inp += "clipthreshold_near", gr.Slider(minimum=0, maximum=1, step=0.001, label='Near clip', value=1)
-
-        with gr.Group():
-            with gr.Row():
-                inp += "show_heat", gr.Checkbox(label="Generate HeatMap", value=False)
-                # gr.Checkbox(label="Generate NormalMap", value=False)  # TODO: this is a fake door
-
-        with gr.Group():
-            with gr.Row():
-                inp += "gen_stereo", gr.Checkbox(label="Generate stereoscopic image(s)", value=False)
-            with gr.Group(visible=False) as stereo_options:
-                with gr.Row():
-                    with gr.Row():
-                        inp += "stereo_modes", gr.CheckboxGroup(
-                            ["left-right", "right-left", "top-bottom", "bottom-top", "red-cyan-anaglyph"],
-                            label="Output", value=["left-right", "red-cyan-anaglyph"])
-                with gr.Row():
-                    inp += "stereo_divergence", gr.Slider(minimum=0.05, maximum=10.005, step=0.01,
-                                                          label='Divergence (3D effect)',
-                                                          value=2.5)
-                    inp += "stereo_separation", gr.Slider(minimum=-5.0, maximum=5.0, step=0.01,
-                                                          label='Separation (moves images apart)',
-                                                          value=0.0)
-                with gr.Row():
-                    inp += "stereo_fill", gr.Dropdown(label="Gap fill technique",
-                                                      choices=['none', 'naive', 'naive_interpolating', 'polylines_soft',
-                                                               'polylines_sharp'], value='polylines_sharp', type="value")
-                    inp += "stereo_balance", gr.Slider(minimum=-1.0, maximum=1.0, step=0.05, label='Balance between eyes',
-                                                       value=0.0)
-
-        with gr.Group():
-            with gr.Row():
-                inp += "gen_mesh", gr.Checkbox(
-                    label="Generate simple 3D mesh. "
-                          "(Fast, accurate only with ZoeDepth models and no boost, no custom maps)",
-                    value=False, visible=True)
-            with gr.Row(visible=False) as mesh_options_row_0:
-                inp += "mesh_occlude", gr.Checkbox(label="Remove occluded edges", value=True, visible=True)
-                inp += "mesh_spherical", gr.Checkbox(label="Equirectangular projection", value=False, visible=True)
-
-        if is_depth_tab:
-            with gr.Group():
-                with gr.Row():
-                    inp += "inpaint", gr.Checkbox(
-                        label="Generate 3D inpainted mesh. (Sloooow, required for generating videos)", value=False)
-                with gr.Group(visible=False) as inpaint_options_row_0:
-                    inp += "inpaint_vids", gr.Checkbox(
-                        label="Generate 4 demo videos with 3D inpainted mesh.", value=False)
-                    gr.HTML("More options for generating video can be found in the Generate video tab")
-
-        with gr.Group():
-            # TODO: it should be clear from the UI that the background removal does not use the model selected above
-            with gr.Row():
-                inp += "background_removal", gr.Checkbox(label="Remove background", value=False)
-            with gr.Row(visible=False) as bgrem_options_row_1:
-                inp += "save_background_removal_masks", gr.Checkbox(label="Save the foreground masks", value=False)
-                inp += "pre_depth_background_removal", gr.Checkbox(label="Pre-depth background removal", value=False)
-            with gr.Row(visible=False) as bgrem_options_row_2:
-                inp += "background_removal_model", gr.Dropdown(label="Rembg Model",
-                                                               choices=['u2net', 'u2netp', 'u2net_human_seg',
-                                                                        'silueta'],
-                                                               value='u2net', type="value")
-
-        with gr.Box():
-            gr.HTML("Information, comment and share @ <a "
-                    "href='https://github.com/thygate/stable-diffusion-webui-depthmap-script'>"
-                    "https://github.com/thygate/stable-diffusion-webui-depthmap-script</a>")
-
-        inp += "gen_normal", gr.Checkbox(label="Generate Normalmap (hidden! api only)", value=False, visible=False)
-
-        inp['boost'].change(
-            fn=lambda a, b: (options_depend_on_boost.update(visible=not a), options_depend_on_match_size.update(visible=not a and not b)),
-            inputs=[inp['boost'], inp['match_size']],
-            outputs=[options_depend_on_boost, options_depend_on_match_size]
-        )
-        inp['match_size'].change(
-            fn=lambda a, b: options_depend_on_match_size.update(visible=not a and not b),
-            inputs=[inp['boost'], inp['match_size']],
-            outputs=[options_depend_on_match_size]
-        )
-
-        inp['output_depth'].change(
-            fn=lambda a: (inp['invert_depth'].update(visible=a), options_depend_on_output_depth_1.update(visible=a)),
-            inputs=[inp['output_depth']],
-            outputs=[inp['invert_depth'], options_depend_on_output_depth_1]
-        )
-
-        inp['combine_output'].change(
-            fn=lambda v: inp['combine_output_axis'].update(visible=v),
-            inputs=[inp['combine_output']],
-            outputs=[inp['combine_output_axis']]
-        )
-
-        inp['clipdepth'].change(
-            fn=lambda v: clip_options_row_1.update(visible=v),
-            inputs=[inp['clipdepth']],
-            outputs=[clip_options_row_1]
-        )
-        inp['clipthreshold_far'].change(
-            fn=lambda a, b: a if b < a else b,
-            inputs=[inp['clipthreshold_far'], inp['clipthreshold_near']],
-            outputs=[inp['clipthreshold_near']]
-        )
-        inp['clipthreshold_near'].change(
-            fn=lambda a, b: a if b > a else b,
-            inputs=[inp['clipthreshold_near'], inp['clipthreshold_far']],
-            outputs=[inp['clipthreshold_far']]
-        )
-
-        # invert_depth must not be used with gen_stereo - otherwise stereo images look super-wrong
-        inp['gen_stereo'].change(
-            fn=lambda a, b: False if b else a,
-            inputs=[inp['invert_depth'], inp['gen_stereo']],
-            outputs=[inp['invert_depth']]
-        )
-        inp['gen_stereo'].change(
-            fn=lambda a, b: inp['invert_depth'].update(interactive=not b),
-            inputs=[inp['invert_depth'], inp['gen_stereo']],
-            outputs=[inp['invert_depth']]
-        )
-
-        def stereo_options_visibility(v):
-            return stereo_options.update(visible=v)
-        inp['gen_stereo'].change(
-            fn=stereo_options_visibility,
-            inputs=[inp['gen_stereo']],
-            outputs=[stereo_options]
-        )
-
-        inp['gen_mesh'].change(
-            fn=lambda v: mesh_options_row_0.update(visible=v),
-            inputs=[inp['gen_mesh']],
-            outputs=[mesh_options_row_0]
-        )
+class ModelHolder():
+    def __init__(self):
+        self.depth_model = None
+        self.pix2pix_model = None
+        self.depth_model_type = None
+        self.device = None
+
+        # Extra stuff
+        self.resize_mode = None
+        self.normalization = None
+
+    def ensure_models(self, model_type, device: torch.device, boost: bool):
+        # TODO: could make it more granular
+        if model_type == -1 or model_type is None:
+            self.unload_models()
+            return
+        # Certain optimisations are irreversible and not device-agnostic, thus changing device requires reloading
+        if model_type != self.depth_model_type or boost != self.pix2pix_model is not None or device != self.device:
+            self.unload_models()
+            self.load_models(model_type, device, boost)
+
+    def load_models(self, model_type, device: torch.device, boost: bool):
+        """Ensure that the depth model is loaded"""
+        # TODO: supply correct values for zoedepth
+        net_width = 512
+        net_height = 512
+
+        # model path and name
+        model_dir = "./models/midas"
+        if model_type == 0:
+            model_dir = "./models/leres"
+        # create paths to model if not present
+        os.makedirs(model_dir, exist_ok=True)
+        os.makedirs('./models/pix2pix', exist_ok=True)
+
+        print("Loading model weights from ", end=" ")
+
+        resize_mode = "minimal"
+        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+        # TODO: net_w, net_h
+        model = None
+        if model_type == 0:  # "res101"
+            model_path = f"{model_dir}/res101.pth"
+            print(model_path)
+            ensure_file_downloaded(
+                model_path,
+                ["https://cloudstor.aarnet.edu.au/plus/s/lTIJF4vrvHCAI31/download",
+                 "https://huggingface.co/lllyasviel/Annotators/resolve/5bc80eec2b4fddbb/res101.pth",
+                 ],
+                "1d696b2ef3e8336b057d0c15bc82d2fecef821bfebe5ef9d7671a5ec5dde520b")
+            if device == torch.device('gpu'):
+                checkpoint = torch.load(model_path)
+            else:
+                checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
+            model = RelDepthModel(backbone='resnext101')
+            model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."), strict=True)
+            del checkpoint
+            devices.torch_gc()
 
-        def inpaint_options_visibility(v):
-            return inpaint_options_row_0.update(visible=v)
-        if is_depth_tab:
-            inp['inpaint'].change(
-                fn=inpaint_options_visibility,
-                inputs=[inp['inpaint']],
-                outputs=[inpaint_options_row_0]
+        if model_type == 1:  # "dpt_beit_large_512" midas 3.1
+            model_path = f"{model_dir}/dpt_beit_large_512.pt"
+            print(model_path)
+            ensure_file_downloaded(model_path,
+                                   "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt")
+            model = DPTDepthModel(
+                path=model_path,
+                backbone="beitl16_512",
+                non_negative=True,
+            )
+            resize_mode = "minimal"
+            normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+        if model_type == 2:  # "dpt_beit_large_384" midas 3.1
+            model_path = f"{model_dir}/dpt_beit_large_384.pt"
+            print(model_path)
+            ensure_file_downloaded(model_path,
+                                   "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt")
+            model = DPTDepthModel(
+                path=model_path,
+                backbone="beitl16_384",
+                non_negative=True,
+            )
+            resize_mode = "minimal"
+            normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+        if model_type == 3:  # "dpt_large_384" midas 3.0
+            model_path = f"{model_dir}/dpt_large-midas-2f21e586.pt"
+            print(model_path)
+            ensure_file_downloaded(model_path,
+                                   "https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt")
+            model = DPTDepthModel(
+                path=model_path,
+                backbone="vitl16_384",
+                non_negative=True,
+            )
+            resize_mode = "minimal"
+            normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+        elif model_type == 4:  # "dpt_hybrid_384" midas 3.0
+            model_path = f"{model_dir}/dpt_hybrid-midas-501f0c75.pt"
+            print(model_path)
+            ensure_file_downloaded(model_path,
+                                   "https://github.com/intel-isl/DPT/releases/download/1_0/dpt_hybrid-midas-501f0c75.pt")
+            model = DPTDepthModel(
+                path=model_path,
+                backbone="vitb_rn50_384",
+                non_negative=True,
+            )
+            resize_mode = "minimal"
+            normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+
+        elif model_type == 5:  # "midas_v21"
+            model_path = f"{model_dir}/midas_v21-f6b98070.pt"
+            print(model_path)
+            ensure_file_downloaded(model_path,
+                                   "https://github.com/AlexeyAB/MiDaS/releases/download/midas_dpt/midas_v21-f6b98070.pt")
+            model = MidasNet(model_path, non_negative=True)
+            resize_mode = "upper_bound"
+            normalization = NormalizeImage(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
             )
 
-        def background_removal_options_visibility(v):
-            return bgrem_options_row_1.update(visible=v), \
-                bgrem_options_row_2.update(visible=v)
-        inp['background_removal'].change(
-            fn=background_removal_options_visibility,
-            inputs=[inp['background_removal']],
-            outputs=[bgrem_options_row_1, bgrem_options_row_2]
-        )
-
-    return inp
-
-
-class Script(scripts.Script):
-    def title(self):
-        return scriptname
+        elif model_type == 6:  # "midas_v21_small"
+            model_path = f"{model_dir}/midas_v21_small-70d6b9c8.pt"
+            print(model_path)
+            ensure_file_downloaded(model_path,
+                                   "https://github.com/AlexeyAB/MiDaS/releases/download/midas_dpt/midas_v21_small-70d6b9c8.pt")
+            model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
+                                   non_negative=True, blocks={'expand': True})
+            resize_mode = "upper_bound"
+            normalization = NormalizeImage(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+            )
 
-    def show(self, is_img2img):
-        return True
+        elif model_type == 7:  # zoedepth_n
+            print("zoedepth_n\n")
+            conf = get_config("zoedepth", "infer")
+            conf.img_size = [net_width, net_height]
+            model = build_model(conf)
+
+        elif model_type == 8:  # zoedepth_k
+            print("zoedepth_k\n")
+            conf = get_config("zoedepth", "infer", config_version="kitti")
+            conf.img_size = [net_width, net_height]
+            model = build_model(conf)
+
+        elif model_type == 9:  # zoedepth_nk
+            print("zoedepth_nk\n")
+            conf = get_config("zoedepth_nk", "infer")
+            conf.img_size = [net_width, net_height]
+            model = build_model(conf)
+
+        model.eval()  # prepare for evaluation
+        # optimize
+        if device == torch.device("cuda") and model_type in [0, 1, 2, 3, 4, 5, 6]:
+            model = model.to(memory_format=torch.channels_last)
+            if not cmd_opts.no_half and model_type != 0 and not boost:
+                model = model.half()
+        model.to(device)  # to correct device
+
+        self.depth_model = model
+        self.depth_model_type = model_type
+        self.resize_mode = resize_mode
+        self.normalization = normalization
+
+        # load merge network if boost enabled or keepmodels enabled
+        if boost or (hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels):
+            # sfu.ca unfortunately is not very reliable, we use a mirror just in case
+            ensure_file_downloaded(
+                './models/pix2pix/latest_net_G.pth',
+                ["https://huggingface.co/lllyasviel/Annotators/resolve/9a7d84251d487d11/latest_net_G.pth",
+                 "https://sfu.ca/~yagiz/CVPR21/latest_net_G.pth"],
+                '50ec735d74ed6499562d898f41b49343e521808b8dae589aa3c2f5c9ac9f7462')
+            opt = TestOptions().parse()
+            if device == torch.device('cpu'):
+                opt.gpu_ids = []
+            pix2pix_model = Pix2Pix4DepthModel(opt)
+            pix2pix_model.save_dir = './models/pix2pix'
+            pix2pix_model.load_networks('latest')
+            pix2pix_model.eval()
+            model.to(device)
+            self.pix2pix_model = pix2pix_model
 
-    def ui(self, is_img2img):
-        gr.HTML()  # Work around a Gradio bug
-        with gr.Column(variant='panel'):
-            gr.HTML()  # Work around a Gradio bug
-            ret = main_ui_panel(False)
-            ret += ret.enkey_tail()
-        return ret.enkey_body()
+        devices.torch_gc()
 
-    # run from script in txt2img or img2img
-    def run(self, p, *inputs):
-        inputs = GradioComponentBundle.enkey_to_dict(inputs)
+    def get_default_net_size(self, model_type):
+        # TODO: fill in, use in the GUI
+        sizes = {
+            1: [512, 512],
+            2: [384, 384],
+            3: [384, 384],
+            4: [384, 384],
+            5: [384, 384],
+            6: [256, 256],
+        }
+        if model_type in sizes:
+            return sizes[model_type]
+        return [512, 512]
+
+    def swap_to_cpu_memory(self):
+        if self.depth_model is not None:
+            self.depth_model.to(torch.device('cpu'))
+        if self.pix2pix_model is not None:
+            self.pix2pix_model.to(torch.device('cpu'))
+
+    def unload_models(self):
+        if self.depth_model is not None or self.pix2pix_model is not None:
+            del self.depth_model
+            self.depth_model = None
+            del self.pix2pix_model
+            self.pix2pix_model = None
+            gc.collect()
+            devices.torch_gc()
 
-        # sd process
-        processed = processing.process_images(p)
-        processed.sampler = p.sampler  # for create_infotext
+        self.depth_model_type = None
+        self.deviceidx = None
+
+    def get_raw_prediction(self, input, net_width, net_height):
+        """Get prediction from the model currently loaded by the class.
+        If boost is enabled, net_width and net_height will be ignored."""
+        # input image
+        img = cv2.cvtColor(np.asarray(input), cv2.COLOR_BGR2RGB) / 255.0
+        # compute depthmap
+        if not self.pix2pix_model != None:
+            if self.depth_model_type == 0:
+                raw_prediction = estimateleres(img, self.depth_model, net_width, net_height)
+                raw_prediction_invert = True
+            elif self.depth_model_type in [7, 8, 9]:
+                raw_prediction = estimatezoedepth(input, self.depth_model, net_width, net_height)
+                raw_prediction_invert = True
+            else:
+                raw_prediction = estimatemidas(img, self.depth_model, net_width, net_height,
+                                               model_holder.resize_mode, model_holder.normalization)
+                raw_prediction_invert = False
+        else:
+            raw_prediction = estimateboost(img, self.depth_model, self.depth_model_type, self.pix2pix_model)
+            raw_prediction_invert = False
+        return raw_prediction, raw_prediction_invert
 
-        inputimages = []
-        for count in range(0, len(processed.images)):
-            # skip first grid image
-            if count == 0 and len(processed.images) > 1 and opts.return_grid:
-                continue
-            inputimages.append(processed.images[count])
 
-        generated_images, mesh_fi, meshsimple_fi = run_depthmap(p.outpath_samples, inputimages, None, None, inputs)
+model_holder = ModelHolder()
 
-        for input_i, imgs in enumerate(generated_images):
-            # get generation parameters
-            if hasattr(processed, 'all_prompts') and opts.enable_pnginfo:
-                info = create_infotext(processed, processed.all_prompts, processed.all_seeds, processed.all_subseeds, "", 0, input_i)
-            else:
-                info = None
-            for image_type, image in list(imgs.items()):
-                processed.images.append(image)
-                if inputs["save_outputs"]:
-                    try:
-                        suffix = "" if image_type == "depth" else f"_{image_type}"
-                        images.save_image(image, path=p.outpath_samples, basename="", seed=processed.all_seeds[input_i],
-                                  prompt=processed.all_prompts[input_i], extension=opts.samples_format, info=info,
-                                  p=processed,
-                                  suffix=suffix)
-                    except Exception as e:
-                        if not ('image has wrong mode' in str(e) or 'I;16' in str(e)): raise e
-                        print('Catched exception: image has wrong mode!')
-                        traceback.print_exc()
-        return processed
+def convert_i16_to_rgb(image, like):
+    # three channel, 8 bits per channel image
+    output = np.zeros_like(like)
+    output[:, :, 0] = image / 256.0
+    output[:, :, 1] = image / 256.0
+    output[:, :, 2] = image / 256.0
+    return output
 
 
 def unload_sd_model():
@@ -392,7 +354,7 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
     if len(inputimages) == 0 or inputimages[0] is None:
         return [], '', ''
     if len(inputdepthmaps) == 0:
-        inputdepthmaps = [None for _ in range(len(inputimages))]
+        inputdepthmaps: list[Image] = [None for _ in range(len(inputimages))]
     inputdepthmaps_complete = all([x is not None for x in inputdepthmaps])
 
     background_removal = inp["background_removal"]
@@ -427,7 +389,7 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
     stereo_separation = inp["stereo_separation"]
 
     # TODO: ideally, run_depthmap should not save meshes - that makes the function not pure
-    print(f"\n{scriptname} {scriptversion} ({get_commit_hash()})")
+    print(f"\n{SCRIPT_NAME} {SCRIPT_VERSION} ({get_commit_hash()})")
 
     unload_sd_model()
 
@@ -441,9 +403,6 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
         else:
             background_removed_images = batched_background_removal(inputimages, background_removal_model)
 
-    resize_mode = "minimal"
-    normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
     # init torch device
     global device
     if depthmap_compute_device == 'GPU' and not torch.cuda.is_available():
@@ -455,212 +414,24 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
         device = torch.device("cpu")
     print("device: %s" % device)
 
-    # model path and name
-    model_dir = "./models/midas"
-    if model_type == 0:
-        model_dir = "./models/leres"
-    # create paths to model if not present
-    os.makedirs(model_dir, exist_ok=True)
-    os.makedirs('./models/pix2pix', exist_ok=True)
-
-    global depthmap_model_depth, depthmap_model_pix2pix, depthmap_model_type, depthmap_device_idx
-    loadmodels = True  # TODO: loadmodels is not intuitive
-    if hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels:
-        loadmodels = False
-        if depthmap_model_type != model_type or depthmap_model_depth == None or depthmap_device_idx != depthmap_compute_device:
-            del depthmap_model_depth
-            depthmap_model_depth = None
-            loadmodels = True
+    generated_images = [{} for _ in range(len(inputimages))]
+    """Images that will be returned.
+    Every array element corresponds to particular input image.
+    Dictionary keys are types of images that were derived from the input image."""
+    # TODO: ???
+    meshsimple_fi = None
+    inpaint_imgs = []
+    inpaint_depths = []
 
     try:
-        if loadmodels and not inputdepthmaps_complete:
-            # TODO: loading model should be separated into a function that would return the model
-            #  and the parameters (or maybe even functions) needed.
-            #  The rest of the run_depthmap should not depend on what specific model
-            #  is actually used for the generation.
-            print("Loading model weights from ", end=" ")
-
-            # "res101"
-            if model_type == 0:
-                model_path = f"{model_dir}/res101.pth"
-                print(model_path)
-                ensure_file_downloaded(
-                    model_path,
-                    ["https://cloudstor.aarnet.edu.au/plus/s/lTIJF4vrvHCAI31/download",
-                     "https://huggingface.co/lllyasviel/Annotators/resolve/5bc80eec2b4fddbb/res101.pth",
-                     ],
-                    "1d696b2ef3e8336b057d0c15bc82d2fecef821bfebe5ef9d7671a5ec5dde520b")
-                if depthmap_compute_device == 'GPU':
-                    checkpoint = torch.load(model_path)
-                else:
-                    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
-                model = RelDepthModel(backbone='resnext101')
-                model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."), strict=True)
-                del checkpoint
-                devices.torch_gc()
-
-            # "dpt_beit_large_512" midas 3.1
-            if model_type == 1:
-                model_path = f"{model_dir}/dpt_beit_large_512.pt"
-                print(model_path)
-                ensure_file_downloaded(model_path,
-                                       "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt")
-                model = DPTDepthModel(
-                    path=model_path,
-                    backbone="beitl16_512",
-                    non_negative=True,
-                )
-                net_w, net_h = 512, 512
-                resize_mode = "minimal"
-                normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-            # "dpt_beit_large_384" midas 3.1
-            if model_type == 2:
-                model_path = f"{model_dir}/dpt_beit_large_384.pt"
-                print(model_path)
-                ensure_file_downloaded(model_path,
-                                       "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt")
-                model = DPTDepthModel(
-                    path=model_path,
-                    backbone="beitl16_384",
-                    non_negative=True,
-                )
-                net_w, net_h = 384, 384
-                resize_mode = "minimal"
-                normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-            # "dpt_large_384" midas 3.0
-            if model_type == 3:
-                model_path = f"{model_dir}/dpt_large-midas-2f21e586.pt"
-                print(model_path)
-                ensure_file_downloaded(model_path,
-                                       "https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt")
-                model = DPTDepthModel(
-                    path=model_path,
-                    backbone="vitl16_384",
-                    non_negative=True,
-                )
-                net_w, net_h = 384, 384
-                resize_mode = "minimal"
-                normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-            # "dpt_hybrid_384" midas 3.0
-            elif model_type == 4:
-                model_path = f"{model_dir}/dpt_hybrid-midas-501f0c75.pt"
-                print(model_path)
-                ensure_file_downloaded(model_path,
-                                       "https://github.com/intel-isl/DPT/releases/download/1_0/dpt_hybrid-midas-501f0c75.pt")
-                model = DPTDepthModel(
-                    path=model_path,
-                    backbone="vitb_rn50_384",
-                    non_negative=True,
-                )
-                net_w, net_h = 384, 384
-                resize_mode = "minimal"
-                normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-
-            # "midas_v21"
-            elif model_type == 5:
-                model_path = f"{model_dir}/midas_v21-f6b98070.pt"
-                print(model_path)
-                ensure_file_downloaded(model_path,
-                                       "https://github.com/AlexeyAB/MiDaS/releases/download/midas_dpt/midas_v21-f6b98070.pt")
-                model = MidasNet(model_path, non_negative=True)
-                net_w, net_h = 384, 384
-                resize_mode = "upper_bound"
-                normalization = NormalizeImage(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                )
-
-            # "midas_v21_small"
-            elif model_type == 6:
-                model_path = f"{model_dir}/midas_v21_small-70d6b9c8.pt"
-                print(model_path)
-                ensure_file_downloaded(model_path,
-                                       "https://github.com/AlexeyAB/MiDaS/releases/download/midas_dpt/midas_v21_small-70d6b9c8.pt")
-                model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
-                                       non_negative=True, blocks={'expand': True})
-                net_w, net_h = 256, 256
-                resize_mode = "upper_bound"
-                normalization = NormalizeImage(
-                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-                )
-
-            # zoedepth_n
-            elif model_type == 7:
-                print("zoedepth_n\n")
-                conf = get_config("zoedepth", "infer")
-                conf.img_size = [net_width, net_height]
-                model = build_model(conf)
-
-            # zoedepth_k
-            elif model_type == 8:
-                print("zoedepth_k\n")
-                conf = get_config("zoedepth", "infer", config_version="kitti")
-                conf.img_size = [net_width, net_height]
-                model = build_model(conf)
-
-            # zoedepth_nk
-            elif model_type == 9:
-                print("zoedepth_nk\n")
-                conf = get_config("zoedepth_nk", "infer")
-                conf.img_size = [net_width, net_height]
-                model = build_model(conf)
-
-            pix2pixmodel = None
-            # load merge network if boost enabled or keepmodels enabled
-            if boost or (hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels):
-                # sfu.ca unfortunately is not very reliable, we use a mirror just in case
-                ensure_file_downloaded(
-                    './models/pix2pix/latest_net_G.pth',
-                    ["https://huggingface.co/lllyasviel/Annotators/resolve/9a7d84251d487d11/latest_net_G.pth",
-                     "https://sfu.ca/~yagiz/CVPR21/latest_net_G.pth"],
-                    '50ec735d74ed6499562d898f41b49343e521808b8dae589aa3c2f5c9ac9f7462')
-                opt = TestOptions().parse()
-                if depthmap_compute_device == "CPU":
-                    opt.gpu_ids = []
-                pix2pixmodel = Pix2Pix4DepthModel(opt)
-                pix2pixmodel.save_dir = './models/pix2pix'
-                pix2pixmodel.load_networks('latest')
-                pix2pixmodel.eval()
-
-            devices.torch_gc()
-
-            # prepare for evaluation
-            model.eval()
-
-            # optimize
-            if device == torch.device("cuda") and model_type in [0, 1, 2, 3, 4, 5, 6]:
-                model = model.to(memory_format=torch.channels_last)
-                if not cmd_opts.no_half and model_type != 0 and not boost:
-                    model = model.half()
-
-            model.to(device)
-
-            depthmap_model_depth = model
-            depthmap_model_pix2pix = pix2pixmodel
-            depthmap_model_type = model_type
-            depthmap_device_idx = depthmap_compute_device
-
-        if not loadmodels:
-            model = depthmap_model_depth
-            pix2pixmodel = depthmap_model_pix2pix
-            if device == torch.device("cuda"):
-                model = model.to(device)
-
-        print("Computing depthmap(s) ..")
-
-        generated_images = [{} for _ in range(len(inputimages))]
-        """Images that will be returned.
-        Every array element corresponds to particular input image.
-        Dictionary keys are types of images that were derived from the input image."""
-
-        # TODO: ???
-        meshsimple_fi = None
-        inpaint_imgs = []
-        inpaint_depths = []
-
-        # iterate over input (generated) images
+        if not inputdepthmaps_complete:
+            print("Loading model(s) ..")
+            model_holder.ensure_models(model_type, device, boost)
+        model = model_holder.depth_model
+        pix2pix_model = model_holder.pix2pix_model
+
+        print("Computing output(s) ..")
+        # iterate over input images
         for count in trange(0, len(inputimages)):
             # override net size (size may be different for different images)
             if match_size:
@@ -671,9 +442,6 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
                 inputimages[count].point(lambda p: p * 0.0039063096, mode='RGB')
                 inputimages[count] = inputimages[count].convert('RGB')
 
-            # input image
-            img = cv2.cvtColor(np.asarray(inputimages[count]), cv2.COLOR_BGR2RGB) / 255.0
-
             raw_prediction = None
             """Raw prediction, as returned by a model. None if input depthmap is used."""
             raw_prediction_invert = False
@@ -691,18 +459,8 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
                 else:
                     out = np.asarray(dimg, dtype="float")[:, :, 0]
             else:
-                # compute depthmap
-                if not boost:
-                    if model_type == 0:
-                        raw_prediction = estimateleres(img, model, net_width, net_height)
-                        raw_prediction_invert = True
-                    elif model_type in [7, 8, 9]:
-                        raw_prediction = estimatezoedepth(inputimages[count], model, net_width, net_height)
-                        raw_prediction_invert = True
-                    else:
-                        raw_prediction = estimatemidas(img, model, net_width, net_height, resize_mode, normalization)
-                else:
-                    raw_prediction = estimateboost(img, model, model_type, pix2pixmodel)
+                raw_prediction, raw_prediction_invert = \
+                    model_holder.get_raw_prediction(inputimages[count], net_width, net_height)
 
                 # output
                 if abs(raw_prediction.max() - raw_prediction.min()) > np.finfo("float").eps:
@@ -757,7 +515,8 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
                     img_depth = cv2.bitwise_not(img_output) if invert_depth else img_output
                     if combine_output:
                         img_concat = Image.fromarray(np.concatenate(
-                            (inputimages[count], convert_i16_to_rgb(img_depth, inputimages[count])), axis=combine_output_axis))
+                            (inputimages[count], convert_i16_to_rgb(img_depth, inputimages[count])),
+                            axis=combine_output_axis))
                         generated_images[count]['concat_depth'] = img_concat
                     else:
                         generated_images[count]['depth'] = Image.fromarray(img_depth)
@@ -776,7 +535,7 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
             if gen_normal:  # TODO: should be moved into a separate file when redesigned
                 # taken from @graemeniedermayer
                 # take gradients
-                zx = cv2.Sobel(np.float64(img_output), cv2.CV_64F, 1, 0, ksize=3)
+                zx = cv2.Sobel(np.float64(img_output), cv2.CV_64F, 1, 0, ksize=3)  # TODO: CV_64F ?
                 zy = cv2.Sobel(np.float64(img_output), cv2.CV_64F, 0, 1, ksize=3)
 
                 # combine and normalize gradients.
@@ -801,58 +560,59 @@ def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
                 meshsimple_fi = get_uniquefn(outpath, basename, 'obj')
                 meshsimple_fi = os.path.join(outpath, meshsimple_fi + '_simple.obj')
 
-                depthi = raw_prediction if raw_prediction is not None else raw_prediction
+                depthi = raw_prediction if raw_prediction is not None else out
+                depthi_min, depthi_max = depthi.min(), depthi.max()
                 # try to map output to sensible values for non zoedepth models, boost, or custom maps
                 if model_type not in [7, 8, 9] or boost or inputdepthmaps[count] is not None:
                     # invert if midas
                     if model_type > 0 or inputdepthmaps[count] is not None:  # TODO: Weird
-                        depthi = depth_max - depthi + depth_min
+                        depthi = depthi_max - depthi + depthi_min
                         depth_max = depthi.max()
                         depth_min = depthi.min()
                     # make positive
-                    if depth_min < 0:
-                        depthi = depthi - depth_min
+                    if depthi_min < 0:
+                        depthi = depthi - depthi_min
                         depth_max = depthi.max()
                         depth_min = depthi.min()
                     # scale down
-                    if depthi.max() > 10:
-                        depthi = 4 * (depthi - depth_min) / (depth_max - depth_min)
+                    if depthi.max() > 10.0:
+                        depthi = 4.0 * (depthi - depthi_min) / (depthi_max - depthi_min)
                     # offset
-                    depthi = depthi + 1
+                    depthi = depthi + 1.0
 
                 mesh = create_mesh(inputimages[count], depthi, keep_edges=not mesh_occlude, spherical=mesh_spherical)
                 mesh.export(meshsimple_fi)
 
-        print("Done.")
-
+        print("Computing output(s) done.")
     except RuntimeError as e:
+        # TODO: display in UI
         if 'out of memory' in str(e):
             print("ERROR: out of memory, could not generate depthmap !")
         else:
             print(e)
-
     finally:
         if not (hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels):
             if 'model' in locals():
                 del model
             if boost and 'pix2pixmodel' in locals():
-                del pix2pixmodel
+                del pix2pix_model
+            model_holder.unload_models()
         else:
-            if 'model' in locals():
-                model.to(devices.cpu)
+            model_holder.swap_to_cpu_memory()
 
         gc.collect()
         devices.torch_gc()
 
+    # TODO: This should not be here
     mesh_fi = None
     if inpaint:
         try:
             mesh_fi = run_3dphoto(device, inpaint_imgs, inpaint_depths, inputnames, outpath, inpaint_vids, 1, "mp4")
         except Exception as e:
             print(f'{str(e)}, some issue with generating inpainted mesh')
+
     reload_sd_model()
     print("All done.")
-
     return generated_images, mesh_fi, meshsimple_fi
 
 
@@ -1022,7 +782,7 @@ def run_3dphoto_videos(mesh_fi, basename, outpath, num_frames, fps, crop_border,
 
     # read ply
     global video_mesh_data, video_mesh_fn
-    if video_mesh_fn == None or video_mesh_fn != mesh_fi:
+    if video_mesh_fn is None or video_mesh_fn != mesh_fi:
         del video_mesh_data
         video_mesh_fn = mesh_fi
         video_mesh_data = read_mesh(mesh_fi)
@@ -1147,284 +907,9 @@ def run_makevideo(fn_mesh, vid_numframes, vid_fps, vid_traj, vid_shift, vid_bord
     return fn_saved[-1], fn_saved[-1], ''
 
 
-# called from depth tab
-def run_generate(*inputs):
-    inputs = GradioComponentBundle.enkey_to_dict(inputs)
-    depthmap_mode = inputs['depthmap_mode']
-    depthmap_batch_input_dir = inputs['depthmap_batch_input_dir']
-    image_batch = inputs['image_batch']
-    depthmap_input_image = inputs['depthmap_input_image']
-    depthmap_batch_output_dir = inputs['depthmap_batch_output_dir']
-    depthmap_batch_reuse = inputs['depthmap_batch_reuse']
-    custom_depthmap = inputs['custom_depthmap']
-    custom_depthmap_img = inputs['custom_depthmap_img']
-
-    inputimages = []
-    # Allow supplying custom depthmaps
-    inputdepthmaps = []
-    # Also keep track of original file names
-    inputnames = []
-
-    if depthmap_mode == '2' and depthmap_batch_output_dir != '':
-        outpath = depthmap_batch_output_dir
-    else:
-        outpath = opts.outdir_samples or opts.outdir_extras_samples
-
-    if depthmap_mode == '0':  # Single image
-        inputimages.append(depthmap_input_image)
-        inputnames.append(None)
-        if custom_depthmap:
-            if custom_depthmap_img is None:
-                return [], None, None, "Custom depthmap is not specified. Please either supply it or disable this option.", ""
-            inputdepthmaps.append(custom_depthmap_img)
-        else:
-            inputdepthmaps.append(None)
-    if depthmap_mode == '1':  # Batch Process
-        # convert files to pillow images
-        for img in image_batch:
-            image = Image.open(os.path.abspath(img.name))
-            inputimages.append(image)
-            inputnames.append(os.path.splitext(img.orig_name)[0])
-    elif depthmap_mode == '2':  # Batch from Directory
-        assert not shared.cmd_opts.hide_ui_dir_config, '--hide-ui-dir-config option must be disabled'
-        if depthmap_batch_input_dir == '':
-            return [], None, None, "Please select an input directory.", ""
-        if depthmap_batch_input_dir == depthmap_batch_output_dir:
-            return [], None, None, "Please pick different directories for batch processing.", ""
-        image_list = shared.listfiles(depthmap_batch_input_dir)
-        for path in image_list:
-            try:
-                inputimages.append(Image.open(path))
-                inputnames.append(path)
-
-                custom_depthmap = None
-                if depthmap_batch_reuse:
-                    basename = Path(path).stem
-                    # Custom names are not used in samples directory
-                    if outpath != opts.outdir_extras_samples:
-                        # Possible filenames that the custom depthmaps may have
-                        name_candidates = [f'{basename}-0000.{opts.samples_format}',  # current format
-                                           f'{basename}.png',  # human-intuitive format
-                                           f'{Path(path).name}']  # human-intuitive format (worse)
-                        for fn_cand in name_candidates:
-                            path_cand = os.path.join(outpath, fn_cand)
-                            if os.path.isfile(path_cand):
-                                custom_depthmap = Image.open(os.path.abspath(path_cand))
-                                break
-                inputdepthmaps.append(custom_depthmap)
-            except Exception:
-                print(f'Failed to load {path}, ignoring.')
-        inputdepthmaps_n = len([1 for x in inputdepthmaps if x is not None])
-        print(f'{len(inputimages)} images will be processed, {inputdepthmaps_n} existing depthmaps will be reused')
-
-    save_images, mesh_fi, meshsimple_fi = run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inputs)
-    show_images = []
-
-    # Saving images
-    for input_i, imgs in enumerate(save_images):
-        basename = 'depthmap'
-        if depthmap_mode == '2' and inputnames[input_i] is not None and outpath != opts.outdir_extras_samples:
-            basename = Path(inputnames[input_i]).stem
-        info = None
-
-        for image_type, image in list(imgs.items()):
-            show_images += [image]
-            if inputs["save_outputs"]:
-                try:
-                    suffix = "" if image_type == "depth" else f"_{image_type}"
-                    images.save_image(image, path=outpath, basename=basename, seed=None,
-                              prompt=None, extension=opts.samples_format, info=info, short_filename=True,
-                              no_prompt=True, grid=False, pnginfo_section_name="extras", existing_info=None,
-                              forced_filename=None, suffix=suffix)
-                except Exception as e:
-                    if not ('image has wrong mode' in str(e) or 'I;16' in str(e)): raise e
-                    print('Catched exception: image has wrong mode!')
-                    traceback.print_exc()
-
-    # use inpainted 3d mesh to show in 3d model output when enabled in settings
-    if hasattr(opts, 'depthmap_script_show_3d_inpaint') and opts.depthmap_script_show_3d_inpaint and mesh_fi != None and len(mesh_fi) > 0:
-            meshsimple_fi = mesh_fi
-    # however, don't show 3dmodel when disabled in settings
-    if hasattr(opts, 'depthmap_script_show_3d') and not opts.depthmap_script_show_3d:
-            meshsimple_fi = None
-    # TODO: return more info
-    return show_images, mesh_fi, meshsimple_fi, plaintext_to_html('info'), ''
-
-
 def unload_models():
-    global depthmap_model_depth, depthmap_model_pix2pix, depthmap_model_type
-    depthmap_model_type = -1
-    del depthmap_model_depth
-    del depthmap_model_pix2pix
-    depthmap_model_depth = None
-    depthmap_model_pix2pix = None
-    gc.collect()
-    devices.torch_gc()
-
-
-def clear_mesh():
-    return None
-
-# TODO: some of them may be put into the main ui pane
-def on_ui_settings():
-    section = ('depthmap-script', "Depthmap extension")
-    shared.opts.add_option("depthmap_script_keepmodels",
-                           shared.OptionInfo(False, "Keep depth models loaded.",
-                                             section=section))
-    shared.opts.add_option("depthmap_script_boost_rmax",
-                           shared.OptionInfo(1600, "Maximum wholesize for boost (Rmax)",
-                                             section=section))
-    shared.opts.add_option("depthmap_script_save_ply",
-                           shared.OptionInfo(False, "Save additional PLY file with 3D inpainted mesh.",
-                                             section=section))
-    shared.opts.add_option("depthmap_script_show_3d",
-                           shared.OptionInfo(True, "Enable showing 3D Meshes in output tab. (Experimental)",
-                                             section=section))
-    shared.opts.add_option("depthmap_script_show_3d_inpaint",
-                           shared.OptionInfo(True, "Also show 3D Inpainted Mesh in 3D Mesh output tab. (Experimental)",
-                                             section=section))
-    shared.opts.add_option("depthmap_script_mesh_maxsize",
-                           shared.OptionInfo(2048, "Max size for generating simple mesh.",
-                                             section=section))
-
-
-def on_ui_tabs():
-    inp = GradioComponentBundle()
-    with gr.Blocks(analytics_enabled=False) as depthmap_interface:
-        with gr.Row().style(equal_height=False):
-            with gr.Column(variant='panel'):
-                inp += 'depthmap_mode', gr.HTML(visible=False, value='0')
-                with gr.Tabs():
-                    with gr.TabItem('Single Image') as depthmap_mode_0:
-                        with gr.Row():
-                            inp += gr.Image(label="Source", source="upload", interactive=True, type="pil",
-                                            elem_id="depthmap_input_image")
-                            with gr.Group(visible=False) as custom_depthmap_row_0:
-                                inp += gr.File(label="Custom DepthMap", file_count="single", interactive=True,
-                                               type="file", elem_id='custom_depthmap_img')
-                        inp += gr.Checkbox(elem_id="custom_depthmap", label="Use custom DepthMap", value=False)
-                    with gr.TabItem('Batch Process') as depthmap_mode_1:
-                        inp += gr.File(elem_id='image_batch', label="Batch Process", file_count="multiple",
-                                       interactive=True, type="file")
-                    with gr.TabItem('Batch from Directory') as depthmap_mode_2:
-                        inp += gr.Textbox(elem_id="depthmap_batch_input_dir", label="Input directory",
-                                          **shared.hide_dirs,
-                                          placeholder="A directory on the same machine where the server is running.")
-                        inp += gr.Textbox(elem_id="depthmap_batch_output_dir", label="Output directory",
-                                          **shared.hide_dirs,
-                                          placeholder="Leave blank to save images to the default path.")
-                        gr.HTML("Files in the output directory may be overwritten.")
-                        inp += gr.Checkbox(elem_id="depthmap_batch_reuse",
-                                           label="Skip generation and use (edited/custom) depthmaps in output directory when a file already exists.",
-                                           value=True)
-                submit = gr.Button('Generate', elem_id="depthmap_generate", variant='primary')
-                inp += main_ui_panel(True)  # Main panel is inserted here
-                unloadmodels = gr.Button('Unload models', elem_id="depthmap_unloadmodels")
-
-            with gr.Column(variant='panel'):
-                with gr.Tabs(elem_id="mode_depthmap_output"):
-                    with gr.TabItem('Depth Output'):
-                        with gr.Group():
-                            result_images = gr.Gallery(label='Output', show_label=False,
-                                                       elem_id=f"depthmap_gallery").style(grid=4)
-                        with gr.Column():
-                            html_info_x = gr.HTML()
-                            html_info = gr.HTML()
-
-                    with gr.TabItem('3D Mesh'):
-                        with gr.Group():
-                            result_depthmesh = gr.Model3D(label="3d Mesh", clear_color=[1.0, 1.0, 1.0, 1.0])
-                            with gr.Row():
-                                #loadmesh = gr.Button('Load')
-                                clearmesh = gr.Button('Clear')
-
-                    with gr.TabItem('Generate video'):
-                        # generate video
-                        with gr.Group():
-                            with gr.Row():
-                                gr.Markdown("Generate video from inpainted(!) mesh.")
-                            with gr.Row():
-                                depth_vid = gr.Video(interactive=False)
-                            with gr.Column():
-                                vid_html_info_x = gr.HTML()
-                                vid_html_info = gr.HTML()
-                                fn_mesh = gr.Textbox(label="Input Mesh (.ply | .obj)", **shared.hide_dirs, placeholder="A file on the same machine where the server is running.")
-                            with gr.Row():
-                                vid_numframes = gr.Textbox(label="Number of frames", value="300")
-                                vid_fps = gr.Textbox(label="Framerate", value="40")
-                                vid_format = gr.Dropdown(label="Format", choices=['mp4', 'webm'], value='mp4', type="value", elem_id="video_format")
-                                vid_ssaa = gr.Dropdown(label="SSAA", choices=['1', '2', '3', '4'], value='3', type="value", elem_id="video_ssaa")
-                            with gr.Row():
-                                vid_traj = gr.Dropdown(label="Trajectory", choices=['straight-line', 'double-straight-line', 'circle'], value='double-straight-line', type="index", elem_id="video_trajectory")
-                                vid_shift = gr.Textbox(label="Translate: x, y, z", value="-0.015, 0.0, -0.05")
-                                vid_border = gr.Textbox(label="Crop: top, left, bottom, right", value="0.03, 0.03, 0.05, 0.03")
-                                vid_dolly = gr.Checkbox(label="Dolly", value=False, elem_classes="smalltxt")
-                            with gr.Row():
-                                submit_vid = gr.Button('Generate Video', elem_id="depthmap_generatevideo", variant='primary')
-
-        inp += inp.enkey_tail()
-
-        depthmap_mode_0.select(lambda: '0', None, inp['depthmap_mode'])
-        depthmap_mode_1.select(lambda: '1', None, inp['depthmap_mode'])
-        depthmap_mode_2.select(lambda: '2', None, inp['depthmap_mode'])
-
-        def custom_depthmap_visibility(v):
-            return custom_depthmap_row_0.update(visible=v)
-        inp['custom_depthmap'].change(
-            fn=custom_depthmap_visibility,
-            inputs=[inp['custom_depthmap']],
-            outputs=[custom_depthmap_row_0]
-        )
-
-        unloadmodels.click(
-            fn=unload_models,
-            inputs=[],
-            outputs=[]
-        )
-
-        clearmesh.click(
-            fn=clear_mesh,
-            inputs=[],
-            outputs=[result_depthmesh]
-        )
-
-        submit.click(
-            fn=wrap_gradio_gpu_call(run_generate),
-            inputs=inp.enkey_body(),
-            outputs=[
-                result_images,
-                fn_mesh,
-                result_depthmesh,
-                html_info_x,
-                html_info
-            ]
-        )
-
-        submit_vid.click(
-            fn=wrap_gradio_gpu_call(run_makevideo),
-            inputs=[
-                fn_mesh,
-                vid_numframes,
-                vid_fps,
-                vid_traj,
-                vid_shift,
-                vid_border,
-                vid_dolly,
-                vid_format,
-                vid_ssaa
-            ],
-            outputs=[
-                depth_vid,
-                vid_html_info_x,
-                vid_html_info
-            ]
-        )
-
-    return (depthmap_interface, "Depth", "depthmap_interface"),
-
+    model_holder.unload_models()
 
-script_callbacks.on_ui_settings(on_ui_settings)
-script_callbacks.on_ui_tabs(on_ui_tabs)
 
 # TODO: code borrowed from the internet to be marked as such and to reside in separate files
 
diff --git a/scripts/interface_webui.py b/scripts/interface_webui.py
new file mode 100644
index 0000000..f16b4e6
--- /dev/null
+++ b/scripts/interface_webui.py
@@ -0,0 +1,553 @@
+import gradio as gr
+import traceback
+import modules.scripts as scripts
+from modules import processing, images, shared
+from modules import script_callbacks
+from modules.call_queue import wrap_gradio_gpu_call
+from modules.processing import create_infotext
+from modules.shared import opts
+from modules.ui import plaintext_to_html
+from pathlib import Path
+
+from scripts.gradio_args_transport import GradioComponentBundle
+from scripts.main import *
+from scripts.depthmap import run_depthmap, unload_models, run_makevideo
+from PIL import Image
+
+
+def main_ui_panel(is_depth_tab):
+    inp = GradioComponentBundle()
+    # TODO: Greater visual separation
+    with gr.Blocks():
+        with gr.Row():
+            inp += 'compute_device', gr.Radio(label="Compute on", choices=['GPU', 'CPU'], value='GPU')
+            # TODO: Should return value instead of index. Maybe Enum should be used?
+            inp += 'model_type', gr.Dropdown(label="Model",
+                                             choices=['res101', 'dpt_beit_large_512 (midas 3.1)',
+                                                      'dpt_beit_large_384 (midas 3.1)', 'dpt_large_384 (midas 3.0)',
+                                                      'dpt_hybrid_384 (midas 3.0)',
+                                                      'midas_v21', 'midas_v21_small',
+                                                      'zoedepth_n (indoor)', 'zoedepth_k (outdoor)', 'zoedepth_nk'],
+                                             value='res101',
+                                             type="index")
+        with gr.Group():
+            with gr.Row():
+                inp += 'boost', gr.Checkbox(label="BOOST (multi-resolution merging)", value=True)
+                with gr.Group(visible=False) as options_depend_on_boost:
+                    inp += 'match_size', gr.Checkbox(label="Match net size to input size", value=False)
+            with gr.Row(visible=False) as options_depend_on_match_size:
+                inp += 'net_width', gr.Slider(minimum=64, maximum=2048, step=64, label='Net width', value=512)
+                inp += 'net_height', gr.Slider(minimum=64, maximum=2048, step=64, label='Net height', value=512)
+
+        with gr.Group():
+            with gr.Row():
+                inp += "save_outputs", gr.Checkbox(label="Save Outputs", value=True)  # 50% of width
+                with gr.Group():  # 50% of width
+                    inp += "output_depth", gr.Checkbox(label="Output DepthMap", value=True)
+                    inp += "invert_depth", gr.Checkbox(label="Invert (black=near, white=far)", value=False)
+            with gr.Row() as options_depend_on_output_depth_1:
+                inp += "combine_output", gr.Checkbox(
+                    label="Combine input and depthmap into one image", value=False)
+                inp += "combine_output_axis", gr.Radio(label="Combine axis", choices=['Vertical', 'Horizontal'],
+                                                       value='Horizontal', type="index", visible=False)
+        with gr.Group():
+            with gr.Row():
+                inp += 'clipdepth', gr.Checkbox(label="Clip and renormalize DepthMap", value=False)
+            with gr.Row(visible=False) as clip_options_row_1:
+                inp += "clipthreshold_far", gr.Slider(minimum=0, maximum=1, step=0.001, label='Far clip', value=0)
+                inp += "clipthreshold_near", gr.Slider(minimum=0, maximum=1, step=0.001, label='Near clip', value=1)
+
+        with gr.Group():
+            with gr.Row():
+                inp += "show_heat", gr.Checkbox(label="Generate HeatMap", value=False)
+                # gr.Checkbox(label="Generate NormalMap", value=False)  # TODO: this is a fake door
+
+        with gr.Group():
+            with gr.Row():
+                inp += "gen_stereo", gr.Checkbox(label="Generate stereoscopic image(s)", value=False)
+            with gr.Group(visible=False) as stereo_options:
+                with gr.Row():
+                    with gr.Row():
+                        inp += "stereo_modes", gr.CheckboxGroup(
+                            ["left-right", "right-left", "top-bottom", "bottom-top", "red-cyan-anaglyph"],
+                            label="Output", value=["left-right", "red-cyan-anaglyph"])
+                with gr.Row():
+                    inp += "stereo_divergence", gr.Slider(minimum=0.05, maximum=10.005, step=0.01,
+                                                          label='Divergence (3D effect)',
+                                                          value=2.5)
+                    inp += "stereo_separation", gr.Slider(minimum=-5.0, maximum=5.0, step=0.01,
+                                                          label='Separation (moves images apart)',
+                                                          value=0.0)
+                with gr.Row():
+                    inp += "stereo_fill", gr.Dropdown(label="Gap fill technique",
+                                                      choices=['none', 'naive', 'naive_interpolating', 'polylines_soft',
+                                                               'polylines_sharp'], value='polylines_sharp',
+                                                      type="value")
+                    inp += "stereo_balance", gr.Slider(minimum=-1.0, maximum=1.0, step=0.05,
+                                                       label='Balance between eyes',
+                                                       value=0.0)
+
+        with gr.Group():
+            with gr.Row():
+                inp += "gen_mesh", gr.Checkbox(
+                    label="Generate simple 3D mesh. "
+                          "(Fast, accurate only with ZoeDepth models and no boost, no custom maps)",
+                    value=False, visible=True)
+            with gr.Row(visible=False) as mesh_options_row_0:
+                inp += "mesh_occlude", gr.Checkbox(label="Remove occluded edges", value=True, visible=True)
+                inp += "mesh_spherical", gr.Checkbox(label="Equirectangular projection", value=False, visible=True)
+
+        if is_depth_tab:
+            with gr.Group():
+                with gr.Row():
+                    inp += "inpaint", gr.Checkbox(
+                        label="Generate 3D inpainted mesh. (Sloooow, required for generating videos)", value=False)
+                with gr.Group(visible=False) as inpaint_options_row_0:
+                    inp += "inpaint_vids", gr.Checkbox(
+                        label="Generate 4 demo videos with 3D inpainted mesh.", value=False)
+                    gr.HTML("More options for generating video can be found in the Generate video tab")
+
+        with gr.Group():
+            # TODO: it should be clear from the UI that the background removal does not use the model selected above
+            with gr.Row():
+                inp += "background_removal", gr.Checkbox(label="Remove background", value=False)
+            with gr.Row(visible=False) as bgrem_options_row_1:
+                inp += "save_background_removal_masks", gr.Checkbox(label="Save the foreground masks", value=False)
+                inp += "pre_depth_background_removal", gr.Checkbox(label="Pre-depth background removal", value=False)
+            with gr.Row(visible=False) as bgrem_options_row_2:
+                inp += "background_removal_model", gr.Dropdown(label="Rembg Model",
+                                                               choices=['u2net', 'u2netp', 'u2net_human_seg',
+                                                                        'silueta'],
+                                                               value='u2net', type="value")
+
+        with gr.Box():
+            gr.HTML("Information, comment and share @ <a "
+                    "href='https://github.com/thygate/stable-diffusion-webui-depthmap-script'>"
+                    "https://github.com/thygate/stable-diffusion-webui-depthmap-script</a>")
+
+        inp += "gen_normal", gr.Checkbox(label="Generate Normalmap (hidden! api only)", value=False, visible=False)
+
+        inp['boost'].change(
+            fn=lambda a, b: (options_depend_on_boost.update(visible=not a),
+                             options_depend_on_match_size.update(visible=not a and not b)),
+            inputs=[inp['boost'], inp['match_size']],
+            outputs=[options_depend_on_boost, options_depend_on_match_size]
+        )
+        inp['match_size'].change(
+            fn=lambda a, b: options_depend_on_match_size.update(visible=not a and not b),
+            inputs=[inp['boost'], inp['match_size']],
+            outputs=[options_depend_on_match_size]
+        )
+
+        inp['output_depth'].change(
+            fn=lambda a: (inp['invert_depth'].update(visible=a), options_depend_on_output_depth_1.update(visible=a)),
+            inputs=[inp['output_depth']],
+            outputs=[inp['invert_depth'], options_depend_on_output_depth_1]
+        )
+
+        inp['combine_output'].change(
+            fn=lambda v: inp['combine_output_axis'].update(visible=v),
+            inputs=[inp['combine_output']],
+            outputs=[inp['combine_output_axis']]
+        )
+
+        inp['clipdepth'].change(
+            fn=lambda v: clip_options_row_1.update(visible=v),
+            inputs=[inp['clipdepth']],
+            outputs=[clip_options_row_1]
+        )
+        inp['clipthreshold_far'].change(
+            fn=lambda a, b: a if b < a else b,
+            inputs=[inp['clipthreshold_far'], inp['clipthreshold_near']],
+            outputs=[inp['clipthreshold_near']]
+        )
+        inp['clipthreshold_near'].change(
+            fn=lambda a, b: a if b > a else b,
+            inputs=[inp['clipthreshold_near'], inp['clipthreshold_far']],
+            outputs=[inp['clipthreshold_far']]
+        )
+
+        # invert_depth must not be used with gen_stereo - otherwise stereo images look super-wrong
+        inp['gen_stereo'].change(
+            fn=lambda a, b: False if b else a,
+            inputs=[inp['invert_depth'], inp['gen_stereo']],
+            outputs=[inp['invert_depth']]
+        )
+        inp['gen_stereo'].change(
+            fn=lambda a, b: inp['invert_depth'].update(interactive=not b),
+            inputs=[inp['invert_depth'], inp['gen_stereo']],
+            outputs=[inp['invert_depth']]
+        )
+
+        def stereo_options_visibility(v):
+            return stereo_options.update(visible=v)
+
+        inp['gen_stereo'].change(
+            fn=stereo_options_visibility,
+            inputs=[inp['gen_stereo']],
+            outputs=[stereo_options]
+        )
+
+        inp['gen_mesh'].change(
+            fn=lambda v: mesh_options_row_0.update(visible=v),
+            inputs=[inp['gen_mesh']],
+            outputs=[mesh_options_row_0]
+        )
+
+        def inpaint_options_visibility(v):
+            return inpaint_options_row_0.update(visible=v)
+
+        if is_depth_tab:
+            inp['inpaint'].change(
+                fn=inpaint_options_visibility,
+                inputs=[inp['inpaint']],
+                outputs=[inpaint_options_row_0]
+            )
+
+        def background_removal_options_visibility(v):
+            return bgrem_options_row_1.update(visible=v), \
+                bgrem_options_row_2.update(visible=v)
+
+        inp['background_removal'].change(
+            fn=background_removal_options_visibility,
+            inputs=[inp['background_removal']],
+            outputs=[bgrem_options_row_1, bgrem_options_row_2]
+        )
+
+    return inp
+
+
+class Script(scripts.Script):
+    def title(self):
+        return SCRIPT_NAME
+
+    def show(self, is_img2img):
+        return True
+
+    def ui(self, is_img2img):
+        gr.HTML()  # Work around a Gradio bug
+        with gr.Column(variant='panel'):
+            gr.HTML()  # Work around a Gradio bug
+            ret = main_ui_panel(False)
+            ret += ret.enkey_tail()
+        return ret.enkey_body()
+
+    # run from script in txt2img or img2img
+    def run(self, p, *inputs):
+        inputs = GradioComponentBundle.enkey_to_dict(inputs)
+
+        # sd process
+        processed = processing.process_images(p)
+        processed.sampler = p.sampler  # for create_infotext
+
+        inputimages = []
+        for count in range(0, len(processed.images)):
+            # skip first grid image
+            if count == 0 and len(processed.images) > 1 and opts.return_grid:
+                continue
+            inputimages.append(processed.images[count])
+
+        generated_images, mesh_fi, meshsimple_fi = run_depthmap(p.outpath_samples, inputimages, None, None, inputs)
+
+        for input_i, imgs in enumerate(generated_images):
+            # get generation parameters
+            if hasattr(processed, 'all_prompts') and opts.enable_pnginfo:
+                info = create_infotext(processed, processed.all_prompts, processed.all_seeds, processed.all_subseeds,
+                                       "", 0, input_i)
+            else:
+                info = None
+            for image_type, image in list(imgs.items()):
+                processed.images.append(image)
+                if inputs["save_outputs"]:
+                    try:
+                        suffix = "" if image_type == "depth" else f"_{image_type}"
+                        images.save_image(image, path=p.outpath_samples, basename="", seed=processed.all_seeds[input_i],
+                                          prompt=processed.all_prompts[input_i], extension=opts.samples_format,
+                                          info=info,
+                                          p=processed,
+                                          suffix=suffix)
+                    except Exception as e:
+                        if not ('image has wrong mode' in str(e) or 'I;16' in str(e)):
+                            raise e
+                        print('Catched exception: image has wrong mode!')
+                        traceback.print_exc()
+        return processed
+
+
+# TODO: some of them may be put into the main ui pane
+def on_ui_settings():
+    section = ('depthmap-script', "Depthmap extension")
+    shared.opts.add_option("depthmap_script_keepmodels",
+                           shared.OptionInfo(False, "Keep depth models loaded.",
+                                             section=section))
+    shared.opts.add_option("depthmap_script_boost_rmax",
+                           shared.OptionInfo(1600, "Maximum wholesize for boost (Rmax)",
+                                             section=section))
+    shared.opts.add_option("depthmap_script_save_ply",
+                           shared.OptionInfo(False, "Save additional PLY file with 3D inpainted mesh.",
+                                             section=section))
+    shared.opts.add_option("depthmap_script_show_3d",
+                           shared.OptionInfo(True, "Enable showing 3D Meshes in output tab. (Experimental)",
+                                             section=section))
+    shared.opts.add_option("depthmap_script_show_3d_inpaint",
+                           shared.OptionInfo(True, "Also show 3D Inpainted Mesh in 3D Mesh output tab. (Experimental)",
+                                             section=section))
+    shared.opts.add_option("depthmap_script_mesh_maxsize",
+                           shared.OptionInfo(2048, "Max size for generating simple mesh.",
+                                             section=section))
+
+
+def on_ui_tabs():
+    inp = GradioComponentBundle()
+    with gr.Blocks(analytics_enabled=False) as depthmap_interface:
+        with gr.Row().style(equal_height=False):
+            with gr.Column(variant='panel'):
+                inp += 'depthmap_mode', gr.HTML(visible=False, value='0')
+                with gr.Tabs():
+                    with gr.TabItem('Single Image') as depthmap_mode_0:
+                        with gr.Row():
+                            inp += gr.Image(label="Source", source="upload", interactive=True, type="pil",
+                                            elem_id="depthmap_input_image")
+                            with gr.Group(visible=False) as custom_depthmap_row_0:
+                                inp += gr.File(label="Custom DepthMap", file_count="single", interactive=True,
+                                               type="file", elem_id='custom_depthmap_img')
+                        inp += gr.Checkbox(elem_id="custom_depthmap", label="Use custom DepthMap", value=False)
+                    with gr.TabItem('Batch Process') as depthmap_mode_1:
+                        inp += gr.File(elem_id='image_batch', label="Batch Process", file_count="multiple",
+                                       interactive=True, type="file")
+                    with gr.TabItem('Batch from Directory') as depthmap_mode_2:
+                        inp += gr.Textbox(elem_id="depthmap_batch_input_dir", label="Input directory",
+                                          **shared.hide_dirs,
+                                          placeholder="A directory on the same machine where the server is running.")
+                        inp += gr.Textbox(elem_id="depthmap_batch_output_dir", label="Output directory",
+                                          **shared.hide_dirs,
+                                          placeholder="Leave blank to save images to the default path.")
+                        gr.HTML("Files in the output directory may be overwritten.")
+                        inp += gr.Checkbox(elem_id="depthmap_batch_reuse",
+                                           label="Skip generation and use (edited/custom) depthmaps "
+                                                 "in output directory when a file already exists.",
+                                           value=True)
+                submit = gr.Button('Generate', elem_id="depthmap_generate", variant='primary')
+                inp += main_ui_panel(True)  # Main panel is inserted here
+                unloadmodels = gr.Button('Unload models', elem_id="depthmap_unloadmodels")
+
+            with gr.Column(variant='panel'):
+                with gr.Tabs(elem_id="mode_depthmap_output"):
+                    with gr.TabItem('Depth Output'):
+                        with gr.Group():
+                            result_images = gr.Gallery(label='Output', show_label=False,
+                                                       elem_id=f"depthmap_gallery").style(grid=4)
+                        with gr.Column():
+                            html_info_x = gr.HTML()
+                            html_info = gr.HTML()
+
+                    with gr.TabItem('3D Mesh'):
+                        with gr.Group():
+                            result_depthmesh = gr.Model3D(label="3d Mesh", clear_color=[1.0, 1.0, 1.0, 1.0])
+                            with gr.Row():
+                                # loadmesh = gr.Button('Load')
+                                clearmesh = gr.Button('Clear')
+
+                    with gr.TabItem('Generate video'):
+                        # generate video
+                        with gr.Group():
+                            with gr.Row():
+                                gr.Markdown("Generate video from inpainted(!) mesh.")
+                            with gr.Row():
+                                depth_vid = gr.Video(interactive=False)
+                            with gr.Column():
+                                vid_html_info_x = gr.HTML()
+                                vid_html_info = gr.HTML()
+                                fn_mesh = gr.Textbox(label="Input Mesh (.ply | .obj)", **shared.hide_dirs,
+                                                     placeholder="A file on the same machine where "
+                                                                 "the server is running.")
+                            with gr.Row():
+                                vid_numframes = gr.Textbox(label="Number of frames", value="300")
+                                vid_fps = gr.Textbox(label="Framerate", value="40")
+                                vid_format = gr.Dropdown(label="Format", choices=['mp4', 'webm'], value='mp4',
+                                                         type="value", elem_id="video_format")
+                                vid_ssaa = gr.Dropdown(label="SSAA", choices=['1', '2', '3', '4'], value='3',
+                                                       type="value", elem_id="video_ssaa")
+                            with gr.Row():
+                                vid_traj = gr.Dropdown(label="Trajectory",
+                                                       choices=['straight-line', 'double-straight-line', 'circle'],
+                                                       value='double-straight-line', type="index",
+                                                       elem_id="video_trajectory")
+                                vid_shift = gr.Textbox(label="Translate: x, y, z", value="-0.015, 0.0, -0.05")
+                                vid_border = gr.Textbox(label="Crop: top, left, bottom, right",
+                                                        value="0.03, 0.03, 0.05, 0.03")
+                                vid_dolly = gr.Checkbox(label="Dolly", value=False, elem_classes="smalltxt")
+                            with gr.Row():
+                                submit_vid = gr.Button('Generate Video', elem_id="depthmap_generatevideo",
+                                                       variant='primary')
+
+        inp += inp.enkey_tail()
+
+        depthmap_mode_0.select(lambda: '0', None, inp['depthmap_mode'])
+        depthmap_mode_1.select(lambda: '1', None, inp['depthmap_mode'])
+        depthmap_mode_2.select(lambda: '2', None, inp['depthmap_mode'])
+
+        def custom_depthmap_visibility(v):
+            return custom_depthmap_row_0.update(visible=v)
+
+        inp['custom_depthmap'].change(
+            fn=custom_depthmap_visibility,
+            inputs=[inp['custom_depthmap']],
+            outputs=[custom_depthmap_row_0]
+        )
+
+        unloadmodels.click(
+            fn=unload_models,
+            inputs=[],
+            outputs=[]
+        )
+
+        clearmesh.click(
+            fn=lambda: None,
+            inputs=[],
+            outputs=[result_depthmesh]
+        )
+
+        submit.click(
+            fn=wrap_gradio_gpu_call(run_generate),
+            inputs=inp.enkey_body(),
+            outputs=[
+                result_images,
+                fn_mesh,
+                result_depthmesh,
+                html_info_x,
+                html_info
+            ]
+        )
+
+        submit_vid.click(
+            fn=wrap_gradio_gpu_call(run_makevideo),
+            inputs=[
+                fn_mesh,
+                vid_numframes,
+                vid_fps,
+                vid_traj,
+                vid_shift,
+                vid_border,
+                vid_dolly,
+                vid_format,
+                vid_ssaa
+            ],
+            outputs=[
+                depth_vid,
+                vid_html_info_x,
+                vid_html_info
+            ]
+        )
+
+    return (depthmap_interface, "Depth", "depthmap_interface"),
+
+
+# called from depth tab
+def run_generate(*inputs):
+    inputs = GradioComponentBundle.enkey_to_dict(inputs)
+    depthmap_mode = inputs['depthmap_mode']
+    depthmap_batch_input_dir = inputs['depthmap_batch_input_dir']
+    image_batch = inputs['image_batch']
+    depthmap_input_image = inputs['depthmap_input_image']
+    depthmap_batch_output_dir = inputs['depthmap_batch_output_dir']
+    depthmap_batch_reuse = inputs['depthmap_batch_reuse']
+    custom_depthmap = inputs['custom_depthmap']
+    custom_depthmap_img = inputs['custom_depthmap_img']
+
+    inputimages = []
+    # Allow supplying custom depthmaps
+    inputdepthmaps = []
+    # Also keep track of original file names
+    inputnames = []
+
+    if depthmap_mode == '2' and depthmap_batch_output_dir != '':
+        outpath = depthmap_batch_output_dir
+    else:
+        outpath = opts.outdir_samples or opts.outdir_extras_samples
+
+    if depthmap_mode == '0':  # Single image
+        inputimages.append(depthmap_input_image)
+        inputnames.append(None)
+        if custom_depthmap:
+            if custom_depthmap_img is None:
+                return [], None, None, "Custom depthmap is not specified. " \
+                                       "Please either supply it or disable this option.", ""
+            inputdepthmaps.append(custom_depthmap_img)
+        else:
+            inputdepthmaps.append(None)
+    if depthmap_mode == '1':  # Batch Process
+        # convert files to pillow images
+        for img in image_batch:
+            image = Image.open(os.path.abspath(img.name))
+            inputimages.append(image)
+            inputnames.append(os.path.splitext(img.orig_name)[0])
+    elif depthmap_mode == '2':  # Batch from Directory
+        assert not shared.cmd_opts.hide_ui_dir_config, '--hide-ui-dir-config option must be disabled'
+        if depthmap_batch_input_dir == '':
+            return [], None, None, "Please select an input directory.", ""
+        if depthmap_batch_input_dir == depthmap_batch_output_dir:
+            return [], None, None, "Please pick different directories for batch processing.", ""
+        image_list = shared.listfiles(depthmap_batch_input_dir)
+        for path in image_list:
+            try:
+                inputimages.append(Image.open(path))
+                inputnames.append(path)
+
+                custom_depthmap = None
+                if depthmap_batch_reuse:
+                    basename = Path(path).stem
+                    # Custom names are not used in samples directory
+                    if outpath != opts.outdir_extras_samples:
+                        # Possible filenames that the custom depthmaps may have
+                        name_candidates = [f'{basename}-0000.{opts.samples_format}',  # current format
+                                           f'{basename}.png',  # human-intuitive format
+                                           f'{Path(path).name}']  # human-intuitive format (worse)
+                        for fn_cand in name_candidates:
+                            path_cand = os.path.join(outpath, fn_cand)
+                            if os.path.isfile(path_cand):
+                                custom_depthmap = Image.open(os.path.abspath(path_cand))
+                                break
+                inputdepthmaps.append(custom_depthmap)
+            except Exception as e:
+                print(f'Failed to load {path}, ignoring. Exception: {str(e)}')
+        inputdepthmaps_n = len([1 for x in inputdepthmaps if x is not None])
+        print(f'{len(inputimages)} images will be processed, {inputdepthmaps_n} existing depthmaps will be reused')
+
+    save_images, mesh_fi, meshsimple_fi = run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inputs)
+    show_images = []
+
+    # Saving images
+    for input_i, imgs in enumerate(save_images):
+        basename = 'depthmap'
+        if depthmap_mode == '2' and inputnames[input_i] is not None and outpath != opts.outdir_extras_samples:
+            basename = Path(inputnames[input_i]).stem
+
+        for image_type, image in list(imgs.items()):
+            show_images += [image]
+            if inputs["save_outputs"]:
+                try:
+                    suffix = "" if image_type == "depth" else f"_{image_type}"
+                    images.save_image(image, path=outpath, basename=basename, seed=None,
+                                      prompt=None, extension=opts.samples_format, short_filename=True,
+                                      no_prompt=True, grid=False, pnginfo_section_name="extras",
+                                      suffix=suffix)
+                except Exception as e:
+                    if not ('image has wrong mode' in str(e) or 'I;16' in str(e)):
+                        raise e
+                    print('Catched exception: image has wrong mode!')
+                    traceback.print_exc()
+
+    # use inpainted 3d mesh to show in 3d model output when enabled in settings
+    if hasattr(opts, 'depthmap_script_show_3d_inpaint') and opts.depthmap_script_show_3d_inpaint \
+            and mesh_fi is not None and len(mesh_fi) > 0:
+        meshsimple_fi = mesh_fi
+    # however, don't show 3dmodel when disabled in settings
+    if hasattr(opts, 'depthmap_script_show_3d') and not opts.depthmap_script_show_3d:
+        meshsimple_fi = None
+    # TODO: return more info
+    return show_images, mesh_fi, meshsimple_fi, plaintext_to_html('info'), ''
+
+
+script_callbacks.on_ui_settings(on_ui_settings)
+script_callbacks.on_ui_tabs(on_ui_tabs)
diff --git a/scripts/main.py b/scripts/main.py
new file mode 100644
index 0000000..2360588
--- /dev/null
+++ b/scripts/main.py
@@ -0,0 +1,39 @@
+import subprocess
+import os
+import pathlib
+
+SCRIPT_NAME = "DepthMap"
+SCRIPT_VERSION = "v0.3.13"
+
+commit_hash = None  # TODO: understand why it would spam to stderr if changed to ... = get_commit_hash()
+def get_commit_hash():
+    global commit_hash
+    if commit_hash is None:
+        try:
+            commit_hash = subprocess.check_output(
+                [os.environ.get('GIT', "git"), "rev-parse", "HEAD"],
+                cwd=pathlib.Path.cwd().joinpath('extensions/stable-diffusion-webui-depthmap-script/'),
+                shell=False,
+                stderr=subprocess.DEVNULL,
+                encoding='utf8').strip()[0:8]
+        except Exception:
+            commit_hash = "<none>"
+    return commit_hash
+
+
+def ensure_file_downloaded(filename, url, sha256_hash_prefix=None):
+    # Do not check the hash every time - it is somewhat time-consuming
+    if os.path.exists(filename):
+        return
+
+    if type(url) is not list:
+        url = [url]
+    for cur_url in url:
+        try:
+            print("Downloading", cur_url, "to", filename)
+            torch.hub.download_url_to_file(cur_url, filename, sha256_hash_prefix)
+            if os.path.exists(filename):
+                return  # The correct model was downloaded, no need to try more
+        except:
+            pass
+    raise RuntimeError('Download failed. Try again later or manually download the file to that location.')

From 72517c459f64963fb4639d20068c2b9a64ad3b5e Mon Sep 17 00:00:00 2001
From: Semjon Kravtsenko <semjon.00@gmail.com>
Date: Mon, 10 Jul 2023 12:23:42 +0300
Subject: [PATCH 10/16] Moved code around

Large refactor part, may be broken
---
 scripts/core.py                               |  748 +++++++++
 .../{depthmap.py => depthmap_generation.py}   | 1419 +++++------------
 scripts/interface_webui.py                    |   36 +-
 scripts/main.py                               |    1 +
 scripts/stereoimage_generation.py             |   17 +-
 5 files changed, 1157 insertions(+), 1064 deletions(-)
 create mode 100644 scripts/core.py
 rename scripts/{depthmap.py => depthmap_generation.py} (53%)

diff --git a/scripts/core.py b/scripts/core.py
new file mode 100644
index 0000000..49a6c7d
--- /dev/null
+++ b/scripts/core.py
@@ -0,0 +1,748 @@
+from pathlib import Path
+
+from PIL import Image
+
+from modules import shared, devices
+from modules.images import get_next_sequence_number
+from modules.shared import opts, cmd_opts
+
+try:
+    from tqdm import trange
+except:
+    from builtins import range as trange
+
+import torch, gc
+import cv2
+import os.path
+import numpy as np
+import copy
+import platform
+import math
+import traceback
+
+# Our code
+from scripts.main import *
+from scripts.stereoimage_generation import create_stereoimages
+from scripts.depthmap_generation import ModelHolder
+
+# 3d-photo-inpainting imports
+from inpaint.mesh import write_mesh, read_mesh, output_3d_photo
+from inpaint.networks import Inpaint_Color_Net, Inpaint_Depth_Net, Inpaint_Edge_Net
+from inpaint.utils import path_planning
+from inpaint.bilateral_filtering import sparse_bilateral_filtering
+
+global video_mesh_data, video_mesh_fn
+video_mesh_data = None
+video_mesh_fn = None
+
+model_holder = ModelHolder()
+
+
+def convert_i16_to_rgb(image, like):
+    # three channel, 8 bits per channel image
+    output = np.zeros_like(like)
+    output[:, :, 0] = image / 256.0
+    output[:, :, 1] = image / 256.0
+    output[:, :, 2] = image / 256.0
+    return output
+
+
+def unload_sd_model():
+    if shared.sd_model is not None:
+        shared.sd_model.cond_stage_model.to(devices.cpu)
+        shared.sd_model.first_stage_model.to(devices.cpu)
+
+
+def reload_sd_model():
+    if shared.sd_model is not None:
+        shared.sd_model.cond_stage_model.to(devices.device)
+        shared.sd_model.first_stage_model.to(devices.device)
+
+
+def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp):
+    if len(inputimages) == 0 or inputimages[0] is None:
+        return [], '', ''
+    if len(inputdepthmaps) == 0:
+        inputdepthmaps: list[Image] = [None for _ in range(len(inputimages))]
+    inputdepthmaps_complete = all([x is not None for x in inputdepthmaps])
+
+    background_removal = inp["background_removal"]
+    background_removal_model = inp["background_removal_model"]
+    boost = inp["boost"]
+    clipdepth = inp["clipdepth"]
+    clipthreshold_far = inp["clipthreshold_far"]
+    clipthreshold_near = inp["clipthreshold_near"]
+    combine_output = inp["combine_output"]
+    combine_output_axis = inp["combine_output_axis"]
+    depthmap_compute_device = inp["compute_device"]
+    gen_mesh = inp["gen_mesh"]
+    gen_normal = inp["gen_normal"] if "gen_normal" in inp else False
+    gen_stereo = inp["gen_stereo"]
+    inpaint = inp["inpaint"]
+    inpaint_vids = inp["inpaint_vids"]
+    invert_depth = inp["invert_depth"]
+    match_size = inp["match_size"]
+    mesh_occlude = inp["mesh_occlude"]
+    mesh_spherical = inp["mesh_spherical"]
+    model_type = inp["model_type"]
+    net_height = inp["net_height"]
+    net_width = inp["net_width"]
+    pre_depth_background_removal = inp["pre_depth_background_removal"]
+    save_background_removal_masks = inp["save_background_removal_masks"]
+    output_depth = inp["output_depth"]
+    show_heat = inp["show_heat"]
+    stereo_balance = inp["stereo_balance"]
+    stereo_divergence = inp["stereo_divergence"]
+    stereo_fill = inp["stereo_fill"]
+    stereo_modes = inp["stereo_modes"]
+    stereo_separation = inp["stereo_separation"]
+
+    # TODO: ideally, run_depthmap should not save meshes - that makes the function not pure
+    print(f"\n{SCRIPT_NAME} {SCRIPT_VERSION} ({get_commit_hash()})")
+
+    unload_sd_model()
+
+    # TODO: this still should not be here
+    background_removed_images = []
+    # remove on base image before depth calculation
+    if background_removal:
+        if pre_depth_background_removal:
+            inputimages = batched_background_removal(inputimages, background_removal_model)
+            background_removed_images = inputimages
+        else:
+            background_removed_images = batched_background_removal(inputimages, background_removal_model)
+
+    # init torch device
+    if depthmap_compute_device == 'GPU' and not torch.cuda.is_available():
+        print('WARNING: Cuda device was not found, cpu will be used')
+        depthmap_compute_device = 'CPU'
+    if depthmap_compute_device == 'GPU':
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    print("device: %s" % device)
+
+    generated_images = [{} for _ in range(len(inputimages))]
+    """Images that will be returned.
+    Every array element corresponds to particular input image.
+    Dictionary keys are types of images that were derived from the input image."""
+    # TODO: ???
+    meshsimple_fi = None
+    inpaint_imgs = []
+    inpaint_depths = []
+
+    try:
+        if not inputdepthmaps_complete:
+            print("Loading model(s) ..")
+            model_holder.ensure_models(model_type, device, boost)
+        model = model_holder.depth_model
+        pix2pix_model = model_holder.pix2pix_model
+
+        print("Computing output(s) ..")
+        # iterate over input images
+        for count in trange(0, len(inputimages)):
+            # Convert single channel input (PIL) images to rgb
+            if inputimages[count].mode == 'I':
+                inputimages[count].point(lambda p: p * 0.0039063096, mode='RGB')
+                inputimages[count] = inputimages[count].convert('RGB')
+
+            raw_prediction = None
+            """Raw prediction, as returned by a model. None if input depthmap is used."""
+            raw_prediction_invert = False
+            """True if near=dark on raw_prediction"""
+            out = None
+            if inputdepthmaps is not None and inputdepthmaps[count] is not None:
+                # use custom depthmap
+                dimg = inputdepthmaps[count]
+                # resize if not same size as input
+                if dimg.width != inputimages[count].width or dimg.height != inputimages[count].height:
+                    dimg = dimg.resize((inputimages[count].width, inputimages[count].height), Image.Resampling.LANCZOS)
+
+                if dimg.mode == 'I' or dimg.mode == 'P' or dimg.mode == 'L':
+                    out = np.asarray(dimg, dtype="float")
+                else:
+                    out = np.asarray(dimg, dtype="float")[:, :, 0]
+            else:
+                # override net size (size may be different for different images)
+                if match_size:
+                    net_width, net_height = inputimages[count].width, inputimages[count].height
+                raw_prediction, raw_prediction_invert = \
+                    model_holder.get_raw_prediction(inputimages[count], net_width, net_height)
+
+                # output
+                if abs(raw_prediction.max() - raw_prediction.min()) > np.finfo("float").eps:
+                    out = np.copy(raw_prediction)
+                    # TODO: some models may output negative values, maybe these should be clamped to zero.
+                    if raw_prediction_invert:
+                        out *= -1
+                    if clipdepth:
+                        out = (out - out.min()) / (out.max() - out.min())  # normalize to [0; 1]
+                        out = np.clip(out, clipthreshold_far, clipthreshold_near)
+                else:
+                    # Regretfully, the depthmap is broken and will be replaced with a black image
+                    out = np.zeros(raw_prediction.shape)
+            out = (out - out.min()) / (out.max() - out.min())  # normalize to [0; 1]
+
+            # Single channel, 16 bit image. This loses some precision!
+            # uint16 conversion uses round-down, therefore values should be [0; 2**16)
+            numbytes = 2
+            max_val = (2 ** (8 * numbytes))
+            out = np.clip(out * max_val, 0, max_val - 0.1)  # Clipping form above is needed to avoid overflowing
+            img_output = out.astype("uint16")
+            """Depthmap (near=bright), as uint16"""
+
+            # if 3dinpainting, store maps for processing in second pass
+            if inpaint:
+                inpaint_imgs.append(inputimages[count])
+                inpaint_depths.append(img_output)
+
+            # applying background masks after depth
+            if background_removal:
+                print('applying background masks')
+                background_removed_image = background_removed_images[count]
+                # maybe a threshold cut would be better on the line below.
+                background_removed_array = np.array(background_removed_image)
+                bg_mask = (background_removed_array[:, :, 0] == 0) & (background_removed_array[:, :, 1] == 0) & (
+                        background_removed_array[:, :, 2] == 0) & (background_removed_array[:, :, 3] <= 0.2)
+                img_output[bg_mask] = 0  # far value
+
+                generated_images[count]['background_removed'] = background_removed_image
+
+                if save_background_removal_masks:
+                    bg_array = (1 - bg_mask.astype('int8')) * 255
+                    mask_array = np.stack((bg_array, bg_array, bg_array, bg_array), axis=2)
+                    mask_image = Image.fromarray(mask_array.astype(np.uint8))
+
+                    generated_images[count]['foreground_mask'] = mask_image
+
+            # A weird quirk: if user tries to save depthmap, whereas input depthmap is used,
+            # depthmap will be outputed, even if combine_output is used.
+            if output_depth and inputdepthmaps[count] is None:
+                if output_depth:
+                    img_depth = cv2.bitwise_not(img_output) if invert_depth else img_output
+                    if combine_output:
+                        img_concat = Image.fromarray(np.concatenate(
+                            (inputimages[count], convert_i16_to_rgb(img_depth, inputimages[count])),
+                            axis=combine_output_axis))
+                        generated_images[count]['concat_depth'] = img_concat
+                    else:
+                        generated_images[count]['depth'] = Image.fromarray(img_depth)
+
+            if show_heat:
+                from dzoedepth.utils.misc import colorize
+                heatmap = colorize(img_output, cmap='inferno')
+                generated_images[count]['heatmap'] = heatmap
+
+            if gen_stereo:
+                print("Generating stereoscopic images..")
+                stereoimages = create_stereoimages(inputimages[count], img_output, stereo_divergence, stereo_separation,
+                                                   stereo_modes, stereo_balance, stereo_fill)
+                for c in range(0, len(stereoimages)):
+                    generated_images[count][stereo_modes[c]] = stereoimages[c]
+
+            if gen_normal:  # TODO: should be moved into a separate file when redesigned
+                # taken from @graemeniedermayer
+                # take gradients
+                zx = cv2.Sobel(np.float64(img_output), cv2.CV_64F, 1, 0, ksize=3)  # TODO: CV_64F ?
+                zy = cv2.Sobel(np.float64(img_output), cv2.CV_64F, 0, 1, ksize=3)
+
+                # combine and normalize gradients.
+                normal = np.dstack((zx, -zy, np.ones_like(img_output)))
+                n = np.linalg.norm(normal, axis=2)
+                normal[:, :, 0] /= n
+                normal[:, :, 1] /= n
+                normal[:, :, 2] /= n
+
+                # offset and rescale values to be in 0-255
+                normal += 1
+                normal /= 2
+                normal *= 255
+                normal = normal.astype(np.uint8)
+
+                generated_images[count]['normal'] = Image.fromarray(normal)
+
+            # gen mesh
+            if gen_mesh:
+                print(f"\nGenerating (occluded) mesh ..")
+                basename = 'depthmap'
+                meshsimple_fi = get_uniquefn(outpath, basename, 'obj')
+                meshsimple_fi = os.path.join(outpath, meshsimple_fi + '_simple.obj')
+
+                depthi = raw_prediction if raw_prediction is not None else out
+                depthi_min, depthi_max = depthi.min(), depthi.max()
+                # try to map output to sensible values for non zoedepth models, boost, or custom maps
+                if model_type not in [7, 8, 9] or boost or inputdepthmaps[count] is not None:
+                    # invert if midas
+                    if model_type > 0 or inputdepthmaps[count] is not None:  # TODO: Weird
+                        depthi = depthi_max - depthi + depthi_min
+                        depth_max = depthi.max()
+                        depth_min = depthi.min()
+                    # make positive
+                    if depthi_min < 0:
+                        depthi = depthi - depthi_min
+                        depth_max = depthi.max()
+                        depth_min = depthi.min()
+                    # scale down
+                    if depthi.max() > 10.0:
+                        depthi = 4.0 * (depthi - depthi_min) / (depthi_max - depthi_min)
+                    # offset
+                    depthi = depthi + 1.0
+
+                mesh = create_mesh(inputimages[count], depthi, keep_edges=not mesh_occlude, spherical=mesh_spherical)
+                mesh.export(meshsimple_fi)
+
+        print("Computing output(s) done.")
+    except RuntimeError as e:
+        # TODO: display in UI
+        if 'out of memory' in str(e):
+            suggestion = "ERROR: out of memory, could not generate depthmap!\nPlease try a different model"
+            if device != torch.device("cpu"):
+                suggestion += ", or try using the CPU"
+            if boost:
+                suggestion += ", or disable BOOST"
+            print(f"{suggestion}.")
+        else:
+            raise e
+    finally:
+        if not (hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels):
+            if 'model' in locals():
+                del model
+            if 'pix2pixmodel' in locals():
+                del pix2pix_model
+            model_holder.unload_models()
+        else:
+            model_holder.swap_to_cpu_memory()
+
+        gc.collect()
+        devices.torch_gc()
+
+    # TODO: This should not be here
+    mesh_fi = None
+    if inpaint:
+        try:
+            mesh_fi = run_3dphoto(device, inpaint_imgs, inpaint_depths, inputnames, outpath, inpaint_vids, 1, "mp4")
+        except Exception as e:
+            print(f'{str(e)}, some issue with generating inpainted mesh')
+
+    reload_sd_model()
+    print("All done.")
+    return generated_images, mesh_fi, meshsimple_fi
+
+
+def get_uniquefn(outpath, basename, ext):
+    # Inefficient and may fail, maybe use unbounded binary search?
+    basecount = get_next_sequence_number(outpath, basename)
+    if basecount > 0: basecount = basecount - 1
+    fullfn = None
+    for i in range(500):
+        fn = f"{basecount + i:05}" if basename == '' else f"{basename}-{basecount + i:04}"
+        fullfn = os.path.join(outpath, f"{fn}.{ext}")
+        if not os.path.exists(fullfn):
+            break
+    basename = Path(fullfn).stem
+
+    return basename
+
+
+def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, inpaint_vids, vid_ssaa, vid_format):
+    mesh_fi = ''
+    try:
+        print("Running 3D Photo Inpainting .. ")
+        edgemodel_path = './models/3dphoto/edge_model.pth'
+        depthmodel_path = './models/3dphoto/depth_model.pth'
+        colormodel_path = './models/3dphoto/color_model.pth'
+        # create paths to model if not present
+        os.makedirs('./models/3dphoto/', exist_ok=True)
+
+        ensure_file_downloaded(edgemodel_path,
+                               "https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/edge-model.pth")
+        ensure_file_downloaded(depthmodel_path,
+                               "https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/depth-model.pth")
+        ensure_file_downloaded(colormodel_path,
+                               "https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/color-model.pth")
+
+        print("Loading edge model ..")
+        depth_edge_model = Inpaint_Edge_Net(init_weights=True)
+        depth_edge_weight = torch.load(edgemodel_path, map_location=torch.device(device))
+        depth_edge_model.load_state_dict(depth_edge_weight)
+        depth_edge_model = depth_edge_model.to(device)
+        depth_edge_model.eval()
+        print("Loading depth model ..")
+        depth_feat_model = Inpaint_Depth_Net()
+        depth_feat_weight = torch.load(depthmodel_path, map_location=torch.device(device))
+        depth_feat_model.load_state_dict(depth_feat_weight, strict=True)
+        depth_feat_model = depth_feat_model.to(device)
+        depth_feat_model.eval()
+        depth_feat_model = depth_feat_model.to(device)
+        print("Loading rgb model ..")
+        rgb_model = Inpaint_Color_Net()
+        rgb_feat_weight = torch.load(colormodel_path, map_location=torch.device(device))
+        rgb_model.load_state_dict(rgb_feat_weight)
+        rgb_model.eval()
+        rgb_model = rgb_model.to(device)
+
+        config = {}
+        config["gpu_ids"] = 0
+        config['extrapolation_thickness'] = 60
+        config['extrapolate_border'] = True
+        config['depth_threshold'] = 0.04
+        config['redundant_number'] = 12
+        config['ext_edge_threshold'] = 0.002
+        config['background_thickness'] = 70
+        config['context_thickness'] = 140
+        config['background_thickness_2'] = 70
+        config['context_thickness_2'] = 70
+        config['log_depth'] = True
+        config['depth_edge_dilate'] = 10
+        config['depth_edge_dilate_2'] = 5
+        config['largest_size'] = 512
+        config['repeat_inpaint_edge'] = True
+        config['ply_fmt'] = "bin"
+
+        config['save_ply'] = False
+        if hasattr(opts, 'depthmap_script_save_ply') and opts.depthmap_script_save_ply:
+            config['save_ply'] = True
+
+        config['save_obj'] = True
+
+        if device == torch.device("cpu"):
+            config["gpu_ids"] = -1
+
+        for count in trange(0, len(img_rgb)):
+            basename = 'depthmap'
+            if inputnames is not None:
+                if inputnames[count] is not None:
+                    p = Path(inputnames[count])
+                    basename = p.stem
+
+            basename = get_uniquefn(outpath, basename, 'obj')
+            mesh_fi = os.path.join(outpath, basename + '.obj')
+
+            print(f"\nGenerating inpainted mesh .. (go make some coffee) ..")
+
+            # from inpaint.utils.get_MiDaS_samples
+            W = img_rgb[count].width
+            H = img_rgb[count].height
+            int_mtx = np.array([[max(H, W), 0, W // 2], [0, max(H, W), H // 2], [0, 0, 1]]).astype(np.float32)
+            if int_mtx.max() > 1:
+                int_mtx[0, :] = int_mtx[0, :] / float(W)
+                int_mtx[1, :] = int_mtx[1, :] / float(H)
+
+            # how inpaint.utils.read_MiDaS_depth() imports depthmap
+            disp = img_depth[count].astype(np.float32)
+            disp = disp - disp.min()
+            disp = cv2.blur(disp / disp.max(), ksize=(3, 3)) * disp.max()
+            disp = (disp / disp.max()) * 3.0
+            depth = 1. / np.maximum(disp, 0.05)
+
+            # rgb input
+            img = np.asarray(img_rgb[count])
+
+            # run sparse bilateral filter
+            config['sparse_iter'] = 5
+            config['filter_size'] = [7, 7, 5, 5, 5]
+            config['sigma_s'] = 4.0
+            config['sigma_r'] = 0.5
+            vis_photos, vis_depths = sparse_bilateral_filtering(depth.copy(), img.copy(), config,
+                                                                num_iter=config['sparse_iter'], spdb=False)
+            depth = vis_depths[-1]
+
+            # bilat_fn = os.path.join(outpath, basename +'_bilatdepth.png')
+            # cv2.imwrite(bilat_fn, depth)
+
+            rt_info = write_mesh(img,
+                                 depth,
+                                 int_mtx,
+                                 mesh_fi,
+                                 config,
+                                 rgb_model,
+                                 depth_edge_model,
+                                 depth_edge_model,
+                                 depth_feat_model)
+
+            if rt_info is not False and inpaint_vids:
+                run_3dphoto_videos(mesh_fi, basename, outpath, 300, 40,
+                                   [0.03, 0.03, 0.05, 0.03],
+                                   ['double-straight-line', 'double-straight-line', 'circle', 'circle'],
+                                   [0.00, 0.00, -0.015, -0.015],
+                                   [0.00, 0.00, -0.015, -0.00],
+                                   [-0.05, -0.05, -0.05, -0.05],
+                                   ['dolly-zoom-in', 'zoom-in', 'circle', 'swing'], False, vid_format, vid_ssaa)
+
+            devices.torch_gc()
+
+    finally:
+        del rgb_model
+        rgb_model = None
+        del depth_edge_model
+        depth_edge_model = None
+        del depth_feat_model
+        depth_feat_model = None
+        devices.torch_gc()
+
+    return mesh_fi
+
+
+def run_3dphoto_videos(mesh_fi, basename, outpath, num_frames, fps, crop_border, traj_types, x_shift_range,
+                       y_shift_range, z_shift_range, video_postfix, vid_dolly, vid_format, vid_ssaa):
+    import vispy
+    if platform.system() == 'Windows':
+        vispy.use(app='PyQt5')
+    elif platform.system() == 'Darwin':
+        vispy.use('PyQt6')
+    else:
+        vispy.use(app='egl')
+
+    # read ply
+    global video_mesh_data, video_mesh_fn
+    if video_mesh_fn is None or video_mesh_fn != mesh_fi:
+        del video_mesh_data
+        video_mesh_fn = mesh_fi
+        video_mesh_data = read_mesh(mesh_fi)
+
+    verts, colors, faces, Height, Width, hFov, vFov, mean_loc_depth = video_mesh_data
+
+    original_w = output_w = W = Width
+    original_h = output_h = H = Height
+    int_mtx = np.array([[max(H, W), 0, W // 2], [0, max(H, W), H // 2], [0, 0, 1]]).astype(np.float32)
+    if int_mtx.max() > 1:
+        int_mtx[0, :] = int_mtx[0, :] / float(W)
+        int_mtx[1, :] = int_mtx[1, :] / float(H)
+
+    config = {}
+    config['video_folder'] = outpath
+    config['num_frames'] = num_frames
+    config['fps'] = fps
+    config['crop_border'] = crop_border
+    config['traj_types'] = traj_types
+    config['x_shift_range'] = x_shift_range
+    config['y_shift_range'] = y_shift_range
+    config['z_shift_range'] = z_shift_range
+    config['video_postfix'] = video_postfix
+    config['ssaa'] = vid_ssaa
+
+    # from inpaint.utils.get_MiDaS_samples
+    generic_pose = np.eye(4)
+    assert len(config['traj_types']) == len(config['x_shift_range']) == \
+           len(config['y_shift_range']) == len(config['z_shift_range']) == len(config['video_postfix']), \
+        "The number of elements in 'traj_types', 'x_shift_range', 'y_shift_range', 'z_shift_range' and \
+            'video_postfix' should be equal."
+    tgt_pose = [[generic_pose * 1]]
+    tgts_poses = []
+    for traj_idx in range(len(config['traj_types'])):
+        tgt_poses = []
+        sx, sy, sz = path_planning(config['num_frames'], config['x_shift_range'][traj_idx],
+                                   config['y_shift_range'][traj_idx],
+                                   config['z_shift_range'][traj_idx], path_type=config['traj_types'][traj_idx])
+        for xx, yy, zz in zip(sx, sy, sz):
+            tgt_poses.append(generic_pose * 1.)
+            tgt_poses[-1][:3, -1] = np.array([xx, yy, zz])
+        tgts_poses += [tgt_poses]
+    tgt_pose = generic_pose * 1
+
+    # seems we only need the depthmap to calc mean_loc_depth, which is only used when doing 'dolly'
+    # width and height are already in the ply file in the comments ..
+    # might try to add the mean_loc_depth to it too
+    # did just that
+    # mean_loc_depth = img_depth[img_depth.shape[0]//2, img_depth.shape[1]//2]
+
+    print("Generating videos ..")
+
+    normal_canvas, all_canvas = None, None
+    videos_poses, video_basename = copy.deepcopy(tgts_poses), basename
+    top = (original_h // 2 - int_mtx[1, 2] * output_h)
+    left = (original_w // 2 - int_mtx[0, 2] * output_w)
+    down, right = top + output_h, left + output_w
+    border = [int(xx) for xx in [top, down, left, right]]
+    normal_canvas, all_canvas, fn_saved = output_3d_photo(verts.copy(), colors.copy(), faces.copy(),
+                                                          copy.deepcopy(Height), copy.deepcopy(Width),
+                                                          copy.deepcopy(hFov), copy.deepcopy(vFov),
+                                                          copy.deepcopy(tgt_pose), config['video_postfix'],
+                                                          copy.deepcopy(generic_pose),
+                                                          copy.deepcopy(config['video_folder']),
+                                                          None, copy.deepcopy(int_mtx), config, None,
+                                                          videos_poses, video_basename, original_h, original_w,
+                                                          border=border, depth=None, normal_canvas=normal_canvas,
+                                                          all_canvas=all_canvas,
+                                                          mean_loc_depth=mean_loc_depth, dolly=vid_dolly,
+                                                          fnExt=vid_format)
+    return fn_saved
+
+
+# called from gen vid tab button
+def run_makevideo(fn_mesh, vid_numframes, vid_fps, vid_traj, vid_shift, vid_border, dolly, vid_format, vid_ssaa):
+    if len(fn_mesh) == 0 or not os.path.exists(fn_mesh):
+        raise Exception("Could not open mesh.")
+
+    vid_ssaa = int(vid_ssaa)
+
+    # traj type
+    if vid_traj == 0:
+        vid_traj = ['straight-line']
+    elif vid_traj == 1:
+        vid_traj = ['double-straight-line']
+    elif vid_traj == 2:
+        vid_traj = ['circle']
+
+    num_fps = int(vid_fps)
+    num_frames = int(vid_numframes)
+    shifts = vid_shift.split(',')
+    if len(shifts) != 3:
+        raise Exception("Translate requires 3 elements.")
+    x_shift_range = [float(shifts[0])]
+    y_shift_range = [float(shifts[1])]
+    z_shift_range = [float(shifts[2])]
+
+    borders = vid_border.split(',')
+    if len(borders) != 4:
+        raise Exception("Crop Border requires 4 elements.")
+    crop_border = [float(borders[0]), float(borders[1]), float(borders[2]), float(borders[3])]
+
+    # output path and filename mess ..
+    basename = Path(fn_mesh).stem
+    outpath = opts.outdir_samples or opts.outdir_extras_samples
+    # unique filename
+    basecount = get_next_sequence_number(outpath, basename)
+    if basecount > 0: basecount = basecount - 1
+    fullfn = None
+    for i in range(500):
+        fn = f"{basecount + i:05}" if basename == '' else f"{basename}-{basecount + i:04}"
+        fullfn = os.path.join(outpath, f"{fn}_." + vid_format)
+        if not os.path.exists(fullfn):
+            break
+    basename = Path(fullfn).stem
+    basename = basename[:-1]
+
+    print("Loading mesh ..")
+
+    fn_saved = run_3dphoto_videos(fn_mesh, basename, outpath, num_frames, num_fps, crop_border, vid_traj, x_shift_range,
+                                  y_shift_range, z_shift_range, [''], dolly, vid_format, vid_ssaa)
+
+    return fn_saved[-1], fn_saved[-1], ''
+
+
+def unload_models():
+    model_holder.unload_models()
+
+
+# TODO: code borrowed from the internet to be marked as such and to reside in separate files
+
+def batched_background_removal(inimages, model_name):
+    from rembg import new_session, remove
+    print('creating background masks')
+    outimages = []
+
+    # model path and name
+    bg_model_dir = Path.joinpath(Path().resolve(), "models/rem_bg")
+    os.makedirs(bg_model_dir, exist_ok=True)
+    os.environ["U2NET_HOME"] = str(bg_model_dir)
+
+    # starting a session
+    background_removal_session = new_session(model_name)
+    for count in range(0, len(inimages)):
+        bg_remove_img = np.array(remove(inimages[count], session=background_removal_session))
+        outimages.append(Image.fromarray(bg_remove_img))
+    # The line below might be redundant
+    del background_removal_session
+    return outimages
+
+
+def ensure_file_downloaded(filename, url, sha256_hash_prefix=None):
+    # Do not check the hash every time - it is somewhat time-consuming
+    if os.path.exists(filename):
+        return
+
+    if type(url) is not list:
+        url = [url]
+    for cur_url in url:
+        try:
+            print("Downloading", cur_url, "to", filename)
+            torch.hub.download_url_to_file(cur_url, filename, sha256_hash_prefix)
+            if os.path.exists(filename):
+                return  # The correct model was downloaded, no need to try more
+        except:
+            pass
+    raise RuntimeError('Download failed. Try again later or manually download the file to that location.')
+
+
+def pano_depth_to_world_points(depth):
+    """
+    360 depth to world points
+    given 2D depth is an equirectangular projection of a spherical image
+    Treat depth as radius
+    longitude : -pi to pi
+    latitude : -pi/2 to pi/2
+    """
+
+    # Convert depth to radius
+    radius = depth.flatten()
+
+    lon = np.linspace(-np.pi, np.pi, depth.shape[1])
+    lat = np.linspace(-np.pi / 2, np.pi / 2, depth.shape[0])
+
+    lon, lat = np.meshgrid(lon, lat)
+    lon = lon.flatten()
+    lat = lat.flatten()
+
+    # Convert to cartesian coordinates
+    x = radius * np.cos(lat) * np.cos(lon)
+    y = radius * np.cos(lat) * np.sin(lon)
+    z = radius * np.sin(lat)
+
+    pts3d = np.stack([x, y, z], axis=1)
+
+    return pts3d
+
+
+def depth_edges_mask(depth):
+    """Returns a mask of edges in the depth map.
+    Args:
+    depth: 2D numpy array of shape (H, W) with dtype float32.
+    Returns:
+    mask: 2D numpy array of shape (H, W) with dtype bool.
+    """
+    # Compute the x and y gradients of the depth map.
+    depth_dx, depth_dy = np.gradient(depth)
+    # Compute the gradient magnitude.
+    depth_grad = np.sqrt(depth_dx ** 2 + depth_dy ** 2)
+    # Compute the edge mask.
+    mask = depth_grad > 0.05
+    return mask
+
+
+def create_mesh(image, depth, keep_edges=False, spherical=False):
+    import trimesh
+    from dzoedepth.utils.geometry import depth_to_points, create_triangles
+    maxsize = 1024
+    if hasattr(opts, 'depthmap_script_mesh_maxsize'):
+        maxsize = opts.depthmap_script_mesh_maxsize
+
+    # limit the size of the input image
+    image.thumbnail((maxsize, maxsize))
+
+    if not spherical:
+        pts3d = depth_to_points(depth[None])
+    else:
+        pts3d = pano_depth_to_world_points(depth)
+
+    pts3d = pts3d.reshape(-1, 3)
+
+    verts = pts3d.reshape(-1, 3)
+    image = np.array(image)
+    if keep_edges:
+        triangles = create_triangles(image.shape[0], image.shape[1])
+    else:
+        triangles = create_triangles(image.shape[0], image.shape[1], mask=~depth_edges_mask(depth))
+    colors = image.reshape(-1, 3)
+
+    mesh = trimesh.Trimesh(vertices=verts, faces=triangles, vertex_colors=colors)
+
+    # rotate 90deg over X when spherical
+    if spherical:
+        angle = math.pi / 2
+        direction = [1, 0, 0]
+        center = [0, 0, 0]
+        rot_matrix = trimesh.transformations.rotation_matrix(angle, direction, center)
+        mesh.apply_transform(rot_matrix)
+
+    return mesh
diff --git a/scripts/depthmap.py b/scripts/depthmap_generation.py
similarity index 53%
rename from scripts/depthmap.py
rename to scripts/depthmap_generation.py
index 4dd02fc..eb00eda 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap_generation.py
@@ -1,56 +1,19 @@
-# Author: thygate
-# https://github.com/thygate/stable-diffusion-webui-depthmap-script
-
 from operator import getitem
-from pathlib import Path
 
 from PIL import Image
 from torchvision.transforms import Compose, transforms
 
 from modules import shared, devices
-from modules.images import get_next_sequence_number
 from modules.shared import opts, cmd_opts
 
-try:
-    from tqdm import trange
-except:
-    from builtins import range as trange
-
-import sys
 import torch, gc
 import cv2
 import os.path
 import numpy as np
 import skimage.measure
-import copy
-import platform
-import math
-import traceback
-import pathlib
-import os
-
-# Not sure if this is needed
-try:
-    script_dir = os.path.dirname(os.path.realpath(__file__))
-    extension_dir = pathlib.Path(script_dir).parent
-    sys.path.append(extension_dir)
-except:
-    sys.path.append('extensions/stable-diffusion-webui-depthmap-script')
-
-# Ugly workaround to fix gradio tempfile issue
-def ensure_gradio_temp_directory():
-    try:
-        import tempfile
-        path = os.path.join(tempfile.gettempdir(), 'gradio')
-        if not (os.path.exists(path)):
-            os.mkdir(path)
-    except Exception as e:
-        traceback.print_exc()
-ensure_gradio_temp_directory()
 
 # Our code
 from scripts.main import *
-from scripts.stereoimage_generation import create_stereoimages
 
 # midas imports
 from dmidas.dpt_depth import DPTDepthModel
@@ -66,32 +29,19 @@ def ensure_gradio_temp_directory():
 from pix2pix.options.test_options import TestOptions
 from pix2pix.models.pix2pix4depth_model import Pix2Pix4DepthModel
 
-# 3d-photo-inpainting imports
-from inpaint.mesh import write_mesh, read_mesh, output_3d_photo
-from inpaint.networks import Inpaint_Color_Net, Inpaint_Depth_Net, Inpaint_Edge_Net
-from inpaint.utils import path_planning
-from inpaint.bilateral_filtering import sparse_bilateral_filtering
 
 # zoedepth
 from dzoedepth.models.builder import build_model
 from dzoedepth.utils.config import get_config
-from dzoedepth.utils.misc import colorize
-from dzoedepth.utils.geometry import depth_to_points, create_triangles
 
-# TODO: next two should not be here
-whole_size_threshold = 1600  # R_max from the paper
-pix2pixsize = 1024
-
-global video_mesh_data, video_mesh_fn
-video_mesh_data = None
-video_mesh_fn = None
+global device
 
 class ModelHolder():
     def __init__(self):
         self.depth_model = None
         self.pix2pix_model = None
         self.depth_model_type = None
-        self.device = None
+        self.device = None  # Target device, the model may be swapped from VRAM into RAM.
 
         # Extra stuff
         self.resize_mode = None
@@ -137,7 +87,7 @@ def load_models(self, model_type, device: torch.device, boost: bool):
                  "https://huggingface.co/lllyasviel/Annotators/resolve/5bc80eec2b4fddbb/res101.pth",
                  ],
                 "1d696b2ef3e8336b057d0c15bc82d2fecef821bfebe5ef9d7671a5ec5dde520b")
-            if device == torch.device('gpu'):
+            if device != torch.device('cpu'):
                 checkpoint = torch.load(model_path)
             else:
                 checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
@@ -243,7 +193,7 @@ def load_models(self, model_type, device: torch.device, boost: bool):
         # optimize
         if device == torch.device("cuda") and model_type in [0, 1, 2, 3, 4, 5, 6]:
             model = model.to(memory_format=torch.channels_last)
-            if not cmd_opts.no_half and model_type != 0 and not boost:
+            if not cmd_opts.no_half and model_type != 0 and not boost:  # TODO: zoedepth, too?
                 model = model.half()
         model.to(device)  # to correct device
 
@@ -252,8 +202,9 @@ def load_models(self, model_type, device: torch.device, boost: bool):
         self.resize_mode = resize_mode
         self.normalization = normalization
 
-        # load merge network if boost enabled or keepmodels enabled
-        if boost or (hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels):
+        self.device = device
+
+        if boost:
             # sfu.ca unfortunately is not very reliable, we use a mirror just in case
             ensure_file_downloaded(
                 './models/pix2pix/latest_net_G.pth',
@@ -263,12 +214,10 @@ def load_models(self, model_type, device: torch.device, boost: bool):
             opt = TestOptions().parse()
             if device == torch.device('cpu'):
                 opt.gpu_ids = []
-            pix2pix_model = Pix2Pix4DepthModel(opt)
-            pix2pix_model.save_dir = './models/pix2pix'
-            pix2pix_model.load_networks('latest')
-            pix2pix_model.eval()
-            model.to(device)
-            self.pix2pix_model = pix2pix_model
+            self.pix2pix_model = Pix2Pix4DepthModel(opt)
+            self.pix2pix_model.save_dir = './models/pix2pix'
+            self.pix2pix_model.load_networks('latest')
+            self.pix2pix_model.eval()
 
         devices.torch_gc()
 
@@ -302,15 +251,17 @@ def unload_models(self):
             devices.torch_gc()
 
         self.depth_model_type = None
-        self.deviceidx = None
+        self.device = None
 
     def get_raw_prediction(self, input, net_width, net_height):
         """Get prediction from the model currently loaded by the class.
         If boost is enabled, net_width and net_height will be ignored."""
+        global device
+        device = self.device
         # input image
         img = cv2.cvtColor(np.asarray(input), cv2.COLOR_BGR2RGB) / 255.0
         # compute depthmap
-        if not self.pix2pix_model != None:
+        if self.pix2pix_model is None:
             if self.depth_model_type == 0:
                 raw_prediction = estimateleres(img, self.depth_model, net_width, net_height)
                 raw_prediction_invert = True
@@ -319,7 +270,7 @@ def get_raw_prediction(self, input, net_width, net_height):
                 raw_prediction_invert = True
             else:
                 raw_prediction = estimatemidas(img, self.depth_model, net_width, net_height,
-                                               model_holder.resize_mode, model_holder.normalization)
+                                               self.resize_mode, self.normalization)
                 raw_prediction_invert = False
         else:
             raw_prediction = estimateboost(img, self.depth_model, self.depth_model_type, self.pix2pix_model)
@@ -327,638 +278,20 @@ def get_raw_prediction(self, input, net_width, net_height):
         return raw_prediction, raw_prediction_invert
 
 
-model_holder = ModelHolder()
-
-def convert_i16_to_rgb(image, like):
-    # three channel, 8 bits per channel image
-    output = np.zeros_like(like)
-    output[:, :, 0] = image / 256.0
-    output[:, :, 1] = image / 256.0
-    output[:, :, 2] = image / 256.0
-    return output
-
-
-def unload_sd_model():
-    if shared.sd_model is not None:
-        shared.sd_model.cond_stage_model.to(devices.cpu)
-        shared.sd_model.first_stage_model.to(devices.cpu)
-
-
-def reload_sd_model():
-    if shared.sd_model is not None:
-        shared.sd_model.cond_stage_model.to(devices.device)
-        shared.sd_model.first_stage_model.to(devices.device)
-
-
-def run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inp):
-    if len(inputimages) == 0 or inputimages[0] is None:
-        return [], '', ''
-    if len(inputdepthmaps) == 0:
-        inputdepthmaps: list[Image] = [None for _ in range(len(inputimages))]
-    inputdepthmaps_complete = all([x is not None for x in inputdepthmaps])
-
-    background_removal = inp["background_removal"]
-    background_removal_model = inp["background_removal_model"]
-    boost = inp["boost"]
-    clipdepth = inp["clipdepth"]
-    clipthreshold_far = inp["clipthreshold_far"]
-    clipthreshold_near = inp["clipthreshold_near"]
-    combine_output = inp["combine_output"]
-    combine_output_axis = inp["combine_output_axis"]
-    depthmap_compute_device = inp["compute_device"]
-    gen_mesh = inp["gen_mesh"]
-    gen_normal = inp["gen_normal"] if "gen_normal" in inp else False
-    gen_stereo = inp["gen_stereo"]
-    inpaint = inp["inpaint"]
-    inpaint_vids = inp["inpaint_vids"]
-    invert_depth = inp["invert_depth"]
-    match_size = inp["match_size"]
-    mesh_occlude = inp["mesh_occlude"]
-    mesh_spherical = inp["mesh_spherical"]
-    model_type = inp["model_type"]
-    net_height = inp["net_height"]
-    net_width = inp["net_width"]
-    pre_depth_background_removal = inp["pre_depth_background_removal"]
-    save_background_removal_masks = inp["save_background_removal_masks"]
-    output_depth = inp["output_depth"]
-    show_heat = inp["show_heat"]
-    stereo_balance = inp["stereo_balance"]
-    stereo_divergence = inp["stereo_divergence"]
-    stereo_fill = inp["stereo_fill"]
-    stereo_modes = inp["stereo_modes"]
-    stereo_separation = inp["stereo_separation"]
-
-    # TODO: ideally, run_depthmap should not save meshes - that makes the function not pure
-    print(f"\n{SCRIPT_NAME} {SCRIPT_VERSION} ({get_commit_hash()})")
-
-    unload_sd_model()
-
-    # TODO: this still should not be here
-    background_removed_images = []
-    # remove on base image before depth calculation
-    if background_removal:
-        if pre_depth_background_removal:
-            inputimages = batched_background_removal(inputimages, background_removal_model)
-            background_removed_images = inputimages
-        else:
-            background_removed_images = batched_background_removal(inputimages, background_removal_model)
-
-    # init torch device
-    global device
-    if depthmap_compute_device == 'GPU' and not torch.cuda.is_available():
-        print('WARNING: Cuda device was not found, cpu will be used')
-        depthmap_compute_device = 'CPU'
-    if depthmap_compute_device == 'GPU':
-        device = torch.device("cuda")
-    else:
-        device = torch.device("cpu")
-    print("device: %s" % device)
-
-    generated_images = [{} for _ in range(len(inputimages))]
-    """Images that will be returned.
-    Every array element corresponds to particular input image.
-    Dictionary keys are types of images that were derived from the input image."""
-    # TODO: ???
-    meshsimple_fi = None
-    inpaint_imgs = []
-    inpaint_depths = []
-
-    try:
-        if not inputdepthmaps_complete:
-            print("Loading model(s) ..")
-            model_holder.ensure_models(model_type, device, boost)
-        model = model_holder.depth_model
-        pix2pix_model = model_holder.pix2pix_model
-
-        print("Computing output(s) ..")
-        # iterate over input images
-        for count in trange(0, len(inputimages)):
-            # override net size (size may be different for different images)
-            if match_size:
-                net_width, net_height = inputimages[count].width, inputimages[count].height
-
-            # Convert single channel input (PIL) images to rgb
-            if inputimages[count].mode == 'I':
-                inputimages[count].point(lambda p: p * 0.0039063096, mode='RGB')
-                inputimages[count] = inputimages[count].convert('RGB')
-
-            raw_prediction = None
-            """Raw prediction, as returned by a model. None if input depthmap is used."""
-            raw_prediction_invert = False
-            """True if near=dark on raw_prediction"""
-            out = None
-            if inputdepthmaps is not None and inputdepthmaps[count] is not None:
-                # use custom depthmap
-                dimg = inputdepthmaps[count]
-                # resize if not same size as input
-                if dimg.width != inputimages[count].width or dimg.height != inputimages[count].height:
-                    dimg = dimg.resize((inputimages[count].width, inputimages[count].height), Image.Resampling.LANCZOS)
-
-                if dimg.mode == 'I' or dimg.mode == 'P' or dimg.mode == 'L':
-                    out = np.asarray(dimg, dtype="float")
-                else:
-                    out = np.asarray(dimg, dtype="float")[:, :, 0]
-            else:
-                raw_prediction, raw_prediction_invert = \
-                    model_holder.get_raw_prediction(inputimages[count], net_width, net_height)
-
-                # output
-                if abs(raw_prediction.max() - raw_prediction.min()) > np.finfo("float").eps:
-                    out = np.copy(raw_prediction)
-                    # TODO: some models may output negative values, maybe these should be clamped to zero.
-                    if raw_prediction_invert:
-                        out *= -1
-                    if clipdepth:
-                        out = (out - out.min()) / (out.max() - out.min())  # normalize to [0; 1]
-                        out = np.clip(out, clipthreshold_far, clipthreshold_near)
-                else:
-                    # Regretfully, the depthmap is broken and will be replaced with a black image
-                    out = np.zeros(raw_prediction.shape)
-            out = (out - out.min()) / (out.max() - out.min())  # normalize to [0; 1]
-
-            # Single channel, 16 bit image. This loses some precision!
-            # uint16 conversion uses round-down, therefore values should be [0; 2**16)
-            numbytes = 2
-            max_val = (2 ** (8 * numbytes))
-            out = np.clip(out * max_val, 0, max_val - 0.1)  # Clipping form above is needed to avoid overflowing
-            img_output = out.astype("uint16")
-            """Depthmap (near=bright), as uint16"""
-
-            # if 3dinpainting, store maps for processing in second pass
-            if inpaint:
-                inpaint_imgs.append(inputimages[count])
-                inpaint_depths.append(img_output)
-
-            # applying background masks after depth
-            if background_removal:
-                print('applying background masks')
-                background_removed_image = background_removed_images[count]
-                # maybe a threshold cut would be better on the line below.
-                background_removed_array = np.array(background_removed_image)
-                bg_mask = (background_removed_array[:, :, 0] == 0) & (background_removed_array[:, :, 1] == 0) & (
-                            background_removed_array[:, :, 2] == 0) & (background_removed_array[:, :, 3] <= 0.2)
-                img_output[bg_mask] = 0  # far value
-
-                generated_images[count]['background_removed'] = background_removed_image
-
-                if save_background_removal_masks:
-                    bg_array = (1 - bg_mask.astype('int8')) * 255
-                    mask_array = np.stack((bg_array, bg_array, bg_array, bg_array), axis=2)
-                    mask_image = Image.fromarray(mask_array.astype(np.uint8))
-
-                    generated_images[count]['foreground_mask'] = mask_image
-
-            # A weird quirk: if user tries to save depthmap, whereas input depthmap is used,
-            # depthmap will be outputed, even if combine_output is used.
-            if output_depth and inputdepthmaps[count] is None:
-                if output_depth:
-                    img_depth = cv2.bitwise_not(img_output) if invert_depth else img_output
-                    if combine_output:
-                        img_concat = Image.fromarray(np.concatenate(
-                            (inputimages[count], convert_i16_to_rgb(img_depth, inputimages[count])),
-                            axis=combine_output_axis))
-                        generated_images[count]['concat_depth'] = img_concat
-                    else:
-                        generated_images[count]['depth'] = Image.fromarray(img_depth)
-
-            if show_heat:
-                heatmap = colorize(img_output, cmap='inferno')
-                generated_images[count]['heatmap'] = heatmap
-
-            if gen_stereo:
-                print("Generating stereoscopic images..")
-                stereoimages = create_stereoimages(inputimages[count], img_output, stereo_divergence, stereo_separation,
-                                                   stereo_modes, stereo_balance, stereo_fill)
-                for c in range(0, len(stereoimages)):
-                    generated_images[count][stereo_modes[c]] = stereoimages[c]
-
-            if gen_normal:  # TODO: should be moved into a separate file when redesigned
-                # taken from @graemeniedermayer
-                # take gradients
-                zx = cv2.Sobel(np.float64(img_output), cv2.CV_64F, 1, 0, ksize=3)  # TODO: CV_64F ?
-                zy = cv2.Sobel(np.float64(img_output), cv2.CV_64F, 0, 1, ksize=3)
-
-                # combine and normalize gradients.
-                normal = np.dstack((zx, -zy, np.ones_like(img_output)))
-                n = np.linalg.norm(normal, axis=2)
-                normal[:, :, 0] /= n
-                normal[:, :, 1] /= n
-                normal[:, :, 2] /= n
-
-                # offset and rescale values to be in 0-255
-                normal += 1
-                normal /= 2
-                normal *= 255
-                normal = normal.astype(np.uint8)
-
-                generated_images[count]['normal'] = Image.fromarray(normal)
-
-            # gen mesh
-            if gen_mesh:
-                print(f"\nGenerating (occluded) mesh ..")
-                basename = 'depthmap'
-                meshsimple_fi = get_uniquefn(outpath, basename, 'obj')
-                meshsimple_fi = os.path.join(outpath, meshsimple_fi + '_simple.obj')
-
-                depthi = raw_prediction if raw_prediction is not None else out
-                depthi_min, depthi_max = depthi.min(), depthi.max()
-                # try to map output to sensible values for non zoedepth models, boost, or custom maps
-                if model_type not in [7, 8, 9] or boost or inputdepthmaps[count] is not None:
-                    # invert if midas
-                    if model_type > 0 or inputdepthmaps[count] is not None:  # TODO: Weird
-                        depthi = depthi_max - depthi + depthi_min
-                        depth_max = depthi.max()
-                        depth_min = depthi.min()
-                    # make positive
-                    if depthi_min < 0:
-                        depthi = depthi - depthi_min
-                        depth_max = depthi.max()
-                        depth_min = depthi.min()
-                    # scale down
-                    if depthi.max() > 10.0:
-                        depthi = 4.0 * (depthi - depthi_min) / (depthi_max - depthi_min)
-                    # offset
-                    depthi = depthi + 1.0
-
-                mesh = create_mesh(inputimages[count], depthi, keep_edges=not mesh_occlude, spherical=mesh_spherical)
-                mesh.export(meshsimple_fi)
-
-        print("Computing output(s) done.")
-    except RuntimeError as e:
-        # TODO: display in UI
-        if 'out of memory' in str(e):
-            print("ERROR: out of memory, could not generate depthmap !")
-        else:
-            print(e)
-    finally:
-        if not (hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels):
-            if 'model' in locals():
-                del model
-            if boost and 'pix2pixmodel' in locals():
-                del pix2pix_model
-            model_holder.unload_models()
-        else:
-            model_holder.swap_to_cpu_memory()
-
-        gc.collect()
-        devices.torch_gc()
-
-    # TODO: This should not be here
-    mesh_fi = None
-    if inpaint:
-        try:
-            mesh_fi = run_3dphoto(device, inpaint_imgs, inpaint_depths, inputnames, outpath, inpaint_vids, 1, "mp4")
-        except Exception as e:
-            print(f'{str(e)}, some issue with generating inpainted mesh')
-
-    reload_sd_model()
-    print("All done.")
-    return generated_images, mesh_fi, meshsimple_fi
-
-
-def get_uniquefn(outpath, basename, ext):
-    # Inefficient and may fail, maybe use unbounded binary search?
-    basecount = get_next_sequence_number(outpath, basename)
-    if basecount > 0: basecount = basecount - 1
-    fullfn = None
-    for i in range(500):
-        fn = f"{basecount + i:05}" if basename == '' else f"{basename}-{basecount + i:04}"
-        fullfn = os.path.join(outpath, f"{fn}.{ext}")
-        if not os.path.exists(fullfn):
-            break
-    basename = Path(fullfn).stem
-
-    return basename
-
-
-def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, inpaint_vids, vid_ssaa, vid_format):
-    mesh_fi = ''
-    try:
-        print("Running 3D Photo Inpainting .. ")
-        edgemodel_path = './models/3dphoto/edge_model.pth'
-        depthmodel_path = './models/3dphoto/depth_model.pth'
-        colormodel_path = './models/3dphoto/color_model.pth'
-        # create paths to model if not present
-        os.makedirs('./models/3dphoto/', exist_ok=True)
-
-        ensure_file_downloaded(edgemodel_path,
-                               "https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/edge-model.pth")
-        ensure_file_downloaded(depthmodel_path,
-                               "https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/depth-model.pth")
-        ensure_file_downloaded(colormodel_path,
-                               "https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/color-model.pth")
-
-        print("Loading edge model ..")
-        depth_edge_model = Inpaint_Edge_Net(init_weights=True)
-        depth_edge_weight = torch.load(edgemodel_path, map_location=torch.device(device))
-        depth_edge_model.load_state_dict(depth_edge_weight)
-        depth_edge_model = depth_edge_model.to(device)
-        depth_edge_model.eval()
-        print("Loading depth model ..")
-        depth_feat_model = Inpaint_Depth_Net()
-        depth_feat_weight = torch.load(depthmodel_path, map_location=torch.device(device))
-        depth_feat_model.load_state_dict(depth_feat_weight, strict=True)
-        depth_feat_model = depth_feat_model.to(device)
-        depth_feat_model.eval()
-        depth_feat_model = depth_feat_model.to(device)
-        print("Loading rgb model ..")
-        rgb_model = Inpaint_Color_Net()
-        rgb_feat_weight = torch.load(colormodel_path, map_location=torch.device(device))
-        rgb_model.load_state_dict(rgb_feat_weight)
-        rgb_model.eval()
-        rgb_model = rgb_model.to(device)
-
-        config = {}
-        config["gpu_ids"] = 0
-        config['extrapolation_thickness'] = 60
-        config['extrapolate_border'] = True
-        config['depth_threshold'] = 0.04
-        config['redundant_number'] = 12
-        config['ext_edge_threshold'] = 0.002
-        config['background_thickness'] = 70
-        config['context_thickness'] = 140
-        config['background_thickness_2'] = 70
-        config['context_thickness_2'] = 70
-        config['log_depth'] = True
-        config['depth_edge_dilate'] = 10
-        config['depth_edge_dilate_2'] = 5
-        config['largest_size'] = 512
-        config['repeat_inpaint_edge'] = True
-        config['ply_fmt'] = "bin"
-
-        config['save_ply'] = False
-        if hasattr(opts, 'depthmap_script_save_ply') and opts.depthmap_script_save_ply:
-            config['save_ply'] = True
-
-        config['save_obj'] = True
-
-        if device == torch.device("cpu"):
-            config["gpu_ids"] = -1
-
-        for count in trange(0, len(img_rgb)):
-            basename = 'depthmap'
-            if inputnames is not None:
-                if inputnames[count] is not None:
-                    p = Path(inputnames[count])
-                    basename = p.stem
-
-            basename = get_uniquefn(outpath, basename, 'obj')
-            mesh_fi = os.path.join(outpath, basename + '.obj')
-
-            print(f"\nGenerating inpainted mesh .. (go make some coffee) ..")
-
-            # from inpaint.utils.get_MiDaS_samples
-            W = img_rgb[count].width
-            H = img_rgb[count].height
-            int_mtx = np.array([[max(H, W), 0, W // 2], [0, max(H, W), H // 2], [0, 0, 1]]).astype(np.float32)
-            if int_mtx.max() > 1:
-                int_mtx[0, :] = int_mtx[0, :] / float(W)
-                int_mtx[1, :] = int_mtx[1, :] / float(H)
-
-            # how inpaint.utils.read_MiDaS_depth() imports depthmap
-            disp = img_depth[count].astype(np.float32)
-            disp = disp - disp.min()
-            disp = cv2.blur(disp / disp.max(), ksize=(3, 3)) * disp.max()
-            disp = (disp / disp.max()) * 3.0
-            depth = 1. / np.maximum(disp, 0.05)
-
-            # rgb input
-            img = np.asarray(img_rgb[count])
-
-            # run sparse bilateral filter
-            config['sparse_iter'] = 5
-            config['filter_size'] = [7, 7, 5, 5, 5]
-            config['sigma_s'] = 4.0
-            config['sigma_r'] = 0.5
-            vis_photos, vis_depths = sparse_bilateral_filtering(depth.copy(), img.copy(), config,
-                                                                num_iter=config['sparse_iter'], spdb=False)
-            depth = vis_depths[-1]
-
-            # bilat_fn = os.path.join(outpath, basename +'_bilatdepth.png')
-            # cv2.imwrite(bilat_fn, depth)
-
-            rt_info = write_mesh(img,
-                                 depth,
-                                 int_mtx,
-                                 mesh_fi,
-                                 config,
-                                 rgb_model,
-                                 depth_edge_model,
-                                 depth_edge_model,
-                                 depth_feat_model)
-
-            if rt_info is not False and inpaint_vids:
-                run_3dphoto_videos(mesh_fi, basename, outpath, 300, 40,
-                                   [0.03, 0.03, 0.05, 0.03],
-                                   ['double-straight-line', 'double-straight-line', 'circle', 'circle'],
-                                   [0.00, 0.00, -0.015, -0.015],
-                                   [0.00, 0.00, -0.015, -0.00],
-                                   [-0.05, -0.05, -0.05, -0.05],
-                                   ['dolly-zoom-in', 'zoom-in', 'circle', 'swing'], False, vid_format, vid_ssaa)
-
-            devices.torch_gc()
-
-    finally:
-        del rgb_model
-        rgb_model = None
-        del depth_edge_model
-        depth_edge_model = None
-        del depth_feat_model
-        depth_feat_model = None
-        devices.torch_gc()
-
-    return mesh_fi
-
-
-def run_3dphoto_videos(mesh_fi, basename, outpath, num_frames, fps, crop_border, traj_types, x_shift_range,
-                       y_shift_range, z_shift_range, video_postfix, vid_dolly, vid_format, vid_ssaa):
-    import vispy
-    if platform.system() == 'Windows':
-        vispy.use(app='PyQt5')
-    elif platform.system() == 'Darwin':
-        vispy.use('PyQt6')
-    else:
-        vispy.use(app='egl')
-
-    # read ply
-    global video_mesh_data, video_mesh_fn
-    if video_mesh_fn is None or video_mesh_fn != mesh_fi:
-        del video_mesh_data
-        video_mesh_fn = mesh_fi
-        video_mesh_data = read_mesh(mesh_fi)
-
-    verts, colors, faces, Height, Width, hFov, vFov, mean_loc_depth = video_mesh_data
-
-    original_w = output_w = W = Width
-    original_h = output_h = H = Height
-    int_mtx = np.array([[max(H, W), 0, W // 2], [0, max(H, W), H // 2], [0, 0, 1]]).astype(np.float32)
-    if int_mtx.max() > 1:
-        int_mtx[0, :] = int_mtx[0, :] / float(W)
-        int_mtx[1, :] = int_mtx[1, :] / float(H)
-
-    config = {}
-    config['video_folder'] = outpath
-    config['num_frames'] = num_frames
-    config['fps'] = fps
-    config['crop_border'] = crop_border
-    config['traj_types'] = traj_types
-    config['x_shift_range'] = x_shift_range
-    config['y_shift_range'] = y_shift_range
-    config['z_shift_range'] = z_shift_range
-    config['video_postfix'] = video_postfix
-    config['ssaa'] = vid_ssaa
-
-    # from inpaint.utils.get_MiDaS_samples
-    generic_pose = np.eye(4)
-    assert len(config['traj_types']) == len(config['x_shift_range']) == \
-           len(config['y_shift_range']) == len(config['z_shift_range']) == len(config['video_postfix']), \
-        "The number of elements in 'traj_types', 'x_shift_range', 'y_shift_range', 'z_shift_range' and \
-            'video_postfix' should be equal."
-    tgt_pose = [[generic_pose * 1]]
-    tgts_poses = []
-    for traj_idx in range(len(config['traj_types'])):
-        tgt_poses = []
-        sx, sy, sz = path_planning(config['num_frames'], config['x_shift_range'][traj_idx],
-                                   config['y_shift_range'][traj_idx],
-                                   config['z_shift_range'][traj_idx], path_type=config['traj_types'][traj_idx])
-        for xx, yy, zz in zip(sx, sy, sz):
-            tgt_poses.append(generic_pose * 1.)
-            tgt_poses[-1][:3, -1] = np.array([xx, yy, zz])
-        tgts_poses += [tgt_poses]
-    tgt_pose = generic_pose * 1
-
-    # seems we only need the depthmap to calc mean_loc_depth, which is only used when doing 'dolly'
-    # width and height are already in the ply file in the comments ..
-    # might try to add the mean_loc_depth to it too
-    # did just that
-    # mean_loc_depth = img_depth[img_depth.shape[0]//2, img_depth.shape[1]//2]
-
-    print("Generating videos ..")
-
-    normal_canvas, all_canvas = None, None
-    videos_poses, video_basename = copy.deepcopy(tgts_poses), basename
-    top = (original_h // 2 - int_mtx[1, 2] * output_h)
-    left = (original_w // 2 - int_mtx[0, 2] * output_w)
-    down, right = top + output_h, left + output_w
-    border = [int(xx) for xx in [top, down, left, right]]
-    normal_canvas, all_canvas, fn_saved = output_3d_photo(verts.copy(), colors.copy(), faces.copy(),
-                                                          copy.deepcopy(Height), copy.deepcopy(Width),
-                                                          copy.deepcopy(hFov), copy.deepcopy(vFov),
-                                                          copy.deepcopy(tgt_pose), config['video_postfix'],
-                                                          copy.deepcopy(generic_pose),
-                                                          copy.deepcopy(config['video_folder']),
-                                                          None, copy.deepcopy(int_mtx), config, None,
-                                                          videos_poses, video_basename, original_h, original_w,
-                                                          border=border, depth=None, normal_canvas=normal_canvas,
-                                                          all_canvas=all_canvas,
-                                                          mean_loc_depth=mean_loc_depth, dolly=vid_dolly, fnExt=vid_format)
-    return fn_saved
-
-
-# called from gen vid tab button
-def run_makevideo(fn_mesh, vid_numframes, vid_fps, vid_traj, vid_shift, vid_border, dolly, vid_format, vid_ssaa):
-    if len(fn_mesh) == 0 or not os.path.exists(fn_mesh):
-        raise Exception("Could not open mesh.")
-
-    vid_ssaa = int(vid_ssaa)
-
-    # traj type
-    if vid_traj == 0:
-        vid_traj = ['straight-line']
-    elif vid_traj == 1:
-        vid_traj = ['double-straight-line']
-    elif vid_traj == 2:
-        vid_traj = ['circle']
-
-    num_fps = int(vid_fps)
-    num_frames = int(vid_numframes)
-    shifts = vid_shift.split(',')
-    if len(shifts) != 3:
-        raise Exception("Translate requires 3 elements.")
-    x_shift_range = [float(shifts[0])]
-    y_shift_range = [float(shifts[1])]
-    z_shift_range = [float(shifts[2])]
-
-    borders = vid_border.split(',')
-    if len(borders) != 4:
-        raise Exception("Crop Border requires 4 elements.")
-    crop_border = [float(borders[0]), float(borders[1]), float(borders[2]), float(borders[3])]
-
-    # output path and filename mess ..
-    basename = Path(fn_mesh).stem
-    outpath = opts.outdir_samples or opts.outdir_extras_samples
-    # unique filename
-    basecount = get_next_sequence_number(outpath, basename)
-    if basecount > 0: basecount = basecount - 1
-    fullfn = None
-    for i in range(500):
-        fn = f"{basecount + i:05}" if basename == '' else f"{basename}-{basecount + i:04}"
-        fullfn = os.path.join(outpath, f"{fn}_." + vid_format)
-        if not os.path.exists(fullfn):
-            break
-    basename = Path(fullfn).stem
-    basename = basename[:-1]
-
-    print("Loading mesh ..")
-
-    fn_saved = run_3dphoto_videos(fn_mesh, basename, outpath, num_frames, num_fps, crop_border, vid_traj, x_shift_range,
-                                  y_shift_range, z_shift_range, [''], dolly, vid_format, vid_ssaa)
-
-    return fn_saved[-1], fn_saved[-1], ''
-
-
-def unload_models():
-    model_holder.unload_models()
-
-
-# TODO: code borrowed from the internet to be marked as such and to reside in separate files
-
-def batched_background_removal(inimages, model_name):
-    from rembg import new_session, remove
-    print('creating background masks')
-    outimages = []
-
-    # model path and name
-    bg_model_dir = Path.joinpath(Path().resolve(), "models/rem_bg")
-    os.makedirs(bg_model_dir, exist_ok=True)
-    os.environ["U2NET_HOME"] = str(bg_model_dir)
-
-    # starting a session
-    background_removal_session = new_session(model_name)
-    for count in range(0, len(inimages)):
-        bg_remove_img = np.array(remove(inimages[count], session=background_removal_session))
-        outimages.append(Image.fromarray(bg_remove_img))
-    # The line below might be redundant
-    del background_removal_session
-    return outimages
-
-
-def ensure_file_downloaded(filename, url, sha256_hash_prefix=None):
-    # Do not check the hash every time - it is somewhat time-consuming
-    if os.path.exists(filename):
-        return
-
-    if type(url) is not list:
-        url = [url]
-    for cur_url in url:
-        try:
-            print("Downloading", cur_url, "to", filename)
-            torch.hub.download_url_to_file(cur_url, filename, sha256_hash_prefix)
-            if os.path.exists(filename):
-                return  # The correct model was downloaded, no need to try more
-        except:
-            pass
-    raise RuntimeError('Download failed. Try again later or manually download the file to that location.')
+def estimateleres(img, model, w, h):
+    # leres transform input
+    rgb_c = img[:, :, ::-1].copy()
+    A_resize = cv2.resize(rgb_c, (w, h))
+    img_torch = scale_torch(A_resize)[None, :, :, :]
 
+    # compute
+    with torch.no_grad():
+        if device == torch.device("cuda"):
+            img_torch = img_torch.cuda()
+        prediction = model.depth_model(img_torch)
 
-def estimatezoedepth(img, model, w, h):
-    # x = transforms.ToTensor()(img).unsqueeze(0)
-    # x = x.type(torch.float32)
-    # x.to(device)
-    # prediction = model.infer(x)
-    model.core.prep.resizer._Resize__width = w
-    model.core.prep.resizer._Resize__height = h
-    prediction = model.infer_pil(img)
+    prediction = prediction.squeeze().cpu().numpy()
+    prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC)
 
     return prediction
 
@@ -982,20 +315,14 @@ def scale_torch(img):
     return img
 
 
-def estimateleres(img, model, w, h):
-    # leres transform input
-    rgb_c = img[:, :, ::-1].copy()
-    A_resize = cv2.resize(rgb_c, (w, h))
-    img_torch = scale_torch(A_resize)[None, :, :, :]
-
-    # compute
-    with torch.no_grad():
-        if device == torch.device("cuda"):
-            img_torch = img_torch.cuda()
-        prediction = model.depth_model(img_torch)
-
-    prediction = prediction.squeeze().cpu().numpy()
-    prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC)
+def estimatezoedepth(img, model, w, h):
+    # x = transforms.ToTensor()(img).unsqueeze(0)
+    # x = x.type(torch.float32)
+    # x.to(device)
+    # prediction = model.infer(x)
+    model.core.prep.resizer._Resize__width = w
+    model.core.prep.resizer._Resize__height = h
+    prediction = model.infer_pil(img)
 
     return prediction
 
@@ -1047,287 +374,115 @@ def estimatemidas(img, model, w, h, resize_mode, normalization):
     return prediction
 
 
-def estimatemidasBoost(img, model, w, h):
-    # init transform
-    transform = Compose(
-        [
-            Resize(
-                w,
-                h,
-                resize_target=None,
-                keep_aspect_ratio=True,
-                ensure_multiple_of=32,
-                resize_method="upper_bound",
-                image_interpolation_method=cv2.INTER_CUBIC,
-            ),
-            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-            PrepareForNet(),
-        ]
-    )
-
-    # transform input
-    img_input = transform({"image": img})["image"]
+class ImageandPatchs:
+    def __init__(self, root_dir, name, patchsinfo, rgb_image, scale=1):
+        self.root_dir = root_dir
+        self.patchsinfo = patchsinfo
+        self.name = name
+        self.patchs = patchsinfo
+        self.scale = scale
 
-    # compute
-    with torch.no_grad():
-        sample = torch.from_numpy(img_input).to(device).unsqueeze(0)
-        if device == torch.device("cuda"):
-            sample = sample.to(memory_format=torch.channels_last)
-        prediction = model.forward(sample)
+        self.rgb_image = cv2.resize(rgb_image, (round(rgb_image.shape[1] * scale), round(rgb_image.shape[0] * scale)),
+                                    interpolation=cv2.INTER_CUBIC)
 
-    prediction = prediction.squeeze().cpu().numpy()
-    prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC)
+        self.do_have_estimate = False
+        self.estimation_updated_image = None
+        self.estimation_base_image = None
 
-    # normalization
-    depth_min = prediction.min()
-    depth_max = prediction.max()
+    def __len__(self):
+        return len(self.patchs)
 
-    if depth_max - depth_min > np.finfo("float").eps:
-        prediction = (prediction - depth_min) / (depth_max - depth_min)
-    else:
-        prediction = 0
+    def set_base_estimate(self, est):
+        self.estimation_base_image = est
+        if self.estimation_updated_image is not None:
+            self.do_have_estimate = True
 
-    return prediction
+    def set_updated_estimate(self, est):
+        self.estimation_updated_image = est
+        if self.estimation_base_image is not None:
+            self.do_have_estimate = True
 
+    def __getitem__(self, index):
+        patch_id = int(self.patchs[index][0])
+        rect = np.array(self.patchs[index][1]['rect'])
+        msize = self.patchs[index][1]['size']
 
-def generatemask(size):
-    # Generates a Guassian mask
-    mask = np.zeros(size, dtype=np.float32)
-    sigma = int(size[0] / 16)
-    k_size = int(2 * np.ceil(2 * int(size[0] / 16)) + 1)
-    mask[int(0.15 * size[0]):size[0] - int(0.15 * size[0]), int(0.15 * size[1]): size[1] - int(0.15 * size[1])] = 1
-    mask = cv2.GaussianBlur(mask, (int(k_size), int(k_size)), sigma)
-    mask = (mask - mask.min()) / (mask.max() - mask.min())
-    mask = mask.astype(np.float32)
-    return mask
+        ## applying scale to rect:
+        rect = np.round(rect * self.scale)
+        rect = rect.astype('int')
+        msize = round(msize * self.scale)
 
+        patch_rgb = impatch(self.rgb_image, rect)
+        if self.do_have_estimate:
+            patch_whole_estimate_base = impatch(self.estimation_base_image, rect)
+            patch_whole_estimate_updated = impatch(self.estimation_updated_image, rect)
+            return {'patch_rgb': patch_rgb, 'patch_whole_estimate_base': patch_whole_estimate_base,
+                    'patch_whole_estimate_updated': patch_whole_estimate_updated, 'rect': rect,
+                    'size': msize, 'id': patch_id}
+        else:
+            return {'patch_rgb': patch_rgb, 'rect': rect, 'size': msize, 'id': patch_id}
 
-def resizewithpool(img, size):
-    i_size = img.shape[0]
-    n = int(np.floor(i_size / size))
+    def print_options(self, opt):
+        """Print and save options
 
-    out = skimage.measure.block_reduce(img, (n, n), np.max)
-    return out
+        It will print both current options and default values(if different).
+        It will save options into a text file / [checkpoints_dir] / opt.txt
+        """
+        message = ''
+        message += '----------------- Options ---------------\n'
+        for k, v in sorted(vars(opt).items()):
+            comment = ''
+            default = self.parser.get_default(k)
+            if v != default:
+                comment = '\t[default: %s]' % str(default)
+            message += '{:>25}: {:<30}{}\n'.format(str(k), str(v), comment)
+        message += '----------------- End -------------------'
+        print(message)
 
+        # save to the disk
+        """
+        expr_dir = os.path.join(opt.checkpoints_dir, opt.name)
+        util.mkdirs(expr_dir)
+        file_name = os.path.join(expr_dir, '{}_opt.txt'.format(opt.phase))
+        with open(file_name, 'wt') as opt_file:
+            opt_file.write(message)
+            opt_file.write('\n')
+        """
 
-def rgb2gray(rgb):
-    # Converts rgb to gray
-    return np.dot(rgb[..., :3], [0.2989, 0.5870, 0.1140])
+    def parse(self):
+        """Parse our options, create checkpoints directory suffix, and set up gpu device."""
+        opt = self.gather_options()
+        opt.isTrain = self.isTrain  # train or test
 
+        # process opt.suffix
+        if opt.suffix:
+            suffix = ('_' + opt.suffix.format(**vars(opt))) if opt.suffix != '' else ''
+            opt.name = opt.name + suffix
 
-def calculateprocessingres(img, basesize, confidence=0.1, scale_threshold=3, whole_size_threshold=3000):
-    # Returns the R_x resolution described in section 5 of the main paper.
+        # self.print_options(opt)
 
-    # Parameters:
-    #    img :input rgb image
-    #    basesize : size the dilation kernel which is equal to receptive field of the network.
-    #    confidence: value of x in R_x; allowed percentage of pixels that are not getting any contextual cue.
-    #    scale_threshold: maximum allowed upscaling on the input image ; it has been set to 3.
-    #    whole_size_threshold: maximum allowed resolution. (R_max from section 6 of the main paper)
+        # set gpu ids
+        str_ids = opt.gpu_ids.split(',')
+        opt.gpu_ids = []
+        for str_id in str_ids:
+            id = int(str_id)
+            if id >= 0:
+                opt.gpu_ids.append(id)
+        # if len(opt.gpu_ids) > 0:
+        #    torch.cuda.set_device(opt.gpu_ids[0])
 
-    # Returns:
-    #    outputsize_scale*speed_scale :The computed R_x resolution
-    #    patch_scale: K parameter from section 6 of the paper
+        self.opt = opt
+        return self.opt
 
-    # speed scale parameter is to process every image in a smaller size to accelerate the R_x resolution search
-    speed_scale = 32
-    image_dim = int(min(img.shape[0:2]))
 
-    gray = rgb2gray(img)
-    grad = np.abs(cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)) + np.abs(cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3))
-    grad = cv2.resize(grad, (image_dim, image_dim), cv2.INTER_AREA)
-
-    # thresholding the gradient map to generate the edge-map as a proxy of the contextual cues
-    m = grad.min()
-    M = grad.max()
-    middle = m + (0.4 * (M - m))
-    grad[grad < middle] = 0
-    grad[grad >= middle] = 1
-
-    # dilation kernel with size of the receptive field
-    kernel = np.ones((int(basesize / speed_scale), int(basesize / speed_scale)), float)
-    # dilation kernel with size of the a quarter of receptive field used to compute k
-    # as described in section 6 of main paper
-    kernel2 = np.ones((int(basesize / (4 * speed_scale)), int(basesize / (4 * speed_scale))), float)
-
-    # Output resolution limit set by the whole_size_threshold and scale_threshold.
-    threshold = min(whole_size_threshold, scale_threshold * max(img.shape[:2]))
-
-    outputsize_scale = basesize / speed_scale
-    for p_size in range(int(basesize / speed_scale), int(threshold / speed_scale), int(basesize / (2 * speed_scale))):
-        grad_resized = resizewithpool(grad, p_size)
-        grad_resized = cv2.resize(grad_resized, (p_size, p_size), cv2.INTER_NEAREST)
-        grad_resized[grad_resized >= 0.5] = 1
-        grad_resized[grad_resized < 0.5] = 0
-
-        dilated = cv2.dilate(grad_resized, kernel, iterations=1)
-        meanvalue = (1 - dilated).mean()
-        if meanvalue > confidence:
-            break
-        else:
-            outputsize_scale = p_size
-
-    grad_region = cv2.dilate(grad_resized, kernel2, iterations=1)
-    patch_scale = grad_region.mean()
-
-    return int(outputsize_scale * speed_scale), patch_scale
-
-
-# Generate a double-input depth estimation
-def doubleestimate(img, size1, size2, pix2pixsize, model, net_type, pix2pixmodel):
-    # Generate the low resolution estimation
-    estimate1 = singleestimate(img, size1, model, net_type)
-    # Resize to the inference size of merge network.
-    estimate1 = cv2.resize(estimate1, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
-
-    # Generate the high resolution estimation
-    estimate2 = singleestimate(img, size2, model, net_type)
-    # Resize to the inference size of merge network.
-    estimate2 = cv2.resize(estimate2, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
-
-    # Inference on the merge model
-    pix2pixmodel.set_input(estimate1, estimate2)
-    pix2pixmodel.test()
-    visuals = pix2pixmodel.get_current_visuals()
-    prediction_mapped = visuals['fake_B']
-    prediction_mapped = (prediction_mapped + 1) / 2
-    prediction_mapped = (prediction_mapped - torch.min(prediction_mapped)) / (
-            torch.max(prediction_mapped) - torch.min(prediction_mapped))
-    prediction_mapped = prediction_mapped.squeeze().cpu().numpy()
-
-    return prediction_mapped
-
-
-# Generate a single-input depth estimation
-def singleestimate(img, msize, model, net_type):
-    if net_type == 0:
-        return estimateleres(img, model, msize, msize)
-    elif net_type >= 7:
-        # np to PIL
-        return estimatezoedepth(Image.fromarray(np.uint8(img * 255)).convert('RGB'), model, msize, msize)
-    else:
-        return estimatemidasBoost(img, model, msize, msize)
-
-
-def applyGridpatch(blsize, stride, img, box):
-    # Extract a simple grid patch.
-    counter1 = 0
-    patch_bound_list = {}
-    for k in range(blsize, img.shape[1] - blsize, stride):
-        for j in range(blsize, img.shape[0] - blsize, stride):
-            patch_bound_list[str(counter1)] = {}
-            patchbounds = [j - blsize, k - blsize, j - blsize + 2 * blsize, k - blsize + 2 * blsize]
-            patch_bound = [box[0] + patchbounds[1], box[1] + patchbounds[0], patchbounds[3] - patchbounds[1],
-                           patchbounds[2] - patchbounds[0]]
-            patch_bound_list[str(counter1)]['rect'] = patch_bound
-            patch_bound_list[str(counter1)]['size'] = patch_bound[2]
-            counter1 = counter1 + 1
-    return patch_bound_list
-
-
-# Generating local patches to perform the local refinement described in section 6 of the main paper.
-def generatepatchs(img, base_size):
-    # Compute the gradients as a proxy of the contextual cues.
-    img_gray = rgb2gray(img)
-    whole_grad = np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 0, 1, ksize=3)) + \
-                 np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 1, 0, ksize=3))
-
-    threshold = whole_grad[whole_grad > 0].mean()
-    whole_grad[whole_grad < threshold] = 0
-
-    # We use the integral image to speed-up the evaluation of the amount of gradients for each patch.
-    gf = whole_grad.sum() / len(whole_grad.reshape(-1))
-    grad_integral_image = cv2.integral(whole_grad)
-
-    # Variables are selected such that the initial patch size would be the receptive field size
-    # and the stride is set to 1/3 of the receptive field size.
-    blsize = int(round(base_size / 2))
-    stride = int(round(blsize * 0.75))
-
-    # Get initial Grid
-    patch_bound_list = applyGridpatch(blsize, stride, img, [0, 0, 0, 0])
-
-    # Refine initial Grid of patches by discarding the flat (in terms of gradients of the rgb image) ones. Refine
-    # each patch size to ensure that there will be enough depth cues for the network to generate a consistent depth map.
-    print("Selecting patches ...")
-    patch_bound_list = adaptiveselection(grad_integral_image, patch_bound_list, gf)
-
-    # Sort the patch list to make sure the merging operation will be done with the correct order: starting from biggest
-    # patch
-    patchset = sorted(patch_bound_list.items(), key=lambda x: getitem(x[1], 'size'), reverse=True)
-    return patchset
-
-
-def getGF_fromintegral(integralimage, rect):
-    # Computes the gradient density of a given patch from the gradient integral image.
-    x1 = rect[1]
-    x2 = rect[1] + rect[3]
-    y1 = rect[0]
-    y2 = rect[0] + rect[2]
-    value = integralimage[x2, y2] - integralimage[x1, y2] - integralimage[x2, y1] + integralimage[x1, y1]
-    return value
-
-
-# Adaptively select patches
-def adaptiveselection(integral_grad, patch_bound_list, gf):
-    patchlist = {}
-    count = 0
-    height, width = integral_grad.shape
-
-    search_step = int(32 / factor)
-
-    # Go through all patches
-    for c in range(len(patch_bound_list)):
-        # Get patch
-        bbox = patch_bound_list[str(c)]['rect']
-
-        # Compute the amount of gradients present in the patch from the integral image.
-        cgf = getGF_fromintegral(integral_grad, bbox) / (bbox[2] * bbox[3])
-
-        # Check if patching is beneficial by comparing the gradient density of the patch to
-        # the gradient density of the whole image
-        if cgf >= gf:
-            bbox_test = bbox.copy()
-            patchlist[str(count)] = {}
-
-            # Enlarge each patch until the gradient density of the patch is equal
-            # to the whole image gradient density
-            while True:
-
-                bbox_test[0] = bbox_test[0] - int(search_step / 2)
-                bbox_test[1] = bbox_test[1] - int(search_step / 2)
-
-                bbox_test[2] = bbox_test[2] + search_step
-                bbox_test[3] = bbox_test[3] + search_step
-
-                # Check if we are still within the image
-                if bbox_test[0] < 0 or bbox_test[1] < 0 or bbox_test[1] + bbox_test[3] >= height \
-                        or bbox_test[0] + bbox_test[2] >= width:
-                    break
-
-                # Compare gradient density
-                cgf = getGF_fromintegral(integral_grad, bbox_test) / (bbox_test[2] * bbox_test[3])
-                if cgf < gf:
-                    break
-                bbox = bbox_test.copy()
-
-            # Add patch to selected patches
-            patchlist[str(count)]['rect'] = bbox
-            patchlist[str(count)]['size'] = bbox[2]
-            count = count + 1
-
-    # Return selected patches
-    return patchlist
-
-
-def impatch(image, rect):
-    # Extract the given patch pixels from a given image.
-    w1 = rect[0]
-    h1 = rect[1]
-    w2 = w1 + rect[2]
-    h2 = h1 + rect[3]
-    image_patch = image[h1:h2, w1:w2]
-    return image_patch
+def impatch(image, rect):
+    # Extract the given patch pixels from a given image.
+    w1 = rect[0]
+    h1 = rect[1]
+    w2 = w1 + rect[2]
+    h2 = h1 + rect[3]
+    image_patch = image[h1:h2, w1:w2]
+    return image_patch
 
 
 class ImageandPatchs:
@@ -1432,6 +587,8 @@ def parse(self):
 
 
 def estimateboost(img, model, model_type, pix2pixmodel):
+    pix2pixsize = 1024  # TODO: to setting?
+    whole_size_threshold = 1600  # R_max from the paper  # TODO: to setting?
     # get settings
     if hasattr(opts, 'depthmap_script_boost_rmax'):
         whole_size_threshold = opts.depthmap_script_boost_rmax
@@ -1475,7 +632,6 @@ def estimateboost(img, model, model_type, pix2pixmodel):
 
     # Compute the multiplier described in section 6 of the main paper to make sure our initial patch can select
     # small high-density regions of the image.
-    global factor
     factor = max(min(1, 4 * patch_scale * whole_image_optimal_size / whole_size_threshold), 0.2)
     print('Adjust factor is:', 1 / factor)
 
@@ -1507,7 +663,7 @@ def estimateboost(img, model, model_type, pix2pixmodel):
 
     # Extract selected patches for local refinement
     base_size = net_receptive_field_size * 2
-    patchset = generatepatchs(img, base_size)
+    patchset = generatepatchs(img, base_size, factor)
 
     print('Target resolution: ', img.shape)
 
@@ -1601,87 +757,274 @@ def estimateboost(img, model, model_type, pix2pixmodel):
                       interpolation=cv2.INTER_CUBIC)
 
 
-def pano_depth_to_world_points(depth):
-    """
-    360 depth to world points
-    given 2D depth is an equirectangular projection of a spherical image
-    Treat depth as radius
-    longitude : -pi to pi
-    latitude : -pi/2 to pi/2
-    """
+def generatemask(size):
+    # Generates a Guassian mask
+    mask = np.zeros(size, dtype=np.float32)
+    sigma = int(size[0] / 16)
+    k_size = int(2 * np.ceil(2 * int(size[0] / 16)) + 1)
+    mask[int(0.15 * size[0]):size[0] - int(0.15 * size[0]), int(0.15 * size[1]): size[1] - int(0.15 * size[1])] = 1
+    mask = cv2.GaussianBlur(mask, (int(k_size), int(k_size)), sigma)
+    mask = (mask - mask.min()) / (mask.max() - mask.min())
+    mask = mask.astype(np.float32)
+    return mask
 
-    # Convert depth to radius
-    radius = depth.flatten()
 
-    lon = np.linspace(-np.pi, np.pi, depth.shape[1])
-    lat = np.linspace(-np.pi / 2, np.pi / 2, depth.shape[0])
+def rgb2gray(rgb):
+    # Converts rgb to gray
+    return np.dot(rgb[..., :3], [0.2989, 0.5870, 0.1140])
 
-    lon, lat = np.meshgrid(lon, lat)
-    lon = lon.flatten()
-    lat = lat.flatten()
 
-    # Convert to cartesian coordinates
-    x = radius * np.cos(lat) * np.cos(lon)
-    y = radius * np.cos(lat) * np.sin(lon)
-    z = radius * np.sin(lat)
+def resizewithpool(img, size):
+    i_size = img.shape[0]
+    n = int(np.floor(i_size / size))
 
-    pts3d = np.stack([x, y, z], axis=1)
+    out = skimage.measure.block_reduce(img, (n, n), np.max)
+    return out
 
-    return pts3d
 
+def calculateprocessingres(img, basesize, confidence=0.1, scale_threshold=3, whole_size_threshold=3000):
+    # Returns the R_x resolution described in section 5 of the main paper.
 
-def depth_edges_mask(depth):
-    """Returns a mask of edges in the depth map.
-    Args:
-    depth: 2D numpy array of shape (H, W) with dtype float32.
-    Returns:
-    mask: 2D numpy array of shape (H, W) with dtype bool.
-    """
-    # Compute the x and y gradients of the depth map.
-    depth_dx, depth_dy = np.gradient(depth)
-    # Compute the gradient magnitude.
-    depth_grad = np.sqrt(depth_dx ** 2 + depth_dy ** 2)
-    # Compute the edge mask.
-    mask = depth_grad > 0.05
-    return mask
+    # Parameters:
+    #    img :input rgb image
+    #    basesize : size the dilation kernel which is equal to receptive field of the network.
+    #    confidence: value of x in R_x; allowed percentage of pixels that are not getting any contextual cue.
+    #    scale_threshold: maximum allowed upscaling on the input image ; it has been set to 3.
+    #    whole_size_threshold: maximum allowed resolution. (R_max from section 6 of the main paper)
+
+    # Returns:
+    #    outputsize_scale*speed_scale :The computed R_x resolution
+    #    patch_scale: K parameter from section 6 of the paper
+
+    # speed scale parameter is to process every image in a smaller size to accelerate the R_x resolution search
+    speed_scale = 32
+    image_dim = int(min(img.shape[0:2]))
 
+    gray = rgb2gray(img)
+    grad = np.abs(cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)) + np.abs(cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3))
+    grad = cv2.resize(grad, (image_dim, image_dim), cv2.INTER_AREA)
 
-def create_mesh(image, depth, keep_edges=False, spherical=False):
-    import trimesh
-    maxsize = 1024
-    if hasattr(opts, 'depthmap_script_mesh_maxsize'):
-        maxsize = opts.depthmap_script_mesh_maxsize
+    # thresholding the gradient map to generate the edge-map as a proxy of the contextual cues
+    m = grad.min()
+    M = grad.max()
+    middle = m + (0.4 * (M - m))
+    grad[grad < middle] = 0
+    grad[grad >= middle] = 1
 
-    # limit the size of the input image
-    image.thumbnail((maxsize, maxsize))
+    # dilation kernel with size of the receptive field
+    kernel = np.ones((int(basesize / speed_scale), int(basesize / speed_scale)), float)
+    # dilation kernel with size of the a quarter of receptive field used to compute k
+    # as described in section 6 of main paper
+    kernel2 = np.ones((int(basesize / (4 * speed_scale)), int(basesize / (4 * speed_scale))), float)
 
-    if not spherical:
-        pts3d = depth_to_points(depth[None])
-    else:
-        pts3d = pano_depth_to_world_points(depth)
+    # Output resolution limit set by the whole_size_threshold and scale_threshold.
+    threshold = min(whole_size_threshold, scale_threshold * max(img.shape[:2]))
+
+    outputsize_scale = basesize / speed_scale
+    for p_size in range(int(basesize / speed_scale), int(threshold / speed_scale), int(basesize / (2 * speed_scale))):
+        grad_resized = resizewithpool(grad, p_size)
+        grad_resized = cv2.resize(grad_resized, (p_size, p_size), cv2.INTER_NEAREST)
+        grad_resized[grad_resized >= 0.5] = 1
+        grad_resized[grad_resized < 0.5] = 0
+
+        dilated = cv2.dilate(grad_resized, kernel, iterations=1)
+        meanvalue = (1 - dilated).mean()
+        if meanvalue > confidence:
+            break
+        else:
+            outputsize_scale = p_size
+
+    grad_region = cv2.dilate(grad_resized, kernel2, iterations=1)
+    patch_scale = grad_region.mean()
+
+    return int(outputsize_scale * speed_scale), patch_scale
+
+
+# Generate a double-input depth estimation
+def doubleestimate(img, size1, size2, pix2pixsize, model, net_type, pix2pixmodel):
+    # Generate the low resolution estimation
+    estimate1 = singleestimate(img, size1, model, net_type)
+    # Resize to the inference size of merge network.
+    estimate1 = cv2.resize(estimate1, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
+
+    # Generate the high resolution estimation
+    estimate2 = singleestimate(img, size2, model, net_type)
+    # Resize to the inference size of merge network.
+    estimate2 = cv2.resize(estimate2, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC)
+
+    # Inference on the merge model
+    pix2pixmodel.set_input(estimate1, estimate2)
+    pix2pixmodel.test()
+    visuals = pix2pixmodel.get_current_visuals()
+    prediction_mapped = visuals['fake_B']
+    prediction_mapped = (prediction_mapped + 1) / 2
+    prediction_mapped = (prediction_mapped - torch.min(prediction_mapped)) / (
+            torch.max(prediction_mapped) - torch.min(prediction_mapped))
+    prediction_mapped = prediction_mapped.squeeze().cpu().numpy()
+
+    return prediction_mapped
 
-    pts3d = pts3d.reshape(-1, 3)
 
-    verts = pts3d.reshape(-1, 3)
-    image = np.array(image)
-    if keep_edges:
-        triangles = create_triangles(image.shape[0], image.shape[1])
+# Generate a single-input depth estimation
+def singleestimate(img, msize, model, net_type):
+    if net_type == 0:
+        return estimateleres(img, model, msize, msize)
+    elif net_type >= 7:
+        # np to PIL
+        return estimatezoedepth(Image.fromarray(np.uint8(img * 255)).convert('RGB'), model, msize, msize)
     else:
-        triangles = create_triangles(image.shape[0], image.shape[1], mask=~depth_edges_mask(depth))
-    colors = image.reshape(-1, 3)
+        return estimatemidasBoost(img, model, msize, msize)
+
+
+# Generating local patches to perform the local refinement described in section 6 of the main paper.
+def generatepatchs(img, base_size, factor):
+    # Compute the gradients as a proxy of the contextual cues.
+    img_gray = rgb2gray(img)
+    whole_grad = np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 0, 1, ksize=3)) + \
+                 np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 1, 0, ksize=3))
+
+    threshold = whole_grad[whole_grad > 0].mean()
+    whole_grad[whole_grad < threshold] = 0
+
+    # We use the integral image to speed-up the evaluation of the amount of gradients for each patch.
+    gf = whole_grad.sum() / len(whole_grad.reshape(-1))
+    grad_integral_image = cv2.integral(whole_grad)
+
+    # Variables are selected such that the initial patch size would be the receptive field size
+    # and the stride is set to 1/3 of the receptive field size.
+    blsize = int(round(base_size / 2))
+    stride = int(round(blsize * 0.75))
+
+    # Get initial Grid
+    patch_bound_list = applyGridpatch(blsize, stride, img, [0, 0, 0, 0])
+
+    # Refine initial Grid of patches by discarding the flat (in terms of gradients of the rgb image) ones. Refine
+    # each patch size to ensure that there will be enough depth cues for the network to generate a consistent depth map.
+    print("Selecting patches ...")
+    patch_bound_list = adaptiveselection(grad_integral_image, patch_bound_list, gf, factor)
+
+    # Sort the patch list to make sure the merging operation will be done with the correct order: starting from biggest
+    # patch
+    patchset = sorted(patch_bound_list.items(), key=lambda x: getitem(x[1], 'size'), reverse=True)
+    return patchset
+
+
+def applyGridpatch(blsize, stride, img, box):
+    # Extract a simple grid patch.
+    counter1 = 0
+    patch_bound_list = {}
+    for k in range(blsize, img.shape[1] - blsize, stride):
+        for j in range(blsize, img.shape[0] - blsize, stride):
+            patch_bound_list[str(counter1)] = {}
+            patchbounds = [j - blsize, k - blsize, j - blsize + 2 * blsize, k - blsize + 2 * blsize]
+            patch_bound = [box[0] + patchbounds[1], box[1] + patchbounds[0], patchbounds[3] - patchbounds[1],
+                           patchbounds[2] - patchbounds[0]]
+            patch_bound_list[str(counter1)]['rect'] = patch_bound
+            patch_bound_list[str(counter1)]['size'] = patch_bound[2]
+            counter1 = counter1 + 1
+    return patch_bound_list
+
+
+# Adaptively select patches
+def adaptiveselection(integral_grad, patch_bound_list, gf, factor):
+    patchlist = {}
+    count = 0
+    height, width = integral_grad.shape
+
+    search_step = int(32 / factor)
+
+    # Go through all patches
+    for c in range(len(patch_bound_list)):
+        # Get patch
+        bbox = patch_bound_list[str(c)]['rect']
+
+        # Compute the amount of gradients present in the patch from the integral image.
+        cgf = getGF_fromintegral(integral_grad, bbox) / (bbox[2] * bbox[3])
+
+        # Check if patching is beneficial by comparing the gradient density of the patch to
+        # the gradient density of the whole image
+        if cgf >= gf:
+            bbox_test = bbox.copy()
+            patchlist[str(count)] = {}
+
+            # Enlarge each patch until the gradient density of the patch is equal
+            # to the whole image gradient density
+            while True:
+
+                bbox_test[0] = bbox_test[0] - int(search_step / 2)
+                bbox_test[1] = bbox_test[1] - int(search_step / 2)
+
+                bbox_test[2] = bbox_test[2] + search_step
+                bbox_test[3] = bbox_test[3] + search_step
+
+                # Check if we are still within the image
+                if bbox_test[0] < 0 or bbox_test[1] < 0 or bbox_test[1] + bbox_test[3] >= height \
+                        or bbox_test[0] + bbox_test[2] >= width:
+                    break
+
+                # Compare gradient density
+                cgf = getGF_fromintegral(integral_grad, bbox_test) / (bbox_test[2] * bbox_test[3])
+                if cgf < gf:
+                    break
+                bbox = bbox_test.copy()
+
+            # Add patch to selected patches
+            patchlist[str(count)]['rect'] = bbox
+            patchlist[str(count)]['size'] = bbox[2]
+            count = count + 1
+
+    # Return selected patches
+    return patchlist
+
+
+def getGF_fromintegral(integralimage, rect):
+    # Computes the gradient density of a given patch from the gradient integral image.
+    x1 = rect[1]
+    x2 = rect[1] + rect[3]
+    y1 = rect[0]
+    y2 = rect[0] + rect[2]
+    value = integralimage[x2, y2] - integralimage[x1, y2] - integralimage[x2, y1] + integralimage[x1, y1]
+    return value
+
+
+def estimatemidasBoost(img, model, w, h):
+    # init transform
+    transform = Compose(
+        [
+            Resize(
+                w,
+                h,
+                resize_target=None,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=32,
+                resize_method="upper_bound",
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            PrepareForNet(),
+        ]
+    )
 
-    mesh = trimesh.Trimesh(vertices=verts, faces=triangles, vertex_colors=colors)
+    # transform input
+    img_input = transform({"image": img})["image"]
 
-    # rotate 90deg over X when spherical
-    if spherical:
-        angle = math.pi / 2
-        direction = [1, 0, 0]
-        center = [0, 0, 0]
-        rot_matrix = trimesh.transformations.rotation_matrix(angle, direction, center)
-        mesh.apply_transform(rot_matrix)
+    # compute
+    with torch.no_grad():
+        sample = torch.from_numpy(img_input).to(device).unsqueeze(0)
+        if device == torch.device("cuda"):
+            sample = sample.to(memory_format=torch.channels_last)
+        prediction = model.forward(sample)
 
-    return mesh
+    prediction = prediction.squeeze().cpu().numpy()
+    prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC)
 
+    # normalization
+    depth_min = prediction.min()
+    depth_max = prediction.max()
 
-def save_mesh_obj(fn, mesh):
-    mesh.export(fn)
+    if depth_max - depth_min > np.finfo("float").eps:
+        prediction = (prediction - depth_min) / (depth_max - depth_min)
+    else:
+        prediction = 0
+
+    return prediction
diff --git a/scripts/interface_webui.py b/scripts/interface_webui.py
index f16b4e6..515c989 100644
--- a/scripts/interface_webui.py
+++ b/scripts/interface_webui.py
@@ -11,10 +11,22 @@
 
 from scripts.gradio_args_transport import GradioComponentBundle
 from scripts.main import *
-from scripts.depthmap import run_depthmap, unload_models, run_makevideo
+from scripts.core import core_generation_funnel, unload_models, run_makevideo
 from PIL import Image
 
 
+# Ugly workaround to fix gradio tempfile issue
+def ensure_gradio_temp_directory():
+    try:
+        import tempfile
+        path = os.path.join(tempfile.gettempdir(), 'gradio')
+        if not (os.path.exists(path)):
+            os.mkdir(path)
+    except Exception as e:
+        traceback.print_exc()
+ensure_gradio_temp_directory()
+
+
 def main_ui_panel(is_depth_tab):
     inp = GradioComponentBundle()
     # TODO: Greater visual separation
@@ -167,18 +179,6 @@ def main_ui_panel(is_depth_tab):
             outputs=[inp['clipthreshold_far']]
         )
 
-        # invert_depth must not be used with gen_stereo - otherwise stereo images look super-wrong
-        inp['gen_stereo'].change(
-            fn=lambda a, b: False if b else a,
-            inputs=[inp['invert_depth'], inp['gen_stereo']],
-            outputs=[inp['invert_depth']]
-        )
-        inp['gen_stereo'].change(
-            fn=lambda a, b: inp['invert_depth'].update(interactive=not b),
-            inputs=[inp['invert_depth'], inp['gen_stereo']],
-            outputs=[inp['invert_depth']]
-        )
-
         def stereo_options_visibility(v):
             return stereo_options.update(visible=v)
 
@@ -247,9 +247,9 @@ def run(self, p, *inputs):
                 continue
             inputimages.append(processed.images[count])
 
-        generated_images, mesh_fi, meshsimple_fi = run_depthmap(p.outpath_samples, inputimages, None, None, inputs)
+        outputs, mesh_fi, meshsimple_fi = core_generation_funnel(p.outpath_samples, inputimages, None, None, inputs)
 
-        for input_i, imgs in enumerate(generated_images):
+        for input_i, imgs in enumerate(outputs):
             # get generation parameters
             if hasattr(processed, 'all_prompts') and opts.enable_pnginfo:
                 info = create_infotext(processed, processed.all_prompts, processed.all_seeds, processed.all_subseeds,
@@ -278,7 +278,7 @@ def run(self, p, *inputs):
 def on_ui_settings():
     section = ('depthmap-script', "Depthmap extension")
     shared.opts.add_option("depthmap_script_keepmodels",
-                           shared.OptionInfo(False, "Keep depth models loaded.",
+                           shared.OptionInfo(False, "Do not unload depth and pix2pix models.",
                                              section=section))
     shared.opts.add_option("depthmap_script_boost_rmax",
                            shared.OptionInfo(1600, "Maximum wholesize for boost (Rmax)",
@@ -514,11 +514,11 @@ def run_generate(*inputs):
         inputdepthmaps_n = len([1 for x in inputdepthmaps if x is not None])
         print(f'{len(inputimages)} images will be processed, {inputdepthmaps_n} existing depthmaps will be reused')
 
-    save_images, mesh_fi, meshsimple_fi = run_depthmap(outpath, inputimages, inputdepthmaps, inputnames, inputs)
+    outputs, mesh_fi, meshsimple_fi = core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inputs)
     show_images = []
 
     # Saving images
-    for input_i, imgs in enumerate(save_images):
+    for input_i, imgs in enumerate(outputs):
         basename = 'depthmap'
         if depthmap_mode == '2' and inputnames[input_i] is not None and outpath != opts.outdir_extras_samples:
             basename = Path(inputnames[input_i]).stem
diff --git a/scripts/main.py b/scripts/main.py
index 2360588..97aae56 100644
--- a/scripts/main.py
+++ b/scripts/main.py
@@ -1,6 +1,7 @@
 import subprocess
 import os
 import pathlib
+import torch
 
 SCRIPT_NAME = "DepthMap"
 SCRIPT_VERSION = "v0.3.13"
diff --git a/scripts/stereoimage_generation.py b/scripts/stereoimage_generation.py
index 082e727..056c6f6 100644
--- a/scripts/stereoimage_generation.py
+++ b/scripts/stereoimage_generation.py
@@ -2,25 +2,26 @@
 import numpy as np
 from PIL import Image
 
+
 def create_stereoimages(original_image, depthmap, divergence, separation=0.0, modes=None, stereo_balance=0.0,
                         fill_technique='polylines_sharp'):
     """Creates stereoscopic images.
     An effort is made to make them look nice, but beware that the resulting image will have some distortion.
-    The correctness was not rigurously tested.
+    The correctness was not rigorously tested.
 
     :param original_image: original image from which the 3D image (stereoimage) will be created
     :param depthmap: depthmap corresponding to the original image. White = near, black = far.
     :param float divergence: the measure of 3D effect, in percentages.
       A good value will likely be somewhere in the [0.05; 10.0) interval.
-    :param float separation: measure by how much to move two halfs of the spereoimage apart from eachother.
-      Measured in percentages. Negative values move two parts closer togethert.
+    :param float separation: measure by how much to move two halves of the stereoimage apart from each-other.
+      Measured in percentages. Negative values move two parts closer together.
       Affects which parts of the image will be visible in left and/or right half.
     :param list modes: how the result will look like. By default only 'left-right' is generated
-      - a picture for the left eye will be on the left and the picture from the right eye - on the rigth.
+      - a picture for the left eye will be on the left and the picture from the right eye - on the right.
       The supported modes are: 'left-right', 'right-left', 'top-bottom', 'bottom-top', 'red-cyan-anaglyph'.
     :param float stereo_balance: has to do with how the divergence will be split among the two parts of the image,
       must be in the [-1.0; 1.0] interval.
-    :param str fill_technique: applying divergence inevidably creates some gaps in the image.
+    :param str fill_technique: applying divergence inevitably creates some gaps in the image.
       This parameter specifies the technique that will be used to fill in the blanks in the two resulting images.
       Must be one of the following: 'none', 'naive', 'naive_interpolating', 'polylines_soft', 'polylines_sharp'.
     """
@@ -185,7 +186,7 @@ def apply_stereo_divergence_polylines(
         # Since the proportions are equal, these lines have the same angle with an axe and are parallel.
         # So, these lines do not intersect. Now rotate the plot by 45 or -45 degrees and observe that
         # each dot of the polyline is further right from the last dot,
-        # which makes it impossible for the polyline to self-interset. QED.
+        # which makes it impossible for the polyline to self-intersect. QED.
 
         # sort segments and points using insertion sort
         # has a very good performance in practice, since these are almost sorted to begin with
@@ -203,7 +204,7 @@ def apply_stereo_divergence_polylines(
         sg_pointer: int = 0
         # and index of the point that should be processed next
         pt_i: int = 0
-        for col in range(w):  # iterate over regions (that will be rasterizeed into pixels)
+        for col in range(w):  # iterate over regions (that will be rasterized into pixels)
             color = np.full(c, 0.5, dtype=np.float_)  # we start with 0.5 because of how floats are converted to ints
             while pt[pt_i][0] < col:
                 pt_i += 1
@@ -216,7 +217,7 @@ def apply_stereo_divergence_polylines(
                 # the color at center point is the same as the average of color of segment part
                 coord_center = coord_from + 0.5 * significance
 
-                # adding semgents that now may contribute
+                # adding segments that now may contribute
                 while sg_pointer < sg_end and sg[sg_pointer][0] < coord_center:
                     csg[csg_end] = sg[sg_pointer]
                     sg_pointer += 1

From 88aa86f3f305ccf01ead18be46e847ed43ebb593 Mon Sep 17 00:00:00 2001
From: semjon00 <semjon.00@gmail.com>
Date: Mon, 10 Jul 2023 15:18:19 +0300
Subject: [PATCH 11/16] Post-refactor fixes

---
 README.md                         |  8 ++++----
 scripts/core.py                   |  6 +++---
 scripts/depthmap_generation.py    | 25 ++++++++++++-------------
 scripts/interface_webui.py        | 28 +++++++++++++++++++---------
 scripts/main.py                   |  2 +-
 scripts/stereoimage_generation.py | 11 +++++++++--
 6 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index 01c063d..591290a 100644
--- a/README.md
+++ b/README.md
@@ -21,10 +21,10 @@ video by [@graemeniedermayer](https://github.com/graemeniedermayer), more exampl
 images generated by [@semjon00](https://github.com/semjon00) from CC0 photos, more examples [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/56#issuecomment-1367596463).
 
 ## Changelog
-* v0.3.13 
-    * Large code refactor
-    * Improved interface
-    * Slightly changed the behaviour of various options
+* v0.4.0 large code refactor
+    * UI improvements
+    * slightly changed the behaviour of various options
+    * extension may partially work even if some of the dependencies are unmet
 * v0.3.12
     * Fixed stereo image generation
     * Other bugfixes
diff --git a/scripts/core.py b/scripts/core.py
index 49a6c7d..456abf2 100644
--- a/scripts/core.py
+++ b/scripts/core.py
@@ -98,7 +98,7 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
     stereo_separation = inp["stereo_separation"]
 
     # TODO: ideally, run_depthmap should not save meshes - that makes the function not pure
-    print(f"\n{SCRIPT_NAME} {SCRIPT_VERSION} ({get_commit_hash()})")
+    print(f"{SCRIPT_NAME} {SCRIPT_VERSION} ({get_commit_hash()})")
 
     unload_sd_model()
 
@@ -230,7 +230,7 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
 
             if show_heat:
                 from dzoedepth.utils.misc import colorize
-                heatmap = colorize(img_output, cmap='inferno')
+                heatmap = Image.fromarray(colorize(img_output, cmap='inferno'))
                 generated_images[count]['heatmap'] = heatmap
 
             if gen_stereo:
@@ -325,7 +325,7 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
             print(f'{str(e)}, some issue with generating inpainted mesh')
 
     reload_sd_model()
-    print("All done.")
+    print("All done.\n")
     return generated_images, mesh_fi, meshsimple_fi
 
 
diff --git a/scripts/depthmap_generation.py b/scripts/depthmap_generation.py
index eb00eda..0116162 100644
--- a/scripts/depthmap_generation.py
+++ b/scripts/depthmap_generation.py
@@ -3,6 +3,7 @@
 from PIL import Image
 from torchvision.transforms import Compose, transforms
 
+# TODO: depthmap_generation should not depend on WebUI
 from modules import shared, devices
 from modules.shared import opts, cmd_opts
 
@@ -29,7 +30,6 @@
 from pix2pix.options.test_options import TestOptions
 from pix2pix.models.pix2pix4depth_model import Pix2Pix4DepthModel
 
-
 # zoedepth
 from dzoedepth.models.builder import build_model
 from dzoedepth.utils.config import get_config
@@ -59,9 +59,6 @@ def ensure_models(self, model_type, device: torch.device, boost: bool):
 
     def load_models(self, model_type, device: torch.device, boost: bool):
         """Ensure that the depth model is loaded"""
-        # TODO: supply correct values for zoedepth
-        net_width = 512
-        net_height = 512
 
         # model path and name
         model_dir = "./models/midas"
@@ -171,22 +168,21 @@ def load_models(self, model_type, device: torch.device, boost: bool):
                 mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
             )
 
+        # When loading, zoedepth models will report the default net size.
+        # It will be overridden by the generation settings.
         elif model_type == 7:  # zoedepth_n
             print("zoedepth_n\n")
             conf = get_config("zoedepth", "infer")
-            conf.img_size = [net_width, net_height]
             model = build_model(conf)
 
         elif model_type == 8:  # zoedepth_k
             print("zoedepth_k\n")
             conf = get_config("zoedepth", "infer", config_version="kitti")
-            conf.img_size = [net_width, net_height]
             model = build_model(conf)
 
         elif model_type == 9:  # zoedepth_nk
             print("zoedepth_nk\n")
             conf = get_config("zoedepth_nk", "infer")
-            conf.img_size = [net_width, net_height]
             model = build_model(conf)
 
         model.eval()  # prepare for evaluation
@@ -221,15 +217,20 @@ def load_models(self, model_type, device: torch.device, boost: bool):
 
         devices.torch_gc()
 
-    def get_default_net_size(self, model_type):
+    @staticmethod
+    def get_default_net_size(model_type):
         # TODO: fill in, use in the GUI
         sizes = {
+            0: [448, 448],
             1: [512, 512],
             2: [384, 384],
             3: [384, 384],
             4: [384, 384],
             5: [384, 384],
             6: [256, 256],
+            7: [384, 512],
+            8: [384, 768],
+            9: [384, 512]
         }
         if model_type in sizes:
             return sizes[model_type]
@@ -254,8 +255,9 @@ def unload_models(self):
         self.device = None
 
     def get_raw_prediction(self, input, net_width, net_height):
-        """Get prediction from the model currently loaded by the class.
+        """Get prediction from the model currently loaded by the ModelHolder object.
         If boost is enabled, net_width and net_height will be ignored."""
+        # TODO: supply net size for zoedepth
         global device
         device = self.device
         # input image
@@ -264,17 +266,14 @@ def get_raw_prediction(self, input, net_width, net_height):
         if self.pix2pix_model is None:
             if self.depth_model_type == 0:
                 raw_prediction = estimateleres(img, self.depth_model, net_width, net_height)
-                raw_prediction_invert = True
             elif self.depth_model_type in [7, 8, 9]:
                 raw_prediction = estimatezoedepth(input, self.depth_model, net_width, net_height)
-                raw_prediction_invert = True
             else:
                 raw_prediction = estimatemidas(img, self.depth_model, net_width, net_height,
                                                self.resize_mode, self.normalization)
-                raw_prediction_invert = False
         else:
             raw_prediction = estimateboost(img, self.depth_model, self.depth_model_type, self.pix2pix_model)
-            raw_prediction_invert = False
+        raw_prediction_invert = self.depth_model_type in [0, 7, 8, 9]
         return raw_prediction, raw_prediction_invert
 
 
diff --git a/scripts/interface_webui.py b/scripts/interface_webui.py
index 515c989..7602071 100644
--- a/scripts/interface_webui.py
+++ b/scripts/interface_webui.py
@@ -8,11 +8,12 @@
 from modules.shared import opts
 from modules.ui import plaintext_to_html
 from pathlib import Path
+from PIL import Image
 
 from scripts.gradio_args_transport import GradioComponentBundle
 from scripts.main import *
 from scripts.core import core_generation_funnel, unload_models, run_makevideo
-from PIL import Image
+from scripts.depthmap_generation import ModelHolder
 
 
 # Ugly workaround to fix gradio tempfile issue
@@ -102,10 +103,9 @@ def main_ui_panel(is_depth_tab):
         with gr.Group():
             with gr.Row():
                 inp += "gen_mesh", gr.Checkbox(
-                    label="Generate simple 3D mesh. "
-                          "(Fast, accurate only with ZoeDepth models and no boost, no custom maps)",
-                    value=False, visible=True)
+                    label="Generate simple 3D mesh", value=False, visible=True)
             with gr.Row(visible=False) as mesh_options_row_0:
+                gr.Label(value="Generates fast, accurate only with ZoeDepth models and no boost, no custom maps")
                 inp += "mesh_occlude", gr.Checkbox(label="Remove occluded edges", value=True, visible=True)
                 inp += "mesh_spherical", gr.Checkbox(label="Equirectangular projection", value=False, visible=True)
 
@@ -113,8 +113,9 @@ def main_ui_panel(is_depth_tab):
             with gr.Group():
                 with gr.Row():
                     inp += "inpaint", gr.Checkbox(
-                        label="Generate 3D inpainted mesh. (Sloooow, required for generating videos)", value=False)
+                        label="Generate 3D inpainted mesh", value=False)
                 with gr.Group(visible=False) as inpaint_options_row_0:
+                    gr.Label("Generation is sloooow, required for generating videos")
                     inp += "inpaint_vids", gr.Checkbox(
                         label="Generate 4 demo videos with 3D inpainted mesh.", value=False)
                     gr.HTML("More options for generating video can be found in the Generate video tab")
@@ -139,6 +140,15 @@ def main_ui_panel(is_depth_tab):
 
         inp += "gen_normal", gr.Checkbox(label="Generate Normalmap (hidden! api only)", value=False, visible=False)
 
+        def update_delault_net_size(model_type):
+            w, h = ModelHolder.get_default_net_size(model_type)
+            return inp['net_width'].update(value=w), inp['net_height'].update(value=h)
+        inp['model_type'].change(
+            fn=update_delault_net_size,
+            inputs=inp['model_type'],
+            outputs=[inp['net_width'], inp['net_height']]
+        )
+
         inp['boost'].change(
             fn=lambda a, b: (options_depend_on_boost.update(visible=not a),
                              options_depend_on_match_size.update(visible=not a and not b)),
@@ -309,6 +319,7 @@ def on_ui_tabs():
                             inp += gr.Image(label="Source", source="upload", interactive=True, type="pil",
                                             elem_id="depthmap_input_image")
                             with gr.Group(visible=False) as custom_depthmap_row_0:
+                                # TODO: depthmap generation settings should disappear when using this
                                 inp += gr.File(label="Custom DepthMap", file_count="single", interactive=True,
                                                type="file", elem_id='custom_depthmap_img')
                         inp += gr.Checkbox(elem_id="custom_depthmap", label="Use custom DepthMap", value=False)
@@ -471,13 +482,12 @@ def run_generate(*inputs):
         inputnames.append(None)
         if custom_depthmap:
             if custom_depthmap_img is None:
-                return [], None, None, "Custom depthmap is not specified. " \
-                                       "Please either supply it or disable this option.", ""
-            inputdepthmaps.append(custom_depthmap_img)
+                return [], None, None,\
+                    "Custom depthmap is not specified. Please either supply it or disable this option.", ""
+            inputdepthmaps.append(Image.open(os.path.abspath(custom_depthmap_img.name)))
         else:
             inputdepthmaps.append(None)
     if depthmap_mode == '1':  # Batch Process
-        # convert files to pillow images
         for img in image_batch:
             image = Image.open(os.path.abspath(img.name))
             inputimages.append(image)
diff --git a/scripts/main.py b/scripts/main.py
index 97aae56..80652da 100644
--- a/scripts/main.py
+++ b/scripts/main.py
@@ -4,7 +4,7 @@
 import torch
 
 SCRIPT_NAME = "DepthMap"
-SCRIPT_VERSION = "v0.3.13"
+SCRIPT_VERSION = "v0.4.0"
 
 commit_hash = None  # TODO: understand why it would spam to stderr if changed to ... = get_commit_hash()
 def get_commit_hash():
diff --git a/scripts/stereoimage_generation.py b/scripts/stereoimage_generation.py
index 056c6f6..aa20128 100644
--- a/scripts/stereoimage_generation.py
+++ b/scripts/stereoimage_generation.py
@@ -1,4 +1,11 @@
-from numba import njit, prange
+try:
+    from numba import njit, prange
+except Exception as e:
+    print(f"WARINING! Numba failed to import! Stereoimage generation will be much slower! ({str(e)})")
+    from builtins import range as prange
+    def njit(parallel=False):
+        def Inner(func): return lambda *args, **kwargs: func(*args, **kwargs)
+        return Inner
 import numpy as np
 from PIL import Image
 
@@ -73,7 +80,7 @@ def apply_stereo_divergence(original_image, depth, divergence, separation, fill_
         )
 
 
-@njit
+@njit(parallel=False)
 def apply_stereo_divergence_naive(
         original_image, normalized_depth, divergence_px: float, separation_px: float, fill_technique):
     h, w, c = original_image.shape

From a9f2e1ff295f29e025e7519640533eb4295eef2a Mon Sep 17 00:00:00 2001
From: semjon00 <semjon.00@gmail.com>
Date: Mon, 17 Jul 2023 13:22:41 +0300
Subject: [PATCH 12/16] Moving files

Does not work! Imports updated in the next commit.
---
 scripts/{interface_webui.py => depthmap.py} | 0
 {scripts => src}/core.py                    | 0
 {scripts => src}/depthmap_generation.py     | 0
 {scripts => src}/gradio_args_transport.py   | 0
 {scripts => src}/main.py                    | 0
 {scripts => src}/stereoimage_generation.py  | 0
 6 files changed, 0 insertions(+), 0 deletions(-)
 rename scripts/{interface_webui.py => depthmap.py} (100%)
 rename {scripts => src}/core.py (100%)
 rename {scripts => src}/depthmap_generation.py (100%)
 rename {scripts => src}/gradio_args_transport.py (100%)
 rename {scripts => src}/main.py (100%)
 rename {scripts => src}/stereoimage_generation.py (100%)

diff --git a/scripts/interface_webui.py b/scripts/depthmap.py
similarity index 100%
rename from scripts/interface_webui.py
rename to scripts/depthmap.py
diff --git a/scripts/core.py b/src/core.py
similarity index 100%
rename from scripts/core.py
rename to src/core.py
diff --git a/scripts/depthmap_generation.py b/src/depthmap_generation.py
similarity index 100%
rename from scripts/depthmap_generation.py
rename to src/depthmap_generation.py
diff --git a/scripts/gradio_args_transport.py b/src/gradio_args_transport.py
similarity index 100%
rename from scripts/gradio_args_transport.py
rename to src/gradio_args_transport.py
diff --git a/scripts/main.py b/src/main.py
similarity index 100%
rename from scripts/main.py
rename to src/main.py
diff --git a/scripts/stereoimage_generation.py b/src/stereoimage_generation.py
similarity index 100%
rename from scripts/stereoimage_generation.py
rename to src/stereoimage_generation.py

From cee5576b47654a9a0b2eb508fd46d05ca4e26df9 Mon Sep 17 00:00:00 2001
From: semjon00 <semjon.00@gmail.com>
Date: Mon, 17 Jul 2023 13:25:02 +0300
Subject: [PATCH 13/16] Fixes previous commit

---
 scripts/depthmap.py        | 8 ++++----
 src/core.py                | 6 +++---
 src/depthmap_generation.py | 2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/depthmap.py b/scripts/depthmap.py
index 7602071..3a542f4 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap.py
@@ -10,10 +10,10 @@
 from pathlib import Path
 from PIL import Image
 
-from scripts.gradio_args_transport import GradioComponentBundle
-from scripts.main import *
-from scripts.core import core_generation_funnel, unload_models, run_makevideo
-from scripts.depthmap_generation import ModelHolder
+from src.gradio_args_transport import GradioComponentBundle
+from src.main import *
+from src.core import core_generation_funnel, unload_models, run_makevideo
+from src.depthmap_generation import ModelHolder
 
 
 # Ugly workaround to fix gradio tempfile issue
diff --git a/src/core.py b/src/core.py
index 456abf2..64dbcd4 100644
--- a/src/core.py
+++ b/src/core.py
@@ -21,9 +21,9 @@
 import traceback
 
 # Our code
-from scripts.main import *
-from scripts.stereoimage_generation import create_stereoimages
-from scripts.depthmap_generation import ModelHolder
+from src.main import *
+from src.stereoimage_generation import create_stereoimages
+from src.depthmap_generation import ModelHolder
 
 # 3d-photo-inpainting imports
 from inpaint.mesh import write_mesh, read_mesh, output_3d_photo
diff --git a/src/depthmap_generation.py b/src/depthmap_generation.py
index 0116162..0915d7c 100644
--- a/src/depthmap_generation.py
+++ b/src/depthmap_generation.py
@@ -14,7 +14,7 @@
 import skimage.measure
 
 # Our code
-from scripts.main import *
+from src.main import *
 
 # midas imports
 from dmidas.dpt_depth import DPTDepthModel

From 3fa9185cd9c0a63cf7566b8478e195dc9292f013 Mon Sep 17 00:00:00 2001
From: semjon00 <semjon.00@gmail.com>
Date: Tue, 11 Jul 2023 20:23:50 +0300
Subject: [PATCH 14/16] Post-refactor fixes vol. 2

* Reload model before generation, if it is offloaded to CPU
* Load model if boost got selected
* Do not try to offload pix2pix
* Net dimensions are multiple of 32 regardless of match size
* Change the default net size to default net size of the default model
* Fixed script mode
* UI fixes
---
 scripts/depthmap.py        | 20 +++++++++++---------
 src/core.py                | 16 +++++++++-------
 src/depthmap_generation.py | 23 +++++++++++++++++++----
 3 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/scripts/depthmap.py b/scripts/depthmap.py
index 3a542f4..f80fecb 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap.py
@@ -49,8 +49,8 @@ def main_ui_panel(is_depth_tab):
                 with gr.Group(visible=False) as options_depend_on_boost:
                     inp += 'match_size', gr.Checkbox(label="Match net size to input size", value=False)
             with gr.Row(visible=False) as options_depend_on_match_size:
-                inp += 'net_width', gr.Slider(minimum=64, maximum=2048, step=64, label='Net width', value=512)
-                inp += 'net_height', gr.Slider(minimum=64, maximum=2048, step=64, label='Net height', value=512)
+                inp += 'net_width', gr.Slider(minimum=64, maximum=2048, step=64, label='Net width', value=448)
+                inp += 'net_height', gr.Slider(minimum=64, maximum=2048, step=64, label='Net height', value=448)
 
         with gr.Group():
             with gr.Row():
@@ -104,10 +104,12 @@ def main_ui_panel(is_depth_tab):
             with gr.Row():
                 inp += "gen_mesh", gr.Checkbox(
                     label="Generate simple 3D mesh", value=False, visible=True)
-            with gr.Row(visible=False) as mesh_options_row_0:
-                gr.Label(value="Generates fast, accurate only with ZoeDepth models and no boost, no custom maps")
-                inp += "mesh_occlude", gr.Checkbox(label="Remove occluded edges", value=True, visible=True)
-                inp += "mesh_spherical", gr.Checkbox(label="Equirectangular projection", value=False, visible=True)
+            with gr.Group(visible=False) as mesh_options:
+                with gr.Row():
+                    gr.HTML(value="Generates fast, accurate only with ZoeDepth models and no boost, no custom maps")
+                with gr.Row():
+                    inp += "mesh_occlude", gr.Checkbox(label="Remove occluded edges", value=True, visible=True)
+                    inp += "mesh_spherical", gr.Checkbox(label="Equirectangular projection", value=False, visible=True)
 
         if is_depth_tab:
             with gr.Group():
@@ -115,7 +117,7 @@ def main_ui_panel(is_depth_tab):
                     inp += "inpaint", gr.Checkbox(
                         label="Generate 3D inpainted mesh", value=False)
                 with gr.Group(visible=False) as inpaint_options_row_0:
-                    gr.Label("Generation is sloooow, required for generating videos")
+                    gr.HTML("Generation is sloooow, required for generating videos")
                     inp += "inpaint_vids", gr.Checkbox(
                         label="Generate 4 demo videos with 3D inpainted mesh.", value=False)
                     gr.HTML("More options for generating video can be found in the Generate video tab")
@@ -199,9 +201,9 @@ def stereo_options_visibility(v):
         )
 
         inp['gen_mesh'].change(
-            fn=lambda v: mesh_options_row_0.update(visible=v),
+            fn=lambda v: mesh_options.update(visible=v),
             inputs=[inp['gen_mesh']],
-            outputs=[mesh_options_row_0]
+            outputs=[mesh_options]
         )
 
         def inpaint_options_visibility(v):
diff --git a/src/core.py b/src/core.py
index 64dbcd4..fee792e 100644
--- a/src/core.py
+++ b/src/core.py
@@ -62,7 +62,7 @@ def reload_sd_model():
 def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp):
     if len(inputimages) == 0 or inputimages[0] is None:
         return [], '', ''
-    if len(inputdepthmaps) == 0:
+    if inputdepthmaps is None or len(inputdepthmaps) == 0:
         inputdepthmaps: list[Image] = [None for _ in range(len(inputimages))]
     inputdepthmaps_complete = all([x is not None for x in inputdepthmaps])
 
@@ -78,8 +78,8 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
     gen_mesh = inp["gen_mesh"]
     gen_normal = inp["gen_normal"] if "gen_normal" in inp else False
     gen_stereo = inp["gen_stereo"]
-    inpaint = inp["inpaint"]
-    inpaint_vids = inp["inpaint_vids"]
+    inpaint = inp["inpaint"] if "inpaint" in inp else False
+    inpaint_vids = inp["inpaint_vids"] if "inpaint_vids" in inp else False
     invert_depth = inp["invert_depth"]
     match_size = inp["match_size"]
     mesh_occlude = inp["mesh_occlude"]
@@ -165,7 +165,9 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
             else:
                 # override net size (size may be different for different images)
                 if match_size:
-                    net_width, net_height = inputimages[count].width, inputimages[count].height
+                    # Round up to a multiple of 32 to avoid potential issues
+                    net_width = (inputimages[count].width + 31) // 32 * 32
+                    net_height = (inputimages[count].height + 31) // 32 * 32
                 raw_prediction, raw_prediction_invert = \
                     model_holder.get_raw_prediction(inputimages[count], net_width, net_height)
 
@@ -304,14 +306,14 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
         else:
             raise e
     finally:
-        if not (hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels):
+        if hasattr(opts, 'depthmap_script_keepmodels') and opts.depthmap_script_keepmodels:
+            model_holder.offload()  # Swap to CPU memory
+        else:
             if 'model' in locals():
                 del model
             if 'pix2pixmodel' in locals():
                 del pix2pix_model
             model_holder.unload_models()
-        else:
-            model_holder.swap_to_cpu_memory()
 
         gc.collect()
         devices.torch_gc()
diff --git a/src/depthmap_generation.py b/src/depthmap_generation.py
index 0915d7c..8346b1e 100644
--- a/src/depthmap_generation.py
+++ b/src/depthmap_generation.py
@@ -42,6 +42,7 @@ def __init__(self):
         self.pix2pix_model = None
         self.depth_model_type = None
         self.device = None  # Target device, the model may be swapped from VRAM into RAM.
+        self.offloaded = False  # True means current device is not the target device
 
         # Extra stuff
         self.resize_mode = None
@@ -53,9 +54,10 @@ def ensure_models(self, model_type, device: torch.device, boost: bool):
             self.unload_models()
             return
         # Certain optimisations are irreversible and not device-agnostic, thus changing device requires reloading
-        if model_type != self.depth_model_type or boost != self.pix2pix_model is not None or device != self.device:
+        if model_type != self.depth_model_type or boost != (self.pix2pix_model is not None) or device != self.device:
             self.unload_models()
             self.load_models(model_type, device, boost)
+        self.reload()
 
     def load_models(self, model_type, device: torch.device, boost: bool):
         """Ensure that the depth model is loaded"""
@@ -236,11 +238,24 @@ def get_default_net_size(model_type):
             return sizes[model_type]
         return [512, 512]
 
-    def swap_to_cpu_memory(self):
+    def offload(self):
+        """Move to RAM to conserve VRAM"""
+        if self.device != torch.device('cpu') and not self.offloaded:
+            self.move_models_to(torch.device('cpu'))
+            self.offloaded = True
+
+    def reload(self):
+        """Undoes offload"""
+        if self.offloaded:
+            self.move_models_to(self.device)
+            self.offloaded = True
+
+    def move_models_to(self, device):
         if self.depth_model is not None:
-            self.depth_model.to(torch.device('cpu'))
+            self.depth_model.to(device)
         if self.pix2pix_model is not None:
-            self.pix2pix_model.to(torch.device('cpu'))
+            pass
+            # TODO: pix2pix offloading not implemented
 
     def unload_models(self):
         if self.depth_model is not None or self.pix2pix_model is not None:

From 381536d00295d276c809c0078daf0a651978215e Mon Sep 17 00:00:00 2001
From: semjon00 <semjon.00@gmail.com>
Date: Tue, 18 Jul 2023 11:49:28 +0300
Subject: [PATCH 15/16] Minor fixes

---
 README.md           |  1 +
 scripts/depthmap.py |  5 +++++
 src/core.py         | 20 +-------------------
 src/main.py         | 28 +++++++++++++---------------
 4 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index 591290a..ccb66cc 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,7 @@ images generated by [@semjon00](https://github.com/semjon00) from CC0 photos, mo
 ## Changelog
 * v0.4.0 large code refactor
     * UI improvements
+    * improved Batch from Directory, Clip and renormalize DepthMap
     * slightly changed the behaviour of various options
     * extension may partially work even if some of the dependencies are unmet
 * v0.3.12
diff --git a/scripts/depthmap.py b/scripts/depthmap.py
index f80fecb..a0e85a1 100644
--- a/scripts/depthmap.py
+++ b/scripts/depthmap.py
@@ -136,6 +136,7 @@ def main_ui_panel(is_depth_tab):
                                                                value='u2net', type="value")
 
         with gr.Box():
+            gr.HTML(f"{SCRIPT_FULL_NAME}<br/>")
             gr.HTML("Information, comment and share @ <a "
                     "href='https://github.com/thygate/stable-diffusion-webui-depthmap-script'>"
                     "https://github.com/thygate/stable-diffusion-webui-depthmap-script</a>")
@@ -480,6 +481,8 @@ def run_generate(*inputs):
         outpath = opts.outdir_samples or opts.outdir_extras_samples
 
     if depthmap_mode == '0':  # Single image
+        if depthmap_input_image is None:
+            return [], None, None, "Please select an input image!", ""
         inputimages.append(depthmap_input_image)
         inputnames.append(None)
         if custom_depthmap:
@@ -490,6 +493,8 @@ def run_generate(*inputs):
         else:
             inputdepthmaps.append(None)
     if depthmap_mode == '1':  # Batch Process
+        if image_batch is None:
+            return [], None, None, "Please select input images!", ""
         for img in image_batch:
             image = Image.open(os.path.abspath(img.name))
             inputimages.append(image)
diff --git a/src/core.py b/src/core.py
index fee792e..ccc3e6e 100644
--- a/src/core.py
+++ b/src/core.py
@@ -98,7 +98,7 @@ def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp
     stereo_separation = inp["stereo_separation"]
 
     # TODO: ideally, run_depthmap should not save meshes - that makes the function not pure
-    print(f"{SCRIPT_NAME} {SCRIPT_VERSION} ({get_commit_hash()})")
+    print(SCRIPT_FULL_NAME)
 
     unload_sd_model()
 
@@ -649,24 +649,6 @@ def batched_background_removal(inimages, model_name):
     return outimages
 
 
-def ensure_file_downloaded(filename, url, sha256_hash_prefix=None):
-    # Do not check the hash every time - it is somewhat time-consuming
-    if os.path.exists(filename):
-        return
-
-    if type(url) is not list:
-        url = [url]
-    for cur_url in url:
-        try:
-            print("Downloading", cur_url, "to", filename)
-            torch.hub.download_url_to_file(cur_url, filename, sha256_hash_prefix)
-            if os.path.exists(filename):
-                return  # The correct model was downloaded, no need to try more
-        except:
-            pass
-    raise RuntimeError('Download failed. Try again later or manually download the file to that location.')
-
-
 def pano_depth_to_world_points(depth):
     """
     360 depth to world points
diff --git a/src/main.py b/src/main.py
index 80652da..d3fed1d 100644
--- a/src/main.py
+++ b/src/main.py
@@ -3,23 +3,21 @@
 import pathlib
 import torch
 
+def get_commit_hash():
+    try:
+        return subprocess.check_output(
+            [os.environ.get("GIT", "git"), "rev-parse", "HEAD"],
+            cwd=pathlib.Path.cwd().joinpath('extensions/stable-diffusion-webui-depthmap-script/'),
+            shell=False,
+            stderr=subprocess.DEVNULL,
+            encoding='utf8').strip()[0:8]
+    except Exception:
+        return "<none>"
+
+
 SCRIPT_NAME = "DepthMap"
 SCRIPT_VERSION = "v0.4.0"
-
-commit_hash = None  # TODO: understand why it would spam to stderr if changed to ... = get_commit_hash()
-def get_commit_hash():
-    global commit_hash
-    if commit_hash is None:
-        try:
-            commit_hash = subprocess.check_output(
-                [os.environ.get('GIT', "git"), "rev-parse", "HEAD"],
-                cwd=pathlib.Path.cwd().joinpath('extensions/stable-diffusion-webui-depthmap-script/'),
-                shell=False,
-                stderr=subprocess.DEVNULL,
-                encoding='utf8').strip()[0:8]
-        except Exception:
-            commit_hash = "<none>"
-    return commit_hash
+SCRIPT_FULL_NAME = f"{SCRIPT_NAME} {SCRIPT_VERSION} ({get_commit_hash()})"
 
 
 def ensure_file_downloaded(filename, url, sha256_hash_prefix=None):

From bc3755ec78cd06dc132016fbd35e2a1b8a7c3a76 Mon Sep 17 00:00:00 2001
From: semjon00 <semjon.00@gmail.com>
Date: Wed, 19 Jul 2023 10:53:41 +0300
Subject: [PATCH 16/16] Bugfix: fix offloading

---
 src/depthmap_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/depthmap_generation.py b/src/depthmap_generation.py
index 8346b1e..6812d81 100644
--- a/src/depthmap_generation.py
+++ b/src/depthmap_generation.py
@@ -248,7 +248,7 @@ def reload(self):
         """Undoes offload"""
         if self.offloaded:
             self.move_models_to(self.device)
-            self.offloaded = True
+            self.offloaded = False
 
     def move_models_to(self, device):
         if self.depth_model is not None: