@@ -494,7 +494,10 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
494494 struct ggml_tensor * output,
495495 int x,
496496 int y,
497- int overlap) {
497+ int overlap_x,
498+ int overlap_y,
499+ int x_skip = 0 ,
500+ int y_skip = 0 ) {
498501 int64_t width = input->ne [0 ];
499502 int64_t height = input->ne [1 ];
500503 int64_t channels = input->ne [2 ];
@@ -503,17 +506,17 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
503506 int64_t img_height = output->ne [1 ];
504507
505508 GGML_ASSERT (input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
506- for (int iy = 0 ; iy < height; iy++) {
507- for (int ix = 0 ; ix < width; ix++) {
509+ for (int iy = y_skip ; iy < height; iy++) {
510+ for (int ix = x_skip ; ix < width; ix++) {
508511 for (int k = 0 ; k < channels; k++) {
509512 float new_value = ggml_tensor_get_f32 (input, ix, iy, k);
510- if (overlap > 0 ) { // blend colors in overlapped area
513+ if (overlap_x > 0 || overlap_y > 0 ) { // blend colors in overlapped area
511514 float old_value = ggml_tensor_get_f32 (output, x + ix, y + iy, k);
512515
513- const float x_f_0 = (x > 0 ) ? ix / float (overlap ) : 1 ;
514- const float x_f_1 = (x < (img_width - width)) ? (width - ix) / float (overlap ) : 1 ;
515- const float y_f_0 = (y > 0 ) ? iy / float (overlap ) : 1 ;
516- const float y_f_1 = (y < (img_height - height)) ? (height - iy) / float (overlap ) : 1 ;
516+ const float x_f_0 = (overlap_x > 0 && x > 0 ) ? ( ix - x_skip) / float (overlap_x ) : 1 ;
517+ const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float (overlap_x ) : 1 ;
518+ const float y_f_0 = (overlap_y > 0 && y > 0 ) ? ( iy - y_skip) / float (overlap_y ) : 1 ;
519+ const float y_f_1 = (overlap_y > 0 && y < (img_height - height)) ? (height - iy) / float (overlap_y ) : 1 ;
517520
518521 const float x_f = std::min (std::min (x_f_0, x_f_1), 1 .f );
519522 const float y_f = std::min (std::min (y_f_0, y_f_1), 1 .f );
@@ -745,22 +748,102 @@ __STATIC_INLINE__ std::vector<struct ggml_tensor*> ggml_chunk(struct ggml_contex
745748
746749typedef std::function<void (ggml_tensor*, ggml_tensor*, bool )> on_tile_process;
747750
751+ __STATIC_INLINE__ void sd_tiling_calc_tiles (int & num_tiles_dim,
752+ float & tile_overlap_factor_dim,
753+ int small_dim,
754+ int tile_size,
755+ const float tile_overlap_factor) {
756+ int tile_overlap = (tile_size * tile_overlap_factor);
757+ int non_tile_overlap = tile_size - tile_overlap;
758+
759+ num_tiles_dim = (small_dim - tile_overlap) / non_tile_overlap;
760+ int overshoot_dim = ((num_tiles_dim + 1 ) * non_tile_overlap + tile_overlap) % small_dim;
761+
762+ if ((overshoot_dim != non_tile_overlap) && (overshoot_dim <= num_tiles_dim * (tile_size / 2 - tile_overlap))) {
763+ // if tiles don't fit perfectly using the desired overlap
764+ // and there is enough room to squeeze an extra tile without overlap becoming >0.5
765+ num_tiles_dim++;
766+ }
767+
768+ tile_overlap_factor_dim = (float )(tile_size * num_tiles_dim - small_dim) / (float )(tile_size * (num_tiles_dim - 1 ));
769+ if (num_tiles_dim <= 2 ) {
770+ if (small_dim <= tile_size) {
771+ num_tiles_dim = 1 ;
772+ tile_overlap_factor_dim = 0 ;
773+ } else {
774+ num_tiles_dim = 2 ;
775+ tile_overlap_factor_dim = (2 * tile_size - small_dim) / (float )tile_size;
776+ }
777+ }
778+ }
779+
748780// Tiling
749- __STATIC_INLINE__ void sd_tiling (ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
781+ __STATIC_INLINE__ void sd_tiling_non_square (ggml_tensor* input,
782+ ggml_tensor* output,
783+ const int scale,
784+ const int p_tile_size_x,
785+ const int p_tile_size_y,
786+ const float tile_overlap_factor,
787+ on_tile_process on_processing) {
750788 output = ggml_set_f32 (output, 0 );
751789
752790 int input_width = (int )input->ne [0 ];
753791 int input_height = (int )input->ne [1 ];
754792 int output_width = (int )output->ne [0 ];
755793 int output_height = (int )output->ne [1 ];
794+
795+ GGML_ASSERT (((input_width / output_width) == (input_height / output_height)) &&
796+ ((output_width / input_width) == (output_height / input_height)));
797+ GGML_ASSERT (((input_width / output_width) == scale) ||
798+ ((output_width / input_width) == scale));
799+
800+ int small_width = output_width;
801+ int small_height = output_height;
802+
803+ bool decode = output_width > input_width;
804+ if (decode) {
805+ small_width = input_width;
806+ small_height = input_height;
807+ }
808+
809+ int num_tiles_x;
810+ float tile_overlap_factor_x;
811+ sd_tiling_calc_tiles (num_tiles_x, tile_overlap_factor_x, small_width, p_tile_size_x, tile_overlap_factor);
812+
813+ int num_tiles_y;
814+ float tile_overlap_factor_y;
815+ sd_tiling_calc_tiles (num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor);
816+
817+ LOG_DEBUG (" num tiles : %d, %d " , num_tiles_x, num_tiles_y);
818+ LOG_DEBUG (" optimal overlap : %f, %f (targeting %f)" , tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor);
819+
756820 GGML_ASSERT (input_width % 2 == 0 && input_height % 2 == 0 && output_width % 2 == 0 && output_height % 2 == 0 ); // should be multiple of 2
757821
758- int tile_overlap = (int32_t )(tile_size * tile_overlap_factor);
759- int non_tile_overlap = tile_size - tile_overlap;
822+ int tile_overlap_x = (int32_t )(p_tile_size_x * tile_overlap_factor_x);
823+ int non_tile_overlap_x = p_tile_size_x - tile_overlap_x;
824+
825+ int tile_overlap_y = (int32_t )(p_tile_size_y * tile_overlap_factor_y);
826+ int non_tile_overlap_y = p_tile_size_y - tile_overlap_y;
827+
828+ int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width;
829+ int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height;
830+
831+ int input_tile_size_x = tile_size_x;
832+ int input_tile_size_y = tile_size_y;
833+ int output_tile_size_x = tile_size_x;
834+ int output_tile_size_y = tile_size_y;
835+
836+ if (decode) {
837+ output_tile_size_x *= scale;
838+ output_tile_size_y *= scale;
839+ } else {
840+ input_tile_size_x *= scale;
841+ input_tile_size_y *= scale;
842+ }
760843
761844 struct ggml_init_params params = {};
762- params.mem_size += tile_size * tile_size * input->ne [2 ] * sizeof (float ); // input chunk
763- params.mem_size += (tile_size * scale) * (tile_size * scale) * output->ne [2 ] * sizeof (float ); // output chunk
845+ params.mem_size += input_tile_size_x * input_tile_size_y * input->ne [2 ] * sizeof (float ); // input chunk
846+ params.mem_size += output_tile_size_x * output_tile_size_y * output->ne [2 ] * sizeof (float ); // output chunk
764847 params.mem_size += 3 * ggml_tensor_overhead ();
765848 params.mem_buffer = NULL ;
766849 params.no_alloc = false ;
@@ -775,29 +858,50 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
775858 }
776859
777860 // tiling
778- ggml_tensor* input_tile = ggml_new_tensor_4d (tiles_ctx, GGML_TYPE_F32, tile_size, tile_size, input->ne [2 ], 1 );
779- ggml_tensor* output_tile = ggml_new_tensor_4d (tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne [2 ], 1 );
780- on_processing (input_tile, NULL , true );
781- int num_tiles = ceil ((float )input_width / non_tile_overlap) * ceil ((float )input_height / non_tile_overlap);
861+ ggml_tensor* input_tile = ggml_new_tensor_4d (tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne [2 ], 1 );
862+ ggml_tensor* output_tile = ggml_new_tensor_4d (tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne [2 ], 1 );
863+ int num_tiles = num_tiles_x * num_tiles_y;
782864 LOG_INFO (" processing %i tiles" , num_tiles);
783- pretty_progress (1 , num_tiles, 0 .0f );
865+ pretty_progress (0 , num_tiles, 0 .0f );
784866 int tile_count = 1 ;
785867 bool last_y = false , last_x = false ;
786868 float last_time = 0 .0f ;
787- for (int y = 0 ; y < input_height && !last_y; y += non_tile_overlap) {
788- if (y + tile_size >= input_height) {
789- y = input_height - tile_size;
869+ for (int y = 0 ; y < small_height && !last_y; y += non_tile_overlap_y) {
870+ int dy = 0 ;
871+ if (y + tile_size_y >= small_height) {
872+ int _y = y;
873+ y = small_height - tile_size_y;
874+ dy = _y - y;
875+ if (decode) {
876+ dy *= scale;
877+ }
790878 last_y = true ;
791879 }
792- for (int x = 0 ; x < input_width && !last_x; x += non_tile_overlap) {
793- if (x + tile_size >= input_width) {
794- x = input_width - tile_size;
880+ for (int x = 0 ; x < small_width && !last_x; x += non_tile_overlap_x) {
881+ int dx = 0 ;
882+ if (x + tile_size_x >= small_width) {
883+ int _x = x;
884+ x = small_width - tile_size_x;
885+ dx = _x - x;
886+ if (decode) {
887+ dx *= scale;
888+ }
795889 last_x = true ;
796890 }
891+
892+ int x_in = decode ? x : scale * x;
893+ int y_in = decode ? y : scale * y;
894+ int x_out = decode ? x * scale : x;
895+ int y_out = decode ? y * scale : y;
896+
897+ int overlap_x_out = decode ? tile_overlap_x * scale : tile_overlap_x;
898+ int overlap_y_out = decode ? tile_overlap_y * scale : tile_overlap_y;
899+
797900 int64_t t1 = ggml_time_ms ();
798- ggml_split_tensor_2d (input, input_tile, x, y );
901+ ggml_split_tensor_2d (input, input_tile, x_in, y_in );
799902 on_processing (input_tile, output_tile, false );
800- ggml_merge_tensor_2d (output_tile, output, x * scale, y * scale, tile_overlap * scale);
903+ ggml_merge_tensor_2d (output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, dx, dy);
904+
801905 int64_t t2 = ggml_time_ms ();
802906 last_time = (t2 - t1) / 1000 .0f ;
803907 pretty_progress (tile_count, num_tiles, last_time);
@@ -811,6 +915,15 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
811915 ggml_free (tiles_ctx);
812916}
813917
918+ __STATIC_INLINE__ void sd_tiling (ggml_tensor* input,
919+ ggml_tensor* output,
920+ const int scale,
921+ const int tile_size,
922+ const float tile_overlap_factor,
923+ on_tile_process on_processing) {
924+ sd_tiling_non_square (input, output, scale, tile_size, tile_size, tile_overlap_factor, on_processing);
925+ }
926+
814927__STATIC_INLINE__ struct ggml_tensor * ggml_group_norm_32 (struct ggml_context * ctx,
815928 struct ggml_tensor * a) {
816929 const float eps = 1e-6f ; // default eps parameter
0 commit comments