jotachango · TDFSE · Sep 1, 2025 · Sep 1, 2025 · Sep 1, 2025
diff --git a/adaptive_boxes.cu b/adaptive_boxes.cu
@@ -6,6 +6,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/reduce.h>
 #include <thrust/copy.h>
+#include <thrust/extrema.h>
 //STL
 #include <vector>
 // cuda call
@@ -18,6 +19,9 @@
 // csv
 #include "./include/csv_tools.h"
 #include "./include/io_tools.h"
+// partition system
+#include "./include/partition_graph.h"
+#include "./include/partition_kernels.h"
 
 
 int main(int argc, char *argv[]){
@@ -92,6 +96,53 @@ int main(int argc, char *argv[]){
 	setup_kernel<<<grid, block>>>(devStates);
 	cudaDeviceSynchronize();
 
+	// Partition system initialization
+	const int partition_size = 32;  // Default 32x32 tiles
+	const float density_threshold = 0.1f;  // 10% filled pixels threshold
+
+	// Adaptive partitioning threshold - only use partitions for large matrices
+	// Eliminates overhead for small datasets while preserving gains for large ones
+	bool use_partitions = (m * n > 500000);  // ~707x707 threshold
+
+	partition_t *partitions_d = nullptr;
+	int *adjacency_matrix_d = nullptr;
+	int partition_count = 0;
+
+	// Setup partition kernels grid/block dimensions (declared here for use in main loop)
+	dim3 partition_grid, partition_block, connectivity_grid, connectivity_block;
+
+	if (use_partitions) {
+		partition_count = calculate_partition_count(m, n, partition_size);
+		printf("Initializing partition system: %d partitions of size %dx%d\n", partition_count, partition_size, partition_size);
+
+		// Allocate partition device memory
+		CC(cudaMalloc((void**)&partitions_d, partition_count * sizeof(partition_t)));
+		CC(cudaMalloc((void**)&adjacency_matrix_d, partition_count * partition_count * sizeof(int)));
+
+		// Initialize partitions with zero values
+		CC(cudaMemset(partitions_d, 0, partition_count * sizeof(partition_t)));
+		CC(cudaMemset(adjacency_matrix_d, 0, partition_count * partition_count * sizeof(int)));
+
+		// Configure partition kernels grid/block dimensions
+		partition_grid = dim3(partition_count, 1, 1);
+		partition_block = dim3(min(256, partition_size * partition_size), 1, 1);  // Max 256 threads per block
+
+		connectivity_grid = dim3((partition_count + 255) / 256, 1, 1);
+		connectivity_block = dim3(256, 1, 1);
+
+		// Calculate initial density and connectivity
+		compute_partition_density<<<partition_grid, partition_block>>>(data_d, m, n, partitions_d, partition_count, partition_size);
+		cudaDeviceSynchronize();
+
+		build_connectivity_graph<<<connectivity_grid, connectivity_block>>>(partitions_d, partition_count, adjacency_matrix_d, density_threshold, partition_size, n);
+		cudaDeviceSynchronize();
+
+		update_partition_priorities<<<connectivity_grid, connectivity_block>>>(partitions_d, partition_count, adjacency_matrix_d);
+		cudaDeviceSynchronize();
+	} else {
+		printf("Using original random exploration (matrix size %ldx%ld below threshold)\n", m, n);
+	}
+
 	// Loop
 	printf("Working...\n");
 	rectangle_t rec;
@@ -109,7 +160,7 @@ int main(int argc, char *argv[]){
 	int x1,x2,y1,y2;
 
 	for (int step=0; step<max_step; step++){
-		find_largest_rectangle<<<grid,block>>>(devStates,m,n,data_d,out_d, areas_d);
+		find_largest_rectangle<<<grid,block>>>(devStates,m,n,data_d,out_d, areas_d, partitions_d, partition_count);
 		cudaDeviceSynchronize();
 
 		thrust::device_vector<int>::iterator iter = thrust::max_element(t_areas_d.begin(), t_areas_d.end());
@@ -148,9 +199,21 @@ int main(int argc, char *argv[]){
 				recs.push_back(rec);
 			}
 
+			last_sum = sum;
+
+			// Update partitions every 10th rectangle removal (only if using partitions)
+			if (use_partitions && step % 10 == 0) {
+				// Update affected partitions after rectangle removal
+				update_affected_partitions<<<partition_grid, partition_block>>>(x1, x2, y1, y2, partitions_d, partition_count, data_d, m, n, partition_size);
+				cudaDeviceSynchronize();
+
+				// Recompute priorities for updated partitions
+				update_partition_priorities<<<connectivity_grid, connectivity_block>>>(partitions_d, partition_count, adjacency_matrix_d);
+				cudaDeviceSynchronize();
+			}
+
 			/*printf("sum = %d\n", sum);			*/
 
-			last_sum = sum;
 			if(sum<=0){
 				break;
 			}
@@ -177,6 +240,8 @@ int main(int argc, char *argv[]){
 
 	// Free memory
 	cudaFree(devStates);
+	cudaFree(partitions_d);
+	cudaFree(adjacency_matrix_d);
 	/*delete data;*/
 
 	return 0;

diff --git a/include/partition_graph.h b/include/partition_graph.h
@@ -0,0 +1,107 @@
+#ifndef PARTITION_GRAPH_H
+#define PARTITION_GRAPH_H
+
+#include <cuda_runtime.h>
+
+/**
+ * Partition structure representing a spatial tile in the 2D matrix
+ * Used for spatial partitioning to guide rectangle exploration
+ */
+struct partition_t {
+    int x_start, x_end;    // Tile boundaries (inclusive)
+    int y_start, y_end;
+    float density;         // Ratio of filled pixels (0.0-1.0)
+    int connectivity;      // Number of connected neighbors
+    float priority;        // Search priority score
+};
+
+/**
+ * Partition graph structure for spatial decomposition
+ * Manages partitions and their connectivity relationships
+ */
+struct partition_graph_t {
+    partition_t* partitions;   // Device array of partitions
+    int partition_count;       // Total number of partitions  
+    int* adjacency_matrix;     // Sparse connectivity matrix (partition_count x partition_count)
+    int* priority_queue;       // Sorted partition indices by priority
+    int partition_size;        // Size of each partition (e.g., 32x32)
+};
+
+/**
+ * Helper functions for partition management
+ */
+
+/**
+ * Calculate number of partitions needed for given matrix dimensions
+ * 
+ * Args:
+ *     m: Matrix height
+ *     n: Matrix width
+ *     partition_size: Size of each partition tile
+ * 
+ * Returns:
+ *     Total number of partitions needed
+ */
+__host__ __device__ inline int calculate_partition_count(int m, int n, int partition_size) {
+    int partitions_y = (m + partition_size - 1) / partition_size;  // Ceiling division
+    int partitions_x = (n + partition_size - 1) / partition_size;
+    return partitions_x * partitions_y;
+}
+
+/**
+ * Get partition index for given matrix coordinates
+ * 
+ * Args:
+ *     row: Matrix row
+ *     col: Matrix column
+ *     n: Matrix width
+ *     partition_size: Size of each partition tile
+ * 
+ * Returns:
+ *     Partition index
+ */
+__host__ __device__ inline int get_partition_index(int row, int col, int n, int partition_size) {
+    int partition_row = row / partition_size;
+    int partition_col = col / partition_size;
+    int partitions_x = (n + partition_size - 1) / partition_size;
+    return partition_row * partitions_x + partition_col;
+}
+
+/**
+ * Get priority-guided partition for this thread (avoids clustering)
+ * Each thread gets a different high-priority partition using thread ID
+ * 
+ * Args:
+ *     partitions: Array of partitions
+ *     partition_count: Number of partitions
+ *     thread_id: Unique thread identifier for distribution
+ * 
+ * Returns:
+ *     Index of a high-priority partition for this thread
+ */
+__device__ inline int get_priority_guided_partition(partition_t* partitions, int partition_count, int thread_id) {
+    if (partitions == NULL || partition_count <= 0) return 0;
+
+    // Fast sampling approach: avoid expensive linear search
+    // Sample from a smaller subset of partitions and pick the best among them
+    const int sample_size = min(8, partition_count);  // Sample at most 8 partitions
+
+    int best_idx = 0;
+    float best_priority = -1.0f;
+
+    for (int i = 0; i < sample_size; i++) {
+        // Use thread_id and iteration to distribute sampling across partition space
+        int sample_idx = (thread_id + i * 1337) % partition_count;  // Pseudo-random distribution
+
+        float priority = partitions[sample_idx].priority;
+        if (priority > best_priority) {
+            best_priority = priority;
+            best_idx = sample_idx;
+        }
+    }
+
+
+    return best_idx;
+}
+
+#endif // PARTITION_GRAPH_H