Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 67 additions & 2 deletions adaptive_boxes.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/extrema.h>
//STL
#include <vector>
// cuda call
Expand All @@ -18,6 +19,9 @@
// csv
#include "./include/csv_tools.h"
#include "./include/io_tools.h"
// partition system
#include "./include/partition_graph.h"
#include "./include/partition_kernels.h"


int main(int argc, char *argv[]){
Expand Down Expand Up @@ -92,6 +96,53 @@ int main(int argc, char *argv[]){
setup_kernel<<<grid, block>>>(devStates);
cudaDeviceSynchronize();

// Partition system initialization
const int partition_size = 32; // Default 32x32 tiles
const float density_threshold = 0.1f; // 10% filled pixels threshold

// Adaptive partitioning threshold - only use partitions for large matrices
// Eliminates overhead for small datasets while preserving gains for large ones
bool use_partitions = (m * n > 500000); // ~707x707 threshold

partition_t *partitions_d = nullptr;
int *adjacency_matrix_d = nullptr;
int partition_count = 0;

// Setup partition kernels grid/block dimensions (declared here for use in main loop)
dim3 partition_grid, partition_block, connectivity_grid, connectivity_block;

if (use_partitions) {
partition_count = calculate_partition_count(m, n, partition_size);
printf("Initializing partition system: %d partitions of size %dx%d\n", partition_count, partition_size, partition_size);

// Allocate partition device memory
CC(cudaMalloc((void**)&partitions_d, partition_count * sizeof(partition_t)));
CC(cudaMalloc((void**)&adjacency_matrix_d, partition_count * partition_count * sizeof(int)));

// Initialize partitions with zero values
CC(cudaMemset(partitions_d, 0, partition_count * sizeof(partition_t)));
CC(cudaMemset(adjacency_matrix_d, 0, partition_count * partition_count * sizeof(int)));

// Configure partition kernels grid/block dimensions
partition_grid = dim3(partition_count, 1, 1);
partition_block = dim3(min(256, partition_size * partition_size), 1, 1); // Max 256 threads per block

connectivity_grid = dim3((partition_count + 255) / 256, 1, 1);
connectivity_block = dim3(256, 1, 1);

// Calculate initial density and connectivity
compute_partition_density<<<partition_grid, partition_block>>>(data_d, m, n, partitions_d, partition_count, partition_size);
cudaDeviceSynchronize();

build_connectivity_graph<<<connectivity_grid, connectivity_block>>>(partitions_d, partition_count, adjacency_matrix_d, density_threshold, partition_size, n);
cudaDeviceSynchronize();

update_partition_priorities<<<connectivity_grid, connectivity_block>>>(partitions_d, partition_count, adjacency_matrix_d);
cudaDeviceSynchronize();
} else {
printf("Using original random exploration (matrix size %ldx%ld below threshold)\n", m, n);
}

// Loop
printf("Working...\n");
rectangle_t rec;
Expand All @@ -109,7 +160,7 @@ int main(int argc, char *argv[]){
int x1,x2,y1,y2;

for (int step=0; step<max_step; step++){
find_largest_rectangle<<<grid,block>>>(devStates,m,n,data_d,out_d, areas_d);
find_largest_rectangle<<<grid,block>>>(devStates,m,n,data_d,out_d, areas_d, partitions_d, partition_count);
cudaDeviceSynchronize();

thrust::device_vector<int>::iterator iter = thrust::max_element(t_areas_d.begin(), t_areas_d.end());
Expand Down Expand Up @@ -148,9 +199,21 @@ int main(int argc, char *argv[]){
recs.push_back(rec);
}

last_sum = sum;

// Update partitions every 10th rectangle removal (only if using partitions)
if (use_partitions && step % 10 == 0) {
// Update affected partitions after rectangle removal
update_affected_partitions<<<partition_grid, partition_block>>>(x1, x2, y1, y2, partitions_d, partition_count, data_d, m, n, partition_size);
cudaDeviceSynchronize();

// Recompute priorities for updated partitions
update_partition_priorities<<<connectivity_grid, connectivity_block>>>(partitions_d, partition_count, adjacency_matrix_d);
cudaDeviceSynchronize();
}

/*printf("sum = %d\n", sum); */

last_sum = sum;
if(sum<=0){
break;
}
Expand All @@ -177,6 +240,8 @@ int main(int argc, char *argv[]){

// Free memory
cudaFree(devStates);
cudaFree(partitions_d);
cudaFree(adjacency_matrix_d);
/*delete data;*/

return 0;
Expand Down
107 changes: 107 additions & 0 deletions include/partition_graph.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#ifndef PARTITION_GRAPH_H
#define PARTITION_GRAPH_H

#include <cuda_runtime.h>

/**
* Partition structure representing a spatial tile in the 2D matrix
* Used for spatial partitioning to guide rectangle exploration
*/
struct partition_t {
int x_start, x_end; // Tile boundaries (inclusive)
int y_start, y_end;
float density; // Ratio of filled pixels (0.0-1.0)
int connectivity; // Number of connected neighbors
float priority; // Search priority score
};

/**
* Partition graph structure for spatial decomposition
* Manages partitions and their connectivity relationships
*/
struct partition_graph_t {
partition_t* partitions; // Device array of partitions
int partition_count; // Total number of partitions
int* adjacency_matrix; // Sparse connectivity matrix (partition_count x partition_count)
int* priority_queue; // Sorted partition indices by priority
int partition_size; // Size of each partition (e.g., 32x32)
};

/**
* Helper functions for partition management
*/

/**
* Calculate number of partitions needed for given matrix dimensions
*
* Args:
* m: Matrix height
* n: Matrix width
* partition_size: Size of each partition tile
*
* Returns:
* Total number of partitions needed
*/
__host__ __device__ inline int calculate_partition_count(int m, int n, int partition_size) {
int partitions_y = (m + partition_size - 1) / partition_size; // Ceiling division
int partitions_x = (n + partition_size - 1) / partition_size;
return partitions_x * partitions_y;
}

/**
* Get partition index for given matrix coordinates
*
* Args:
* row: Matrix row
* col: Matrix column
* n: Matrix width
* partition_size: Size of each partition tile
*
* Returns:
* Partition index
*/
__host__ __device__ inline int get_partition_index(int row, int col, int n, int partition_size) {
int partition_row = row / partition_size;
int partition_col = col / partition_size;
int partitions_x = (n + partition_size - 1) / partition_size;
return partition_row * partitions_x + partition_col;
}

/**
* Get priority-guided partition for this thread (avoids clustering)
* Each thread gets a different high-priority partition using thread ID
*
* Args:
* partitions: Array of partitions
* partition_count: Number of partitions
* thread_id: Unique thread identifier for distribution
*
* Returns:
* Index of a high-priority partition for this thread
*/
__device__ inline int get_priority_guided_partition(partition_t* partitions, int partition_count, int thread_id) {
if (partitions == NULL || partition_count <= 0) return 0;

// Fast sampling approach: avoid expensive linear search
// Sample from a smaller subset of partitions and pick the best among them
const int sample_size = min(8, partition_count); // Sample at most 8 partitions

int best_idx = 0;
float best_priority = -1.0f;

for (int i = 0; i < sample_size; i++) {
// Use thread_id and iteration to distribute sampling across partition space
int sample_idx = (thread_id + i * 1337) % partition_count; // Pseudo-random distribution

float priority = partitions[sample_idx].priority;
if (priority > best_priority) {
best_priority = priority;
best_idx = sample_idx;
}
}


return best_idx;
}

#endif // PARTITION_GRAPH_H
Loading