Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ inline size_t TotalMemory(int device_idx) {
}

/**
* \fn inline int max_shared_memory(int device_idx)
* \fn inline int MaxSharedMemory(int device_idx)
*
* \brief Maximum shared memory per block on this device.
*
Expand All @@ -113,6 +113,23 @@ inline size_t MaxSharedMemory(int device_idx) {
return prop.sharedMemPerBlock;
}

/**
* \fn inline int MaxSharedMemoryOptin(int device_idx)
*
* \brief Maximum dynamic shared memory per thread block on this device
that can be opted into when using cudaFuncSetAttribute().
*
* \param device_idx Zero-based index of the device.
*/

inline size_t MaxSharedMemoryOptin(int device_idx) {
int max_shared_memory = 0;
dh::safe_cuda(cudaDeviceGetAttribute
(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlockOptin,
device_idx));
return size_t(max_shared_memory);
}

inline void CheckComputeCapability() {
for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) {
cudaDeviceProp prop;
Expand Down
45 changes: 31 additions & 14 deletions src/tree/gpu_hist/histogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,35 +150,52 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
common::Span<GradientPair const> gpair,
common::Span<const uint32_t> d_ridx,
common::Span<GradientSumT> histogram,
GradientSumT rounding, bool shared) {
const size_t smem_size =
shared
? sizeof(GradientSumT) * matrix.NumBins()
: 0;
auto n_elements = d_ridx.size() * matrix.row_stride;
GradientSumT rounding) {
// decide whether to use shared memory
int device = 0;
dh::safe_cuda(cudaGetDevice(&device));
int max_shared_memory = dh::MaxSharedMemoryOptin(device);
size_t smem_size = sizeof(GradientSumT) * matrix.NumBins();
bool shared = smem_size <= max_shared_memory;
smem_size = shared ? smem_size : 0;

// opt into maximum shared memory for the kernel if necessary
auto kernel = SharedMemHistKernel<GradientSumT>;
if (shared) {
dh::safe_cuda(cudaFuncSetAttribute
(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
max_shared_memory));
}

// determine the launch configuration
unsigned block_threads = shared ? 1024 : 256;
int n_mps = 0;
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
int n_blocks_per_mp = 0;
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor
(&n_blocks_per_mp, kernel, block_threads, smem_size));
unsigned grid_size = n_blocks_per_mp * n_mps;

uint32_t items_per_thread = 8;
uint32_t block_threads = 256;
auto grid_size = static_cast<uint32_t>(
common::DivRoundUp(n_elements, items_per_thread * block_threads));
auto n_elements = d_ridx.size() * matrix.row_stride;
dh::LaunchKernel {grid_size, block_threads, smem_size} (
SharedMemHistKernel<GradientSumT>,
matrix, d_ridx, histogram.data(), gpair.data(), n_elements,
kernel, matrix, d_ridx, histogram.data(), gpair.data(), n_elements,
rounding, shared);
dh::safe_cuda(cudaGetLastError());
}

template void BuildGradientHistogram<GradientPair>(
EllpackDeviceAccessor const& matrix,
common::Span<GradientPair const> gpair,
common::Span<const uint32_t> ridx,
common::Span<GradientPair> histogram,
GradientPair rounding, bool shared);
GradientPair rounding);

template void BuildGradientHistogram<GradientPairPrecise>(
EllpackDeviceAccessor const& matrix,
common::Span<GradientPair const> gpair,
common::Span<const uint32_t> ridx,
common::Span<GradientPairPrecise> histogram,
GradientPairPrecise rounding, bool shared);
GradientPairPrecise rounding);

} // namespace tree
} // namespace xgboost
2 changes: 1 addition & 1 deletion src/tree/gpu_hist/histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
common::Span<GradientPair const> gpair,
common::Span<const uint32_t> ridx,
common::Span<GradientSumT> histogram,
GradientSumT rounding, bool shared);
GradientSumT rounding);
} // namespace tree
} // namespace xgboost

Expand Down
12 changes: 1 addition & 11 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,6 @@ struct GPUHistMakerDevice {

TrainParam param;
bool deterministic_histogram;
bool use_shared_memory_histograms {false};

GradientSumT histogram_rounding;

Expand Down Expand Up @@ -596,7 +595,7 @@ struct GPUHistMakerDevice {
auto d_node_hist = hist.GetNodeHistogram(nidx);
auto d_ridx = row_partitioner->GetRows(nidx);
BuildGradientHistogram(page->GetDeviceAccessor(device_id), gpair, d_ridx, d_node_hist,
histogram_rounding, use_shared_memory_histograms);
histogram_rounding);
}

void SubtractionTrick(int nidx_parent, int nidx_histogram,
Expand Down Expand Up @@ -910,15 +909,6 @@ inline void GPUHistMakerDevice<GradientSumT>::InitHistogram() {
host_node_sum_gradients.resize(param.MaxNodes());
node_sum_gradients.resize(param.MaxNodes());

// check if we can use shared memory for building histograms
// (assuming atleast we need 2 CTAs per SM to maintain decent latency
// hiding)
auto histogram_size = sizeof(GradientSumT) * page->Cuts().TotalBins();
auto max_smem = dh::MaxSharedMemory(device_id);
if (histogram_size <= max_smem) {
use_shared_memory_histograms = true;
}

// Init histogram
hist.Init(device_id, page->Cuts().TotalBins());
}
Expand Down
6 changes: 3 additions & 3 deletions tests/cpp/tree/gpu_hist/test_histogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@ void TestDeterminsticHistogram() {

auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
d_histogram, rounding, true);
d_histogram, rounding);

for (size_t i = 0; i < kRounds; ++i) {
dh::device_vector<Gradient> new_histogram(kBins * kCols);
auto d_histogram = dh::ToSpan(new_histogram);

auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
d_histogram, rounding, true);
d_histogram, rounding);

for (size_t j = 0; j < new_histogram.size(); ++j) {
ASSERT_EQ(((Gradient)new_histogram[j]).GetGrad(),
Expand All @@ -50,7 +50,7 @@ void TestDeterminsticHistogram() {
gpair.SetDevice(0);
dh::device_vector<Gradient> baseline(kBins * kCols);
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
dh::ToSpan(baseline), rounding, true);
dh::ToSpan(baseline), rounding);
for (size_t i = 0; i < baseline.size(); ++i) {
EXPECT_NEAR(((Gradient)baseline[i]).GetGrad(), ((Gradient)histogram[i]).GetGrad(),
((Gradient)baseline[i]).GetGrad() * 1e-3);
Expand Down
1 change: 0 additions & 1 deletion tests/cpp/tree/test_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ void TestBuildHist(bool use_shared_memory_histograms) {
maker.hist.AllocateHistogram(0);
maker.gpair = gpair.DeviceSpan();

maker.use_shared_memory_histograms = use_shared_memory_histograms;
maker.BuildHist(0);
DeviceHistogram<GradientSumT> d_hist = maker.hist;

Expand Down