Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 51 additions & 2 deletions tests/cpp/helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,10 @@ SimpleLCG::StateType SimpleLCG::Max() const {
}

void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
SimpleLCG lcg{seed_};
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
CHECK(out);

SimpleLCG lcg{lcg_};
out->Resize(rows_ * cols_, 0);
auto &h_data = out->HostVector();
float sparsity = sparsity_ * (upper_ - lower_) + lower_;
Expand Down Expand Up @@ -202,7 +202,56 @@ std::string RandomDataGenerator::GenerateArrayInterface(
return out;
}

std::pair<std::vector<std::string>, std::string>
RandomDataGenerator::GenerateArrayInterfaceBatch(
HostDeviceVector<float> *storage, size_t batches) const {
this->GenerateDense(storage);
std::vector<std::string> result(batches);
std::vector<Json> objects;

size_t const rows_per_batch = rows_ / batches;

auto make_interface = [storage, this](size_t offset, size_t rows) {
Json array_interface{Object()};
array_interface["data"] = std::vector<Json>(2);
if (device_ >= 0) {
array_interface["data"][0] =
Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
} else {
array_interface["data"][0] =
Integer(reinterpret_cast<int64_t>(storage->HostPointer() + offset));
}

array_interface["data"][1] = Boolean(false);

array_interface["shape"] = std::vector<Json>(2);
array_interface["shape"][0] = rows;
array_interface["shape"][1] = cols_;

array_interface["typestr"] = String("<f4");
array_interface["version"] = 1;
return array_interface;
};

auto j_interface = make_interface(0, rows_);
size_t offset = 0;
for (size_t i = 0; i < batches - 1; ++i) {
objects.emplace_back(make_interface(offset, rows_per_batch));
offset += rows_per_batch * cols_;
}

size_t const remaining = rows_ - offset / cols_;
CHECK_LE(offset, rows_ * cols_);
objects.emplace_back(make_interface(offset, remaining));

for (size_t i = 0; i < batches; ++i) {
Json::Dump(objects[i], &result[i]);
}

std::string interface_str;
Json::Dump(j_interface, &interface_str);
return {result, interface_str};
}

std::string RandomDataGenerator::GenerateColumnarArrayInterface(
std::vector<HostDeviceVector<float>> *data) const {
Expand All @@ -225,8 +274,8 @@ void RandomDataGenerator::GenerateCSR(
auto& h_value = value->HostVector();
auto& h_rptr = row_ptr->HostVector();
auto& h_cols = columns->HostVector();
SimpleLCG lcg{lcg_};

SimpleLCG lcg{seed_};
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
float sparsity = sparsity_ * (upper_ - lower_) + lower_;

Expand Down
47 changes: 36 additions & 11 deletions tests/cpp/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,19 +97,25 @@ bool IsNear(std::vector<xgboost::bst_float>::const_iterator _beg1,
class SimpleLCG {
private:
using StateType = int64_t;
static StateType constexpr default_init_ = 3;
static StateType constexpr kDefaultInit = 3;
static StateType constexpr default_alpha_ = 61;
static StateType constexpr max_value_ = ((StateType)1 << 32) - 1;

StateType state_;
StateType const alpha_;
StateType const mod_;

StateType const seed_;
StateType seed_;

public:
SimpleLCG() : state_{default_init_},
SimpleLCG() : state_{kDefaultInit},
alpha_{default_alpha_}, mod_{max_value_}, seed_{state_}{}
SimpleLCG(SimpleLCG const& that) = default;
SimpleLCG(SimpleLCG&& that) = default;

void Seed(StateType seed) {
seed_ = seed;
}
/*!
* \brief Initialize SimpleLCG.
*
Expand All @@ -118,9 +124,9 @@ class SimpleLCG {
* \param alpha multiplier
* \param mod modulo
*/
SimpleLCG(StateType state,
StateType alpha=default_alpha_, StateType mod=max_value_)
: state_{state == 0 ? default_init_ : state},
explicit SimpleLCG(StateType state,
StateType alpha=default_alpha_, StateType mod=max_value_)
: state_{state == 0 ? kDefaultInit : state},
alpha_{alpha}, mod_{mod} , seed_{state} {}

StateType operator()();
Expand All @@ -131,8 +137,8 @@ class SimpleLCG {
template <typename ResultT>
class SimpleRealUniformDistribution {
private:
ResultT const lower;
ResultT const upper;
ResultT const lower_;
ResultT const upper_;

/*! \brief Over-simplified version of std::generate_canonical. */
template <size_t Bits, typename GeneratorT>
Expand All @@ -156,13 +162,13 @@ class SimpleRealUniformDistribution {

public:
SimpleRealUniformDistribution(ResultT l, ResultT u) :
lower{l}, upper{u} {}
lower_{l}, upper_{u} {}

template <typename GeneratorT>
ResultT operator()(GeneratorT* rng) const {
ResultT tmp = GenerateCanonical<std::numeric_limits<ResultT>::digits,
GeneratorT>(rng);
return (tmp * (upper - lower)) + lower;
return (tmp * (upper_ - lower_)) + lower_;
}
};

Expand All @@ -177,6 +183,7 @@ class RandomDataGenerator {

int32_t device_;
int32_t seed_;
SimpleLCG lcg_;

size_t bins_;

Expand All @@ -186,7 +193,7 @@ class RandomDataGenerator {
public:
RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity)
: rows_{rows}, cols_{cols}, sparsity_{sparsity}, lower_{0.0f}, upper_{1.0f},
device_{-1}, seed_{0}, bins_{0} {}
device_{-1}, seed_{0}, lcg_{seed_}, bins_{0} {}

RandomDataGenerator &Lower(float v) {
lower_ = v;
Expand All @@ -202,6 +209,7 @@ class RandomDataGenerator {
}
RandomDataGenerator& Seed(int32_t s) {
seed_ = s;
lcg_.Seed(seed_);
return *this;
}
RandomDataGenerator& Bins(size_t b) {
Expand All @@ -210,9 +218,26 @@ class RandomDataGenerator {
}

void GenerateDense(HostDeviceVector<float>* out) const;

std::string GenerateArrayInterface(HostDeviceVector<float>* storage) const;

/*!
* \brief Generate batches of array interface stored in consecutive memory.
*
* \param storage The consecutive momory used to store the arrays.
* \param batches Number of batches.
*
* \return A vector storing JSON string representation of interface for each batch, and
* a single JSON string representing the consecutive memory as a whole
* (combining all the batches).
*/
std::pair<std::vector<std::string>, std::string>
GenerateArrayInterfaceBatch(HostDeviceVector<float> *storage,
size_t batches) const;

std::string GenerateColumnarArrayInterface(
std::vector<HostDeviceVector<float>> *data) const;

void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
HostDeviceVector<bst_feature_t>* columns) const;

Expand Down
26 changes: 26 additions & 0 deletions tests/cpp/test_helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <algorithm>

#include "helpers.h"
#include "../../src/data/array_interface.h"
namespace xgboost {

TEST(RandomDataGenerator, DMatrix) {
Expand Down Expand Up @@ -41,4 +42,29 @@ TEST(RandomDataGenerator, DMatrix) {
}
}

TEST(RandomDataGenerator, GenerateArrayInterfaceBatch) {
size_t constexpr kRows { 937 }, kCols { 100 }, kBatches { 13 };
float constexpr kSparsity { 0.4f };

HostDeviceVector<float> storage;
std::string array;
std::vector<std::string> batches;
std::tie(batches, array) =
RandomDataGenerator{kRows, kCols, kSparsity}.GenerateArrayInterfaceBatch(
&storage, kBatches);
CHECK_EQ(batches.size(), kBatches);

size_t rows = 0;
for (auto const &interface_str : batches) {
Json j_interface =
Json::Load({interface_str.c_str(), interface_str.size()});
ArrayInterfaceHandler::Validate(get<Object const>(j_interface));
CHECK_EQ(get<Integer>(j_interface["shape"][1]), kCols);
rows += get<Integer>(j_interface["shape"][0]);
}
CHECK_EQ(rows, kRows);
auto j_array = Json::Load({array.c_str(), array.size()});
CHECK_EQ(get<Integer>(j_array["shape"][0]), kRows);
CHECK_EQ(get<Integer>(j_array["shape"][1]), kCols);
}
} // namespace xgboost