Skip to content

Commit 4ab790e

Browse files
committed
NUMA bindings
1 parent 3ddfbaa commit 4ab790e

20 files changed

+451
-17
lines changed

cachelib/allocator/CacheAllocator-inl.h

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,28 +105,37 @@ CacheAllocator<CacheTrait>::~CacheAllocator() {
105105
}
106106

107107
template <typename CacheTrait>
108-
std::unique_ptr<MemoryAllocator>
109-
CacheAllocator<CacheTrait>::createNewMemoryAllocator() {
108+
ShmSegmentOpts CacheAllocator<CacheTrait>::createShmCacheOpts() {
110109
ShmSegmentOpts opts;
111110
opts.alignment = sizeof(Slab);
111+
auto memoryTierConfigs = config_.getMemoryTierConfigs();
112+
// TODO: we support single tier so far
113+
XDCHECK_EQ(memoryTierConfigs.size(), 1ul);
114+
opts.memBindNumaNodes = memoryTierConfigs[0].getMemBind();
115+
116+
return opts;
117+
}
118+
119+
template <typename CacheTrait>
120+
std::unique_ptr<MemoryAllocator>
121+
CacheAllocator<CacheTrait>::createNewMemoryAllocator() {
112122
return std::make_unique<MemoryAllocator>(
113123
getAllocatorConfig(config_),
114124
shmManager_
115125
->createShm(detail::kShmCacheName, config_.size,
116-
config_.slabMemoryBaseAddr, opts)
126+
config_.slabMemoryBaseAddr, createShmCacheOpts())
117127
.addr,
118128
config_.size);
119129
}
120130

121131
template <typename CacheTrait>
122132
std::unique_ptr<MemoryAllocator>
123133
CacheAllocator<CacheTrait>::restoreMemoryAllocator() {
124-
ShmSegmentOpts opts;
125-
opts.alignment = sizeof(Slab);
126134
return std::make_unique<MemoryAllocator>(
127135
deserializer_->deserialize<MemoryAllocator::SerializationType>(),
128136
shmManager_
129-
->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr, opts)
137+
->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr,
138+
createShmCacheOpts())
130139
.addr,
131140
config_.size,
132141
config_.disableFullCoredump);

cachelib/allocator/CacheAllocator.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1868,6 +1868,7 @@ class CacheAllocator : public CacheBase {
18681868
std::unique_ptr<T>& worker,
18691869
std::chrono::seconds timeout = std::chrono::seconds{0});
18701870

1871+
ShmSegmentOpts createShmCacheOpts();
18711872
std::unique_ptr<MemoryAllocator> createNewMemoryAllocator();
18721873
std::unique_ptr<MemoryAllocator> restoreMemoryAllocator();
18731874
std::unique_ptr<CCacheManager> restoreCCacheManager();

cachelib/allocator/CacheAllocatorConfig.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ class CacheAllocatorConfig {
210210
CacheAllocatorConfig& configureMemoryTiers(const MemoryTierConfigs& configs);
211211

212212
// Return reference to MemoryTierCacheConfigs.
213-
const MemoryTierConfigs& getMemoryTierConfigs();
213+
const MemoryTierConfigs& getMemoryTierConfigs() const noexcept;
214214

215215
// This turns on a background worker that periodically scans through the
216216
// access container and look for expired items and remove them.
@@ -877,7 +877,7 @@ CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::configureMemoryTiers(
877877

878878
template <typename T>
879879
const typename CacheAllocatorConfig<T>::MemoryTierConfigs&
880-
CacheAllocatorConfig<T>::getMemoryTierConfigs() {
880+
CacheAllocatorConfig<T>::getMemoryTierConfigs() const noexcept {
881881
return memoryTierConfigs;
882882
}
883883

cachelib/allocator/MemoryTierCacheConfig.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@
1616

1717
#pragma once
1818

19-
#include <string>
20-
2119
#include "cachelib/shm/ShmCommon.h"
2220

2321
namespace facebook {
@@ -43,6 +41,16 @@ class MemoryTierCacheConfig {
4341

4442
size_t getRatio() const noexcept { return ratio; }
4543

44+
// Allocate memory only from specified NUMA nodes
45+
MemoryTierCacheConfig& setMemBind(const NumaBitMask& _numaNodes) {
46+
numaNodes = _numaNodes;
47+
return *this;
48+
}
49+
50+
const NumaBitMask& getMemBind() const noexcept {
51+
return numaNodes;
52+
}
53+
4654
size_t calculateTierSize(size_t totalCacheSize, size_t partitionNum) {
4755
// TODO: Call this method when tiers are enabled in allocator
4856
// to calculate tier sizes in bytes.
@@ -59,14 +67,17 @@ class MemoryTierCacheConfig {
5967
return getRatio() * (totalCacheSize / partitionNum);
6068
}
6169

70+
private:
6271
// Ratio is a number of parts of the total cache size to be allocated for this
6372
// tier. E.g. if X is a total cache size, Yi are ratios specified for memory
6473
// tiers, and Y is the sum of all Yi, then size of the i-th tier
6574
// Xi = (X / Y) * Yi. For examle, to configure 2-tier cache where each
6675
// tier is a half of the total cache size, set both tiers' ratios to 1.
6776
size_t ratio{1};
6877

69-
private:
78+
// Numa node(s) to bind the tier
79+
NumaBitMask numaNodes;
80+
7081
// TODO: introduce a container for tier settings when adding support for
7182
// file-mapped memory
7283
MemoryTierCacheConfig() = default;

cachelib/cachebench/cache/Cache-inl.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,18 @@ Cache<Allocator>::Cache(const CacheConfig& config,
8080

8181
allocatorConfig_.setCacheSize(config_.cacheSizeMB * (MB));
8282

83+
if (!cacheDir.empty()) {
84+
allocatorConfig_.cacheDir = cacheDir;
85+
}
86+
87+
if (config_.usePosixShm) {
88+
allocatorConfig_.usePosixForShm();
89+
}
90+
91+
if (!config_.memoryTierConfigs.empty()) {
92+
allocatorConfig_.configureMemoryTiers(config_.memoryTierConfigs);
93+
}
94+
8395
auto cleanupGuard = folly::makeGuard([&] {
8496
if (!nvmCacheFilePath_.empty()) {
8597
util::removePath(nvmCacheFilePath_);
@@ -222,8 +234,7 @@ Cache<Allocator>::Cache(const CacheConfig& config,
222234
allocatorConfig_.cacheName = "cachebench";
223235

224236
bool isRecovered = false;
225-
if (!cacheDir.empty()) {
226-
allocatorConfig_.cacheDir = cacheDir;
237+
if (!allocatorConfig_.cacheDir.empty()) {
227238
try {
228239
cache_ = std::make_unique<Allocator>(Allocator::SharedMemAttach,
229240
allocatorConfig_);
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// @nolint instantiates a small cache and runs a quick run of basic operations.
2+
{
3+
"cache_config" : {
4+
"cacheSizeMB" : 512,
5+
"usePosixShm" : false,
6+
"cacheDir" : "/tmp/mem-tiers",
7+
"memoryTiers" : [
8+
{
9+
"ratio": 1,
10+
"memBindNodes": "0"
11+
}
12+
],
13+
"poolRebalanceIntervalSec" : 1,
14+
"moveOnSlabRelease" : false,
15+
16+
"numPools" : 2,
17+
"poolSizes" : [0.3, 0.7]
18+
},
19+
"test_config" : {
20+
"numOps" : 100000,
21+
"numThreads" : 32,
22+
"numKeys" : 1000000,
23+
24+
"keySizeRange" : [1, 8, 64],
25+
"keySizeRangeProbability" : [0.3, 0.7],
26+
27+
"valSizeRange" : [1, 32, 10240, 409200],
28+
"valSizeRangeProbability" : [0.1, 0.2, 0.7],
29+
30+
"getRatio" : 0.15,
31+
"setRatio" : 0.8,
32+
"delRatio" : 0.05,
33+
"keyPoolDistribution": [0.4, 0.6],
34+
"opPoolDistribution" : [0.5, 0.5]
35+
}
36+
}

cachelib/cachebench/util/CacheConfig.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,13 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
8484

8585
JSONSetVal(configJson, memoryOnlyTTL);
8686

87+
JSONSetVal(configJson, usePosixShm);
88+
if (configJson.count("memoryTiers")) {
89+
for (auto& it : configJson["memoryTiers"]) {
90+
memoryTierConfigs.push_back(MemoryTierConfig(it).getMemoryTierCacheConfig());
91+
}
92+
}
93+
8794
JSONSetVal(configJson, useTraceTimeStamp);
8895
JSONSetVal(configJson, printNvmCounters);
8996
JSONSetVal(configJson, tickerSynchingSeconds);
@@ -95,7 +102,7 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
95102
// if you added new fields to the configuration, update the JSONSetVal
96103
// to make them available for the json configs and increment the size
97104
// below
98-
checkCorrectSize<CacheConfig, 696>();
105+
checkCorrectSize<CacheConfig, 728>();
99106

100107
if (numPools != poolSizes.size()) {
101108
throw std::invalid_argument(folly::sformat(
@@ -124,6 +131,13 @@ std::shared_ptr<RebalanceStrategy> CacheConfig::getRebalanceStrategy() const {
124131
RandomStrategy::Config{static_cast<unsigned int>(rebalanceMinSlabs)});
125132
}
126133
}
134+
135+
MemoryTierConfig::MemoryTierConfig(const folly::dynamic& configJson) {
136+
JSONSetVal(configJson, ratio);
137+
JSONSetVal(configJson, memBindNodes);
138+
139+
checkCorrectSize<MemoryTierConfig, 40>();
140+
}
127141
} // namespace cachebench
128142
} // namespace cachelib
129143
} // namespace facebook

cachelib/cachebench/util/CacheConfig.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,26 @@ class CacheMonitorFactory {
4141
virtual std::unique_ptr<CacheMonitor> create(Lru2QAllocator& cache) = 0;
4242
};
4343

44+
// Parse memory tiers configuration from JSON config
45+
struct MemoryTierConfig : public JSONConfig {
46+
MemoryTierConfig() {}
47+
48+
explicit MemoryTierConfig(const folly::dynamic& configJson);
49+
50+
// Returns MemoryTierCacheConfig parsed from JSON config
51+
MemoryTierCacheConfig getMemoryTierCacheConfig() {
52+
MemoryTierCacheConfig config = MemoryTierCacheConfig::fromShm();
53+
config.setRatio(ratio);
54+
config.setMemBind(NumaBitMask(memBindNodes));
55+
return config;
56+
}
57+
58+
// Specifies ratio of this memory tier to other tiers
59+
size_t ratio{0};
60+
// Allocate memory only from specified NUMA nodes
61+
std::string memBindNodes{""};
62+
};
63+
4464
struct CacheConfig : public JSONConfig {
4565
// by defaullt, lru allocator. can be set to LRU-2Q.
4666
std::string allocator{"LRU"};
@@ -194,6 +214,12 @@ struct CacheConfig : public JSONConfig {
194214
// Not used when its value is 0. In seconds.
195215
uint32_t memoryOnlyTTL{0};
196216

217+
// Use Posix Shm instead of SysVShm
218+
bool usePosixShm{false};
219+
220+
// Memory tiers configs
221+
std::vector<MemoryTierCacheConfig> memoryTierConfigs{};
222+
197223
// If enabled, we will use the timestamps from the trace file in the ticker
198224
// so that the cachebench will observe time based on timestamps from the trace
199225
// instead of the system time.

cachelib/shm/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ add_library (cachelib_shm
2424
add_dependencies(cachelib_shm thrift_generated_files)
2525
target_link_libraries(cachelib_shm PUBLIC
2626
cachelib_common
27+
numa
2728
)
2829

2930
install(TARGETS cachelib_shm

cachelib/shm/PosixShmSegment.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,15 @@
1616

1717
#include "cachelib/shm/PosixShmSegment.h"
1818

19+
#include <cstring>
20+
1921
#include <fcntl.h>
2022
#include <folly/logging/xlog.h>
2123
#include <sys/mman.h>
2224
#include <sys/stat.h>
2325
#include <sys/types.h>
26+
#include <numa.h>
27+
#include <numaif.h>
2428

2529
#include "cachelib/common/Utils.h"
2630

@@ -166,6 +170,29 @@ void munmapImpl(void* addr, size_t length) {
166170
}
167171
}
168172

173+
void getMempolicyImpl(int &oldMode, NumaBitMask &memBindNumaNodes) {
174+
auto nodeMask = memBindNumaNodes.getNativeBitmask();
175+
176+
long ret = get_mempolicy(&oldMode, nodeMask->maskp, nodeMask->size,
177+
nullptr, 0);
178+
179+
if (ret != 0) {
180+
util::throwSystemError(errno, folly::sformat("get_mempolicy() failed: {}",
181+
std::strerror(errno)));
182+
}
183+
}
184+
185+
void setMempolicyImpl(int oldMode, const NumaBitMask &memBindNumaNodes) {
186+
auto nodeMask = memBindNumaNodes.getNativeBitmask();
187+
188+
long ret = set_mempolicy(oldMode, nodeMask->maskp, nodeMask->size);
189+
190+
if (ret != 0) {
191+
util::throwSystemError(errno, folly::sformat("set_mempolicy() failed: {}",
192+
std::strerror(errno)));
193+
}
194+
}
195+
169196
} // namespace detail
170197

171198
PosixShmSegment::PosixShmSegment(ShmAttachT,
@@ -312,13 +339,50 @@ void* PosixShmSegment::mapAddress(void* addr) const {
312339
util::throwSystemError(EINVAL, "Address already mapped");
313340
}
314341
XDCHECK(retAddr == addr || addr == nullptr);
342+
memBind(addr);
315343
return retAddr;
316344
}
317345

318346
void PosixShmSegment::unMap(void* addr) const {
319347
detail::munmapImpl(addr, getSize());
320348
}
321349

350+
static void forcePageAllocation(void* addr, size_t size, size_t pageSize) {
351+
char* startAddr = reinterpret_cast<char*>(addr);
352+
char* endAddr = startAddr + size;
353+
for (volatile char* curAddr = startAddr; curAddr < endAddr; curAddr += pageSize) {
354+
*curAddr = *curAddr;
355+
}
356+
}
357+
358+
void PosixShmSegment::memBind(void* addr) const {
359+
if (opts_.memBindNumaNodes.empty()) {
360+
return;
361+
}
362+
363+
NumaBitMask oldMemBindNumaNodes;
364+
int oldMode = 0;
365+
366+
// mbind() cannot be used because mmap was called with MAP_SHARED flag
367+
// But we can set memory policy for current thread and force page allocation.
368+
// The following logic is used:
369+
// 1. Remember current memory policy for the current thread
370+
// 2. Set new memory policy as specified by config
371+
// 3. Force page allocation by touching every page in the segment
372+
// 4. Restore memory policy
373+
374+
// Remember current memory policy
375+
detail::getMempolicyImpl(oldMode, oldMemBindNumaNodes);
376+
377+
// Set memory bindings
378+
detail::setMempolicyImpl(MPOL_BIND, opts_.memBindNumaNodes);
379+
380+
forcePageAllocation(addr, getSize(), detail::getPageSize(opts_.pageSize));
381+
382+
// Restore memory policy for the thread
383+
detail::setMempolicyImpl(oldMode, oldMemBindNumaNodes);
384+
}
385+
322386
std::string PosixShmSegment::createKeyForName(
323387
const std::string& name) noexcept {
324388
// ensure that the slash is always there in the head. repetitive

0 commit comments

Comments
 (0)